/
cell_graph_features.py
202 lines (175 loc) · 7.63 KB
/
cell_graph_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from __future__ import division
from collections import namedtuple
import numpy as np
from numpy import linalg
from pandas import DataFrame
from scipy.spatial import Delaunay, cKDTree as KDTree, Voronoi
from scipy import sparse
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.spatial.distance import pdist
PopStats = namedtuple('PopStats', ['mean', 'stddev', 'min_max_ratio', 'disorder'])
PolyProps = namedtuple('PolyProps', ['area', 'peri', 'max_dist'])
TriProps = namedtuple('TriProps', ['sides', 'area'])
DensityProps = namedtuple('DensityProps', ['neighbors_in_distance',
'distance_for_neighbors'])
Props = namedtuple('Props', ['voronoi', 'delaunay', 'mst_branches', 'density'])
def compute_global_cell_graph_features(
centroids,
neighbor_distances=10. * np.arange(1, 6),
neighbor_counts=(3, 5, 7),
):
r"""Compute global (i.e., not per-nucleus) features of the nuclei with
the given centroids based on the partitioning of the space into
Voronoi cells and on the induced graph structure.
Parameters
----------
centroids : array_like
Nx2 numpy array of nuclear centroids
neighbor_distances : array_like
Radii to count neighbors in
neighbor_counts : sequence
Sequence of numbers of neighbors, each of which is used to
compute statistics relating to the distance required to reach
that many neighbors.
Returns
-------
props : pandas.DataFrame
A single-row DataFrame with the following columns:
- voronoi\_...: Voronoi diagram features
- area\_...: Polygon area features
- peri\_...: Polygon perimeter features
- max_dist\_...: Maximum distance in polygon features
- delaunay\_...: Delaunay triangulation features
- sides\_...: Triangle side length features
- area\_...: Triangle area features
- mst_branches\_...: Minimum spanning tree branch features
- density\_...: Density features
- neighbors_in_distance\_...
- 0, 1, ..., len(neighbor_distances) - 1: Neighbor count
within given radius features.
- distance_for_neighbors\_...
- 0, 1, ..., len(neighbor_counts) - 1: Minimum distance to
enclose count neighbors features
The "..."s are meant to signify that what precedes is the
start of a column name. At the end of each column name is one
of 'mean', 'stddev', 'min_max_ratio', and 'disorder'.
'min_max_ratio' is the minimum-to-maximum ratio, and disorder
is stddev / (mean + stddev).
Note
----
The indices for the density features are with respect to the
*sorted* values of the corresponding argument sequence.
References
----------
.. [#] Doyle, S., Agner, S., Madabhushi, A., Feldman, M., & Tomaszewski, J.
(2008, May). Automated grading of breast cancer histopathology using
spectral clustering with textural and architectural image features.
In Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008.
5th IEEE International Symposium on (pp. 496-499). IEEE.
"""
return _flatten_to_dataframe(_compute_global_cell_graph_features(
centroids,
neighbor_distances,
neighbor_counts,
))
def _compute_global_cell_graph_features(
centroids,
neighbor_distances,
neighbor_counts,
):
"""Internal support for compute_global_cell_graph_features that
returns its result in a nested nametuple structure instead of a
pandas DataFrame.
"""
vor = Voronoi(centroids)
centroids = vor.points
vertices = vor.vertices
regions = [r for r in vor.regions if r and -1 not in r]
areas = np.stack(_poly_area(vertices[r]) for r in regions)
peris = np.stack(_poly_peri(vertices[r]) for r in regions)
max_dists = np.stack(pdist(vertices[r]).max() for r in regions)
poly_props = PolyProps._make(map(_pop_stats, (areas, peris, max_dists)))
de = Delaunay(centroids)
# From the docs: "Coplanar points are input points which were not
# included in the triangulation due to numerical precision
# issues." I don't know how this would affect the results if
# present, and it doesn't appear to happen, so it's excluded here.
assert not de.coplanar.size
indptr, indices = de.vertex_neighbor_vertices
bin_connectivity = sparse.csr_matrix((np.ones(len(indices), dtype=bool), indices, indptr), (len(centroids),) * 2)
ridge_points = sparse.triu(bin_connectivity, format='coo')
ridge_points = np.stack((ridge_points.row, ridge_points.col), axis=-1)
# This isn't exactly the collection of sides, since if they should
# be counted per-triangle then we weight border ridges wrong
# relative to ridges that are part of two triangles.
ridge_lengths = _dist(*np.swapaxes(centroids[ridge_points], 0, 1))
sides = ridge_lengths
areas = np.stack(_poly_area(centroids[t]) for t in de.simplices)
tri_props = TriProps._make(map(_pop_stats, (sides, areas)))
graph = sparse.coo_matrix((ridge_lengths, ridge_points.T),
(len(centroids), len(centroids)))
mst = minimum_spanning_tree(graph)
# Without looking into exactly how minimum_spanning_tree
# constructs its output, elimate any explicit zeros to be on the
# safe side.
mst_branches = _pop_stats(mst.data[mst.data != 0])
tree = KDTree(centroids)
neigbors_in_distance = {
# Yes, we just throw away the actual points
r: _pop_stats(np.stack(map(len, tree.query_ball_tree(tree, r))) - 1)
for r in neighbor_distances
}
distance_for_neighbors = dict(zip(
neighbor_counts,
map(_pop_stats, tree.query(centroids, [c + 1 for c in neighbor_counts])[0].T),
))
density_props = DensityProps(neigbors_in_distance, distance_for_neighbors)
return Props(poly_props, tri_props, mst_branches, density_props)
def _poly_area(vertices):
return abs(_poly_signed_area(vertices))
def _poly_signed_area(vertices):
return .5 * linalg.det(
np.stack((vertices, np.roll(vertices, -1, axis=-2)), -1)
).sum(-1)
def _poly_peri(vertices):
return _dist(vertices, np.roll(vertices, -1, axis=-2)).sum(-1)
def _dist(x, y):
"""Compute the distance between two sets of points. Has signature
(i),(i)->().
"""
return (np.subtract(x, y) ** 2).sum(-1) ** .5
def _pop_stats(pop):
# Filter out outliers (here defined as points more than three
# standard deviations away from the mean)
while True:
mean = pop.mean()
stddev = pop.std()
mask = abs(pop - mean) <= 3 * stddev
if mask.all():
break
pop = pop[mask]
minmaxr = pop.min() / pop.max()
disorder = stddev / (mean + stddev)
return PopStats(mean, stddev, minmaxr, disorder)
def _flatten_to_dataframe(nt):
"""Flatten the result of _compute_global_cell_graph_features to the
DataFrame returned by compute_global_cell_graph_features.
"""
return DataFrame(_flatten_to_dict(nt), index=[0])
def _flatten_to_dict(nt, prefix=''):
result = {}
assert isinstance(nt, (tuple, dict))
if isinstance(nt, tuple):
d = nt._asdict()
else: # nt is a dict
# We only have numeric keys, and they may be non-nice floats,
# so just number them by sort order instead of doing something
# else for names
d = {str(i): kv[1] for i, kv in enumerate(sorted(nt.items()))}
for k, v in d.items():
if not isinstance(v, (tuple, dict)):
# Terminate
result[prefix + k] = v
else:
result.update(_flatten_to_dict(v, prefix + k + '_'))
return result