def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) density = kwargs.pop('density', False) for c in channels: if c not in self.channels: raise util.CytoflowViewError( "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( "Must specify at least one channel for a default view") elif len(channels) == 1: return FlowPeaks1DView(op=self, channel=channels[0], scale=scale[channels[0]], **kwargs) elif len(channels) == 2: if density: return FlowPeaks2DDensityView(op=self, xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) else: return FlowPeaks2DView(op=self, xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) else: raise util.CytoflowViewError( "Can't specify more than two channels for a default view")
def default_view(self, **kwargs): """ Returns a diagnostic plot of the k-means clustering. Returns ------- IView : an IView, call :meth:`KMeans1DView.plot` to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) for c in channels: if c not in self.channels: raise util.CytoflowViewError( 'channels', "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( 'scale', "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( 'channels', "Must specify at least one channel for a default view") elif len(channels) == 1: v = KMeans1DView(op=self) v.trait_set(channel=channels[0], scale=scale[channels[0]], **kwargs) return v elif len(channels) == 2: v = KMeans2DView(op=self) v.trait_set(xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) return v else: raise util.CytoflowViewError( 'channels', "Can't specify more than two channels for a default view")
def plot(self, experiment, **kwargs): """ Parameters ---------- lim : Dict(Str : (float, float)) Set the range of each channel's axis. If unspecified, assume that the limits are the minimum and maximum of the clipped data """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") if len(self.channels) != len(set(self.channels)): raise util.CytoflowOpError('channels', "Must not duplicate channels") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'scale', "Scale set for channel {0}, but it isn't " "in 'channels'".format(c)) # get the scale scale = {} for c in self.channels: if c in self.scale: scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) else: scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) lim = kwargs.pop("lim", {}) for c in self.channels: if c not in lim: lim[c] = None super().plot(experiment, lim=lim, scale=scale, **kwargs)
def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) # if self.scale[c] == 'log': # self._scale[c].mode = 'mask' else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) for data_group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(data_group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values #### choose the number of clusters and fit the kmeans num_clusters = [ util.num_hist_bins(x[:, c]) for c in range(len(self.channels)) ] num_clusters = np.ceil(np.median(num_clusters)) num_clusters = int(num_clusters) self._kmeans[data_group] = kmeans = \ sklearn.cluster.MiniBatchKMeans(n_clusters = num_clusters) kmeans.fit(x) x_labels = kmeans.predict(x) d = len(self.channels) #### use the kmeans centroids to parameterize a finite gaussian #### mixture model which estimates the density function d = len(self.channels) s0 = np.zeros([d, d]) for j in range(d): r = x[d].max() - x[d].min() s0[j, j] = (r / (num_clusters**(1. / d)))**0.5 means = [] weights = [] normals = [] beta_max = [] for k in range(num_clusters): xk = x[x_labels == k] num_k = np.sum(x_labels == k) weight_k = num_k / len(x_labels) mu = xk.mean(axis=0) means.append(mu) s = np.cov(xk, rowvar=False) el = num_k / (num_clusters + num_k) s_smooth = el * self.h * s + (1.0 - el) * self.h0 * s0 n = scipy.stats.multivariate_normal(mean=mu, cov=s_smooth) weights.append(weight_k) normals.append(lambda x, n=n: n.pdf(x)) # get appropriate step size for peak finding min_b = np.inf for b in np.diagonal(s_smooth): if np.sqrt(b) < min_b: min_b = np.sqrt(b) beta_max.append(b) self._normals[data_group] = normals self._density[ data_group] = density = lambda x, weights=weights, normals=normals: np.sum( [w * n(x) for w, n in zip(weights, normals)], axis=0) ### use optimization on the finite gmm to find the local peak for ### each kmeans cluster peaks = [] peak_clusters = [] # peak idx --> list of clusters min_mu = [np.inf] * len(self.channels) max_mu = [-1.0 * np.inf] * len(self.channels) for k in range(num_clusters): mu = means[k] for ci in range(len(self.channels)): if mu[ci] < min_mu[ci]: min_mu[ci] = mu[ci] if mu[ci] > max_mu[ci]: max_mu[ci] = mu[ci] constraints = [] for ci, c in enumerate(self.channels): constraints.append({ 'type': 'ineq', 'fun': lambda x, min_mu=min_mu[ci]: x - min_mu }) constraints.append({ 'type': 'ineq', 'fun': lambda x, max_mu=max_mu[ci]: max_mu - x }) for k in range(num_clusters): mu = means[k] f = lambda x: -1.0 * density(x) res = scipy.optimize.minimize(f, mu, method='COBYLA', constraints=constraints, options={ 'rhobeg': beta_max[k], 'maxiter': 5000 }) if not res.success: raise util.CytoflowOpError( "Peak finding failed for cluster {}: {}".format( k, res.message)) # ### The peak-searching algorithm from the paper. works fine, # ### but slow! we get similar results with the COBYLA # ### optimization method from scipy, using an appropriate rho # x0 = x = means[k] # k0 = k # b = beta_max[k] / 10.0 # Nsuc = 0 # n = 0 # # while(n < 1000): # # df = scipy.misc.derivative(density, x, 1e-6) # df = statsmodels.tools.numdiff.approx_fprime(x, density) # if np.linalg.norm(df) < 1e-3: # break # # y = x + b * df / np.linalg.norm(df) # if density(y) <= density(x): # Nsuc = 0 # b = b / 2.0 # continue # # Nsuc += 1 # if Nsuc >= 2: # b = min(2*b, beta_max[k]) # # ky = kmeans.predict(y[np.newaxis, :])[0] # if ky == k: # x = y # else: # k = ky # b = beta_max[k] / 10.0 # mu = means[k] # if density(mu) > density(y): # x = mu # else: # x = y # # n += 1 # # # # print("{} --> {}, {}".format(x0, x, n)) merged = False for pi, p in enumerate(peaks): if np.linalg.norm(p - res.x) < (1e-2): peak_clusters[pi].append(k) merged = True break if not merged: peak_clusters.append([k]) peaks.append(res.x) self._peaks[data_group] = peaks ### merge peaks that are sufficiently close groups = [[x] for x in range(len(peaks))] peak_groups = [x for x in range(len(peaks)) ] # peak idx --> group idx def max_tol(x, y): f = lambda a: density(a[np.newaxis, :]) # lx = kmeans.predict(x[np.newaxis, :])[0] # ly = kmeans.predict(y[np.newaxis, :])[0] n = len(x) n_scale = 1 # n_scale = np.sqrt(((nx + ny) / 2.0) / (n / num_clusters)) def tol(t): zt = x + t * (y - x) fhat_zt = f(x) + t * (f(y) - f(x)) return -1.0 * abs((f(zt) - fhat_zt) / fhat_zt) * n_scale res = scipy.optimize.minimize_scalar(tol, bounds=[0, 1], method='Bounded') if res.status != 0: raise util.CytoflowOpError( "tol optimization failed for {}, {}".format(x, y)) return -1.0 * res.fun def nearest_neighbor_dist(k): min_dist = np.inf for i in range(num_clusters): if i == k: continue dist = np.linalg.norm(means[k] - means[i]) if dist < min_dist: min_dist = dist return min_dist sk = [nearest_neighbor_dist(x) for x in range(num_clusters)] def s(x): k = kmeans.predict(x[np.newaxis, :])[0] return sk[k] def can_merge(g, h): for pg in g: for ph in h: vg = peaks[pg] vh = peaks[ph] dist_gh = np.linalg.norm(vg - vh) if max_tol(vg, vh) < self.tol and dist_gh / ( s(vg) + s(vh)) <= self.merge_dist: return True return False while True: if len(groups) == 1: break # find closest mergable groups min_dist = np.inf for gi in range(len(groups)): g = groups[gi] for hi in range(gi + 1, len(groups)): h = groups[hi] if can_merge(g, h): dist_gh = np.inf for pg in g: vg = peaks[pg] for ph in h: vh = peaks[ph] # print("vg {} vh {}".format(vg, vh)) dist_gh = min(dist_gh, np.linalg.norm(vg - vh)) if dist_gh < min_dist: min_gi = gi min_hi = hi min_dist = dist_gh if min_dist == np.inf: break # merge the groups groups[min_gi].extend(groups[min_hi]) for g in groups[min_hi]: peak_groups[g] = min_gi del groups[min_hi] cluster_group = [0] * num_clusters cluster_peaks = [0] * num_clusters for gi, g in enumerate(groups): for p in g: for cluster in peak_clusters[p]: cluster_group[cluster] = gi cluster_peaks[cluster] = p self._peaks[data_group] = peaks self._cluster_peak[data_group] = cluster_peaks self._cluster_group[data_group] = cluster_group
def estimate(self, experiment, subset = None): """ Estimate the Gaussian mixture model parameters Parameters ---------- experiment : Experiment The data to use to estimate the mixture parameters subset : str (default = None) If set, a Python expression to determine the subset of the data to use to in the estimation. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") if len(self.channels) != len(set(self.channels)): raise util.CytoflowOpError('channels', "Must not duplicate channels") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError('channels', "Channel {0} not found in the experiment" .format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError('channels', "Scale set for channel {0}, but it isn't " "in the experiment" .format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError('subset', "Subset string '{0}' isn't valid" .format(subset)) if len(experiment) == 0: raise util.CytoflowViewError('subset', "Subset string '{0}' returned no events" .format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel = c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel = c) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError(None, "Group {} had no data" .format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values gmm = sklearn.mixture.GaussianMixture(n_components = self.num_components, covariance_type = "full", random_state = 1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError(None, "Estimator didn't converge" " for group {0}" .format(group)) # in the 1D version, we sorted the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. # that doesn't work in the general case. instead, we assume that # the clusters are likely (?) to be arranged along *one* of the # axes, so we take the |norm| of the mean of each cluster and # sort that way. norms = np.sum(gmm.means_ ** 2, axis = 1) ** 0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmm.precisions_ = gmm.precisions_[sort_idx] gmm.precisions_cholesky_ = gmm.precisions_cholesky_[sort_idx] gmms[group] = gmm self._gmms = gmms
def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Parameters ---------- channels : List(Str) Which channels to plot? Must be contain either one or two channels. scale : List({'linear', 'log', 'logicle'}) How to scale the channels before plotting them density : bool Should we plot a scatterplot or the estimated density function? Returns ------- IView an IView, call :meth:`plot` to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) density = kwargs.pop('density', False) for c in channels: if c not in self.channels: raise util.CytoflowViewError( 'channels', "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( 'channels', "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( 'channels', "Must specify at least one channel for a default view") elif len(channels) == 1: v = FlowPeaks1DView(op=self) v.trait_set(channel=channels[0], scale=scale[channels[0]], **kwargs) return v elif len(channels) == 2: if density: v = FlowPeaks2DDensityView(op=self) v.trait_set(xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) return v else: v = FlowPeaks2DView(op=self) v.trait_set(xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) return v else: raise util.CytoflowViewError( None, "Can't specify more than two channels for a default view")
def estimate(self, experiment, subset=None): """ Estimate the decomposition Parameters ---------- experiment : Experiment The :class:`.Experiment` to use to estimate the k-means clusters subset : str (default = None) A Python expression that specifies a subset of the data in ``experiment`` to use to parameterize the operation. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) if self.num_components > len(self.channels): raise util.CytoflowOpError( 'num_components', "Number of components must be less than " "or equal to number of channels.") for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'scale', "Scale set for channel {0}, but it isn't " "in `channels`".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( 'by', "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values self._pca[group] = pca = \ sklearn.decomposition.PCA(n_components = self.num_components, whiten = self.whiten, random_state = 0) pca.fit(x)
def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if self.num_clusters < 2: raise util.CytoflowOpError("num_clusters must be >= 2") if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values self._kmeans[group] = kmeans = \ sklearn.cluster.MiniBatchKMeans(n_clusters = self.num_clusters) kmeans.fit(x)