def _grid_plot(self, experiment, grid, xlim, ylim, xscale, yscale, **kwargs): scaled_xdata = xscale(experiment[self.xchannel]) scaled_xdata = scaled_xdata[~np.isnan(scaled_xdata)] scaled_ydata = yscale(experiment[self.ychannel]) scaled_ydata = scaled_ydata[~np.isnan(scaled_ydata)] # find good bin counts num_xbins = kwargs.pop('xbins', util.num_hist_bins(scaled_xdata)) num_ybins = kwargs.pop('ybins', util.num_hist_bins(scaled_ydata)) max_bins = kwargs.pop('max_bins', 100) # there are situations where this produces an unreasonable estimate. if num_xbins > max_bins: warnings.warn("Capping X bins to {}! To increase this limit, " "change max_bins" .format(max_bins)) num_xbins = max_bins if num_ybins > max_bins: warnings.warn("Capping Y bins to {}! To increase this limit, " "change max_bins" .format(max_bins)) num_ybins = max_bins kwargs.setdefault('smoothed', False) xbins = xscale.inverse(np.linspace(xscale(xlim[0]), xscale(xlim[1]), num_xbins)) ybins = yscale.inverse(np.linspace(yscale(ylim[0]), yscale(ylim[1]), num_ybins)) kwargs.setdefault('antialiased', False) kwargs.setdefault('linewidth', 0) kwargs.setdefault('edgecolors', 'face') grid.map(_hist2d, self.xchannel, self.ychannel, xbins = xbins, ybins = ybins, **kwargs) return {}
def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) # if self.scale[c] == 'log': # self._scale[c].mode = 'mask' else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) for data_group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(data_group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values #### choose the number of clusters and fit the kmeans num_clusters = [ util.num_hist_bins(x[:, c]) for c in range(len(self.channels)) ] num_clusters = np.ceil(np.median(num_clusters)) num_clusters = int(num_clusters) self._kmeans[data_group] = kmeans = \ sklearn.cluster.MiniBatchKMeans(n_clusters = num_clusters) kmeans.fit(x) x_labels = kmeans.predict(x) d = len(self.channels) #### use the kmeans centroids to parameterize a finite gaussian #### mixture model which estimates the density function d = len(self.channels) s0 = np.zeros([d, d]) for j in range(d): r = x[d].max() - x[d].min() s0[j, j] = (r / (num_clusters**(1. / d)))**0.5 means = [] weights = [] normals = [] beta_max = [] for k in range(num_clusters): xk = x[x_labels == k] num_k = np.sum(x_labels == k) weight_k = num_k / len(x_labels) mu = xk.mean(axis=0) means.append(mu) s = np.cov(xk, rowvar=False) el = num_k / (num_clusters + num_k) s_smooth = el * self.h * s + (1.0 - el) * self.h0 * s0 n = scipy.stats.multivariate_normal(mean=mu, cov=s_smooth) weights.append(weight_k) normals.append(lambda x, n=n: n.pdf(x)) # get appropriate step size for peak finding min_b = np.inf for b in np.diagonal(s_smooth): if np.sqrt(b) < min_b: min_b = np.sqrt(b) beta_max.append(b) self._normals[data_group] = normals self._density[ data_group] = density = lambda x, weights=weights, normals=normals: np.sum( [w * n(x) for w, n in zip(weights, normals)], axis=0) ### use optimization on the finite gmm to find the local peak for ### each kmeans cluster peaks = [] peak_clusters = [] # peak idx --> list of clusters min_mu = [np.inf] * len(self.channels) max_mu = [-1.0 * np.inf] * len(self.channels) for k in range(num_clusters): mu = means[k] for ci in range(len(self.channels)): if mu[ci] < min_mu[ci]: min_mu[ci] = mu[ci] if mu[ci] > max_mu[ci]: max_mu[ci] = mu[ci] constraints = [] for ci, c in enumerate(self.channels): constraints.append({ 'type': 'ineq', 'fun': lambda x, min_mu=min_mu[ci]: x - min_mu }) constraints.append({ 'type': 'ineq', 'fun': lambda x, max_mu=max_mu[ci]: max_mu - x }) for k in range(num_clusters): mu = means[k] f = lambda x: -1.0 * density(x) res = scipy.optimize.minimize(f, mu, method='COBYLA', constraints=constraints, options={ 'rhobeg': beta_max[k], 'maxiter': 5000 }) if not res.success: raise util.CytoflowOpError( "Peak finding failed for cluster {}: {}".format( k, res.message)) # ### The peak-searching algorithm from the paper. works fine, # ### but slow! we get similar results with the COBYLA # ### optimization method from scipy, using an appropriate rho # x0 = x = means[k] # k0 = k # b = beta_max[k] / 10.0 # Nsuc = 0 # n = 0 # # while(n < 1000): # # df = scipy.misc.derivative(density, x, 1e-6) # df = statsmodels.tools.numdiff.approx_fprime(x, density) # if np.linalg.norm(df) < 1e-3: # break # # y = x + b * df / np.linalg.norm(df) # if density(y) <= density(x): # Nsuc = 0 # b = b / 2.0 # continue # # Nsuc += 1 # if Nsuc >= 2: # b = min(2*b, beta_max[k]) # # ky = kmeans.predict(y[np.newaxis, :])[0] # if ky == k: # x = y # else: # k = ky # b = beta_max[k] / 10.0 # mu = means[k] # if density(mu) > density(y): # x = mu # else: # x = y # # n += 1 # # # # print("{} --> {}, {}".format(x0, x, n)) merged = False for pi, p in enumerate(peaks): if np.linalg.norm(p - res.x) < (1e-2): peak_clusters[pi].append(k) merged = True break if not merged: peak_clusters.append([k]) peaks.append(res.x) self._peaks[data_group] = peaks ### merge peaks that are sufficiently close groups = [[x] for x in range(len(peaks))] peak_groups = [x for x in range(len(peaks)) ] # peak idx --> group idx def max_tol(x, y): f = lambda a: density(a[np.newaxis, :]) # lx = kmeans.predict(x[np.newaxis, :])[0] # ly = kmeans.predict(y[np.newaxis, :])[0] n = len(x) n_scale = 1 # n_scale = np.sqrt(((nx + ny) / 2.0) / (n / num_clusters)) def tol(t): zt = x + t * (y - x) fhat_zt = f(x) + t * (f(y) - f(x)) return -1.0 * abs((f(zt) - fhat_zt) / fhat_zt) * n_scale res = scipy.optimize.minimize_scalar(tol, bounds=[0, 1], method='Bounded') if res.status != 0: raise util.CytoflowOpError( "tol optimization failed for {}, {}".format(x, y)) return -1.0 * res.fun def nearest_neighbor_dist(k): min_dist = np.inf for i in range(num_clusters): if i == k: continue dist = np.linalg.norm(means[k] - means[i]) if dist < min_dist: min_dist = dist return min_dist sk = [nearest_neighbor_dist(x) for x in range(num_clusters)] def s(x): k = kmeans.predict(x[np.newaxis, :])[0] return sk[k] def can_merge(g, h): for pg in g: for ph in h: vg = peaks[pg] vh = peaks[ph] dist_gh = np.linalg.norm(vg - vh) if max_tol(vg, vh) < self.tol and dist_gh / ( s(vg) + s(vh)) <= self.merge_dist: return True return False while True: if len(groups) == 1: break # find closest mergable groups min_dist = np.inf for gi in range(len(groups)): g = groups[gi] for hi in range(gi + 1, len(groups)): h = groups[hi] if can_merge(g, h): dist_gh = np.inf for pg in g: vg = peaks[pg] for ph in h: vh = peaks[ph] # print("vg {} vh {}".format(vg, vh)) dist_gh = min(dist_gh, np.linalg.norm(vg - vh)) if dist_gh < min_dist: min_gi = gi min_hi = hi min_dist = dist_gh if min_dist == np.inf: break # merge the groups groups[min_gi].extend(groups[min_hi]) for g in groups[min_hi]: peak_groups[g] = min_gi del groups[min_hi] cluster_group = [0] * num_clusters cluster_peaks = [0] * num_clusters for gi, g in enumerate(groups): for p in g: for cluster in peak_clusters[p]: cluster_group[cluster] = gi cluster_peaks[cluster] = p self._peaks[data_group] = peaks self._cluster_peak[data_group] = cluster_peaks self._cluster_group[data_group] = cluster_group
def plot(self, experiment, **kwargs): """Plot a faceted histogram view of a channel""" if not experiment: raise util.CytoflowViewError("No experiment specified") if not self.channel: raise util.CytoflowViewError("Must specify a channel") if self.channel not in experiment.data: raise util.CytoflowViewError("Channel {0} not in the experiment" .format(self.channel)) if self.xfacet and self.xfacet not in experiment.conditions: raise util.CytoflowViewError("X facet {0} not in the experiment" .format(self.xfacet)) if self.yfacet and self.yfacet not in experiment.conditions: raise util.CytoflowViewError("Y facet {0} not in the experiment" .format(self.yfacet)) if self.huefacet and self.huefacet not in experiment.conditions: raise util.CytoflowViewError("Hue facet {0} not in the experiment" .format(self.huefacet)) if self.subset: try: data = experiment.query(self.subset).data.reset_index() except: raise util.CytoflowViewError("Subset string '{0}' isn't valid" .format(self.subset)) if len(experiment.data) == 0: raise util.CytoflowViewError("Subset string '{0}' returned no events" .format(self.subset)) else: data = experiment.data # get the scale scale = util.scale_factory(self.scale, experiment, self.channel) scaled_data = scale(data[self.channel]) #print scaled_data kwargs.setdefault('histtype', 'stepfilled') kwargs.setdefault('alpha', 0.5) kwargs.setdefault('antialiased', True) # estimate a "good" number of bins; see cytoflow.utility.num_hist_bins # for a reference. num_bins = util.num_hist_bins(scaled_data) # clip num_bins to (50, 1000) num_bins = max(min(num_bins, 1000), 50) xmin = bottleneck.nanmin(scaled_data) xmax = bottleneck.nanmax(scaled_data) if (self.huefacet and "bins" in experiment.metadata[self.huefacet] and experiment.metadata[self.huefacet]["bin_scale"] == self.scale): # if we color facet by the result of a BinningOp and we don't # match the BinningOp bins with the histogram bins, we get # gnarly aliasing. # each color gets at least one bin. however, if the estimated # number of bins for the histogram is much larger than the # number of colors, sub-divide each color into multiple bins. bins = experiment.metadata[self.huefacet]["bins"] bins = np.append(bins, xmax) num_hues = len(data[self.huefacet].unique()) bins_per_hue = math.ceil(num_bins / num_hues) new_bins = [xmin] for end in [b for b in bins if (b > xmin and b <= xmax)]: new_bins = np.append(new_bins, np.linspace(new_bins[-1], end, bins_per_hue + 1, endpoint = True)[1:]) bins = scale.inverse(new_bins) else: bin_width = (xmax - xmin) / num_bins bins = scale.inverse(np.arange(xmin, xmax, bin_width)) bins = np.append(bins, scale.inverse(xmax)) # take care of a rare rounding error, where the last observation is # a liiiitle bit more than the last bin, which makes plt.hist() puke bins[-1] += 1 kwargs.setdefault('bins', bins) # mask out the data that's not in the scale domain data = data[~np.isnan(scaled_data)] g = sns.FacetGrid(data, size = 6, aspect = 1.5, col = (self.xfacet if self.xfacet else None), row = (self.yfacet if self.yfacet else None), hue = (self.huefacet if self.huefacet else None), col_order = (np.sort(data[self.xfacet].unique()) if self.xfacet else None), row_order = (np.sort(data[self.yfacet].unique()) if self.yfacet else None), hue_order = (np.sort(data[self.huefacet].unique()) if self.huefacet else None), legend_out = False, sharex = False, sharey = False) # set the scale for each set of axes; can't just call plt.xscale() for ax in g.axes.flatten(): ax.set_xscale(self.scale, **scale.mpl_params) g.map(plt.hist, self.channel, **kwargs) # if we have a hue facet and a lot of hues, make a color bar instead # of a super-long legend. if self.huefacet: current_palette = mpl.rcParams['axes.color_cycle'] if len(g.hue_names) > len(current_palette): plot_ax = plt.gca() cmap = mpl.colors.ListedColormap(sns.color_palette("husl", n_colors = len(g.hue_names))) cax, _ = mpl.colorbar.make_axes(plt.gca()) norm = mpl.colors.Normalize(vmin = np.min(g.hue_names), vmax = np.max(g.hue_names), clip = False) mpl.colorbar.ColorbarBase(cax, cmap = cmap, norm = norm, label = self.huefacet) plt.sca(plot_ax) else: g.add_legend(title = self.huefacet)
def _grid_plot(self, experiment, grid, **kwargs): kwargs.setdefault('histtype', 'stepfilled') kwargs.setdefault('alpha', 0.5) kwargs.setdefault('antialiased', True) # estimate a "good" number of bins; see cytoflow.utility.num_hist_bins # for a reference. scale = kwargs.pop('scale')[self.channel] lim = kwargs.pop('lim')[self.channel] scaled_data = scale(experiment[self.channel]) num_bins = kwargs.pop('num_bins', util.num_hist_bins(scaled_data)) num_bins = util.num_hist_bins( scaled_data) if num_bins is None else num_bins # clip num_bins to (100, 1000) num_bins = max(min(num_bins, 1000), 100) if (self.huefacet and "bins" in experiment.metadata[self.huefacet] and experiment.metadata[self.huefacet]["bin_scale"] == self.scale): # if we color facet by the result of a BinningOp and we don't # match the BinningOp bins with the histogram bins, we get # gnarly aliasing. # each color gets at least one bin. however, if the estimated # number of bins for the histogram is much larger than the # number of colors, sub-divide each color into multiple bins. bins = experiment.metadata[self.huefacet]["bins"] scaled_bins = scale(bins) num_hues = len(experiment[self.huefacet].unique()) bins_per_hue = math.floor(num_bins / num_hues) if bins_per_hue == 1: new_bins = scaled_bins else: new_bins = [] for idx in range(1, len(scaled_bins)): new_bins = np.append( new_bins, np.linspace(scaled_bins[idx - 1], scaled_bins[idx], bins_per_hue + 1, endpoint=False)) bins = scale.inverse(new_bins) else: xmin = bottleneck.nanmin(scaled_data) xmax = bottleneck.nanmax(scaled_data) bins = scale.inverse( np.linspace(xmin, xmax, num=int(num_bins), endpoint=True)) kwargs.setdefault('bins', bins) kwargs.setdefault('orientation', 'vertical') if ('linewidth' not in kwargs) or ('linewidth' in kwargs and kwargs['linewidth'] is None): kwargs[ 'linewidth'] = 0 if kwargs['histtype'] == "stepfilled" else 2 # if we have a hue facet, the y scaling is frequently wrong. this # will capture the maximum bin count of each call to plt.hist, so # we don't have to compute the histogram multiple times count_max = [] def hist_lims(*args, **kwargs): # there's some bug in the above code where we get data that isn't # in the range of `bins`, which makes hist() puke. so get rid # of it. bins = kwargs.get('bins') new_args = [] for x in args: x = x[x > bins[0]] x = x[x < bins[-1]] new_args.append(x) if scale.name != "linear" and kwargs.get("density"): kwargs["density"] = False counts, _ = np.histogram(new_args, bins=kwargs["bins"]) kwargs["weights"] = counts / np.sum(counts) n, _, _ = plt.hist(kwargs["bins"][:-1], **kwargs) else: n, _, _ = plt.hist(*new_args, **kwargs) count_max.append(max(n)) grid.map(hist_lims, self.channel, **kwargs) ret = {} if kwargs['orientation'] == 'vertical': ret['xscale'] = scale ret['xlim'] = lim ret['ylim'] = (0, 1.05 * max(count_max)) else: ret['yscale'] = scale ret['ylim'] = lim ret['xlim'] = (0, 1.05 * max(count_max)) return ret
def plot(self, experiment, **kwargs): """Plot a faceted histogram view of a channel""" if not experiment: raise util.CytoflowViewError("No experiment specified") if not self.channel: raise util.CytoflowViewError("Must specify a channel") if self.channel not in experiment.data: raise util.CytoflowViewError( "Channel {0} not in the experiment".format(self.channel)) if self.xfacet and self.xfacet not in experiment.conditions: raise util.CytoflowViewError( "X facet {0} not in the experiment".format(self.xfacet)) if self.yfacet and self.yfacet not in experiment.conditions: raise util.CytoflowViewError( "Y facet {0} not in the experiment".format(self.yfacet)) if self.huefacet and self.huefacet not in experiment.conditions: raise util.CytoflowViewError( "Hue facet {0} not in the experiment".format(self.huefacet)) facets = filter(lambda x: x, [self.xfacet, self.yfacet, self.huefacet]) if len(facets) != len(set(facets)): raise util.CytoflowViewError("Can't reuse facets") col_wrap = kwargs.pop('col_wrap', None) if col_wrap and self.yfacet: raise util.CytoflowViewError( "Can't set yfacet and col_wrap at the same time.") if col_wrap and not self.xfacet: raise util.CytoflowViewError("Must set xfacet to use col_wrap.") if self.subset: try: data = experiment.query(self.subset).data.reset_index() except util.CytoflowError as e: raise util.CytoflowViewError(str(e)) except Exception as e: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(self.subset)) if len(data) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format( self.subset)) else: data = experiment.data # get the scale scale = kwargs.pop('scale', None) if scale is None: scale = util.scale_factory(self.scale, experiment, channel=self.channel) scaled_data = scale(data[self.channel]) kwargs.setdefault('histtype', 'stepfilled') kwargs.setdefault('alpha', 0.5) kwargs.setdefault('antialiased', True) # estimate a "good" number of bins; see cytoflow.utility.num_hist_bins # for a reference. num_bins = util.num_hist_bins(scaled_data) # clip num_bins to (50, 1000) num_bins = max(min(num_bins, 1000), 50) xmin = bottleneck.nanmin(scaled_data) xmax = bottleneck.nanmax(scaled_data) if (self.huefacet and "bins" in experiment.metadata[self.huefacet] and experiment.metadata[self.huefacet]["bin_scale"] == self.scale): # if we color facet by the result of a BinningOp and we don't # match the BinningOp bins with the histogram bins, we get # gnarly aliasing. # each color gets at least one bin. however, if the estimated # number of bins for the histogram is much larger than the # number of colors, sub-divide each color into multiple bins. bins = experiment.metadata[self.huefacet]["bins"] bins = np.append(bins, xmax) num_hues = len(data[self.huefacet].unique()) bins_per_hue = math.ceil(num_bins / num_hues) new_bins = [xmin] for end in [b for b in bins if (b > xmin and b <= xmax)]: new_bins = np.append( new_bins, np.linspace(new_bins[-1], end, bins_per_hue + 1, endpoint=True)[1:]) bins = scale.inverse(new_bins) else: bin_width = (xmax - xmin) / num_bins bins = scale.inverse(np.arange(xmin, xmax, bin_width)) bins = np.append(bins, scale.inverse(xmax)) # take care of a rare rounding error, where the first observation is # less than the first bin or the last observation is more than the last # bin, which makes plt.hist() puke bins[-1] += 1 bins[0] -= 1 kwargs.setdefault('bins', bins) # mask out the data that's not in the scale domain data = data[~np.isnan(scaled_data)] # adjust the limits to clip extreme values min_quantile = kwargs.pop("min_quantile", 0.001) max_quantile = kwargs.pop("max_quantile", 0.999) xlim = kwargs.pop("xlim", None) if xlim is None: xlim = (data[self.channel].quantile(min_quantile), data[self.channel].quantile(max_quantile)) sharex = kwargs.pop("sharex", True) sharey = kwargs.pop("sharey", True) cols = col_wrap if col_wrap else \ len(data[self.xfacet].unique()) if self.xfacet else 1 g = sns.FacetGrid(data, size=6 / cols, aspect=1.5, col=(self.xfacet if self.xfacet else None), row=(self.yfacet if self.yfacet else None), hue=(self.huefacet if self.huefacet else None), col_order=(np.sort(data[self.xfacet].unique()) if self.xfacet else None), row_order=(np.sort(data[self.yfacet].unique()) if self.yfacet else None), hue_order=(np.sort(data[self.huefacet].unique()) if self.huefacet else None), col_wrap=col_wrap, legend_out=False, sharex=sharex, sharey=sharey, xlim=xlim) # set the scale for each set of axes; can't just call plt.xscale() for ax in g.axes.flatten(): ax.set_xscale(self.scale, **scale.mpl_params) legend = kwargs.pop('legend', True) g.map(plt.hist, self.channel, **kwargs) # if we are sharing y axes, make sure the y scale is the same for each if sharey: fig = plt.gcf() fig_y_max = float("-inf") for ax in fig.get_axes(): _, ax_y_max = ax.get_ylim() if ax_y_max > fig_y_max: fig_y_max = ax_y_max for ax in fig.get_axes(): ax.set_ylim(None, fig_y_max) # if we are sharing x axes, make sure the x scale is the same for each if sharex: fig = plt.gcf() fig_x_min = float("inf") fig_x_max = float("-inf") for ax in fig.get_axes(): ax_x_min, ax_x_max = ax.get_xlim() if ax_x_min < fig_x_min: fig_x_min = ax_x_min if ax_x_max > fig_x_max: fig_x_max = ax_x_max for ax in fig.get_axes(): ax.set_xlim(fig_x_min, fig_x_max) # if we have a hue facet, the y scaling is frequently wrong. if self.huefacet: h = np.histogram(data[self.channel], bins=bins) ymax = np.max(h[0]) plt.ylim(0, 1.1 * ymax) # if we have a hue facet and a lot of hues, make a color bar instead # of a super-long legend. if self.huefacet and legend: current_palette = mpl.rcParams['axes.color_cycle'] if util.is_numeric(experiment.data[self.huefacet]) and \ len(g.hue_names) > len(current_palette): plot_ax = plt.gca() cmap = mpl.colors.ListedColormap( sns.color_palette("husl", n_colors=len(g.hue_names))) cax, _ = mpl.colorbar.make_axes(plt.gca()) norm = mpl.colors.Normalize(vmin=np.min(g.hue_names), vmax=np.max(g.hue_names), clip=False) mpl.colorbar.ColorbarBase(cax, cmap=cmap, norm=norm, label=self.huefacet) plt.sca(plot_ax) else: g.add_legend(title=self.huefacet) return g
def plot(self, experiment, **kwargs): """Plot a faceted histogram view of a channel""" if not experiment: raise util.CytoflowViewError("No experiment specified") if not self.xchannel: raise util.CytoflowViewError("X channel not specified") if self.xchannel not in experiment.data: raise util.CytoflowViewError( "X channel {0} not in the experiment".format(self.xchannel)) if not self.ychannel: raise util.CytoflowViewError("Y channel not specified") if self.ychannel not in experiment.data: raise util.CytoflowViewError("Y channel {0} not in the experiment") if self.xfacet and self.xfacet not in experiment.conditions: raise util.CytoflowViewError("X facet {0} not in the experiment") if self.yfacet and self.yfacet not in experiment.conditions: raise util.CytoflowViewError("Y facet {0} not in the experiment") if self.huefacet and self.huefacet not in experiment.metadata: raise util.CytoflowViewError("Hue facet {0} not in the experiment") facets = filter(lambda x: x, [self.xfacet, self.yfacet, self.huefacet]) if len(facets) != len(set(facets)): raise util.CytoflowViewError("Can't reuse facets") col_wrap = kwargs.pop('col_wrap', None) if col_wrap and self.yfacet: raise util.CytoflowViewError( "Can't set yfacet and col_wrap at the same time.") if col_wrap and not self.xfacet: raise util.CytoflowViewError("Must set xfacet to use col_wrap.") if self.subset: try: data = experiment.query(self.subset).data.reset_index() except: raise util.CytoflowViewError("Subset string \'{0}\' not valid") if len(data) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format( self.subset)) else: data = experiment.data xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) kwargs['xscale'] = xscale kwargs['yscale'] = yscale scaled_xdata = xscale(data[self.xchannel]) data = data[~np.isnan(scaled_xdata)] scaled_xdata = scaled_xdata[~np.isnan(scaled_xdata)] scaled_ydata = yscale(data[self.ychannel]) data = data[~np.isnan(scaled_ydata)] scaled_ydata = scaled_ydata[~np.isnan(scaled_ydata)] # find good bin counts num_xbins = util.num_hist_bins(scaled_xdata) num_ybins = util.num_hist_bins(scaled_ydata) # there are situations where this produces an unreasonable estimate. if num_xbins > self._max_bins: warnings.warn("Capping X bins to {}! To increase this limit, " "change _max_bins".format(self._max_bins)) num_xbins = self._max_bins if num_ybins > self._max_bins: warnings.warn("Capping Y bins to {}! To increase this limit, " "change _max_bins".format(self._max_bins)) num_ybins = self._max_bins kwargs.setdefault('smoothed', False) if kwargs['smoothed']: num_xbins /= 2 num_ybins /= 2 _, xedges, yedges = np.histogram2d(scaled_xdata, scaled_ydata, bins=(num_xbins, num_ybins)) kwargs['xedges'] = xscale.inverse(xedges) kwargs['yedges'] = yscale.inverse(yedges) kwargs.setdefault('antialiased', True) # adjust the limits to clip extreme values min_quantile = kwargs.pop("min_quantile", 0.001) max_quantile = kwargs.pop("max_quantile", 0.999) xlim = kwargs.pop("xlim", None) if xlim is None: xlim = (data[self.xchannel].quantile(min_quantile), data[self.xchannel].quantile(max_quantile)) ylim = kwargs.pop("ylim", None) if ylim is None: ylim = (data[self.ychannel].quantile(min_quantile), data[self.ychannel].quantile(max_quantile)) sharex = kwargs.pop('sharex', True) sharey = kwargs.pop('sharey', True) cols = col_wrap if col_wrap else \ len(data[self.xfacet].unique()) if self.xfacet else 1 g = sns.FacetGrid(data, size=(6 / cols), aspect=1.5, col=(self.xfacet if self.xfacet else None), row=(self.yfacet if self.yfacet else None), hue=(self.huefacet if self.huefacet else None), col_order=(np.sort(data[self.xfacet].unique()) if self.xfacet else None), row_order=(np.sort(data[self.yfacet].unique()) if self.yfacet else None), hue_order=(np.sort(data[self.huefacet].unique()) if self.huefacet else None), col_wrap=col_wrap, sharex=sharex, sharey=sharey, xlim=xlim, ylim=ylim) for ax in g.axes.flatten(): ax.set_xscale(self.xscale, **xscale.mpl_params) ax.set_yscale(self.yscale, **yscale.mpl_params) g.map(_hist2d, self.xchannel, self.ychannel, **kwargs) # if we are sharing x axes, make sure the x scale is the same for each if sharex: fig = plt.gcf() fig_x_min = float("inf") fig_x_max = float("-inf") for ax in fig.get_axes(): ax_x_min, ax_x_max = ax.get_xlim() if ax_x_min < fig_x_min: fig_x_min = ax_x_min if ax_x_max > fig_x_max: fig_x_max = ax_x_max for ax in fig.get_axes(): ax.set_xlim(fig_x_min, fig_x_max) # if we are sharing y axes, make sure the y scale is the same for each if sharey: fig = plt.gcf() fig_y_min = float("inf") fig_y_max = float("-inf") for ax in fig.get_axes(): ax_y_min, ax_y_max = ax.get_ylim() if ax_y_min < fig_y_min: fig_y_min = ax_y_min if ax_y_max > fig_y_max: fig_y_max = ax_y_max for ax in fig.get_axes(): ax.set_ylim(fig_y_min, fig_y_max) # if we have a hue facet and a lot of hues, make a color bar instead # of a super-long legend. if self.huefacet: current_palette = mpl.rcParams['axes.color_cycle'] if util.is_numeric(experiment.data[self.huefacet]) and \ len(g.hue_names) > len(current_palette): plot_ax = plt.gca() cmap = mpl.colors.ListedColormap( sns.color_palette("husl", n_colors=len(g.hue_names))) cax, _ = mpl.colorbar.make_axes(plt.gca()) hue_scale = util.scale_factory(self.huescale, experiment, condition=self.huefacet) mpl.colorbar.ColorbarBase(cax, cmap=cmap, norm=hue_scale.color_norm(), label=self.huefacet) plt.sca(plot_ax) else: g.add_legend(title=self.huefacet)
def _grid_plot(self, experiment, grid, xlim, ylim, xscale, yscale, **kwargs): kwargs.setdefault('histtype', 'stepfilled') kwargs.setdefault('alpha', 0.5) kwargs.setdefault('antialiased', True) # estimate a "good" number of bins; see cytoflow.utility.num_hist_bins # for a reference. scaled_data = xscale(experiment[self.channel]) num_bins = util.num_hist_bins(scaled_data) # clip num_bins to (100, 1000) num_bins = max(min(num_bins, 1000), 100) if (self.huefacet and "bins" in experiment.metadata[self.huefacet] and experiment.metadata[self.huefacet]["bin_scale"] == self.scale): # if we color facet by the result of a BinningOp and we don't # match the BinningOp bins with the histogram bins, we get # gnarly aliasing. # each color gets at least one bin. however, if the estimated # number of bins for the histogram is much larger than the # number of colors, sub-divide each color into multiple bins. bins = experiment.metadata[self.huefacet]["bins"] scaled_bins = xscale(bins) num_hues = len(experiment[self.huefacet].unique()) bins_per_hue = math.floor(num_bins / num_hues) if bins_per_hue == 1: new_bins = scaled_bins else: new_bins = [] for idx in range(1, len(scaled_bins)): new_bins = np.append( new_bins, np.linspace(scaled_bins[idx - 1], scaled_bins[idx], bins_per_hue + 1, endpoint=False)) bins = xscale.inverse(new_bins) else: xmin = bottleneck.nanmin(scaled_data) xmax = bottleneck.nanmax(scaled_data) bins = xscale.inverse( np.linspace(xmin, xmax, num=num_bins, endpoint=True)) bins = np.append(bins, xscale.inverse(xmax)) kwargs.setdefault('bins', bins) # if we have a hue facet, the y scaling is frequently wrong. this # will capture the maximum bin count of each call to plt.hist, so # we don't have to compute the histogram multiple times ymax = [] def hist_lims(*args, **kwargs): # there's some bug in the above code where we get data that isn't # in the range of `bins`, which makes hist() puke. so get rid # of it. bins = kwargs.get('bins') new_args = [] for x in args: x = x[x > bins[0]] x = x[x < bins[-1]] new_args.append(x) n, _, _ = plt.hist(*new_args, **kwargs) ymax.append(max(n)) grid.map(hist_lims, self.channel, **kwargs) plt.ylim(0, 1.05 * max(ymax)) return {}
def plot(self, experiment, **kwargs): """Plot a faceted histogram view of a channel""" if not experiment: raise util.CytoflowViewError("No experiment specified") if not self.xchannel: raise util.CytoflowViewError("X channel not specified") if self.xchannel not in experiment.data: raise util.CytoflowViewError("X channel {0} not in the experiment" .format(self.xchannel)) if not self.ychannel: raise util.CytoflowViewError("Y channel not specified") if self.ychannel not in experiment.data: raise util.CytoflowViewError("Y channel {0} not in the experiment") if self.xfacet and self.xfacet not in experiment.conditions: raise util.CytoflowViewError("X facet {0} not in the experiment") if self.yfacet and self.yfacet not in experiment.conditions: raise util.CytoflowViewError("Y facet {0} not in the experiment") if self.huefacet and self.huefacet not in experiment.metadata: raise util.CytoflowViewError("Hue facet {0} not in the experiment") if self.subset: try: data = experiment.query(self.subset).data.reset_index() except: raise util.CytoflowViewError("Subset string \'{0}\' not valid") if len(data.index) == 0: raise util.CytoflowViewError("Subset string '{0}' returned no events" .format(self.subset)) else: data = experiment.data xscale = util.scale_factory(self.xscale, experiment, self.xchannel) yscale = util.scale_factory(self.yscale, experiment, self.ychannel) kwargs['xscale'] = xscale kwargs['yscale'] = yscale scaled_xdata = xscale(data[self.xchannel]) data = data[~np.isnan(scaled_xdata)] scaled_xdata = scaled_xdata[~np.isnan(scaled_xdata)] scaled_ydata = yscale(data[self.ychannel]) data = data[~np.isnan(scaled_ydata)] scaled_ydata = scaled_ydata[~np.isnan(scaled_ydata)] # find good bin counts num_xbins = util.num_hist_bins(scaled_xdata) num_ybins = util.num_hist_bins(scaled_ydata) # there are situations where this produces an unreasonable estimate. if num_xbins > self._max_bins: warnings.warn("Capping X bins to {}! To increase this limit, " "change _max_bins" .format(self._max_bins)) num_xbins = self._max_bins if num_ybins > self._max_bins: warnings.warn("Capping Y bins to {}! To increase this limit, " "change _max_bins" .format(self._max_bins)) num_ybins = self._max_bins kwargs.setdefault('smoothed', False) if kwargs['smoothed']: num_xbins /= 2 num_ybins /= 2 _, xedges, yedges = np.histogram2d(scaled_xdata, scaled_ydata, bins = (num_xbins, num_ybins)) kwargs['xedges'] = xscale.inverse(xedges) kwargs['yedges'] = yscale.inverse(yedges) kwargs.setdefault('antialiased', True) g = sns.FacetGrid(data, size = 6, aspect = 1.5, col = (self.xfacet if self.xfacet else None), row = (self.yfacet if self.yfacet else None), hue = (self.huefacet if self.huefacet else None), col_order = (np.sort(data[self.xfacet].unique()) if self.xfacet else None), row_order = (np.sort(data[self.yfacet].unique()) if self.yfacet else None), hue_order = (np.sort(data[self.huefacet].unique()) if self.huefacet else None), sharex = False, sharey = False) for ax in g.axes.flatten(): ax.set_xscale(self.xscale, **xscale.mpl_params) ax.set_yscale(self.yscale, **yscale.mpl_params) g.map(_hist2d, self.xchannel, self.ychannel, **kwargs) # if we have a hue facet and a lot of hues, make a color bar instead # of a super-long legend. if self.huefacet: current_palette = mpl.rcParams['axes.color_cycle'] if len(g.hue_names) > len(current_palette): plot_ax = plt.gca() cmap = mpl.colors.ListedColormap(sns.color_palette("husl", n_colors = len(g.hue_names))) cax, _ = mpl.colorbar.make_axes(plt.gca()) norm = mpl.colors.Normalize(vmin = np.min(g.hue_names), vmax = np.max(g.hue_names), clip = False) mpl.colorbar.ColorbarBase(cax, cmap = cmap, norm = norm, label = self.huefacet) plt.sca(plot_ax) else: g.add_legend(title = self.huefacet)
def plot(self, experiment, **kwargs): """Plot a faceted histogram view of a channel""" if not experiment: raise util.CytoflowViewError("No experiment specified") if not self.xchannel: raise util.CytoflowViewError("X channel not specified") if self.xchannel not in experiment.data: raise util.CytoflowViewError("X channel {0} not in the experiment" .format(self.xchannel)) if not self.ychannel: raise util.CytoflowViewError("Y channel not specified") if self.ychannel not in experiment.data: raise util.CytoflowViewError("Y channel {0} not in the experiment") if self.xfacet and self.xfacet not in experiment.conditions: raise util.CytoflowViewError("X facet {0} not in the experiment") if self.yfacet and self.yfacet not in experiment.conditions: raise util.CytoflowViewError("Y facet {0} not in the experiment") if self.huefacet and self.huefacet not in experiment.metadata: raise util.CytoflowViewError("Hue facet {0} not in the experiment") if self.subset: try: data = experiment.query(self.subset) except: raise util.CytoflowViewError("Subset string \'{0}\' not valid") if len(data.index) == 0: raise util.CytoflowViewError("Subset string '{0}' returned no events" .format(self.subset)) else: data = experiment.data #kwargs.setdefault('histtype', 'stepfilled') #kwargs.setdefault('alpha', 0.5) kwargs.setdefault('edgecolor', 'none') #kwargs.setdefault('mincnt', 1) #kwargs.setdefault('bins', 'log') kwargs.setdefault('antialiased', True) xmin, xmax = (np.amin(data[self.xchannel]), np.amax(data[self.xchannel])) ymin, ymax = (np.amin(data[self.ychannel]), np.amax(data[self.ychannel])) # to avoid issues with singular data, expand the min/max pairs xmin, xmax = mtrans.nonsingular(xmin, xmax, expander=0.1) ymin, ymax = mtrans.nonsingular(ymin, ymax, expander=0.1) extent = (xmin, xmax, ymin, ymax) kwargs.setdefault('extent', extent) xbins = util.num_hist_bins(experiment[self.xchannel]) ybins = util.num_hist_bins(experiment[self.ychannel]) bins = np.mean([xbins, ybins]) kwargs.setdefault('bins', bins) # Do not move above. don't ask. g = sns.FacetGrid(data, size = 6, aspect = 1.5, col = (self.xfacet if self.xfacet else None), row = (self.yfacet if self.yfacet else None), hue = (self.huefacet if self.huefacet else None), col_order = (np.sort(data[self.xfacet].unique()) if self.xfacet else None), row_order = (np.sort(data[self.yfacet].unique()) if self.yfacet else None), hue_order = (np.sort(data[self.huefacet].unique()) if self.huefacet else None), sharex = False, sharey = False) if(self.xscale != "linear" or self.yscale != "linear"): warnings.warn("hexbin is broken with scales other than \"linear\"", util.CytoflowViewWarning) xscale = util.scale_factory(self.xscale, experiment, self.xchannel) yscale = util.scale_factory(self.yscale, experiment, self.ychannel) for ax in g.axes.flatten(): ax.set_xscale(self.xscale, **xscale.mpl_params) ax.set_yscale(self.yscale, **yscale.mpl_params) g.map(plt.hexbin, self.xchannel, self.ychannel, **kwargs)
def plot(self, experiment, **kwargs): """Plot a faceted histogram view of a channel""" if not experiment: raise util.CytoflowViewError("No experiment specified") if self.channel not in experiment.data: raise util.CytoflowViewError( "Channel {0} not in the experiment".format(self.channel)) if self.xfacet and self.xfacet not in experiment.conditions: raise util.CytoflowViewError( "X facet {0} not in the experiment".format(self.xfacet)) if self.yfacet and self.yfacet not in experiment.conditions: raise util.CytoflowViewError( "Y facet {0} not in the experiment".format(self.yfacet)) if self.huefacet and self.huefacet not in experiment.conditions: raise util.CytoflowViewError( "Hue facet {0} not in the experiment".format(self.huefacet)) if self.subset: try: data = experiment.query(self.subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(self.subset)) if len(data.index) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format( self.subset)) else: data = experiment.data # get the scale scale = util.scale_factory(self.scale, experiment, self.channel) scaled_data = scale(data[self.channel]) #print scaled_data kwargs.setdefault('histtype', 'stepfilled') kwargs.setdefault('alpha', 0.5) kwargs.setdefault('antialiased', True) # estimate a "good" number of bins; see cytoflow.utility.num_hist_bins # for a reference. num_bins = util.num_hist_bins(scaled_data) num_bins = 50 if num_bins < 50 else num_bins xmin = bottleneck.nanmin(scaled_data) xmax = bottleneck.nanmax(scaled_data) if (self.huefacet and "bins" in experiment.metadata[self.huefacet] and experiment.metadata[self.huefacet]["bin_scale"] == self.scale): # if we color facet by the result of a BinningOp and we don't # match the BinningOp bins with the histogram bins, we get # gnarly aliasing. # each color gets at least one bin. however, if the estimated # number of bins for the histogram is much larger than the # number of colors, sub-divide each color into multiple bins. bins = experiment.metadata[self.huefacet]["bins"] bins = np.append(bins, xmax) num_hues = len(data[self.huefacet].unique()) bins_per_hue = math.ceil(num_bins / num_hues) new_bins = [xmin] for end in [b for b in bins if (b > xmin and b <= xmax)]: new_bins = np.append( new_bins, np.linspace(new_bins[-1], end, bins_per_hue + 1, endpoint=True)[1:]) bins = scale.inverse(new_bins) else: bin_width = (xmax - xmin) / num_bins bins = scale.inverse(np.arange(xmin, xmax, bin_width)) bins = np.append(bins, scale.inverse(xmax)) kwargs.setdefault('bins', bins) # mask out the data that's not in the scale domain data = data[~np.isnan(scaled_data)] g = sns.FacetGrid(data, size=6, aspect=1.5, col=(self.xfacet if self.xfacet else None), row=(self.yfacet if self.yfacet else None), hue=(self.huefacet if self.huefacet else None), col_order=(np.sort(data[self.xfacet].unique()) if self.xfacet else None), row_order=(np.sort(data[self.yfacet].unique()) if self.yfacet else None), hue_order=(np.sort(data[self.huefacet].unique()) if self.huefacet else None), legend_out=False, sharex=False, sharey=False) # set the scale for each set of axes; can't just call plt.xscale() for ax in g.axes.flatten(): ax.set_xscale(self.scale, **scale.mpl_params) g.map(plt.hist, self.channel, **kwargs) # if we have a hue facet and a lot of hues, make a color bar instead # of a super-long legend. if self.huefacet: current_palette = mpl.rcParams['axes.color_cycle'] if len(g.hue_names) > len(current_palette): plot_ax = plt.gca() cmap = mpl.colors.ListedColormap( sns.color_palette("husl", n_colors=len(g.hue_names))) cax, _ = mpl.colorbar.make_axes(plt.gca()) norm = mpl.colors.Normalize(vmin=np.min(g.hue_names), vmax=np.max(g.hue_names), clip=False) mpl.colorbar.ColorbarBase(cax, cmap=cmap, norm=norm) plt.sca(plot_ax) else: g.add_legend()