def _get_subset_str(self): if self.selected_t and not self.selected_f: return "({0} == True)".format(sanitize_identifier(self.name)) elif not self.selected_t and self.selected_f: return "({0} == False)".format(sanitize_identifier(self.name)) else: return ""
def _get_str(self): if self.selected_t and not self.selected_f: return "({0} == True)".format(util.sanitize_identifier(self.name)) elif not self.selected_t and self.selected_f: return "({0} == False)".format(util.sanitize_identifier(self.name)) else: return ""
def _get_str(self): if self.low == self.values[0] and self.high == self.values[-1]: return "" elif self.low == self.high: return "({0} == {1})" \ .format(util.sanitize_identifier(self.name), self.low) else: return "({0} >= {1} and {0} <= {2})" \ .format(util.sanitize_identifier(self.name), self.low, self.high)
def _set_subset_str(self, val): """Update the view based on a subset string""" if val == "({0} == True)".format(sanitize_identifier(self.name)): self.selected_t = True self.selected_f = False elif val == "({0} == False)".format(sanitize_identifier(self.name)): self.selected_t = False self.selected_f = True else: self.selected_t = False self.selected_f = False
def _on_import(self): """ Import format: CSV, first column is filename, path relative to CSV. others are conditions, type is autodetected. first row is header with names. """ file_dialog = FileDialog() file_dialog.wildcard = "CSV files (*.csv)|*.csv|" file_dialog.action = 'open' file_dialog.open() if file_dialog.return_code != PyfaceOK: return csv = pandas.read_csv(file_dialog.path) csv_folder = Path(file_dialog.path).parent if self.model.tubes or self.model.tube_traits: if confirm( parent=None, message="This will clear the current conditions and tubes! " "Are you sure you want to continue?", title="Clear tubes and conditions?") != YES: return for col in csv.columns[1:]: self.model.tube_traits.append( TubeTrait(model=self.model, name=util.sanitize_identifier(col), type='category')) for _, row in csv.iterrows(): filename = csv_folder / row[0] try: metadata, _ = parse_tube(str(filename), metadata_only=True) except Exception as e: warning( None, "Had trouble loading file {}: {}".format(filename, str(e))) continue metadata['CF_File'] = Path(filename).stem new_tube = Tube(file=str(filename), parent=self.model, metadata=sanitize_metadata(metadata)) self.model.tubes.append(new_tube) for col in csv.columns[1:]: new_tube.trait_set(**{util.sanitize_identifier(col): row[col]})
def validate(self, obj, name, value): value = super(ValidPythonIdentifier, self).validate(obj, name, value) if util.sanitize_identifier(value) == value: return value self.error(obj, name, value)
def query(self, expr, **kwargs): """ Expose pandas.DataFrame.query() to the outside world This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore '_'. So, the column name `a column` becomes `a_column`, and can be queried with an `a_column == True` or such. Parameters ---------- expr : string The expression to pass to `pandas.DataFrame.query()`. Must be a valid Python expression, something you could pass to `eval()`. **kwargs : dict Other named parameters to pass to `pandas.DataFrame.query()`. """ resolvers = {} for name, col in self.data.iteritems(): new_name = util.sanitize_identifier(name) if new_name in resolvers: raise util.CytoflowError( "Tried to sanitize column name {1} to " "{2} but it already existed in the " " DataFrame.".format(name, new_name)) else: resolvers[new_name] = col return self.data.query(expr, resolvers=({}, resolvers), **kwargs)
def query(self, expr, **kwargs): """ Expose pandas.DataFrame.query() to the outside world This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore '_'. So, the column name `a column` becomes `a_column`, and can be queried with an `a_column == True` or such. Parameters ---------- expr : string The expression to pass to `pandas.DataFrame.query()`. Must be a valid Python expression, something you could pass to `eval()`. **kwargs : dict Other named parameters to pass to `pandas.DataFrame.query()`. """ resolvers = {} for name, col in self.data.iteritems(): new_name = util.sanitize_identifier(name) if new_name in resolvers: raise util.CytoflowError("Tried to sanitize column name {1} to " "{2} but it already existed in the " " DataFrame." .format(name, new_name)) else: resolvers[new_name] = col return self.data.query(expr, resolvers = ({}, resolvers), **kwargs)
def subset(self, name, value): """ A fast way to get a subset of the data where a condition equals a particular value. This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore '_'. So, the column name `a column` becomes `a_column`, and can be queried with an `a_column == True` or such. Parameters ---------- name : Str A condition; ie, a key in `self.conditions`. value : Any The value to look for. Will be checked with equality, ie `==` """ new_name = util.sanitize_identifier(name) if new_name not in self.conditions: raise util.CytoflowError("Can't find condition '{}'" .format(name)) ret = self.clone() ret.data = self.data[ self.data[new_name] == value ] ret.data.reset_index(drop = True, inplace = True) return ret
def subset(self, name, value): """ A fast way to get a subset of the data where a condition equals a particular value. This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore '_'. So, the column name `a column` becomes `a_column`, and can be queried with an `a_column == True` or such. Parameters ---------- name : Str A condition; ie, a key in `self.conditions`. value : Any The value to look for. Will be checked with equality, ie `==` """ new_name = util.sanitize_identifier(name) if new_name not in self.conditions: raise util.CytoflowError("Can't find condition '{}'" .format(name)) ret = self.clone() ret.data = self.data[ self.data[new_name] == value ] return ret
def apply(self, experiment): """Applies the ratio operation to an experiment Parameters ---------- experiment : Experiment the old experiment to which this op is applied Returns ------- Experiment a new experiment with the new ratio channel The new channel also has the following new metadata: - **numerator** : Str What was the numerator channel for the new one? - **denominator** : Str What was the denominator channel for the new one? """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.numerator not in experiment.channels: raise util.CytoflowOpError( 'numerator', "Channel {0} not in the experiment".format(self.numerator)) if self.denominator not in experiment.channels: raise util.CytoflowOpError( 'denominator', "Channel {0} not in the experiment".format(self.denominator)) if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if self.name in experiment.channels: raise util.CytoflowOpError( 'name', "New channel {0} is already in the experiment".format( self.name)) new_experiment = experiment.clone() new_experiment.add_channel( self.name, experiment[self.numerator] / experiment[self.denominator]) new_experiment.data.replace([np.inf, -np.inf], np.nan, inplace=True) new_experiment.data.dropna(inplace=True) new_experiment.history.append( self.clone_traits(transient=lambda t: True)) new_experiment.metadata[self.name]['numerator'] = self.numerator new_experiment.metadata[self.name]['denominator'] = self.denominator return new_experiment
def sanitize_metadata(meta): ret = {} for k, v in meta.items(): if len(k) > 0 and k[0] == '$': k = k[1:] k = util.sanitize_identifier(k) ret[k] = v return ret
def apply(self, experiment): """Applies the threshold to an experiment. Parameters ---------- experiment : Experiment the experiment to which this operation is applied Returns ------- Experiment a new :class:`~experiment`, the same as the old experiment but with a new column of type ``bool`` with the same name as the operation :attr:`name`. The new condition is ``True`` if the event's measurement in :attr:`channel` is greater than :attr:`threshold`; it is ``False`` otherwise. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) # make sure old_experiment doesn't already have a column named self.name if (self.name in experiment.data.columns): raise util.CytoflowOpError( 'name', "Experiment already contains a column {0}".format(self.name)) if self.channel not in experiment.channels: raise util.CytoflowOpError( 'channel', "{0} isn't a channel in the experiment".format(self.channel)) if self.threshold is None: raise util.CytoflowOpError('threshold', "must set 'threshold'") gate = pd.Series(experiment[self.channel] > self.threshold) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", gate) new_experiment.history.append( self.clone_traits(transient=lambda t: True)) return new_experiment
def _get_str(self): if len(self.selected) == 0: return "" phrase = "(" for cat in self.selected: if len(phrase) > 1: phrase += " or " phrase += "{0} == \"{1}\"".format(util.sanitize_identifier(self.name), cat) phrase += ")" return phrase
def _get_subset_str(self): if len(self.selected) == 0: return "" phrase = "(" for cat in self.selected: if len(phrase) > 1: phrase += " or " phrase += "{0} == \"{1}\"".format(sanitize_identifier(self.name), cat) phrase += ")" return phrase
def apply(self, experiment): """Applies the ratio operation to an experiment Parameters ---------- experiment : Experiment the old experiment to which this op is applied Returns ------- a new experiment with the new ratio channel """ if not experiment: raise util.CytoflowOpError("No experiment specified") if self.numerator not in experiment.channels: raise util.CytoflowOpError( "Channel {0} not in the experiment".format(self.numerator)) if self.denominator not in experiment.channels: raise util.CytoflowOpError( "Channel {0} not in the experiment".format(self.denominator)) if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( "New channel {0} must be a valid Python identifier".format( self.name)) if self.name in experiment.channels: raise util.CytoflowOpError( "New channel {0} is already in the experiment".format( self.name)) new_experiment = experiment.clone() new_experiment.add_channel( self.name, experiment[self.numerator] / experiment[self.denominator]) new_experiment.data.replace([np.inf, -np.inf], np.nan, inplace=True) new_experiment.data.dropna(inplace=True) new_experiment.history.append( self.clone_traits(transient=lambda t: True)) new_experiment.metadata[self.name]['numerator'] = self.numerator new_experiment.metadata[self.name]['denominator'] = self.denominator return new_experiment
def query(self, expr, **kwargs): """ Return an experiment whose data is a subset of this one where ``expr`` evaluates to ``True``. This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore ``_``. So, the column name ``a column`` becomes ``a_column``, and can be queried with an ``a_column == True`` or such. Parameters ---------- expr : string The expression to pass to :meth:`pandas.DataFrame.query`. Must be a valid Python expression, something you could pass to :func:`eval`. **kwargs : dict Other named parameters to pass to :meth:`pandas.DataFrame.query`. Returns ------- Experiment A new :class:`Experiment`, a clone of this one with the data returned by :meth:`pandas.DataFrame.query()` """ resolvers = {} for name, col in self.data.iteritems(): new_name = util.sanitize_identifier(name) if new_name in resolvers: raise util.CytoflowError( "Tried to sanitize column name {1} to " "{2} but it already existed in the " " DataFrame.".format(name, new_name)) else: resolvers[new_name] = col ret = self.clone() ret.data = self.data.query(expr, resolvers=({}, resolvers), **kwargs) ret.data.reset_index(drop=True, inplace=True) if len(ret.data) == 0: raise util.CytoflowError("No events matched {}".format(expr)) return ret
def apply(self, experiment): """ Apply the operation to an :class:`.Experiment`. Parameters ---------- experiment The :class:`.Experiment` to apply this operation to. Returns ------- Experiment A new :class:`.Experiment`, containing a new entry in :attr:`.Experiment.statistics`. The key of the new entry is a tuple ``(name, function)`` (or ``(name, statistic_name)`` if :attr:`statistic_name` is set. """ if experiment is None: raise util.CytoflowOpError('experiment', "Must specify an experiment") if not self.name: raise util.CytoflowOpError('name', "Must specify a name") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if not self.channel: raise util.CytoflowOpError('channel', "Must specify a channel") if not self.function: raise util.CytoflowOpError('function', "Must specify a function") if self.channel not in experiment.data: raise util.CytoflowOpError( 'channel', "Channel {0} not found in the experiment".format(self.channel)) if not self.by: raise util.CytoflowOpError( 'by', "Must specify some grouping conditions " "in 'by'") stat_name = (self.name, self.statistic_name) \ if self.statistic_name \ else (self.name, self.function.__name__) if stat_name in experiment.statistics: raise util.CytoflowOpError( 'name', "{} is already in the experiment's statistics".format( stat_name)) new_experiment = experiment.clone() if self.subset: try: experiment = experiment.query(self.subset) except Exception as exc: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format( self.subset)) from exc if len(experiment) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format( self.subset)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) unique = experiment.data[b].unique() if len(unique) == 1: warn("Only one category for {}".format(b), util.CytoflowOpWarning) groupby = experiment.data.groupby(self.by) for group, data_subset in groupby: if len(data_subset) == 0: warn("Group {} had no data".format(group), util.CytoflowOpWarning) # this shouldn't be necessary, but see pandas bug #38053 if len(self.by) == 1: idx = pd.Index(experiment[self.by[0]].unique(), name=self.by[0]) else: idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by], names=self.by) stat = pd.Series(data=[self.fill] * len(idx), index=idx, name="{} : {}".format(stat_name[0], stat_name[1]), dtype=np.dtype(object)).sort_index() for group, data_subset in groupby: if len(data_subset) == 0: continue if not isinstance(group, tuple): group = (group, ) try: v = self.function(data_subset[self.channel]) stat.at[group] = v except Exception as e: raise util.CytoflowOpError( None, "Your function threw an error in group {}".format( group)) from e # check for, and warn about, NaNs. if pd.Series(stat.loc[group]).isna().any(): warn( "Found NaN in category {} returned {}".format( group, stat.loc[group]), util.CytoflowOpWarning) # try to convert to numeric, but if there are non-numeric bits ignore stat = pd.to_numeric(stat, errors='ignore') new_experiment.history.append( self.clone_traits(transient=lambda _: True)) new_experiment.statistics[stat_name] = stat return new_experiment
def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment` with the new condition variables as described in the class documentation. Also adds the following new statistics: - **mean** : Float the mean of the fitted gaussian in each channel for each component. - **sigma** : (Float, Float) the locations the mean +/- one standard deviation in each channel for each component. - **correlation** : Float the correlation coefficient between each pair of channels for each component. - **proportion** : Float the proportion of events in each component of the mixture model. only added if :attr:`num_components` ``> 1``. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if self.num_components > 1 and self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if self.sigma is not None: for i in range(1, self.num_components + 1): cname = "{}_{}".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {}" .format(cname)) if self.posteriors: for i in range(1, self.num_components + 1): cname = "{}_{}_posterior".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {}" .format(cname)) if not self._gmms: raise util.CytoflowOpError(None, "No components found. Did you forget to " "call estimate()?") for c in self.channels: if c not in self._scale: raise util.CytoflowOpError(None, "Model scale not set. Did you forget " "to call estimate()?") for c in self.channels: if c not in experiment.channels: raise util.CytoflowOpError('channels', "Channel {0} not found in the experiment" .format(c)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) # # if self.num_components == 1 and self.sigma == 0.0: # raise util.CytoflowOpError('sigma', # "if num_components is 1, sigma must be > 0.0") if self.num_components == 1 and self.posteriors: warn("If num_components == 1, all posteriors will be 1", util.CytoflowOpWarning) # raise util.CytoflowOpError('posteriors', # "If num_components == 1, all posteriors will be 1.") if self.num_components > 1: event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object") if self.sigma is not None: event_gate = {i : pd.Series([False] * len(experiment), dtype = "double") for i in range(self.num_components)} if self.posteriors: event_posteriors = {i : pd.Series([0.0] * len(experiment), dtype = "double") for i in range(self.num_components)} if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) # make the statistics components = [x + 1 for x in range(self.num_components)] prop_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components], names = list(self.by) + ["Component"]) prop_stat = pd.Series(name = "{} : {}".format(self.name, "proportion"), index = prop_idx, dtype = np.dtype(object)).sort_index() mean_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels], names = list(self.by) + ["Component"] + ["Channel"]) mean_stat = pd.Series(name = "{} : {}".format(self.name, "mean"), index = mean_idx, dtype = np.dtype(object)).sort_index() sigma_stat = pd.Series(name = "{} : {}".format(self.name, "sigma"), index = mean_idx, dtype = np.dtype(object)).sort_index() interval_stat = pd.Series(name = "{} : {}".format(self.name, "interval"), index = mean_idx, dtype = np.dtype(object)).sort_index() corr_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], names = list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"]) corr_stat = pd.Series(name = "{} : {}".format(self.name, "correlation"), index = corr_idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] if self.num_components > 1: predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma is not None: for c in range(self.num_components): s = np.linalg.pinv(gmm.covariances_[c]) mu = gmm.means_[c] # compute the Mahalanobis distance f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu)) dist = np.apply_along_axis(f, 1, x, mu, s) # come up with a threshold based on sigma. you'll note we # didn't sqrt dist: that's because for a multivariate # Gaussian, the square of the Mahalanobis distance is # chi-square distributed p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2 thresh = scipy.stats.chi2.ppf(p, 1) event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh) if self.posteriors: p = np.full((len(x), self.num_components), 0.0) p[~x_na] = gmm.predict_proba(x[~x_na]) for c in range(self.num_components): event_posteriors[c].iloc[group_idx] = p[:, c] for c in range(self.num_components): if len(self.by) == 0: g = tuple([c + 1]) elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)): g = tuple(list(group) + [c + 1]) else: g = tuple([group] + [c + 1]) prop_stat.at[g] = gmm.weights_[c] for cidx1, channel1 in enumerate(self.channels): g2 = tuple(list(g) + [channel1]) mean_stat.at[g2] = self._scale[channel1].inverse(gmm.means_[c, cidx1]) s, corr = util.cov2corr(gmm.covariances_[c]) sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1])) interval_stat.at[g2] = (self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]), self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1])) for cidx2, channel2 in enumerate(self.channels): g3 = tuple(list(g2) + [channel2]) corr_stat[g3] = corr[cidx1, cidx2] corr_stat.drop(tuple(list(g2) + [channel1]), inplace = True) new_experiment = experiment.clone() if self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.sigma is not None: for c in range(self.num_components): gate_name = "{}_{}".format(self.name, c + 1) new_experiment.add_condition(gate_name, "bool", event_gate[c]) if self.posteriors: for c in range(self.num_components): post_name = "{}_{}_posterior".format(self.name, c + 1) new_experiment.add_condition(post_name, "double", event_posteriors[c]) new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "sigma")] = sigma_stat new_experiment.statistics[(self.name, "interval")] = interval_stat if len(corr_stat) > 0: new_experiment.statistics[(self.name, "correlation")] = pd.to_numeric(corr_stat) if self.num_components > 1: new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment
def apply(self, experiment): """ Creates a new condition based on membership in the gate that was parameterized with :meth:`estimate`. Parameters ---------- experiment : Experiment the :class:`.Experiment` to apply the gate to. Returns ------- Experiment a new :class:`.Experiment` with the new gate applied. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.xchannel: raise util.CytoflowOpError('xchannel', "Must set X channel") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must set Y channel") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if not (self._xbins.size and self._ybins.size and self._keep_xbins): raise util.CytoflowOpError(None, "No gate estimate found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError(None, "Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError(None, "Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError('xchannel', "Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError('ychannel', "Column {0} not found in the experiment" .format(self.ychannel)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series([False] * len(experiment), dtype = "bool") for group, group_data in groupby: if group not in self._keep_xbins: # there weren't any events in this group, so we didn't get # an estimate continue group_idx = groupby.groups[group] cX = pd.cut(group_data[self.xchannel], self._xbins, include_lowest = True, labels = False) cY = pd.cut(group_data[self.ychannel], self._ybins, include_lowest = True, labels = False) group_keep = pd.Series([False] * len(group_data)) keep_x = self._keep_xbins[group] keep_y = self._keep_ybins[group] for (xbin, ybin) in zip(keep_x, keep_y): group_keep = group_keep | ((cX == xbin) & (cY == ybin)) event_assignments.iloc[group_idx] = group_keep new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", event_assignments) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment
def apply(self, experiment): """ Applies the binning to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- Experiment A new experiment with a condition column named :attr:`name`, which contains the location of the left-most edge of the bin that the event is in. If :attr:`bin_count_name` is set, another column is added with that name as well, containing the number of events in the same bin as the event. """ if experiment is None: raise util.CytoflowOpError('experiment', "no experiment specified") if not self.name: raise util.CytoflowOpError('name', "Name is not set") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Name {} is in the experiment already".format(self.name)) if self.bin_count_name and self.bin_count_name in experiment.data.columns: raise util.CytoflowOpError( 'bin_count_name', "bin_count_name {} is in the experiment already".format( self.bin_count_name)) if not self.channel: raise util.CytoflowOpError('channel', "channel is not set") if self.channel not in experiment.data.columns: raise util.CytoflowOpError( 'channel', "channel {} isn't in the experiment".format(self.channel)) if not self.bin_width: raise util.CytoflowOpError('bin_width', "must set bin width") if not (self.scale == "linear" or self.scale == "log"): raise util.CytoflowOpError( 'scale', "Can only use binning op with linear or log scale") scale = util.scale_factory(self.scale, experiment, channel=self.channel) scaled_min = scale(scale.clip(experiment.data[self.channel]).min()) scaled_max = scale(scale.clip(experiment.data[self.channel]).max()) if self.scale == 'linear': start = 0 else: start = 1 scaled_bins_left = np.arange(start=-1.0 * start, stop=(-1.0 * scaled_min) + self.bin_width, step=self.bin_width) * -1.0 scaled_bins_left = scaled_bins_left[::-1][:-1] scaled_bins_right = np.arange(start=start, stop=scaled_max + self.bin_width, step=self.bin_width) scaled_bins = np.append(scaled_bins_left, scaled_bins_right) if len(scaled_bins) > self._max_num_bins: raise util.CytoflowOpError( None, "Too many bins! To increase this limit, " "change _max_num_bins (currently {})".format( self._max_num_bins)) if len(scaled_bins) < 2: raise util.CytoflowOpError('bin_width', "Must have more than one bin") # now, back into data space bins = scale.inverse(scaled_bins) # reduce to 4 sig figs bins = ['%.4g' % x for x in bins] bins = [float(x) for x in bins] bins = np.array(bins) # put the data in bins bin_idx = np.digitize(experiment.data[self.channel], bins[1:-1]) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "float64", bins[bin_idx]) # keep track of the bins we used, for prettier plotting later. new_experiment.metadata[self.name]["bin_scale"] = self.scale new_experiment.metadata[self.name]["bins"] = bins if self.bin_count_name: # TODO - this is a HUGE memory hog?! # TODO - fix this, then turn it on by default agg_count = new_experiment.data.groupby(self.name).count() agg_count = agg_count[agg_count.columns[0]] # have to make the condition a float64, because if we're in log # space there may be events that have NaN as the bin number. new_experiment.add_condition( self.bin_count_name, "float64", new_experiment[self.name].map(agg_count)) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment
def apply(self, experiment): """ Applies :attr:`function` to a statistic. Parameters ---------- experiment : Experiment The experiment to apply the operation to Returns ------- Experiment The same as the old experiment, but with a new statistic that results from applying :attr:`function` to the statistic specified in :attr:`statistic`. """ if experiment is None: raise util.CytoflowOpError('experiment', "Must specify an experiment") if not self.name: raise util.CytoflowOpError('name', "Must specify a name") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if not self.statistic: raise util.CytoflowViewError('statistic', "Statistic not set") if self.statistic not in experiment.statistics: raise util.CytoflowViewError('statistic', "Can't find the statistic {} in the experiment" .format(self.statistic)) else: stat = experiment.statistics[self.statistic] if not self.function: raise util.CytoflowOpError('function', "Must specify a function") stat_name = (self.name, self.statistic_name) \ if self.statistic_name \ else (self.name, self.function.__name__) if stat_name in experiment.statistics: raise util.CytoflowOpError('name', "{} is already in the experiment's statistics" .format(stat_name)) for b in self.by: if b not in stat.index.names: raise util.CytoflowOpError('by', "{} is not a statistic index; " " must be one of {}" .format(b, stat.index.names)) data = stat.reset_index() if self.by: idx = pd.MultiIndex.from_product([data[x].unique() for x in self.by], names = self.by) else: idx = stat.index.copy() new_stat = pd.Series(data = self.fill, index = idx, dtype = np.dtype(object)).sort_index() if self.by: for group in data[self.by].itertuples(index = False, name = None): if isinstance(stat.index, pd.MultiIndex): s = stat.xs(group, level = self.by, drop_level = False) else: s = stat.loc[list(group)] if len(s) == 0: continue try: new_stat[group] = self.function(s) except Exception as e: raise util.CytoflowOpError('function', "Your function threw an error in group {}".format(group)) from e # check for, and warn about, NaNs. if np.any(np.isnan(new_stat.loc[group])): warn("Category {} returned {}".format(group, new_stat.loc[group]), util.CytoflowOpWarning) else: new_stat = self.function(stat) if not isinstance(new_stat, pd.Series): raise util.CytoflowOpError('by', "Transform function {} does not return a Series; " "in this case, you must set 'by'" .format(self.function)) new_stat.name = "{} : {}".format(stat_name[0], stat_name[1]) matched_series = True for group in data[self.by].itertuples(index = False, name = None): if isinstance(stat.index, pd.MultiIndex): s = stat.xs(group, level = self.by, drop_level = False) else: s = stat.loc[list(group)] if isinstance(new_stat.loc[group], pd.Series) and \ s.index.equals(new_stat.loc[group].index): pass else: matched_series = False break if matched_series and len(self.by) > 0: new_stat = pd.concat(new_stat.values) # try to convert to numeric, but if there are non-numeric bits ignore new_stat = pd.to_numeric(new_stat, errors = 'ignore') # sort the index, for performance new_stat = new_stat.sort_index() new_experiment = experiment.clone() new_experiment.history.append(self.clone_traits(transient = lambda t: True)) if self.statistic_name: new_experiment.statistics[(self.name, self.statistic_name)] = new_stat else: new_experiment.statistics[(self.name, self.function.__name__)] = new_stat return new_experiment
def apply(self, experiment): """ Apply the KMeans clustering to the data. Returns ------- Experiment a new Experiment with one additional :attr:`~Experiment.condition` named :attr:`name`, of type ``category``. The new category has values ``name_1, name_2, etc`` to indicate which k-means cluster an event is a member of. The new :class:`.Experiment` also has one new statistic called ``centers``, which is a list of tuples encoding the centroids of each k-means cluster. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if not self._kmeans: raise util.CytoflowOpError(None, "No components found. Did you forget to " "call estimate()?") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError('channels', "Channel {0} not found in the experiment" .format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError('scale', "Scale set for channel {0}, but it isn't " "in the experiment" .format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object") # make the statistics clusters = [x + 1 for x in range(self.num_clusters)] idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], names = list(self.by) + ["Cluster"] + ["Channel"]) centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError('by', "Group {} had no data" .format(group)) if group not in self._kmeans: raise util.CytoflowOpError('by', "Group {} not found in the estimated model. " "Do you need to re-run estimate()?" .format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] kmeans = self._kmeans[group] predicted = np.full(len(x), -1, "int") predicted[~x_na] = kmeans.predict(x[~x_na]) predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_clusters): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str for c in range(self.num_clusters): if len(self.by) == 0: g = [c + 1] elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)): g = tuple(list(group) + [c + 1]) else: g = tuple([group] + [c + 1]) for cidx1, channel1 in enumerate(self.channels): g2 = tuple(list(g) + [channel1]) centers_stat.loc[g2] = self._scale[channel1].inverse(kmeans.cluster_centers_[c, cidx1]) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "category", event_assignments) new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment
def add_condition(self, name, dtype, data=None): """ Add a new column of per-event metadata to this :class:`Experiment`. .. note:: :meth:`add_condition` operates **in place.** There are two places to call `add_condition`. - As you're setting up a new :class:`Experiment`, call :meth:`add_condition` with ``data`` set to ``None`` to specify the conditions the new events will have. - If you compute some new per-event metadata on an existing :class:`Experiment`, call :meth:`add_condition` to add it. Parameters ---------- name : String The name of the new column in :attr:`data`. Must be a valid Python identifier: must start with ``[A-Za-z_]`` and contain only the characters ``[A-Za-z0-9_]``. dtype : String The type of the new column in :attr:`data`. Must be a string that :class:`pandas.Series` recognizes as a ``dtype``: common types are ``category``, ``float``, ``int``, and ``bool``. data : pandas.Series (default = None) The :class:`pandas.Series` to add to :attr:`data`. Must be the same length as :attr:`data`, and it must be convertable to a :class:`pandas.Series` of type ``dtype``. If ``None``, will add an empty column to the :class:`Experiment` ... but the :class:`Experiment` must be empty to do so! Raises ------ :class:`.CytoflowError` If the :class:`pandas.Series` passed in ``data`` isn't the same length as :attr:`data`, or isn't convertable to type ``dtype``. Examples -------- >>> import cytoflow as flow >>> ex = flow.Experiment() >>> ex.add_condition("Time", "float") >>> ex.add_condition("Strain", "category") """ if name != util.sanitize_identifier(name): raise util.CytoflowError( "Name '{}' is not a valid Python identifier".format(name)) if name in self.data: raise util.CytoflowError( "Already a column named {0} in self.data".format(name)) if data is None and len(self) > 0: raise util.CytoflowError( "If data is None, self.data must be empty!") if data is not None and len(self) != len(data): raise util.CytoflowError( "data must be the same length as self.data") try: if data is not None: self.data[name] = data.astype(dtype, copy=True) else: self.data[name] = pd.Series(dtype=dtype) except (ValueError, TypeError) as exc: raise util.CytoflowError( "Had trouble converting data to type {0}".format( dtype)) from exc self.metadata[name] = {} self.metadata[name]['type'] = "condition"
def apply(self, experiment=None, metadata_only=False): """ Load a new :class:`.Experiment`. Parameters ---------- experiment : Experiment Ignored metadata_only : bool (default = False) Only "import" the metadata, creating an Experiment with all the expected metadata and structure but 0 events. Returns ------- Experiment The new :class:`.Experiment`. New channels have the following metadata: - **voltage** - int The voltage that this channel was collected at. Determined by the ``$PnV`` field from the first FCS file. - **range** - int The maximum range of this channel. Determined by the ``$PnR`` field from the first FCS file. New experimental conditions do not have **voltage** or **range** metadata, obviously. Instead, they have **experiment** set to ``True``, to distinguish the experimental variables from the conditions that were added by gates, etc. If :attr:`ignore_v` is set, it is added as a key to the :class:`.Experiment`-wide metadata. """ if not self.tubes or len(self.tubes) == 0: raise util.CytoflowOpError('tubes', "Must specify some tubes!") # if we have channel renaming, make sure the new names are valid # python identifiers if self.channels: for old_name, new_name in self.channels.items(): if old_name != new_name and new_name != util.sanitize_identifier( new_name): raise util.CytoflowOpError( 'channels', "Channel name {} must be a " "valid Python identifier.".format(new_name)) # make sure each tube has the same conditions tube0_conditions = set(self.tubes[0].conditions) for tube in self.tubes: tube_conditions = set(tube.conditions) if len(tube0_conditions ^ tube_conditions) > 0: raise util.CytoflowOpError( 'tubes', "Tube {0} didn't have the same " "conditions as tube {1}".format(tube.file, self.tubes[0].file)) # make sure experimental conditions are unique for idx, i in enumerate(self.tubes[0:-1]): for j in self.tubes[idx + 1:]: if i.conditions_equal(j): raise util.CytoflowOpError( 'tubes', "The same conditions specified for " "tube {0} and tube {1}".format(i.file, j.file)) experiment = Experiment() experiment.metadata["ignore_v"] = self.ignore_v for condition, dtype in list(self.conditions.items()): experiment.add_condition(condition, dtype) experiment.metadata[condition]['experiment'] = True try: # silence warnings about duplicate channels; # we'll figure that out below with warnings.catch_warnings(): warnings.simplefilter("ignore") tube0_meta = fcsparser.parse(self.tubes[0].file, data_set=self.data_set, meta_data_only=True, reformat_meta=True) except Exception as e: raise util.CytoflowOpError( 'tubes', "FCS reader threw an error reading metadata " "for tube {}: {}".format(self.tubes[0].file, str(e))) from e meta_channels = tube0_meta["_channels_"] if self.name_metadata: experiment.metadata["name_metadata"] = self.name_metadata else: experiment.metadata["name_metadata"] = autodetect_name_metadata( self.tubes[0].file, data_set=self.data_set) meta_channels['Index'] = meta_channels.index meta_channels.set_index(experiment.metadata["name_metadata"], inplace=True) channels = list(self.channels.keys()) if self.channels \ else list(meta_channels.index.values) # make sure everything in self.channels is in the tube channels for channel in channels: if channel not in meta_channels.index: raise util.CytoflowOpError( 'channels', "Channel {0} not in tube {1}".format( channel, self.tubes[0].file)) # now that we have the metadata, load it into experiment for channel in channels: experiment.add_channel(channel) experiment.metadata[channel]["fcs_name"] = channel # keep track of the channel's PMT voltage if ("$PnV" in meta_channels.loc[channel]): v = meta_channels.loc[channel]['$PnV'] if v: experiment.metadata[channel]["voltage"] = v # add the maximum possible value for this channel. data_range = meta_channels.loc[channel]['$PnR'] data_range = float(data_range) experiment.metadata[channel]['range'] = data_range experiment.metadata['fcs_metadata'] = {} for tube in self.tubes: if metadata_only: tube_meta, tube_data = parse_tube(tube.file, experiment, data_set=self.data_set, metadata_only=True) else: tube_meta, tube_data = parse_tube(tube.file, experiment, data_set=self.data_set) if self.events: if self.events <= len(tube_data): tube_data = tube_data.loc[np.random.choice( tube_data.index, self.events, replace=False)] else: warnings.warn( "Only {0} events in tube {1}".format( len(tube_data), tube.file), util.CytoflowWarning) experiment.add_events(tube_data[channels], tube.conditions) # extract the row and column from wells collected on a # BD HTS if 'WELL ID' in tube_meta: pos = tube_meta['WELL ID'] tube_meta['CF_Row'] = pos[0] tube_meta['CF_Col'] = int(pos[1:3]) for i, channel in enumerate(channels): # remove the PnV tube metadata if '$P{}V'.format(i + 1) in tube_meta: del tube_meta['$P{}V'.format(i + 1)] # work around a bug where the PnR is sometimes not the detector range # but the data range. pnr = '$P{}R'.format(i + 1) if pnr in tube_meta and float( tube_meta[pnr] ) > experiment.metadata[channel]['range']: experiment.metadata[channel]['range'] = float( tube_meta[pnr]) tube_meta['CF_File'] = Path(tube.file).stem experiment.metadata['fcs_metadata'][tube.file] = tube_meta for channel in channels: if self.channels and channel in self.channels: new_name = self.channels[channel] if channel == new_name: continue experiment.data.rename(columns={channel: new_name}, inplace=True) experiment.metadata[new_name] = experiment.metadata[channel] experiment.metadata[new_name]["fcs_name"] = channel del experiment.metadata[channel] # this catches an odd corner case where some instruments store # instrument-specific info in the "extra" bits. we have to # clear them out. if tube0_meta['$DATATYPE'] == 'I': data_bits = int(meta_channels.loc[channel]['$PnB']) data_range = float(meta_channels.loc[channel]['$PnR']) range_bits = int(math.log(data_range, 2)) if range_bits < data_bits: mask = 1 for _ in range(1, range_bits): mask = mask << 1 | 1 experiment.data[channel] = experiment.data[ channel].values.astype('int') & mask # re-scale the data to linear if if's recorded as log-scaled with # integer channels data_range = float(meta_channels.loc[channel]['$PnR']) f1 = float(meta_channels.loc[channel]['$PnE'][0]) f2 = float(meta_channels.loc[channel]['$PnE'][1]) if f1 > 0.0 and f2 == 0.0: warnings.warn( 'Invalid $PnE = {},{} for channel {}, changing it to {},1.0' .format(f1, f2, channel, f1), util.CytoflowWarning) f2 = 1.0 if f1 > 0.0 and f2 > 0.0 and tube0_meta['$DATATYPE'] == 'I': warnings.warn( 'Converting channel {} from logarithmic to linear'.format( channel), util.CytoflowWarning) # experiment.data[channel] = 10 ** (f1 * experiment.data[channel] / data_range) * f2 return experiment
def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment`, with a new column named :attr:`name`, and possibly one named :attr:`name` _Posterior. Also the following new :attr:`~.Experiment.statistics`: - **mean** : Float the mean of the fitted gaussian - **stdev** : Float the inverse-scaled standard deviation of the fitted gaussian. on a linear scale, this is in the same units as the mean; on a log scale, this is a scalar multiple; and on a logicle scale, this is probably meaningless! - **interval** : (Float, Float) the inverse-scaled (mean - stdev, mean + stdev) of the fitted gaussian. this is likely more meaningful than ``stdev``, especially on the ``logicle`` scale. - **proportion** : Float the proportion of events in each component of the mixture model. only set if :attr:`num_components` ``> 1``. """ warn("GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self._gmms: raise util.CytoflowOpError(None, "No model found. Did you forget to " "call estimate()?") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if not self._gmms: raise util.CytoflowOpError(None, "No components found. Did you forget to " "call estimate()?") if not self._scale: raise util.CytoflowOpError(None, "Couldn't find _scale. What happened??") if self.channel not in experiment.data: raise util.CytoflowOpError('channel', "Column {0} not found in the experiment" .format(self.channel)) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError('posteriors', "Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.sigma < 0.0: raise util.CytoflowOpError('sigma', "sigma must be >= 0.0") if self.by: by = sorted(self.by) groupby = experiment.data.groupby(by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for group, data_subset in groupby: # if there weren't any events in this group, there's no gmm if group not in self._gmms: warn("There wasn't a GMM for data subset {}".format(group), util.CytoflowOpWarning) continue gmm = self._gmms[group] x = data_subset[self.channel] x = self._scale(x).values # which values are missing? x_na = np.isnan(x) group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x, "p" : predicted}) # for each component, get the low and the high threshold for c in range(0, self.num_components): lo = (gmm.means_[c][0] # @UnusedVariable - self.sigma * np.sqrt(gmm.covariances_[c][0])) hi = (gmm.means_[c][0] # @UnusedVariable + self.sigma * np.sqrt(gmm.covariances_[c][0])) # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis]) posteriors = pd.Series([0.0] * len(predicted)) for i in range(0, self.num_components): posteriors[predicted == i] = probability[predicted == i, i] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1 and self.sigma > 0: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) elif self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors and self.num_components > 1: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) # add the statistics levels = list(self.by) if self.num_components > 1: levels.append(self.name) if levels: idx = pd.MultiIndex.from_product([new_experiment[x].unique() for x in levels], names = levels) mean_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() stdev_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() interval_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() prop_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, _ in groupby: gmm = self._gmms[group] for c in range(self.num_components): if self.num_components > 1: component_name = "{}_{}".format(self.name, c + 1) if group is True: g = [component_name] elif isinstance(group, tuple): g = list(group) g.append(component_name) else: g = list([group]) g.append(component_name) if len(g) > 1: g = tuple(g) else: g = (g[0],) else: g = group mean_stat.at[g] = self._scale.inverse(gmm.means_[c][0]) stdev_stat.at[g] = self._scale.inverse(np.sqrt(gmm.covariances_[c][0]))[0] interval_stat.at[g] = (self._scale.inverse(gmm.means_[c][0] - np.sqrt(gmm.covariances_[c][0][0])), self._scale.inverse(gmm.means_[c][0] + np.sqrt(gmm.covariances_[c][0][0]))) prop_stat.at[g] = gmm.weights_[c] new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "stdev")] = pd.to_numeric(stdev_stat) new_experiment.statistics[(self.name, "interval")] = interval_stat if self.num_components > 1: new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment
def apply(self, experiment): """Applies the range gate to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- Experiment a new experiment, the same as old :class:`~Experiment` but with a new column of type ``bool`` with the same as the operation name. The bool is ``True`` if the event's measurement in :attr:`channel` is greater than :attr:`low` and less than :attr:`high`; it is ``False`` otherwise. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {0}".format(self.name)) if not self.channel: raise util.CytoflowOpError('channel', "Channel not specified") if not self.channel in experiment.channels: raise util.CytoflowOpError( 'channel', "Channel {0} not in the experiment".format(self.channel)) if self.high <= self.low: raise util.CytoflowOpError('high', "range high must be > range low") if self.high <= experiment[self.channel].min(): raise util.CytoflowOpError( 'high', "range high must be > {0}".format( experiment[self.channel].min())) if self.low >= experiment[self.channel].max(): raise util.CytoflowOpError( 'low', "range low must be < {0}".format( experiment[self.channel].max())) gate = experiment[self.channel].between(self.low, self.high) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", gate) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment
def apply(self, experiment): if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.name: raise util.CytoflowOpError('name', "Must specify a name") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if not self.function: raise util.CytoflowOpError('function', "Must specify a function") if not self.by: raise util.CytoflowOpError( 'by', "Must specify some grouping conditions " "in 'by'") stat_name = (self.name, self.statistic_name) \ if self.statistic_name \ else (self.name, self.function.__name__) if stat_name in experiment.statistics: raise util.CytoflowOpError( 'name', "{} is already in the experiment's statistics".format( stat_name)) new_experiment = experiment.clone() if self.subset: try: experiment = experiment.query(self.subset) except Exception as e: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format( self.subset)) from e if len(experiment) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format( self.subset)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " " must be one of {}".format(b, experiment.conditions)) unique = experiment.data[b].unique() if len(unique) == 1: warn("Only one category for {}".format(b), util.CytoflowOpWarning) groupby = experiment.data.groupby(self.by) for group, data_subset in groupby: if len(data_subset) == 0: warn("Group {} had no data".format(group), util.CytoflowOpWarning) idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by], names=self.by) stat = pd.Series(data=self.fill, index=idx, name="{} : {}".format(stat_name[0], stat_name[1]), dtype=np.dtype(object)).sort_index() for group, data_subset in groupby: if len(data_subset) == 0: continue try: stat.loc[group] = self.function(data_subset) except Exception as e: raise util.CytoflowOpError( 'function', "Your function threw an error in group {}".format( group)) from e # check for, and warn about, NaNs. if np.any(np.isnan(stat.loc[group])): warn("Category {} returned {}".format(group, stat.loc[group]), util.CytoflowOpWarning) # try to convert to numeric, but if there are non-numeric bits ignore stat = pd.to_numeric(stat, errors='ignore') new_experiment.history.append( self.clone_traits(transient=lambda t: True)) new_experiment.statistics[stat_name] = stat return new_experiment
def reset_channels(self): self.channels_list = [ Channel(channel=x, name=util.sanitize_identifier(x)) for x in self.original_channels ]
def apply(self, experiment): """Applies the threshold to an experiment. Parameters ---------- experiment : Experiment the old :class:`Experiment` to which this op is applied Returns ------- Experiment a new :class:'Experiment`, the same as ``old_experiment`` but with a new column of type `bool` with the same as the operation name. The bool is ``True`` if the event's measurement is within the polygon, and ``False`` otherwise. Raises ------ util.CytoflowOpError if for some reason the operation can't be applied to this experiment. The reason is in :attr:`.CytoflowOpError.args` """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "{} is in the experiment already!".format(self.name)) if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if not self.xchannel: raise util.CytoflowOpError('xchannel', "Must specify an x channel") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must specify a y channel") if not self.xchannel in experiment.channels: raise util.CytoflowOpError( 'xchannel', "xchannel {0} is not in the experiment".format(self.xchannel)) if not self.ychannel in experiment.channels: raise util.CytoflowOpError( 'ychannel', "ychannel {0} is not in the experiment".format(self.ychannel)) if len(self.vertices) < 3: raise util.CytoflowOpError('vertices', "Must have at least 3 vertices") if any([len(x) != 2 for x in self.vertices]): return util.CytoflowOpError( 'vertices', "All vertices must be lists or tuples " "of length = 2") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the Polygon gate's name " "before applying it!") # make sure old_experiment doesn't already have a column named self.name if (self.name in experiment.data.columns): raise util.CytoflowOpError( 'name', "Experiment already contains a column {0}".format(self.name)) # there's a bit of a subtlety here: if the vertices were # selected with an interactive plot, and that plot had scaled # axes, we need to apply that scale function to both the # vertices and the data before looking for path membership xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) vertices = [(xscale(x), yscale(y)) for (x, y) in self.vertices] data = experiment.data[[self.xchannel, self.ychannel]].copy() data[self.xchannel] = xscale(data[self.xchannel]) data[self.ychannel] = yscale(data[self.ychannel]) # use a matplotlib Path because testing for membership is a fast C fn. path = mpl.path.Path(np.array(vertices)) xy_data = data[[self.xchannel, self.ychannel]].values new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", path.contains_points(xy_data)) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment
def apply(self, experiment): """Applies the threshold to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- Experiment a new :class:`~Experiment`, the same as the old experiment but with a new column with a data type of ``bool`` and the same as the operation :attr:`name`. The bool is ``True`` if the event's measurement in :attr:`xchannel` is greater than :attr:`xlow` and less than :attr:`high`, and the event's measurement in :attr:`ychannel` is greater than :attr:`ylow` and less than :attr:`yhigh`; it is ``False`` otherwise. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) # make sure old_experiment doesn't already have a column named self.name if (self.name in experiment.data.columns): raise util.CytoflowOpError( 'name', "Experiment already contains a column {0}".format(self.name)) if not self.xchannel or not self.ychannel: raise util.CytoflowOpError('xchannel', "Must specify xchannel") if not self.xchannel in experiment.channels: raise util.CytoflowOpError('xchannel', "xchannel isn't in the experiment") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must specify ychannel") if not self.ychannel in experiment.channels: raise util.CytoflowOpError('ychannel', "ychannel isn't in the experiment") if self.xhigh <= experiment[self.xchannel].min(): raise util.CytoflowOpError( 'xhigh', "x channel range high must be > {0}".format( experiment[self.xchannel].min())) if self.xlow >= experiment[self.xchannel].max(): raise util.CytoflowOpError( 'xlow', "x channel range low must be < {0}".format( experiment[self.xchannel].max())) if self.yhigh <= experiment[self.ychannel].min(): raise util.CytoflowOpError( 'yhigh', "y channel range high must be > {0}".format( experiment[self.ychannel].min())) if self.ylow >= experiment[self.ychannel].max(): raise util.CytoflowOpError( 'ylow', "y channel range low must be < {0}".format( experiment[self.ychannel].max())) x = experiment[self.xchannel].between(self.xlow, self.xhigh) y = experiment[self.ychannel].between(self.ylow, self.yhigh) gate = pd.Series(x & y) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", gate) new_experiment.history.append( self.clone_traits(transient=lambda t: True)) return new_experiment
def _get_subset_str(self): if self.low == self.values[0] and self.high == self.values[-1]: return "" return "({0} >= {1} and {0} <= {2})" \ .format(sanitize_identifier(self.name), self.low, self.high)
def apply(self, experiment=None): """ Load a new :class:`.Experiment`. Returns ------- Experiment The new :class:`.Experiment`. New channels have the following metadata: - **voltage** - int The voltage that this channel was collected at. Determined by the ``$PnV`` field from the first FCS file. - **range** - int The maximum range of this channel. Determined by the ``$PnR`` field from the first FCS file. New experimental conditions do not have **voltage** or **range** metadata, obviously. Instead, they have **experiment** set to ``True``, to distinguish the experimental variables from the conditions that were added by gates, etc. If :attr:`ignore_v` is set, it is added as a key to the :class:`.Experiment`-wide metadata. """ if not self.tubes or len(self.tubes) == 0: raise util.CytoflowOpError('tubes', "Must specify some tubes!") # if we have channel renaming, make sure the new names are valid # python identifiers if self.channels: for old_name, new_name in self.channels.items(): if old_name != new_name and new_name != util.sanitize_identifier( new_name): raise util.CytoflowOpError( 'channels', "Channel name {} must be a " "valid Python identifier.".format(new_name)) # make sure each tube has the same conditions tube0_conditions = set(self.tubes[0].conditions) for tube in self.tubes: tube_conditions = set(tube.conditions) if len(tube0_conditions ^ tube_conditions) > 0: raise util.CytoflowOpError( 'tubes', "Tube {0} didn't have the same " "conditions as tube {1}".format(tube.file, self.tubes[0].file)) # make sure experimental conditions are unique for idx, i in enumerate(self.tubes[0:-1]): for j in self.tubes[idx + 1:]: if i.conditions_equal(j): raise util.CytoflowOpError( 'tubes', "The same conditions specified for " "tube {0} and tube {1}".format(i.file, j.file)) experiment = Experiment() experiment.metadata["ignore_v"] = self.ignore_v for condition, dtype in list(self.conditions.items()): experiment.add_condition(condition, dtype) experiment.metadata[condition]['experiment'] = True try: # silence warnings about duplicate channels; # we'll figure that out below with warnings.catch_warnings(): warnings.simplefilter("ignore") tube0_meta = fcsparser.parse(self.tubes[0].file, meta_data_only=True, reformat_meta=True) except Exception as e: raise util.CytoflowOpError( 'tubes', "FCS reader threw an error reading metadata " "for tube {}".format(self.tubes[0].file)) from e meta_channels = tube0_meta["_channels_"] if self.name_metadata: experiment.metadata["name_metadata"] = self.name_metadata else: # try to autodetect the metadata if "$PnN" in meta_channels and not "$PnS" in meta_channels: experiment.metadata["name_metadata"] = "$PnN" elif "$PnN" not in meta_channels and "$PnS" in meta_channels: experiment.metadata["name_metadata"] = "$PnS" else: PnN = meta_channels["$PnN"] PnS = meta_channels["$PnS"] # sometimes one is unique and the other isn't if (len(set(PnN)) == len(PnN) and len(set(PnS)) != len(PnS)): experiment.metadata["name_metadata"] = "$PnN" elif (len(set(PnN)) != len(PnN) and len(set(PnS)) == len(PnS)): experiment.metadata["name_metadata"] = "$PnS" else: # as per fcsparser.api, $PnN is the "short name" (like FL-1) # and $PnS is the "actual name" (like "FSC-H"). so let's # use $PnS. experiment.metadata["name_metadata"] = "$PnS" meta_channels.set_index(experiment.metadata["name_metadata"], inplace=True) channels = list(self.channels.keys()) if self.channels \ else list(tube0_meta["_channel_names_"]) # make sure everything in self.channels is in the tube channels for channel in channels: if channel not in meta_channels.index: raise util.CytoflowOpError( 'channels', "Channel {0} not in tube {1}".format( channel, self.tubes[0].file)) # now that we have the metadata, load it into experiment for channel in channels: experiment.add_channel(channel) experiment.metadata[channel]["fcs_name"] = channel # keep track of the channel's PMT voltage if ("$PnV" in meta_channels.loc[channel]): v = meta_channels.loc[channel]['$PnV'] if v: experiment.metadata[channel]["voltage"] = v # add the maximum possible value for this channel. data_range = meta_channels.loc[channel]['$PnR'] data_range = float(data_range) experiment.metadata[channel]['range'] = data_range experiment.metadata['fcs_metadata'] = {} for tube in self.tubes: tube_meta, tube_data = parse_tube(tube.file, experiment) if self.events: if self.events <= len(tube_data): tube_data = tube_data.loc[np.random.choice(tube_data.index, self.events, replace=False)] else: warnings.warn( "Only {0} events in tube {1}".format( len(tube_data), tube.file), util.CytoflowWarning) experiment.add_events(tube_data[channels], tube.conditions) experiment.metadata['fcs_metadata'][tube.file] = tube_meta for channel in channels: if self.channels and channel in self.channels: new_name = self.channels[channel] if channel == new_name: continue experiment.data.rename(columns={channel: new_name}, inplace=True) experiment.metadata[new_name] = experiment.metadata[channel] experiment.metadata[new_name]["fcs_name"] = channel del experiment.metadata[channel] return experiment
def apply(self, experiment): """ Assign events to a cluster. Assigns each event to one of the k-means centroids from :meth:`estimate`, then groups together events in the same cluster hierarchy. Parameters ---------- experiment : Experiment the :class:`.Experiment` to apply the gate to. Returns ------- Experiment A new :class:`.Experiment` with the gate applied to it. TODO - document the extra statistics """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {0}".format(self.name)) if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") if not self._peaks: raise util.CytoflowOpError( None, "No model found. Did you forget to " "call estimate()?") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'scale', "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype="object") # make the statistics # clusters = [x + 1 for x in range(self.num_clusters)] # # idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], # names = list(self.by) + ["Cluster"] + ["Channel"]) # centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( 'by', "Group {} had no data".format(group)) if group not in self._kmeans: raise util.CytoflowOpError( 'by', "Group {} not found in the estimated " "model. Do you need to re-run estimate()?".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] kmeans = self._kmeans[group] predicted_km = np.full(len(x), -1, "int") predicted_km[~x_na] = kmeans.predict(x[~x_na]) groups = np.asarray(self._cluster_group[group]) predicted_group = np.full(len(x), -1, "int") predicted_group[~x_na] = groups[predicted_km[~x_na]] # outlier detection code. this is disabled for the moment # because it is really slow. # num_groups = len(set(groups)) # if self.find_outliers: # density = self._density[group] # max_d = [-1.0 * np.inf] * num_groups # # for xi in range(len(x)): # if x_na[xi]: # continue # # x_c = predicted_group[xi] # d_x_c = density(x[xi]) # if d_x_c > max_d[x_c]: # max_d[x_c] = d_x_c # # group_density = [None] * num_groups # group_weight = [0.0] * num_groups # # for c in range(num_groups): # num_c = np.sum(predicted_group == c) # clusters = np.argwhere(groups == c).flatten() # # normals = [] # weights = [] # for k in range(len(clusters)): # num_k = np.sum(predicted_km == k) # weight_k = num_k / num_c # group_weight[c] += num_k / len(x) # weights.append(weight_k) # normals.append(self._normals[group][k]) # # group_density[c] = lambda x, weights = weights, normals = normals: np.sum([w * n(x) for w, n in zip(weights, normals)], axis = 0) # # for xi in range(len(x)): # if x_na[xi]: # continue # # x_c = predicted_group[xi] # # if density(x[xi]) / max_d[x_c] < 0.01: # predicted_group[xi] = -1 # continue # # sum_d = 0 # for c in set(groups): # sum_d += group_weight[c] * group_density[c](x[xi]) # # if group_weight[x_c] * group_density[x_c](x[xi]) / sum_d < 0.8: # predicted_group[xi] = -1 # # max_d = -1.0 * np.inf # for x_c in x[predicted_group == c]: # x_c_d = density(x_c) # if x_c_d > max_d: # max_d = x_c_d # # for i in range(len(x)): # if predicted_group[i] == c and density(x[i]) / max_d <= 0.01: # predicted_group[i] = -1 # # predicted_str = pd.Series(["(none)"] * len(predicted_group)) for c in range(len(self._cluster_group[group])): predicted_str[predicted_group == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted_group == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str new_experiment = experiment.clone() new_experiment.add_condition(self.name, "category", event_assignments) # new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment
def _validate_condition_name(self, x): return util.sanitize_identifier(x)