def check_tube(filename, experiment, ignore_v = False): try: tube_meta = fcsparser.parse( filename, channel_naming = experiment.metadata["name_metadata"], meta_data_only = True, reformat_meta = True) except Exception as e: raise util.CytoflowOpError("FCS reader threw an error reading metadata " " for tube {0}: {1}" .format(filename, str(e))) # first make sure the tube has the right channels if set(tube_meta["_channel_names_"]) != set(experiment.channels): raise util.CytoflowError("Tube {0} doesn't have the same channels " "as the first tube added".format(filename)) tube_channels = tube_meta["_channels_"] tube_channels.set_index(experiment.metadata["name_metadata"], inplace = True) # next check the per-channel parameters for channel in experiment.channels: # first check voltage if "voltage" in experiment.metadata[channel]: if not "$PnV" in tube_channels.ix[channel]: raise util.CytoflowError("Didn't find a voltage for channel {0}" \ "in tube {1}".format(channel, filename)) old_v = experiment.metadata[channel]["voltage"] new_v = tube_channels.ix[channel]['$PnV'] if old_v != new_v and not ignore_v: raise util.CytoflowError("Tube {0} doesn't have the same voltages" .format(filename))
def add_channel(self, name, data=None): """ Add a new column of per-event data (as opposed to metadata) to this :class:`Experiment`: ie, something that was measured per cell, or derived from per-cell measurements. .. note:: :meth:`add_channel` operates *in place*. Parameters ---------- name : String The name of the new column to be added to :attr:`data`. data : pandas.Series The :class:`pandas.Series` to add to :attr:`data`. Must be the same length as :attr:`data`, and it must be convertable to a dtype of ``float64``. If ``None``, will add an empty column to the :class:`Experiment` ... but the :class:`Experiment` must be empty to do so! Raises ------ :exc:`.CytoflowError` If the :class:`pandas.Series` passed in ``data`` isn't the same length as :attr:`data`, or isn't convertable to a dtype ``float64``. Examples -------- >>> ex.add_channel("FSC_over_2", ex.data["FSC-A"] / 2.0) """ if name in self.data: raise util.CytoflowError( "Already a column named {0} in self.data".format(name)) if data is None and len(self) > 0: raise util.CytoflowError( "If data is None, self.data must be empty!") if data is not None and len(self) != len(data): raise util.CytoflowError( "data must be the same length as self.data") try: if data is not None: self.data[name] = data.astype("float64", copy=True) else: self.data[name] = pd.Series(dtype="float64") except (ValueError, TypeError) as exc: raise util.CytoflowError( "Had trouble converting data to type \"float64\"") from exc self.metadata[name] = {} self.metadata[name]['type'] = "channel"
def subset(self, name, value): """ A fast way to get a subset of the data where a condition equals a particular value. This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore '_'. So, the column name `a column` becomes `a_column`, and can be queried with an `a_column == True` or such. Parameters ---------- name : Str A condition; ie, a key in `self.conditions`. value : Any The value to look for. Will be checked with equality, ie `==` """ new_name = util.sanitize_identifier(name) if new_name not in self.conditions: raise util.CytoflowError("Can't find condition '{}'" .format(name)) ret = self.clone() ret.data = self.data[ self.data[new_name] == value ] ret.data.reset_index(drop = True, inplace = True) return ret
def parse_tube(filename, experiment=None, data_set=0, metadata_only=False): if experiment: check_tube(filename, experiment) name_metadata = experiment.metadata["name_metadata"] else: name_metadata = '$PnS' try: if metadata_only: tube_data = None with warnings.catch_warnings(): warnings.simplefilter("ignore") tube_meta = fcsparser.parse(filename, meta_data_only=True, data_set=data_set, channel_naming=name_metadata) else: with warnings.catch_warnings(): warnings.simplefilter("ignore") tube_meta, tube_data = fcsparser.parse( filename, meta_data_only=metadata_only, data_set=data_set, channel_naming=name_metadata) except Exception as e: raise util.CytoflowError( "FCS reader threw an error reading data for tube {}".format( filename)) from e del tube_meta['__header__'] return tube_meta, tube_data
def query(self, expr, **kwargs): """ Expose pandas.DataFrame.query() to the outside world This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore '_'. So, the column name `a column` becomes `a_column`, and can be queried with an `a_column == True` or such. Parameters ---------- expr : string The expression to pass to `pandas.DataFrame.query()`. Must be a valid Python expression, something you could pass to `eval()`. **kwargs : dict Other named parameters to pass to `pandas.DataFrame.query()`. """ resolvers = {} for name, col in self.data.iteritems(): new_name = util.sanitize_identifier(name) if new_name in resolvers: raise util.CytoflowError( "Tried to sanitize column name {1} to " "{2} but it already existed in the " " DataFrame.".format(name, new_name)) else: resolvers[new_name] = col return self.data.query(expr, resolvers=({}, resolvers), **kwargs)
def subset(self, conditions, values): """ Returns a subset of this experiment including only the events where each condition in ``condition`` equals the corresponding value in ``values``. Parameters ---------- conditions : Str or Tuple(Str) A condition or list of conditions values : Any or Tuple(Any) The value(s) of the condition(s) Returns ------- Experiment A new :class:`Experiment` containing only the events specified in ``conditions`` and ``values``. """ if isinstance(conditions, str): c = conditions v = values if c not in self.conditions: raise util.CytoflowError("{} is not a condition".format(c)) if v not in list(self.conditions[c]): raise util.CytoflowError( "{} is not a value of condition {}".format(v, c)) else: for c, v in zip(conditions, values): if c not in self.conditions: raise util.CytoflowError("{} is not a condition".format(c)) if v not in list(self.conditions[c]): raise util.CytoflowError( "{} is not a value of condition {}".format(v, c)) g = self.data.groupby(conditions) ret = self.clone() ret.data = g.get_group(values) ret.data.reset_index(drop=True, inplace=True) return ret
def query(self, expr, **kwargs): """ Return an experiment whose data is a subset of this one where ``expr`` evaluates to ``True``. This method "sanitizes" column names first, replacing characters that are not valid in a Python identifier with an underscore ``_``. So, the column name ``a column`` becomes ``a_column``, and can be queried with an ``a_column == True`` or such. Parameters ---------- expr : string The expression to pass to :meth:`pandas.DataFrame.query`. Must be a valid Python expression, something you could pass to :func:`eval`. **kwargs : dict Other named parameters to pass to :meth:`pandas.DataFrame.query`. Returns ------- Experiment A new :class:`Experiment`, a clone of this one with the data returned by :meth:`pandas.DataFrame.query()` """ resolvers = {} for name, col in self.data.iteritems(): new_name = util.sanitize_identifier(name) if new_name in resolvers: raise util.CytoflowError( "Tried to sanitize column name {1} to " "{2} but it already existed in the " " DataFrame.".format(name, new_name)) else: resolvers[new_name] = col ret = self.clone() ret.data = self.data.query(expr, resolvers=({}, resolvers), **kwargs) ret.data.reset_index(drop=True, inplace=True) if len(ret.data) == 0: raise util.CytoflowError("No events matched {}".format(expr)) return ret
def check_tube(filename, experiment, data_set=0): if experiment is None: raise util.CytoflowError("No experiment specified") ignore_v = experiment.metadata['ignore_v'] try: tube_meta = fcsparser.parse( filename, channel_naming=experiment.metadata["name_metadata"], data_set=data_set, meta_data_only=True, reformat_meta=True) except Exception as e: raise util.CytoflowError("FCS reader threw an error reading metadata " "for tube {0}".format(filename)) from e # first make sure the tube has the right channels if not set( [experiment.metadata[c]["fcs_name"] for c in experiment.channels]) <= set(tube_meta["_channel_names_"]): raise util.CytoflowError( "Tube {0} doesn't have the same channels".format(filename)) tube_channels = tube_meta["_channels_"] tube_channels.set_index(experiment.metadata["name_metadata"], inplace=True) # next check the per-channel parameters for channel in experiment.channels: fcs_name = experiment.metadata[channel]["fcs_name"] # first check voltage if "voltage" in experiment.metadata[channel]: if not "$PnV" in tube_channels.loc[fcs_name]: raise util.CytoflowError("Didn't find a voltage for channel {0}" \ "in tube {1}".format(channel, filename)) old_v = experiment.metadata[channel]["voltage"] new_v = tube_channels.loc[fcs_name]['$PnV'] if old_v != new_v and not channel in ignore_v: raise util.CytoflowError( "Tube {0} doesn't have the same voltages for channel ". format(filename) + str(channel))
def include_condition(self, condition): if not self.when: return True if condition in self.metadata: try: return eval(self.when, globals(), self.metadata[condition]) except: raise util.CytoflowError("Bad when statement: {}" .format(self.when)) else: return False
def parse_tube(filename, experiment): check_tube(filename, experiment) try: tube_meta, tube_data = fcsparser.parse( filename, channel_naming=experiment.metadata["name_metadata"]) except Exception as e: raise util.CytoflowError( "FCS reader threw an error reading data for tube {}".format( filename)) from e return tube_meta, tube_data
def _on_conditions_change(self, obj, name, old, new): value_names = set([subset.name for subset in self.value]) condition_names = set([ x for x in list(self.conditions.keys()) if self.include_condition(x) ]) loading = (self.ui.context["context"].status == "loading") if not loading: for name in value_names - condition_names: # remove subsets that aren't in conditions subset = next((x for x in self.value if x.name == name)) self.value.remove(subset) for name in condition_names - value_names: # add subsets that are new conditions values = self.conditions[name].sort_values() dtype = pd.Series(list(values)).dtype if dtype.kind == 'b': subset = BoolSubset(name=name) elif dtype.kind in "ifu": subset = RangeSubset(name=name, values=list(values)) elif dtype.kind in "OSU": subset = CategorySubset(name=name, values=sorted(list(values))) else: raise util.CytoflowError( "Unknown dtype {} in ViewController".format(dtype)) self.value.append(subset) for name in condition_names & value_names: # update values for subsets we're already tracking subset = next((x for x in self.value if x.name == name)) if set(subset.values) != set(self.conditions[name]): subset.values = list(self.conditions[name].sort_values()) self.value = sorted(self.value, key=lambda x: x.name)
def _on_conditions_change(self, obj, name, old, new): # to prevent unnecessary updates, be careful about how these are # updated # first, check current models against the new conditions. remove any # that are no longer present, and update the values for the rest for model in list(self.condition_models): if model.name not in self.conditions or not self.include_condition(model.name): self.condition_models.remove(model) continue else: if set(model.values) != set(self.conditions[model.name]): model.values = list(self.conditions[model.name]) # then, see if there are any new conditions to add for name, values in self.conditions.iteritems(): if len([x for x in self.condition_models if x.name == name]) > 0: continue if not self.include_condition(name): continue dtype = pd.Series(list(values)).dtype if dtype.kind == 'b': model = BoolCondition(name = name) elif dtype.kind in "ifu": model = RangeCondition(name = name, values = list(values)) elif dtype.kind in "OSU": model = CategoryCondition(name = name, values = list(values)) else: raise util.CytoflowError("Unknown dtype {} in SubsetEditor" .format(dtype)) self.condition_models.append(model)
def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if not experiment: raise util.CytoflowOpError("No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError("Experiment already has a column named {0}" .format(self.name)) if not self._gmms: raise util.CytoflowOpError("No components found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError("Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError("Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.ychannel)) if (self.name + "_Posterior") in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(self.name + "_Posterior")) if self.num_components == 1 and self.sigma == 0.0: raise util.CytoflowError("If num_components == 1, sigma must be > 0") if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.sigma < 0.0: raise util.CytoflowOpError("sigma must be >= 0.0") event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for example, this is why # we don't use Ellipse.contains(). if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda x: True) for group, data_subset in groupby: gmm = self._gmms[group] x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # which values are missing? x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]) x_na = x_na.values x = x.values group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x[:, 0], "y" : x[:, 1], "p" : predicted}) # for each component, get the ellipse that follows the isoline # around the mixture component # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389 # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm # i am not proud of how many tries this took me to get right. for c in range(0, self.num_components): mean = gmm.means_[c] covar = gmm._get_covars()[c] # xc is the center on the x axis # yc is the center on the y axis xc = mean[0] # @UnusedVariable yc = mean[1] # @UnusedVariable v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # xl is the length along the x axis # yl is the length along the y axis xl = np.sqrt(v[0]) * self.sigma # @UnusedVariable yl = np.sqrt(v[1]) * self.sigma # @UnusedVariable # t is the rotation in radians (counter-clockwise) t = 2 * np.pi - np.arctan(u[1] / u[0]) sin_t = np.sin(t) # @UnusedVariable cos_t = np.cos(t) # @UnusedVariable # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and " "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + " "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, :]) posteriors = pd.Series([0.0] * len(predicted)) for c in range(0, self.num_components): posteriors[predicted == c] = probability[predicted == c, c] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) else: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) new_experiment.history.append(self.clone_traits()) return new_experiment
def add_condition(self, name, dtype, data = None): """Add a new column of per-event metadata to this `Experiment`. Operates *in place*. There are two places to call `add_condition`. - As you're setting up a new `Experiment`, call `add_condition()` with `data` set to `None` to specify the conditions the new events will have. - If you compute some new per-event metadata on an existing `Experiment`, call `add_condition()` to add it. Parameters ---------- name : String The name of the new column in `self.data`. dtype : String The type of the new column in `self.data`. Must be a string that `pandas.Series` recognizes as a `dtype`: common types are "category", "float", "int", and "bool". data : pandas.Series (default = None) The `pandas.Series` to add to `self.data`. Must be the same length as `self.data`, and it must be convertable to a `pandas.Series` of type `dtype`. If `None`, will add an empty column to the `Experiment` ... but the `Experiment` must be empty to do so! Raises ------ CytoflowError If the `pandas.Series` passed in `data` isn't the same length as `self.data`, or isn't convertable to type `dtype`. Examples -------- >>> import cytoflow as flow >>> ex = flow.Experiment() >>> ex.add_condition("Time", "float") >>> ex.add_condition("Strain", "category") """ if name in self.data: raise util.CytoflowError("Already a column named {0} in self.data" .format(name)) if data is None and len(self) > 0: raise util.CytoflowError("If data is None, self.data must be empty!") if data is not None and len(self) != len(data): raise util.CytoflowError("data must be the same length as self.data") try: if data is not None: self.data[name] = data.astype(dtype, copy = True) else: self.data[name] = pd.Series(dtype = dtype) self.metadata[name] = {} self.metadata[name]['type'] = dtype except (ValueError, TypeError): raise util.CytoflowError("Had trouble converting data to type {0}" .format(dtype))
def add_events(self, data, conditions): """ Add new events to this :class:`Experiment`. Each new event in ``data`` is appended to :attr:`data`, and its per-event metadata columns will be set with the values specified in ``conditions``. Thus, it is particularly useful for adding tubes of data to new experiments, before additional per-event metadata is added by gates, etc. .. note:: *Every* column in :attr:`data` must be accounted for. Each column of type ``channel`` must appear in ``data``; each column of metadata must have a key:value pair in ``conditions``. Parameters ---------- tube : pandas.DataFrame A single tube or well's worth of data. Must be a DataFrame with the same columns as :attr:`channels` conditions : Dict(Str, Any) A dictionary of the tube's metadata. The keys must match :attr:`conditions`, and the values must be coercable to the relevant ``numpy`` dtype. Raises ------ :exc:`.CytoflowError` :meth:`add_events` pukes if: - there are columns in ``data`` that aren't channels in the experiment, or vice versa. - there are keys in ``conditions`` that aren't conditions in the experiment, or vice versa. - there is metadata specified in ``conditions`` that can't be converted to the corresponding metadata ``dtype``. Examples -------- >>> import cytoflow as flow >>> import fcsparser >>> ex = flow.Experiment() >>> ex.add_condition("Time", "float") >>> ex.add_condition("Strain", "category") >>> tube1, _ = fcparser.parse('CFP_Well_A4.fcs') >>> tube2, _ = fcparser.parse('RFP_Well_A3.fcs') >>> ex.add_events(tube1, {"Time" : 1, "Strain" : "BL21"}) >>> ex.add_events(tube2, {"Time" : 1, "Strain" : "Top10G"}) """ # make sure the new tube's channels match the rest of the # channels in the Experiment if len(self) > 0 and set(data.columns) != set(self.channels): raise util.CytoflowError("New events don't have the same channels") # check that the conditions for this tube exist in the experiment # already if( any(True for k in conditions if k not in self.conditions) or \ any(True for k in self.conditions if k not in conditions) ): raise util.CytoflowError( "Metadata for this tube should be {}".format( list(self.conditions.keys()))) # add the conditions to tube's internal data frame. specify the conditions # dtype using self.conditions. check for errors as we do so. # take this chance to up-convert the float32s to float64. # this happened automatically in DataFrame.append(), below, but # only in certain cases.... :-/ # TODO - the FCS standard says you can specify the precision. # check with int/float/double files! new_data = data.astype("float64", copy=True) for meta_name, meta_value in conditions.items(): meta_type = self.conditions[meta_name].dtype if is_categorical_dtype(meta_type): meta_type = CategoricalDtype([meta_value]) new_data[meta_name] = \ pd.Series(data = [meta_value] * len(new_data), index = new_data.index, dtype = meta_type) # if we're categorical, merge the categories if is_categorical_dtype(meta_type) and meta_name in self.data: cats = set(self.data[meta_name].cat.categories) | set( new_data[meta_name].cat.categories) self.data[meta_name] = self.data[meta_name].cat.set_categories( cats) new_data[meta_name] = new_data[meta_name].cat.set_categories( cats) self.data = self.data.append(new_data, ignore_index=True) del new_data
def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if not experiment: raise util.CytoflowOpError("No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError("Experiment already has a column named {0}" .format(self.name)) if not self._gmms: raise util.CytoflowOpError("No components found. Did you forget to " "call estimate()?") if not self._scale: raise util.CytoflowOpError("Couldn't find _scale. What happened??") if self.channel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.channel)) if self.num_components == 1 and self.sigma == 0.0: raise util.CytoflowError("If num_components == 1, sigma must be > 0") if (self.name + "_Posterior") in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(self.name + "_Posterior")) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.sigma < 0.0: raise util.CytoflowOpError("sigma must be >= 0.0") event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda x: True) for group, data_subset in groupby: gmm = self._gmms[group] x = data_subset[self.channel] x = self._scale(x) # which values are missing? x_na = np.isnan(x) group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x, "p" : predicted}) # for each component, get the low and the high threshold for c in range(0, self.num_components): lo = (gmm.means_[c][0] # @UnusedVariable - self.sigma * np.sqrt(gmm.covars_[c][0])) hi = (gmm.means_[c][0] # @UnusedVariable + self.sigma * np.sqrt(gmm.covars_[c][0])) # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis]) posteriors = pd.Series([0.0] * len(predicted)) for i in range(0, self.num_components): posteriors[predicted == i] = probability[predicted == i, i] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) else: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) new_experiment.history.append(self.clone_traits()) return new_experiment
def add_condition(self, name, dtype, data=None): """ Add a new column of per-event metadata to this :class:`Experiment`. .. note:: :meth:`add_condition` operates **in place.** There are two places to call `add_condition`. - As you're setting up a new :class:`Experiment`, call :meth:`add_condition` with ``data`` set to ``None`` to specify the conditions the new events will have. - If you compute some new per-event metadata on an existing :class:`Experiment`, call :meth:`add_condition` to add it. Parameters ---------- name : String The name of the new column in :attr:`data`. Must be a valid Python identifier: must start with ``[A-Za-z_]`` and contain only the characters ``[A-Za-z0-9_]``. dtype : String The type of the new column in :attr:`data`. Must be a string that :class:`pandas.Series` recognizes as a ``dtype``: common types are ``category``, ``float``, ``int``, and ``bool``. data : pandas.Series (default = None) The :class:`pandas.Series` to add to :attr:`data`. Must be the same length as :attr:`data`, and it must be convertable to a :class:`pandas.Series` of type ``dtype``. If ``None``, will add an empty column to the :class:`Experiment` ... but the :class:`Experiment` must be empty to do so! Raises ------ :class:`.CytoflowError` If the :class:`pandas.Series` passed in ``data`` isn't the same length as :attr:`data`, or isn't convertable to type ``dtype``. Examples -------- >>> import cytoflow as flow >>> ex = flow.Experiment() >>> ex.add_condition("Time", "float") >>> ex.add_condition("Strain", "category") """ if name != util.sanitize_identifier(name): raise util.CytoflowError( "Name '{}' is not a valid Python identifier".format(name)) if name in self.data: raise util.CytoflowError( "Already a column named {0} in self.data".format(name)) if data is None and len(self) > 0: raise util.CytoflowError( "If data is None, self.data must be empty!") if data is not None and len(self) != len(data): raise util.CytoflowError( "data must be the same length as self.data") try: if data is not None: self.data[name] = data.astype(dtype, copy=True) else: self.data[name] = pd.Series(dtype=dtype) except (ValueError, TypeError) as exc: raise util.CytoflowError( "Had trouble converting data to type {0}".format( dtype)) from exc self.metadata[name] = {} self.metadata[name]['type'] = "condition"
def apply(self, experiment = None, metadata_only = False): """ Load a new :class:`.Experiment`. Parameters ---------- experiment : Experiment Ignored metadata_only : bool (default = False) Only "import" the metadata, creating an Experiment with all the expected metadata and structure but 0 events. Returns ------- Experiment The new :class:`.Experiment`. New channels have the following metadata: - **voltage** - int The voltage that this channel was collected at. Determined by the ``$PnV`` field from the first FCS file. - **range** - int The maximum range of this channel. Determined by the ``$PnR`` field from the first FCS file. New experimental conditions do not have **voltage** or **range** metadata, obviously. Instead, they have **experiment** set to ``True``, to distinguish the experimental variables from the conditions that were added by gates, etc. If :attr:`ignore_v` is set, it is added as a key to the :class:`.Experiment`-wide metadata. """ if not self.tubes or len(self.tubes) == 0: raise util.CytoflowOpError('tubes', "Must specify some tubes!") # if we have channel renaming, make sure the new names are valid # python identifiers if self.channels: for old_name, new_name in self.channels.items(): if old_name != new_name and new_name != util.sanitize_identifier(new_name): raise util.CytoflowOpError('channels', "Channel name {} must be a " "valid Python identifier." .format(new_name)) # make sure each tube has the same conditions tube0_conditions = set(self.tubes[0].conditions) for tube in self.tubes: tube_conditions = set(tube.conditions) if len(tube0_conditions ^ tube_conditions) > 0: raise util.CytoflowOpError('tubes', "Tube {0} didn't have the same " "conditions as tube {1}" .format(tube.file, self.tubes[0].file)) # make sure experimental conditions are unique for idx, i in enumerate(self.tubes[0:-1]): for j in self.tubes[idx+1:]: if i.conditions_equal(j): raise util.CytoflowOpError('tubes', "The same conditions specified for " "tube {0} and tube {1}" .format(i.file, j.file)) experiment = Experiment() experiment.metadata["ignore_v"] = self.ignore_v for condition, dtype in list(self.conditions.items()): experiment.add_condition(condition, dtype) experiment.metadata[condition]['experiment'] = True if (self.tubes[0].file): try: # silence warnings about duplicate channels; # we'll figure that out below with warnings.catch_warnings(): warnings.simplefilter("ignore") tube0_meta = fcsparser.parse(self.tubes[0].file, data_set = self.data_set, meta_data_only = True, reformat_meta = True) except Exception as e: raise util.CytoflowOpError('tubes', "FCS reader threw an error reading metadata " "for tube {}: {}" .format(self.tubes[0].file, str(e))) from e meta_channels = tube0_meta["_channels_"] if self.name_metadata: experiment.metadata["name_metadata"] = self.name_metadata else: experiment.metadata["name_metadata"] = autodetect_name_metadata(self.tubes[0].file, data_set = self.data_set) meta_channels['Index'] = meta_channels.index meta_channels.set_index(experiment.metadata["name_metadata"], inplace = True) channels = list(self.channels.keys()) if self.channels \ else list(meta_channels.index.values) # make sure everything in self.channels is in the tube channels for channel in channels: if channel not in meta_channels.index: raise util.CytoflowOpError('channels', "Channel {0} not in tube {1}" .format(channel, self.tubes[0].file)) else: channels = list(self.channels.keys()) if self.channels else list(self.tubes[0].frame) meta_channels = DataFrame() experiment.metadata["name_metadata"] = None tube0_meta = {} # now that we have the metadata, load it into experiment for channel in channels: experiment.add_channel(channel) experiment.metadata[channel]["fcs_name"] = channel if (not tube.file): # experiment.metadata[channel]['range'] = 65535*2 experiment.metadata[channel]['range'] = 1e6 if (list(meta_channels)): # keep track of the channel's PMT voltage if("$PnV" in meta_channels.loc[channel]): v = meta_channels.loc[channel]['$PnV'] if v: experiment.metadata[channel]["voltage"] = v # add the maximum possible value for this channel. data_range = meta_channels.loc[channel]['$PnR'] data_range = float(data_range) experiment.metadata[channel]['range'] = data_range experiment.metadata['fcs_metadata'] = {} for tube in self.tubes: if (tube.file and tube.frame != None): raise util.CytoflowError("Both a DataFrame and an FCS file were specified, " "tube with file {0} and conditions {1}".format(tube.file,tube.conditions)) elif (tube.file and tube.frame == None): if metadata_only: tube_meta, tube_data = parse_tube(tube.file, experiment, data_set = self.data_set, metadata_only = True) else: tube_meta, tube_data = parse_tube(tube.file, experiment, data_set = self.data_set) elif (not tube.file and not tube.frame.empty): tube_meta = {} # probably incorrect --tsj tube_data = tube.frame if self.events: if self.events <= len(tube_data): tube_data = tube_data.loc[np.random.choice(tube_data.index, self.events, replace = False)] else: warnings.warn("Only {0} events in tube {1}" .format(len(tube_data), tube.file), util.CytoflowWarning) experiment.add_events(tube_data[channels], tube.conditions) # extract the row and column from wells collected on a # BD HTS if 'WELL ID' in tube_meta: pos = tube_meta['WELL ID'] tube_meta['CF_Row'] = pos[0] tube_meta['CF_Col'] = int(pos[1:3]) for i, channel in enumerate(channels): # remove the PnV tube metadata if '$P{}V'.format(i+1) in tube_meta: del tube_meta['$P{}V'.format(i+1)] # work around a bug where the PnR is sometimes not the detector range # but the data range. pnr = '$P{}R'.format(i+1) if pnr in tube_meta and float(tube_meta[pnr]) > experiment.metadata[channel]['range']: experiment.metadata[channel]['range'] = float(tube_meta[pnr]) tube_meta['CF_File'] = Path(tube.file).stem experiment.metadata['fcs_metadata'][tube.file] = tube_meta for channel in channels: if self.channels and channel in self.channels: new_name = self.channels[channel] if channel == new_name: continue experiment.data.rename(columns = {channel : new_name}, inplace = True) experiment.metadata[new_name] = experiment.metadata[channel] experiment.metadata[new_name]["fcs_name"] = channel del experiment.metadata[channel] if (self.tubes[0].file): # this catches an odd corner case where some instruments store # instrument-specific info in the "extra" bits. we have to # clear them out. if '$DATATYPE' in tube0_meta and tube0_meta['$DATATYPE'] == 'I': data_bits = int(meta_channels.loc[channel]['$PnB']) data_range = float(meta_channels.loc[channel]['$PnR']) range_bits = int(math.log(data_range, 2)) if range_bits < data_bits: mask = 1 for _ in range(1, range_bits): mask = mask << 1 | 1 experiment.data[channel] = experiment.data[channel].values.astype('int') & mask # re-scale the data to linear if if's recorded as log-scaled with # integer channels data_range = float(meta_channels.loc[channel]['$PnR']) f1 = float(meta_channels.loc[channel]['$PnE'][0]) f2 = float(meta_channels.loc[channel]['$PnE'][1]) if f1 > 0.0 and f2 == 0.0: warnings.warn('Invalid $PnE = {},{} for channel {}, changing it to {},1.0' .format(f1, f2, channel, f1), util.CytoflowWarning) f2 = 1.0 if f1 > 0.0 and f2 > 0.0 and tube0_meta['$DATATYPE'] == 'I': warnings.warn('Converting channel {} from logarithmic to linear' .format(channel), util.CytoflowWarning) # experiment.data[channel] = 10 ** (f1 * experiment.data[channel] / data_range) * f2 return experiment
def plot(self, experiment, **kwargs): """ Plot some data from an experiment. This function takes care of checking for facet name validity and subsetting, then passes the underlying dataframe to `BaseView.plot` Parameters ---------- min_quantile : float (>0.0 and <1.0, default = 0.001) Clip data that is less than this quantile. max_quantile : float (>0.0 and <1.0, default = 1.00) Clip data that is greater than this quantile. Other Parameters ---------------- lim : Dict(Str : (float, float)) Set the range of each channel's axis. If unspecified, assume that the limits are the minimum and maximum of the clipped data. Required. scale : Dict(Str : IScale) Scale the data on each axis. Required. """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") if self.xfacet and self.xfacet not in experiment.conditions: raise util.CytoflowViewError( 'xfacet', "X facet {0} not in the experiment".format(self.xfacet)) if self.yfacet and self.yfacet not in experiment.conditions: raise util.CytoflowViewError( 'yfacet', "Y facet {0} not in the experiment".format(self.yfacet)) if self.huefacet and self.huefacet not in experiment.conditions: raise util.CytoflowViewError( 'huefacet', "Hue facet {0} not in the experiment".format(self.huefacet)) # adjust the limits to clip extreme values min_quantile = kwargs.pop("min_quantile", 0.001) max_quantile = kwargs.pop("max_quantile", 1.0) if min_quantile < 0.0 or min_quantile > 1: raise util.CytoflowViewError( 'min_quantile', "min_quantile must be between 0 and 1") if max_quantile < 0.0 or max_quantile > 1: raise util.CytoflowViewError( 'max_quantile', "max_quantile must be between 0 and 1") if min_quantile >= max_quantile: raise util.CytoflowViewError( 'min_quantile', "min_quantile must be less than max_quantile") lim = kwargs.get('lim') scale = kwargs.get('scale') for c in lim.keys(): if lim[c] is None: lim[c] = (experiment[c].quantile(min_quantile), experiment[c].quantile(max_quantile)) elif isinstance(lim[c], list) or isinstance(lim[c], tuple): if len(lim[c]) != 2: raise util.CytoflowError( 'lim', 'Length of lim\[{}\] must be 2'.format(c)) if lim[c][0] is None: lim[c] = (experiment[c].quantile(min_quantile), lim[c][1]) if lim[c][1] is None: lim[c] = (lim[c][0], experiment[c].quantile(max_quantile)) else: raise util.CytoflowError( 'lim', "lim\[{}\] is an unknown data type".format(c)) lim[c] = [scale[c].clip(x) for x in lim[c]] facets = [x for x in [self.xfacet, self.yfacet, self.huefacet] if x] if len(facets) != len(set(facets)): raise util.CytoflowViewError(None, "Can't reuse facets") if self.subset: try: experiment = experiment.query(self.subset) except util.CytoflowError as e: raise util.CytoflowViewError('subset', str(e)) from e except Exception as e: raise util.CytoflowViewError( 'subset', "Subset string '{0}' isn't valid".format( self.subset)) from e if len(experiment) == 0: raise util.CytoflowViewError( 'subset', "Subset string '{0}' returned no events".format( self.subset)) super().plot(experiment, experiment.data, **kwargs)
def plot(self, experiment, plot_name=None, **kwargs): """Plot a chart of a variable's values against a statistic. Parameters ---------- variable_lim : (float, float) The limits on the variable axis color : a matplotlib color The color to plot with. Overridden if `huefacet` is not `None` linewidth : float The width of the line, in points linestyle : ['solid' | 'dashed', 'dashdot', 'dotted' | (offset, on-off-dash-seq) | '-' | '--' | '-.' | ':' | 'None' | ' ' | ''] marker : a matplotlib marker style See http://matplotlib.org/api/markers_api.html#module-matplotlib.markers markersize : int The marker size in points markerfacecolor : a matplotlib color The color to make the markers. Overridden (?) if `huefacet` is not `None` alpha : the alpha blending value, from 0.0 (transparent) to 1.0 (opaque) capsize : scalar The size of the error bar caps, in points shade_error : bool If `False` (the default), plot the error statistic as traditional "error bars." If `True`, plot error statistic as a filled, shaded region. shade_alpha : float The transparency of the shaded error region, from 0.0 (transparent) to 1.0 (opaque.) Default is 0.2. Notes ----- Other `kwargs` are passed to `matplotlib.pyplot.plot <https://matplotlib.org/devdocs/api/_as_gen/matplotlib.pyplot.plot.html>`_ """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") if self.variable not in experiment.conditions: raise util.CytoflowError( 'variable', "Variable {} not in the experiment".format(self.variable)) if not util.is_numeric(experiment[self.variable]): raise util.CytoflowError( 'variable', "Variable {} must be numeric".format(self.variable)) variable_scale = util.scale_factory(self.variable_scale, experiment, condition=self.variable) super().plot(experiment, plot_name, variable_scale=variable_scale, **kwargs)