class ImportPluginOp(PluginOpMixin, ImportOp): handler_factory = Callable(ImportHandler, transient=True) original_channels = List(Str, estimate=True) channels_list = List(Channel, estimate=True) events = util.CIntOrNone(None, estimate=True) tubes = List(Tube, estimate=True) channels = Dict(Str, Str, transient=True) name_metadata = Enum(None, "$PnN", "$PnS", estimate=True) ret_events = util.PositiveInt(0, allow_zero=True, status=True) do_import = Bool(False) def reset_channels(self): self.channels_list = [ Channel(channel=x, name=util.sanitize_identifier(x)) for x in self.original_channels ] @on_trait_change('channels_list_items, channels_list.+') def _channels_changed(self, obj, name, old, new): self.changed = (Changed.ESTIMATE, ('channels_list', self.channels_list)) @on_trait_change('tubes_items, tubes:+') def _tubes_changed(self, obj, name, old, new): self.changed = (Changed.ESTIMATE, ('tubes', self.tubes)) def estimate(self, _): self.do_import = False self.do_import = True def apply(self, experiment=None, metadata_only=False, force=False): if self.do_import or force: self.channels = {c.channel: c.name for c in self.channels_list} ret = super().apply(experiment=experiment, metadata_only=metadata_only) self.ret_events = len(ret.data) return ret else: if not self.tubes: raise util.CytoflowOpError( None, 'Click "Set up experiment", ' 'then "Import!"') raise util.CytoflowOpError(None, "Press 'Import!'") def clear_estimate(self): self.do_import = False def get_notebook_code(self, idx): op = ImportOp() op.copy_traits(self, op.copyable_trait_names()) op.channels = {c.channel: c.name for c in self.channels_list} return dedent(""" op_{idx} = {repr} ex_{idx} = op_{idx}.apply()""".format(repr=repr(op), idx=idx))
class ImportWorkflowOp(WorkflowOperation, ImportOp): original_channels = List(Str) channels_list = List(Channel, estimate=True) events = util.CIntOrNone(None, estimate=True) tubes = List(Tube, estimate=True) conditions = Dict(Str, Str, estimate=True) channels = Dict(Str, Str, transient=True) name_metadata = Enum(None, "$PnN", "$PnS", estimate=True) # how many events did we load? ret_events = util.PositiveInt(0, allow_zero=True, status=True, estimate_result=True, transient=True) # since we're actually calling super().apply() from self.estimate(), we need # to keep around the actual experiment that's returned ret_experiment = Instance('cytoflow.experiment.Experiment', transient=True) def reset_channels(self): self.channels_list = [ Channel(channel=x, name=util.sanitize_identifier(x)) for x in self.original_channels ] def estimate(self, _): self.channels = {c.channel: c.name for c in self.channels_list} self.ret_experiment = super().apply() self.ret_events = len(self.ret_experiment) def apply(self, _): if self.ret_experiment: return self.ret_experiment elif not self.tubes: raise util.CytoflowOpError( None, 'Click "Set up experiment, then "Import!"') else: raise util.CytoflowOpError(None, 'Click "Import!"') def clear_estimate(self): self.ret_experiment = None self.ret_events = 0 def get_notebook_code(self, idx): op = ImportOp() op.copy_traits(self, op.copyable_trait_names()) op.channels = {c.channel: c.name for c in self.channels_list} return dedent(""" op_{idx} = {repr} ex_{idx} = op_{idx}.apply()""".format(repr=repr(op), idx=idx))
class ImportOp(HasStrictTraits): """ An operation for importing data and making an :class:`.Experiment`. To use, set the :attr:`conditions` dict to a mapping between condition name and NumPy ``dtype``. Useful dtypes include ``category``, ``float``, ``int``, ``bool``. Next, set :attr:`tubes` to a list of :class:`Tube` containing FCS filenames and the corresponding conditions. If you would rather not analyze every single event in every FCS file, set :attr:`events` to the number of events from each FCS file you want to load. Call :meth:`apply` to load the data. The usual ``experiment`` parameter can be ``None``. Attributes ---------- conditions : Dict(Str, Str) A dictionary mapping condition names (keys) to NumPy ``dtype``s (values). Useful ``dtype``s include ``category``, ``float``, ``int``, and ``bool``. tubes : List(Tube) A list of :class:``Tube`` instances, which map FCS files to their corresponding experimental conditions. Each :class:``Tube`` must have a :attr:``~Tube.conditions`` dict whose keys match those of :attr:`conditions`. channels : Dict(Str, Str) If you only need a subset of the channels available in the data set, specify them here. Each ``(key, value)`` pair specifies a channel to include in the output experiment. The key is the channel name in the FCS file, and the value is the name of the channel in the Experiment. You can use this to rename channels as you import data (because flow channel names are frequently not terribly informative.) New channel names must be valid Python identifiers: start with a letter or ``_``, and all characters must be letters, numbers or ``_``. If :attr:`channels` is empty, load all channels in the FCS files. events : Int If not None, import only a random subset of events of size :attr:`events`. Presumably the analysis will go faster but less precisely; good for interactive data exploration. Then, unset :attr:`events` and re-run the analysis non-interactively. name_metadata : {None, "$PnN", "$PnS"} (default = None) Which FCS metadata is the channel name? If ``None``, attempt to autodetect. data_set : Int (default = 0) The FCS standard allows you to encode multiple data sets in a single FCS file. Some software (such as the Beckman-Coulter software) also encode the same data in two different formats -- for example, FCS2.0 and FCS3.0. To access a data set other than the first one, set :attr:`data_set` to the 0-based index of the data set you would like to use. This will be used for *all FCS files imported by this operation.* ignore_v : List(Str) :class:`cytoflow` is designed to operate on an :class:`.Experiment` containing tubes that were all collected under the same instrument settings. In particular, the same PMT voltages ensure that data can be compared across samples. *Very rarely*, you may need to set up an :class:`.Experiment` with different voltage settings on different :class:`Tube`s. This is likely only to be the case when you are trying to figure out which voltages should be used in future experiments. If so, set :attr:`ignore_v` to a :class:`List` of channel names to ignore particular channels. .. warning:: THIS WILL BREAK REAL EXPERIMENTS Examples -------- >>> tube1 = flow.Tube(file = 'RFP_Well_A3.fcs', conditions = {"Dox" : 10.0}) >>> tube2 = flow.Tube(file='CFP_Well_A4.fcs', conditions = {"Dox" : 1.0}) >>> import_op = flow.ImportOp(conditions = {"Dox" : "float"}, ... tubes = [tube1, tube2]) >>> ex = import_op.apply() """ id = Constant("edu.mit.synbio.cytoflow.operations.import") friendly_id = Constant("Import") name = Constant("Import Data") # experimental conditions: name --> dtype. conditions = Dict(Str, Str) # the tubes tubes = List(Tube) # which channels do we import? channels = Dict(Str, Str) # which FCS metadata has the channel names in it? name_metadata = Enum(None, "$PnN", "$PnS") # which data set to get out of the FCS files? data_set = Int(0) # are we subsetting? events = util.CIntOrNone(None) coarse_events = util.Deprecated(new='events') # DON'T DO THIS ignore_v = List(Str) def apply(self, experiment=None, metadata_only=False): """ Load a new :class:`.Experiment`. Parameters ---------- experiment : Experiment Ignored metadata_only : bool (default = False) Only "import" the metadata, creating an Experiment with all the expected metadata and structure but 0 events. Returns ------- Experiment The new :class:`.Experiment`. New channels have the following metadata: - **voltage** - int The voltage that this channel was collected at. Determined by the ``$PnV`` field from the first FCS file. - **range** - int The maximum range of this channel. Determined by the ``$PnR`` field from the first FCS file. New experimental conditions do not have **voltage** or **range** metadata, obviously. Instead, they have **experiment** set to ``True``, to distinguish the experimental variables from the conditions that were added by gates, etc. If :attr:`ignore_v` is set, it is added as a key to the :class:`.Experiment`-wide metadata. """ if not self.tubes or len(self.tubes) == 0: raise util.CytoflowOpError('tubes', "Must specify some tubes!") # if we have channel renaming, make sure the new names are valid # python identifiers if self.channels: for old_name, new_name in self.channels.items(): if old_name != new_name and new_name != util.sanitize_identifier( new_name): raise util.CytoflowOpError( 'channels', "Channel name {} must be a " "valid Python identifier.".format(new_name)) # make sure each tube has the same conditions tube0_conditions = set(self.tubes[0].conditions) for tube in self.tubes: tube_conditions = set(tube.conditions) if len(tube0_conditions ^ tube_conditions) > 0: raise util.CytoflowOpError( 'tubes', "Tube {0} didn't have the same " "conditions as tube {1}".format(tube.file, self.tubes[0].file)) # make sure experimental conditions are unique for idx, i in enumerate(self.tubes[0:-1]): for j in self.tubes[idx + 1:]: if i.conditions_equal(j): raise util.CytoflowOpError( 'tubes', "The same conditions specified for " "tube {0} and tube {1}".format(i.file, j.file)) experiment = Experiment() experiment.metadata["ignore_v"] = self.ignore_v for condition, dtype in list(self.conditions.items()): experiment.add_condition(condition, dtype) experiment.metadata[condition]['experiment'] = True try: # silence warnings about duplicate channels; # we'll figure that out below with warnings.catch_warnings(): warnings.simplefilter("ignore") tube0_meta = fcsparser.parse(self.tubes[0].file, data_set=self.data_set, meta_data_only=True, reformat_meta=True) except Exception as e: raise util.CytoflowOpError( 'tubes', "FCS reader threw an error reading metadata " "for tube {}: {}".format(self.tubes[0].file, str(e))) from e meta_channels = tube0_meta["_channels_"] if self.name_metadata: experiment.metadata["name_metadata"] = self.name_metadata else: experiment.metadata["name_metadata"] = autodetect_name_metadata( self.tubes[0].file, data_set=self.data_set) meta_channels['Index'] = meta_channels.index meta_channels.set_index(experiment.metadata["name_metadata"], inplace=True) channels = list(self.channels.keys()) if self.channels \ else list(meta_channels.index.values) # make sure everything in self.channels is in the tube channels for channel in channels: if channel not in meta_channels.index: raise util.CytoflowOpError( 'channels', "Channel {0} not in tube {1}".format( channel, self.tubes[0].file)) # now that we have the metadata, load it into experiment for channel in channels: experiment.add_channel(channel) experiment.metadata[channel]["fcs_name"] = channel # keep track of the channel's PMT voltage if ("$PnV" in meta_channels.loc[channel]): v = meta_channels.loc[channel]['$PnV'] if v: experiment.metadata[channel]["voltage"] = v # add the maximum possible value for this channel. data_range = meta_channels.loc[channel]['$PnR'] data_range = float(data_range) experiment.metadata[channel]['range'] = data_range experiment.metadata['fcs_metadata'] = {} for tube in self.tubes: if metadata_only: tube_meta, tube_data = parse_tube(tube.file, experiment, data_set=self.data_set, metadata_only=True) else: tube_meta, tube_data = parse_tube(tube.file, experiment, data_set=self.data_set) if self.events: if self.events <= len(tube_data): tube_data = tube_data.loc[np.random.choice( tube_data.index, self.events, replace=False)] else: warnings.warn( "Only {0} events in tube {1}".format( len(tube_data), tube.file), util.CytoflowWarning) experiment.add_events(tube_data[channels], tube.conditions) # extract the row and column from wells collected on a # BD HTS if 'WELL ID' in tube_meta: pos = tube_meta['WELL ID'] tube_meta['CF_Row'] = pos[0] tube_meta['CF_Col'] = int(pos[1:3]) for i, channel in enumerate(channels): # remove the PnV tube metadata if '$P{}V'.format(i + 1) in tube_meta: del tube_meta['$P{}V'.format(i + 1)] # work around a bug where the PnR is sometimes not the detector range # but the data range. pnr = '$P{}R'.format(i + 1) if pnr in tube_meta and float( tube_meta[pnr] ) > experiment.metadata[channel]['range']: experiment.metadata[channel]['range'] = float( tube_meta[pnr]) tube_meta['CF_File'] = Path(tube.file).stem experiment.metadata['fcs_metadata'][tube.file] = tube_meta for channel in channels: if self.channels and channel in self.channels: new_name = self.channels[channel] if channel == new_name: continue experiment.data.rename(columns={channel: new_name}, inplace=True) experiment.metadata[new_name] = experiment.metadata[channel] experiment.metadata[new_name]["fcs_name"] = channel del experiment.metadata[channel] # this catches an odd corner case where some instruments store # instrument-specific info in the "extra" bits. we have to # clear them out. if tube0_meta['$DATATYPE'] == 'I': data_bits = int(meta_channels.loc[channel]['$PnB']) data_range = float(meta_channels.loc[channel]['$PnR']) range_bits = int(math.log(data_range, 2)) if range_bits < data_bits: mask = 1 for _ in range(1, range_bits): mask = mask << 1 | 1 experiment.data[channel] = experiment.data[ channel].values.astype('int') & mask # re-scale the data to linear if if's recorded as log-scaled with # integer channels data_range = float(meta_channels.loc[channel]['$PnR']) f1 = float(meta_channels.loc[channel]['$PnE'][0]) f2 = float(meta_channels.loc[channel]['$PnE'][1]) if f1 > 0.0 and f2 == 0.0: warnings.warn( 'Invalid $PnE = {},{} for channel {}, changing it to {},1.0' .format(f1, f2, channel, f1), util.CytoflowWarning) f2 = 1.0 if f1 > 0.0 and f2 > 0.0 and tube0_meta['$DATATYPE'] == 'I': warnings.warn( 'Converting channel {} from logarithmic to linear'.format( channel), util.CytoflowWarning) # experiment.data[channel] = 10 ** (f1 * experiment.data[channel] / data_range) * f2 return experiment