def run_job(self, batch, assume_clean_state=False): '''Extracts OMEXML from microscope image or metadata files. Parameters ---------- batch: dict description of the *run* job assume_clean_state: bool, optional assume that output of previous runs has already been cleaned up Note ---- The actual processing is delegated to the `showinf <http://www.openmicroscopy.org/site/support/bio-formats5.1/users/comlinetools/display.html>`_ Bioformats command line tool. Raises ------ subprocess.CalledProcessError when extraction failed ''' # NOTE: Ideally, we would use the BFOmeXmlReader together with JavaBridge # but this approach has several shortcomings and requires too much # memory to run efficiently on individual cores. with tm.utils.ExperimentSession(self.experiment_id) as session: for fid in batch['microscope_image_file_ids']: img_file = session.query(tm.MicroscopeImageFile).get(fid) logger.info('process image %d' % img_file.id) # The "showinf" command line tool writes the extracted OMEXML # to standard output. command = [ 'showinf', '-omexml-only', '-nopix', '-novalid', '-nocore', '-no-upgrade', '-no-sas', img_file.location ] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() logger.debug("showinf STDOUT: \n```%s```\n", stdout) logger.debug("showinf STDERR: \n```%s```\n", stderr) if p.returncode != 0 or not stdout: raise MetadataError( 'Extraction of OMEXML failed! Error message:\n%s' % stderr) # the OME-XML data is contained within XML tags `<OME ...>` and `</OME>` start = stdout.find("<OME") if start == -1: raise ValueError( "Cannot find OME-XML start tag in `showinf` output.") end = stdout.rfind("</OME>", start) if end == -1: raise ValueError( "Cannot find OME-XML closing tag in `showinf` output.") img_file.omexml = unicode(stdout[start:end + len('</OME>')]) session.add(img_file) session.commit() session.expunge(img_file)
def run_job(self, batch, assume_clean_state=False): '''Extracts OMEXML from microscope image or metadata files. Parameters ---------- batch: dict description of the *run* job assume_clean_state: bool, optional assume that output of previous runs has already been cleaned up Note ---- The actual processing is delegated to the `showinf <http://www.openmicroscopy.org/site/support/bio-formats5.1/users/comlinetools/display.html>`_ Bioformats command line tool. Raises ------ subprocess.CalledProcessError when extraction failed ''' # NOTE: Ideally, we would use the BFOmeXmlReader together with JavaBridge # but this approach has several shortcomings and requires too much # memory to run efficiently on individual cores. with tm.utils.ExperimentSession(self.experiment_id) as session: for fid in batch['microscope_image_file_ids']: img_file = session.query(tm.MicroscopeImageFile).get(fid) logger.info('process image %d' % img_file.id) # The "showinf" command line tool writes the extracted OMEXML # to standard output. command = [ 'showinf', '-omexml-only', '-nopix', '-novalid', '-nocore', '-no-upgrade', '-no-sas', img_file.location ] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode != 0 or not stdout: raise MetadataError( 'Extraction of OMEXML failed! Error message:\n%s' % stderr) try: # We only want the XML. This will remove potential # warnings and other stuff we don't want. omexml = re.search(r'<(\w+).*</\1>', stdout, flags=re.DOTALL).group() except: raise RegexError('OMEXML metadata could not be extracted.') img_file.omexml = unicode(omexml) session.add(img_file) session.commit() session.expunge(img_file)
def determine_grid_coordinates_from_layout(self, stitch_layout, stitch_dimensions): '''Determines the coordinates of each image acquisition site within the continuous acquisition grid (slide or well in a plate) based on a provided layout. Parameters ---------- stitch_layout: str layout of the acquisition grid (options: ``"horizontal"``, ``"zigzag_horizontal"``, ``"vertical"``, or ``"zigzag_vertical"``) stitch_dimensions: Tuple[int] dimensions of the acquisition grid, i.e. number of images along the vertical and horizontal axis of the acquired area Returns ------- pandas.DataFrame metadata for each 2D *Plane* element See also -------- :func:`illuminati.stitch.calc_grid_coordinates_from_layout` ''' md = self.metadata logger.info('determine acquisition grid coordinates based on layout') # Determine the number of unique positions per well acquisitions_per_well = md.groupby( ['well_name', 'channel_name', 'zplane', 'tpoint']) n_acquisitions_per_well = acquisitions_per_well.count().name if len(np.unique(n_acquisitions_per_well)) > 1: raise MetadataError( 'Each well must have the same number of acquisition sites.') n_sites = n_acquisitions_per_well[0] sites = acquisitions_per_well.groups.values() logger.debug('stitch layout: {0}; stitch dimensions: {1}'.format( stitch_layout, stitch_dimensions)) coordinates = stitch.calc_grid_coordinates_from_layout( stitch_dimensions, stitch_layout) y_coordinates = [c[0] for c in coordinates] x_coordinates = [c[1] for c in coordinates] for indices in sites: if len(indices) != len(coordinates): raise ValueError('Incorrect stitch dimensions provided.') md.loc[indices, 'well_position_y'] = y_coordinates md.loc[indices, 'well_position_x'] = x_coordinates return self.metadata
def configure_from_filenames(self, plate_dimensions, regex): '''Configures metadata based on information encoded in image filenames using a regular expression with the followsing fields: - *w*: well - *t*: time point - *s*: acquisition site - *z*: focal plane (z dimension) - *c*: channel Parameters ---------- plate_dimensions: Tuple[int] number of rows and columns in the well plate regex: str named regular expression Raises ------ tmlib.errors.MetadataError when image files contain more than more plane, since this case wouldn't allow a 1-to-1 mapping of information from filename to image plane Returns ------- pandas.DataFrame metadata for each 2D *Plane* element ''' logger.info('update image metadata with filename information') md = self.metadata filenames = natsorted( list(set([f for fm in self._file_mapper_list for f in fm.files]))) if md.shape[0] != len(filenames): raise MetadataError( 'Configuration of metadata based on filenames ' 'works only when each image file contains only a single plane.' ) logger.info('retrieve metadata from filenames via regular expression') self.check_regular_expression(regex) for i, f in enumerate(filenames): # Not every microscope provides all the information in the filename. fields = self.extract_fields_from_filename(regex, f) md.at[i, 'channel_name'] = str(fields.c) md.at[i, 'site'] = int(fields.s) md.at[i, 'zplane'] = int(fields.z) md.at[i, 'tpoint'] = int(fields.t) md.at[i, 'well_name'] = str(fields.w) return self.metadata
def determine_grid_coordinates_from_stage_positions(self): '''Determines the coordinates of each image acquisition site within the continuous acquisition grid (slide or well in a plate) based on the absolute microscope stage positions. Returns ------- pandas.DataFrame metadata for each 2D *Plane* element Raises ------ MetadataError when stage position information is not available from `metadata` See also -------- :func:`illuminati.stitch.calc_grid_coordinates_from_positions` ''' md = self.metadata if (any(md.stage_position_y.isnull()) or any(md.stage_position_x.isnull())): raise MetadataError('Stage position information is not available.') logger.info('translate absolute microscope stage positions into ' 'relative acquisition grid coordinates') planes_per_well = md.groupby(['well_name']) n_tpoints = len(np.unique(md.tpoint)) n_channels = len(np.unique(md.channel_name)) n_zplanes = len(np.unique(md.zplane)) for well_name in np.unique(md.well_name): ix = planes_per_well.groups[well_name] positions = zip(md.loc[ix, 'stage_position_y'], md.loc[ix, 'stage_position_x']) n = len(positions) / (n_tpoints * n_channels * n_zplanes) coordinates = self._calculate_coordinates(positions, n) md.loc[ix, 'well_position_y'] = [c[0] for c in coordinates] md.loc[ix, 'well_position_x'] = [c[1] for c in coordinates] return self.metadata
def _combine_omexml_elements(self, omexml_images, omexml_metadata): logger.info('combine OMEXML elements') # We assume here that each image files contains the same number images. n_images = omexml_images.values()[0].image_count * len(omexml_images) if omexml_metadata is not None: extra_omexml_available = True if not isinstance(omexml_metadata, bioformats.omexml.OMEXML): raise TypeError('Argument "omexml_metadata" must have type ' 'bioformats.omexml.OMEXML.') if omexml_metadata.image_count != n_images: raise MetadataError( 'Number of images in "omexml_metadata" must match ' 'the total number of Image elements in "omexml_images".') else: extra_omexml_available = False omexml_metadata = bioformats.OMEXML(XML_DECLARATION) omexml_metadata.image_count = n_images image_element_attributes = {'AcquisitionDate', 'Name'} channel_element_attributes = {'Name'} pixel_element_attributes = { 'PixelType', 'SizeC', 'SizeT', 'SizeX', 'SizeY', 'SizeZ' } plane_element_attributes = { 'PositionX', 'PositionY', 'PositionZ', 'TheC', 'TheT', 'TheZ' } filenames = natsorted(omexml_images) count = 0 for i, f in enumerate(filenames): omexml_img = omexml_images[f] n_series = omexml_img.image_count for s in xrange(n_series): extracted_image = omexml_img.image(s) md_image = omexml_metadata.image(count) for attr in image_element_attributes: extracted_value = getattr(extracted_image, attr) if extracted_value is not None: setattr(md_image, attr, extracted_value) extracted_pixels = extracted_image.Pixels n_planes = extracted_pixels.plane_count if n_planes == 0: # Sometimes an image doesn't have any plane elements. # Let's create them for consistency. extracted_pixels = self._create_channel_planes( extracted_pixels) n_planes = extracted_pixels.plane_count md_pixels = md_image.Pixels md_pixels.plane_count = n_planes if extra_omexml_available and (md_pixels.plane_count != n_planes): raise MetadataError( 'Image element #%d in OMEXML obtained from additional ' 'metdata files must have the same number of Plane ' 'elements as the corresponding Image elements in the ' 'OMEXML element obtained from image file "%s".' % (i, f)) for attr in pixel_element_attributes: extracted_value = getattr(extracted_pixels, attr) if extracted_value is not None: # This is python-bioformats being stupid by setting # random default values. setattr(md_pixels, attr, extracted_value) for p in xrange(n_planes): extracted_plane = extracted_pixels.Plane(p) md_plane = md_pixels.Plane(p) for attr in plane_element_attributes: extracted_value = getattr(extracted_plane, attr) md_value = getattr(md_plane, attr) if md_value is None and extracted_value is not None: setattr(md_plane, attr, extracted_value) fm = ImageFileMapping() fm.ref_index = count + p fm.files = [f] fm.series = [s] fm.planes = [p] self._file_mapper_list.append(fm) self._file_mapper_lut[f].append(fm) n_channels = extracted_pixels.channel_count md_image.channel_count = n_channels for c in xrange(n_channels): extracted_channel = extracted_pixels.Channel(c) md_channel = md_pixels.Channel(c) for attr in channel_element_attributes: extracted_value = getattr(extracted_channel, attr) if extracted_value is not None: setattr(md_channel, attr, extracted_value) count += 1 return omexml_metadata
def collect_job_output(self, batch): '''Assigns registered image files from different acquisitions to separate *cycles*. If an acquisition includes multiple time points, a separate *cycle* is created for each time point. The mapping from *acquisitions* to *cycles* is consequently 1 -> n, where n is the number of time points per acquisition (n >= 1). Whether acquisition time points will be interpreted as actual time points in a time series depends on the value of :attr:`tm.Experiment.plate_acquisition_mode`. Parameters ---------- batch: dict description of the *collect* job ''' with tm.utils.ExperimentSession(self.experiment_id) as session: # We need to do this per plate to ensure correct indices # TODO: check plates have similar channels, etc experiment = session.query(tm.Experiment).one() acquisition_mode = experiment.plate_acquisition_mode logger.info('plates were acquired in mode "%s"', acquisition_mode) is_time_series = acquisition_mode == 'basic' if is_time_series: logger.info('time points are interpreted as time series') is_multiplexing = acquisition_mode == 'multiplexing' if is_multiplexing: logger.info('time points are interpreted as multiplexing cycles') with tm.utils.ExperimentSession(self.experiment_id) as session: channels = session.query(tm.Channel.name, tm.Channel.id).all() channel_lut = dict(channels) bit_depth = session.query(tm.Channel.bit_depth).distinct().one() if len(bit_depth) > 1: raise MetadataError('All channels must have the same bit depth.') bit_depth = bit_depth[0] wavelengths = session.query(tm.Channel.wavelength).\ distinct().\ all() wavelengths = [w[0] for w in wavelengths] # We order acquisitions by the time they got created. This will # determine the order of multiplexing cycles. plates = session.query(tm.Plate.id).\ order_by(tm.Plate.created_at).\ all() plate_ids = [p.id for p in plates] for p in plate_ids: acquisitions = session.query(tm.Acquisition.id).\ filter_by(plate_id=p).\ order_by(tm.Acquisition.created_at).\ all() acquisition_ids = [a.id for a in acquisitions] t_index = 0 w_index = 0 c_index = 0 for a in acquisition_ids: logger.debug('acquisition %d', a) tpoints = session.query(tm.ChannelImageFile.tpoint).\ filter_by(acquisition_id=a).\ distinct().\ all() tpoints = [t[0] for t in tpoints] for t in tpoints: logger.debug('time point #%d', t) cycle = session.get_or_create( tm.Cycle, index=c_index, experiment_id=self.experiment_id ) for w in wavelengths: # Get all channel_image_files for the currently # processed acquisition that match the old values # of the "tpoint" and "channel_id" attributes. image_files = session.query(tm.ChannelImageFile.id).\ filter_by( tpoint=t, acquisition_id=a, channel_id=channel_lut[w] ).\ all() if len(image_files) == 0: # A wavelength might not have been used at # every time point. continue logger.debug('wavelength "%s"', w) if is_multiplexing: # In case of a multiplexing experiment # we create a separate channel for each # combination of wavelength and tpoint. new_channel_name = '{c}_{w}'.format( c=c_index, w=w ) else: # In case of a time series experiment # the name of the channel remains unchanged. new_channel_name = w # Check whether the channel already exists and # update the name accordingly (upon creation, the # "name" attribute should have been set to the # value of the "wavelength" attribute). channel = session.query(tm.Channel).\ filter_by(name=w, wavelength=w).\ one_or_none() if channel is not None: channel.name = new_channel_name session.add(channel) session.commit() else: channel = tm.Channel( name=new_channel_name, wavelength=w, bit_depth=bit_depth, experiment_id=self.experiment_id ) session.add(channel) session.commit() logger.info( 'update time point and channel id ' 'of channel image files: tpoint=%d, channel=%s', t_index, channel.name ) # Update the attributes of channel_image_files with # the new values for tpoint and channel_id and also # add the cycle_id. session.bulk_update_mappings( tm.ChannelImageFile, [ { 'id': f.id, 'tpoint': t_index, 'cycle_id': cycle.id, 'channel_id': channel.id } for f in image_files ] ) # Update lookup table channel_lut[new_channel_name] = channel.id if is_time_series: t_index += 1 else: c_index += 1
def run_job(self, batch, assume_clean_state=False): '''Configures OMEXML metadata extracted from microscope image files and complements it with metadata retrieved from additional microscope metadata files and/or user input. The actual processing is delegated to a format-specific implementation of :class:`MetadataHandler <tmlib.workflow.metaconfig.base.MetadataHandler>`. Parameters ---------- batch: dict job description assume_clean_state: bool, optional assume that output of previous runs has already been cleaned up See also -------- :mod:`tmlib.workflow.metaconfig.cellvoyager` ''' regexp = batch.get('regex', '') if not regexp: regexp = get_microscope_type_regex( batch['microscope_type'], as_string=True )[0] with tm.utils.ExperimentSession(self.experiment_id) as session: experiment = session.query(tm.Experiment).one() plate_dimensions = experiment.plates[0].dimensions acquisition = session.query(tm.Acquisition).\ get(batch['acquisition_id']) metadata_files = session.query(tm.MicroscopeMetadataFile.location).\ filter_by(acquisition_id=batch['acquisition_id']).\ all() metadata_filenames = [f.location for f in metadata_files] image_files = session.query( tm.MicroscopeImageFile.name, tm.MicroscopeImageFile.omexml ).\ filter_by(acquisition_id=batch['acquisition_id']).\ all() omexml_images = { f.name: bioformats.OMEXML(f.omexml) for f in image_files } MetadataReader = metadata_reader_factory(batch['microscope_type']) if MetadataReader is not None: with MetadataReader() as mdreader: omexml_metadata = mdreader.read( metadata_filenames, omexml_images.keys() ) else: omexml_metadata = None MetadataHandler = metadata_handler_factory(batch['microscope_type']) mdhandler = MetadataHandler(omexml_images, omexml_metadata) mdhandler.configure_from_omexml() missing = mdhandler.determine_missing_metadata() if missing: logger.warning( 'required metadata information is missing: "%s"', '", "'.join(missing) ) logger.info( 'try to retrieve missing metadata from filenames ' 'using regular expression' ) if regexp is None: logger.warn('no regular expression provided') mdhandler.configure_from_filenames( plate_dimensions=plate_dimensions, regex=regexp ) missing = mdhandler.determine_missing_metadata() if missing: raise MetadataError( 'The following metadata information is missing:\n"%s"\n' % '", "'.join(missing) ) # Once we have collected basic metadata such as information about # channels and focal planes, we try to determine the relative position # of images within the acquisition grid try: logger.info( 'try to determine grid coordinates from microscope ' 'stage positions' ) mdhandler.determine_grid_coordinates_from_stage_positions() except MetadataError as error: logger.warning( 'microscope stage positions are not available: "%s"' % str(error) ) logger.info( 'try to determine grid coordinates from provided stitch layout' ) # In general, the values of these arguments can be ``None``, because # they are not required and may not be used. # However, in case the grid coordinates should be determined based # on user interput, these arguments are required. if not isinstance(batch['n_vertical'], int): raise TypeError( 'Value of argument "n_vertical" must be an integer.' ) if not isinstance(batch['n_horizontal'], int): raise TypeError( 'Value of argument "n_horizontal" must be an integer.' ) mdhandler.determine_grid_coordinates_from_layout( stitch_layout=batch['stitch_layout'], stitch_dimensions=(batch['n_vertical'], batch['n_horizontal']) ) if batch['perform_mip']: mdhandler.group_metadata_per_zstack() # Create consistent zero-based ids mdhandler.update_indices() mdhandler.assign_acquisition_site_indices() md = mdhandler.remove_redundant_columns() fmaps = mdhandler.create_image_file_mappings() logger.info('create database entries') with tm.utils.ExperimentSession(self.experiment_id) as session: channels = dict() bit_depth = md['bit_depth'][0] for ch_name in np.unique(md['channel_name']): logger.info('create channel "%s"', ch_name) ch = session.get_or_create( tm.Channel, experiment_id=self.experiment_id, name=ch_name, wavelength=ch_name, bit_depth=bit_depth, ) channels[ch_name] = ch.id for w in np.unique(md.well_name): with tm.utils.ExperimentSession(self.experiment_id) as session: acquisition = session.query(tm.Acquisition).\ get(batch['acquisition_id']) logger.info('create well "%s"', w) w_index = (md.well_name == w) well = session.get_or_create( tm.Well, plate_id=acquisition.plate.id, name=w ) channel_image_files = [] for s in np.unique(md.loc[w_index, 'site']): logger.debug('create site #%d', s) s_index = (md.site == s) y = md.loc[s_index, 'well_position_y'].values[0] x = md.loc[s_index, 'well_position_x'].values[0] height = md.loc[s_index, 'height'].values[0] width = md.loc[s_index, 'width'].values[0] site = session.get_or_create( tm.Site, y=y, x=x, height=height, width=width, well_id=well.id ) for index, i in md.ix[s_index].iterrows(): channel_image_files.append( tm.ChannelImageFile( tpoint=i.tpoint, zplane=i.zplane, channel_id=channels[i.channel_name], site_id=site.id, acquisition_id=acquisition.id, file_map=fmaps[index], ) ) session.bulk_save_objects(channel_image_files)