def metadata_index(paths, processes=4): '''Build an external scan metadata index for a mass spectrometry data file This extended index is saved in a separate JSON file that can be loaded with :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor mass of MSn scans, as well as the relationships between precursor and product ion scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information ''' for path in paths: reader = MSFileLoader(path) try: fn = reader.prebuild_byte_offset_file if not reader.source._check_has_byte_offset_file(): fn(path) except AttributeError: pass if processes > 1: index, _ = quick_index.index(reader, processes) else: index = quick_index.ExtendedScanIndex() reader.reset() for bunch in reader: index.add_scan_bunch(bunch) name = path index_file_name = index.index_file_name(name) with open(index_file_name, 'w') as fh: index.serialize(fh)
def metadata_index(paths, processes=4): '''Build an external scan metadata index for a mass spectrometry data file This extended index is saved in a separate JSON file that can be loaded with :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor mass of MSn scans, as well as the relationships between precursor and product ion scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information ''' for path in paths: click.echo("Indexing %s" % (path, )) reader = MSFileLoader(path) try: fn = reader.prebuild_byte_offset_file if not reader.source._check_has_byte_offset_file(): fn(path) except AttributeError: pass if processes > 1: progbar = click.progressbar(label='Building Index', length=100) acc = [0] def update_bar(x): '''Progress Bar update callback for :func:`~.quick_index.index` ''' x = int(x * 100) x -= acc[0] # pylint: disable=cell-var-from-loop progbar.update(x) # pylint: disable=cell-var-from-loop acc[0] += x # pylint: disable=cell-var-from-loop with progbar: update_bar(0.0) index, _ = quick_index.index( reader, processes, progress_indicator=update_bar) else: index = quick_index.ExtendedScanIndex() reader.reset() try: n = len(reader) progbar = click.progressbar(label='Building Index', length=n) except TypeError: progbar = spinner(title="Building Index") with progbar: for bunch in reader.make_iterator(grouped=True): i = 0 i += bunch.precursor is not None i += len(bunch.products) index.add_scan_bunch(bunch) progbar.update(i) name = path index_file_name = index.index_file_name(name) with open(index_file_name, 'w') as fh: index.serialize(fh)
def _run(): for path in paths: reader = MSFileLoader(path) chunk_out_of_order = quick_index.run_task_in_chunks( reader, processes, processes * 4, task=interval_extraction) for chunk in chunk_out_of_order: interval_set.extend(chunk) yield 0
def make_scan(self): complex_compressed_mzml = datafile("20150710_3um_AGP_001_29_30.mzML.gz") reader = MSFileLoader(complex_compressed_mzml) bunch = next(reader) return bunch
def describe(path): '''Produces a minimal textual description of a mass spectrometry data file. ''' click.echo("Describing \"%s\"" % (path,)) try: sf = SourceFile.from_path(path) except IOError: click.echo("Could not open", err=True) if sf.file_format is None: click.echo("It doesn't appear to be a mass spectrometry data file") return -1 click.echo("File Format: %s" % (sf.file_format, )) click.echo("ID Format: %s" % (sf.id_format, )) reader = MSFileLoader(path) if isinstance(reader, RandomAccessScanSource): click.echo("Format Supports Random Access: True") first_scan = reader[0] last_scan = reader[-1] click.echo("First Scan: %s at %0.3f minutes" % (first_scan.id, first_scan.scan_time)) click.echo("Last Scan: %s at %0.3f minutes" % (last_scan.id, last_scan.scan_time)) else: click.echo("Format Supports Random Access: False") try: finfo = reader.file_description() click.echo("Contents:") for key in finfo.contents: click.echo(" %s" % (key, )) except AttributeError: pass index_file_name = quick_index.ExtendedScanIndex.index_file_name(path) # Extra introspection if the extended index is available if os.path.exists(index_file_name): with open(index_file_name, 'rt') as fh: index = quick_index.ExtendedScanIndex.deserialize(fh) ms1_scans = len(index.ms1_ids) msn_scans = len(index.msn_ids) click.echo("MS1 Scans: %d" % (ms1_scans, )) click.echo("MSn Scans: %d" % (msn_scans, )) n_defaulted = 0 n_orphan = 0 charges = Counter() first_msn = float('inf') last_msn = 0 for scan_info in index.msn_ids.values(): n_defaulted += scan_info.get('defaulted', False) n_orphan += scan_info.get('orphan', False) charges[scan_info['charge']] += 1 rt = scan_info['scan_time'] if rt < first_msn: first_msn = rt if rt > last_msn: last_msn = rt click.echo("First MSn Scan: %0.3f minutes" % (first_msn,)) click.echo("Last MSn Scan: %0.3f minutes" % (last_msn,)) for charge, count in sorted(charges.items()): if not isinstance(charge, int): continue click.echo("Precursors with Charge State %d: %d" % (charge, count)) if n_defaulted > 0: click.echo("Defaulted MSn Scans: %d" % (n_defaulted,)) if n_orphan > 0: click.echo("Orphan MSn Scans: %d" % (n_orphan,))
def configure_storage(cls, path=None, name=None, source=None): if path is not None: if name is None: sample_name = os.path.basename(path) else: sample_name = name else: path = "processed.mzML" if source is not None: reader = MSFileLoader(source.scan_source) n_spectra = len(reader.index) deconvoluting = source.deconvoluting inst = cls(path, sample_name, n_spectra=n_spectra, deconvoluted=deconvoluting) try: description = reader.file_description() except AttributeError: description = FileInformation() source_file_metadata = MetadataSourceFile.from_path(source.scan_source) inst.serializer.add_file_information(description) try: inst.serializer.remove_file_contents("profile spectrum") except KeyError: pass inst.serializer.add_file_contents("centroid spectrum") if source_file_metadata not in description.source_files: inst.serializer.add_source_file(source_file_metadata) try: instrument_configs = reader.instrument_configuration() for config in instrument_configs: inst.serializer.add_instrument_configuration(config) except Exception as e: log_handle.error( "An error occurred while writing instrument configuration", e) for trans in source.ms1_peak_picking_args.get("transforms"): inst.register_parameter("parameter: ms1-%s" % trans.__class__.__name__, repr(trans)) if deconvoluting: if source.ms1_deconvolution_args.get("averagine"): inst.register_parameter( "parameter: ms1-averagine", repr(source.ms1_deconvolution_args.get("averagine"))) if source.ms1_deconvolution_args.get("scorer"): inst.register_parameter( "parameter: ms1-scorer", repr(source.ms1_deconvolution_args.get("scorer"))) if source.ms1_averaging > 0: inst.register_parameter("parameter: ms1-averaging", repr(source.ms1_averaging)) if source.ignore_tandem_scans: inst.register_parameter("parameter: ignore-tandem-scans", "") if source.extract_only_tandem_envelopes: inst.register_parameter("parameter: extract-only-tandem-envelopes", "") if source.msn_peak_picking_args is not None: for trans in source.msn_peak_picking_args.get("transforms"): inst.register_parameter("parameter: msn-%s" % trans.__class__.__name__, repr(trans)) if deconvoluting: if source.msn_deconvolution_args.get("averagine"): inst.register_parameter( "parameter: msn-averagine", repr(source.msn_deconvolution_args.get("averagine"))) if source.msn_deconvolution_args.get("scorer"): inst.register_parameter( "parameter: msn-scorer", repr(source.msn_deconvolution_args.get("scorer"))) data_processing = inst.serializer.build_processing_method() inst.serializer.add_data_processing(data_processing) else: n_spectra = 2e5 inst = cls(path, sample_name, n_spectra=n_spectra) # Force marshalling of controlled vocabularies early. inst.serializer.writer.param("32-bit float") return inst
def configure_storage(cls, path=None, name=None, source=None): if path is not None: if name is None: sample_name = os.path.basename(path) else: sample_name = name else: path = "processed.mzML" if source is not None: reader = MSFileLoader(source.scan_source) n_spectra = len(reader.index) deconvoluting = source.deconvoluting inst = cls(path, sample_name, n_spectra=n_spectra, deconvoluted=deconvoluting) try: description = reader.file_description() except AttributeError: description = FileInformation() source_file_metadata = MetadataSourceFile.from_path( source.scan_source) inst.serializer.add_file_information(description) try: inst.serializer.remove_file_contents("profile spectrum") except KeyError: pass inst.serializer.add_file_contents("centroid spectrum") if source_file_metadata not in description.source_files: inst.serializer.add_source_file(source_file_metadata) try: instrument_configs = reader.instrument_configuration() for config in instrument_configs: inst.serializer.add_instrument_configuration(config) except Exception as e: inst.error( "An error occurred while writing instrument configuration", e) for trans in source.ms1_peak_picking_args.get("transforms", []): inst.register_parameter( "parameter: ms1-%s" % trans.__class__.__name__, repr(trans)) if deconvoluting: if source.ms1_deconvolution_args.get("averagine"): inst.register_parameter( "parameter: ms1-averagine", repr(source.ms1_deconvolution_args.get("averagine"))) if source.ms1_deconvolution_args.get("scorer"): inst.register_parameter( "parameter: ms1-scorer", repr(source.ms1_deconvolution_args.get("scorer"))) if source.ms1_averaging > 0: inst.register_parameter("parameter: ms1-averaging", repr(source.ms1_averaging)) if source.ignore_tandem_scans: inst.register_parameter("parameter: ignore-tandem-scans", "") if source.extract_only_tandem_envelopes: inst.register_parameter( "parameter: extract-only-tandem-envelopes", "") if source.msn_peak_picking_args is not None: for trans in source.msn_peak_picking_args.get( "transforms", []): inst.register_parameter( "parameter: msn-%s" % trans.__class__.__name__, repr(trans)) if deconvoluting: if source.msn_deconvolution_args.get("averagine"): inst.register_parameter( "parameter: msn-averagine", repr(source.msn_deconvolution_args.get("averagine"))) if source.msn_deconvolution_args.get("scorer"): inst.register_parameter( "parameter: msn-scorer", repr(source.msn_deconvolution_args.get("scorer"))) data_processing = inst.serializer.build_processing_method() inst.serializer.add_data_processing(data_processing) else: n_spectra = 2e5 inst = cls(path, sample_name, n_spectra=n_spectra) # Force marshalling of controlled vocabularies early. inst.serializer.writer.param("32-bit float") return inst
def describe(path, diagnostics=False): '''Produces a minimal textual description of a mass spectrometry data file. ''' click.echo("Describing \"%s\"" % (path,)) try: sf = SourceFile.from_path(path) except IOError: raise click.Abort("Could not open file \"%s\"" % (path, ), err=True) if sf.file_format is None: raise click.Abort("\"%s\" doesn't appear to be a mass spectrometry data file" % (path, )) click.echo("File Format: %s" % (sf.file_format, )) click.echo("ID Format: %s" % (sf.id_format, )) reader = MSFileLoader(path) if isinstance(reader, RandomAccessScanSource): click.echo("Format Supports Random Access: True") first_scan = reader[0] last_scan = reader[-1] click.echo("First Scan: %s at %0.3f minutes" % (first_scan.id, first_scan.scan_time)) click.echo("Last Scan: %s at %0.3f minutes" % (last_scan.id, last_scan.scan_time)) else: click.echo("Format Supports Random Access: False") try: finfo = reader.file_description() click.echo("Contents:") for key in finfo.contents: click.echo(" %s" % (key, )) except AttributeError: pass index_file_name = quick_index.ExtendedScanIndex.index_file_name(path) # Extra introspection if the extended index is available if os.path.exists(index_file_name): with open(index_file_name, 'rt') as fh: index = quick_index.ExtendedScanIndex.deserialize(fh) ms1_scans = len(index.ms1_ids) msn_scans = len(index.msn_ids) click.echo("MS1 Scans: %d" % (ms1_scans, )) click.echo("MSn Scans: %d" % (msn_scans, )) n_defaulted = 0 n_orphan = 0 charges = Counter() first_msn = float('inf') last_msn = 0 for scan_info in index.msn_ids.values(): n_defaulted += scan_info.get('defaulted', False) n_orphan += scan_info.get('orphan', False) charges[scan_info['charge']] += 1 rt = scan_info['scan_time'] if rt < first_msn: first_msn = rt if rt > last_msn: last_msn = rt click.echo("First MSn Scan: %0.3f minutes" % (first_msn,)) click.echo("Last MSn Scan: %0.3f minutes" % (last_msn,)) for charge, count in sorted(charges.items()): if not isinstance(charge, int): continue click.echo("Precursors with Charge State %d: %d" % (charge, count)) if n_defaulted > 0: click.echo("Defaulted MSn Scans: %d" % (n_defaulted,)) if n_orphan > 0: click.echo("Orphan MSn Scans: %d" % (n_orphan,)) if diagnostics: reader.reset() scan_ids_with_invalid_isolation_window = [] scan_ids_with_empty_isolation_window = [] activation_counter = Counter() n_ms1 = 0 n_msn = 0 for precursor, products in reader: if precursor is not None: n_ms1 += 1 for product in products: n_msn += 1 if not isolation_window_valid(product): scan_ids_with_invalid_isolation_window.append((precursor.id, product.id)) if is_isolation_window_empty(product): scan_ids_with_empty_isolation_window.append( (precursor.id, product.id)) activation_counter[product.activation] += 1 click.echo("MS1 Spectra: %d" % (n_ms1, )) click.echo("MSn Spectra: %d" % (n_msn, )) click.echo("Invalid Isolation Windows: %d" % ( len(scan_ids_with_invalid_isolation_window), )) click.echo("Empty Isolation Windows: %d" % ( len(scan_ids_with_empty_isolation_window), )) click.echo("Activation Methods") for activation, count in activation_counter.items(): if not activation.is_multiple_dissociation(): click.echo("\t{method}:{energy} = {count}".format( method=activation.method, energy=activation.energy, count=count)) else: click.echo("\t{method}:{energy} = {count}".format( method=','.join(map(str, activation.method)), energy=','.join(map(str, activation.energies)), count=count))
def make_scan(): reader = MSFileLoader(datafile("20150710_3um_AGP_001_29_30.mzML.gz")) scan = reader.get_scan_by_id("scanId=1740086") return scan