Пример #1
0
def metadata_index(paths, processes=4):
    '''Build an external scan metadata index for a mass spectrometry data file

    This extended index is saved in a separate JSON file that can be loaded with
    :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor
    mass of MSn scans, as well as the relationships between precursor and product ion
    scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information
    '''
    for path in paths:
        reader = MSFileLoader(path)
        try:
            fn = reader.prebuild_byte_offset_file
            if not reader.source._check_has_byte_offset_file():
                fn(path)
        except AttributeError:
            pass
        if processes > 1:
            index, _ = quick_index.index(reader, processes)
        else:
            index = quick_index.ExtendedScanIndex()
            reader.reset()
            for bunch in reader:
                index.add_scan_bunch(bunch)

        name = path
        index_file_name = index.index_file_name(name)
        with open(index_file_name, 'w') as fh:
            index.serialize(fh)
Пример #2
0
def metadata_index(paths, processes=4):
    '''Build an external scan metadata index for a mass spectrometry data file

    This extended index is saved in a separate JSON file that can be loaded with
    :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor
    mass of MSn scans, as well as the relationships between precursor and product ion
    scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information
    '''
    for path in paths:
        click.echo("Indexing %s" % (path, ))
        reader = MSFileLoader(path)
        try:
            fn = reader.prebuild_byte_offset_file
            if not reader.source._check_has_byte_offset_file():
                fn(path)
        except AttributeError:
            pass
        if processes > 1:
            progbar = click.progressbar(label='Building Index', length=100)
            acc = [0]

            def update_bar(x):
                '''Progress Bar update callback for :func:`~.quick_index.index`
                '''
                x = int(x * 100)
                x -= acc[0]  # pylint: disable=cell-var-from-loop
                progbar.update(x)  # pylint: disable=cell-var-from-loop
                acc[0] += x  # pylint: disable=cell-var-from-loop

            with progbar:
                update_bar(0.0)
                index, _ = quick_index.index(
                    reader, processes, progress_indicator=update_bar)
        else:
            index = quick_index.ExtendedScanIndex()
            reader.reset()
            try:
                n = len(reader)
                progbar = click.progressbar(label='Building Index', length=n)
            except TypeError:
                progbar = spinner(title="Building Index")
            with progbar:
                for bunch in reader.make_iterator(grouped=True):
                    i = 0
                    i += bunch.precursor is not None
                    i += len(bunch.products)
                    index.add_scan_bunch(bunch)
                    progbar.update(i)

        name = path
        index_file_name = index.index_file_name(name)
        with open(index_file_name, 'w') as fh:
            index.serialize(fh)
Пример #3
0
 def _run():
     for path in paths:
         reader = MSFileLoader(path)
         chunk_out_of_order = quick_index.run_task_in_chunks(
             reader, processes, processes * 4, task=interval_extraction)
         for chunk in chunk_out_of_order:
             interval_set.extend(chunk)
             yield 0
 def make_scan(self):
     complex_compressed_mzml = datafile("20150710_3um_AGP_001_29_30.mzML.gz")
     reader = MSFileLoader(complex_compressed_mzml)
     bunch = next(reader)
     return bunch
Пример #5
0
def describe(path):
    '''Produces a minimal textual description of a mass spectrometry data file.
    '''
    click.echo("Describing \"%s\"" % (path,))
    try:
        sf = SourceFile.from_path(path)
    except IOError:
        click.echo("Could not open", err=True)
    if sf.file_format is None:
        click.echo("It doesn't appear to be a mass spectrometry data file")
        return -1
    click.echo("File Format: %s" % (sf.file_format, ))
    click.echo("ID Format: %s" % (sf.id_format, ))
    reader = MSFileLoader(path)
    if isinstance(reader, RandomAccessScanSource):
        click.echo("Format Supports Random Access: True")
        first_scan = reader[0]
        last_scan = reader[-1]
        click.echo("First Scan: %s at %0.3f minutes" % (first_scan.id, first_scan.scan_time))
        click.echo("Last Scan: %s at %0.3f minutes" % (last_scan.id, last_scan.scan_time))
    else:
        click.echo("Format Supports Random Access: False")
    try:
        finfo = reader.file_description()
        click.echo("Contents:")
        for key in finfo.contents:
            click.echo("    %s" % (key, ))
    except AttributeError:
        pass
    index_file_name = quick_index.ExtendedScanIndex.index_file_name(path)
    # Extra introspection if the extended index is available
    if os.path.exists(index_file_name):
        with open(index_file_name, 'rt') as fh:
            index = quick_index.ExtendedScanIndex.deserialize(fh)
        ms1_scans = len(index.ms1_ids)
        msn_scans = len(index.msn_ids)
        click.echo("MS1 Scans: %d" % (ms1_scans, ))
        click.echo("MSn Scans: %d" % (msn_scans, ))
        n_defaulted = 0
        n_orphan = 0

        charges = Counter()
        first_msn = float('inf')
        last_msn = 0
        for scan_info in index.msn_ids.values():
            n_defaulted += scan_info.get('defaulted', False)
            n_orphan += scan_info.get('orphan', False)
            charges[scan_info['charge']] += 1
            rt = scan_info['scan_time']
            if rt < first_msn:
                first_msn = rt
            if rt > last_msn:
                last_msn = rt
        click.echo("First MSn Scan: %0.3f minutes" % (first_msn,))
        click.echo("Last MSn Scan: %0.3f minutes" % (last_msn,))
        for charge, count in sorted(charges.items()):
            if not isinstance(charge, int):
                continue
            click.echo("Precursors with Charge State %d: %d" % (charge, count))
        if n_defaulted > 0:
            click.echo("Defaulted MSn Scans: %d" % (n_defaulted,))
        if n_orphan > 0:
            click.echo("Orphan MSn Scans: %d" % (n_orphan,))
Пример #6
0
    def configure_storage(cls, path=None, name=None, source=None):
        if path is not None:
            if name is None:
                sample_name = os.path.basename(path)
            else:
                sample_name = name
        else:
            path = "processed.mzML"
        if source is not None:
            reader = MSFileLoader(source.scan_source)
            n_spectra = len(reader.index)
            deconvoluting = source.deconvoluting
            inst = cls(path, sample_name, n_spectra=n_spectra, deconvoluted=deconvoluting)
            try:
                description = reader.file_description()
            except AttributeError:
                description = FileInformation()
            source_file_metadata = MetadataSourceFile.from_path(source.scan_source)
            inst.serializer.add_file_information(description)
            try:
                inst.serializer.remove_file_contents("profile spectrum")
            except KeyError:
                pass
            inst.serializer.add_file_contents("centroid spectrum")
            if source_file_metadata not in description.source_files:
                inst.serializer.add_source_file(source_file_metadata)
            try:
                instrument_configs = reader.instrument_configuration()
                for config in instrument_configs:
                    inst.serializer.add_instrument_configuration(config)
            except Exception as e:
                log_handle.error(
                    "An error occurred while writing instrument configuration", e)
            for trans in source.ms1_peak_picking_args.get("transforms"):
                inst.register_parameter("parameter: ms1-%s" % trans.__class__.__name__, repr(trans))
            if deconvoluting:
                if source.ms1_deconvolution_args.get("averagine"):
                    inst.register_parameter(
                        "parameter: ms1-averagine", repr(source.ms1_deconvolution_args.get("averagine")))
                if source.ms1_deconvolution_args.get("scorer"):
                    inst.register_parameter(
                        "parameter: ms1-scorer", repr(source.ms1_deconvolution_args.get("scorer")))
                if source.ms1_averaging > 0:
                    inst.register_parameter("parameter: ms1-averaging", repr(source.ms1_averaging))
                if source.ignore_tandem_scans:
                    inst.register_parameter("parameter: ignore-tandem-scans", "")
                if source.extract_only_tandem_envelopes:
                    inst.register_parameter("parameter: extract-only-tandem-envelopes", "")

            if source.msn_peak_picking_args is not None:
                for trans in source.msn_peak_picking_args.get("transforms"):
                    inst.register_parameter("parameter: msn-%s" % trans.__class__.__name__, repr(trans))
            if deconvoluting:
                if source.msn_deconvolution_args.get("averagine"):
                    inst.register_parameter(
                        "parameter: msn-averagine", repr(source.msn_deconvolution_args.get("averagine")))
                if source.msn_deconvolution_args.get("scorer"):
                    inst.register_parameter(
                        "parameter: msn-scorer", repr(source.msn_deconvolution_args.get("scorer")))
            data_processing = inst.serializer.build_processing_method()
            inst.serializer.add_data_processing(data_processing)
        else:
            n_spectra = 2e5
            inst = cls(path, sample_name, n_spectra=n_spectra)
        # Force marshalling of controlled vocabularies early.
        inst.serializer.writer.param("32-bit float")
        return inst
Пример #7
0
    def configure_storage(cls, path=None, name=None, source=None):
        if path is not None:
            if name is None:
                sample_name = os.path.basename(path)
            else:
                sample_name = name
        else:
            path = "processed.mzML"
        if source is not None:
            reader = MSFileLoader(source.scan_source)
            n_spectra = len(reader.index)
            deconvoluting = source.deconvoluting
            inst = cls(path,
                       sample_name,
                       n_spectra=n_spectra,
                       deconvoluted=deconvoluting)
            try:
                description = reader.file_description()
            except AttributeError:
                description = FileInformation()
            source_file_metadata = MetadataSourceFile.from_path(
                source.scan_source)
            inst.serializer.add_file_information(description)
            try:
                inst.serializer.remove_file_contents("profile spectrum")
            except KeyError:
                pass
            inst.serializer.add_file_contents("centroid spectrum")
            if source_file_metadata not in description.source_files:
                inst.serializer.add_source_file(source_file_metadata)
            try:
                instrument_configs = reader.instrument_configuration()
                for config in instrument_configs:
                    inst.serializer.add_instrument_configuration(config)
            except Exception as e:
                inst.error(
                    "An error occurred while writing instrument configuration",
                    e)
            for trans in source.ms1_peak_picking_args.get("transforms", []):
                inst.register_parameter(
                    "parameter: ms1-%s" % trans.__class__.__name__,
                    repr(trans))
            if deconvoluting:
                if source.ms1_deconvolution_args.get("averagine"):
                    inst.register_parameter(
                        "parameter: ms1-averagine",
                        repr(source.ms1_deconvolution_args.get("averagine")))
                if source.ms1_deconvolution_args.get("scorer"):
                    inst.register_parameter(
                        "parameter: ms1-scorer",
                        repr(source.ms1_deconvolution_args.get("scorer")))
                if source.ms1_averaging > 0:
                    inst.register_parameter("parameter: ms1-averaging",
                                            repr(source.ms1_averaging))
                if source.ignore_tandem_scans:
                    inst.register_parameter("parameter: ignore-tandem-scans",
                                            "")
                if source.extract_only_tandem_envelopes:
                    inst.register_parameter(
                        "parameter: extract-only-tandem-envelopes", "")

            if source.msn_peak_picking_args is not None:
                for trans in source.msn_peak_picking_args.get(
                        "transforms", []):
                    inst.register_parameter(
                        "parameter: msn-%s" % trans.__class__.__name__,
                        repr(trans))
            if deconvoluting:
                if source.msn_deconvolution_args.get("averagine"):
                    inst.register_parameter(
                        "parameter: msn-averagine",
                        repr(source.msn_deconvolution_args.get("averagine")))
                if source.msn_deconvolution_args.get("scorer"):
                    inst.register_parameter(
                        "parameter: msn-scorer",
                        repr(source.msn_deconvolution_args.get("scorer")))
            data_processing = inst.serializer.build_processing_method()
            inst.serializer.add_data_processing(data_processing)
        else:
            n_spectra = 2e5
            inst = cls(path, sample_name, n_spectra=n_spectra)
        # Force marshalling of controlled vocabularies early.
        inst.serializer.writer.param("32-bit float")
        return inst
Пример #8
0
def describe(path, diagnostics=False):
    '''Produces a minimal textual description of a mass spectrometry data file.
    '''
    click.echo("Describing \"%s\"" % (path,))
    try:
        sf = SourceFile.from_path(path)
    except IOError:
        raise click.Abort("Could not open file \"%s\"" % (path, ), err=True)

    if sf.file_format is None:
        raise click.Abort("\"%s\" doesn't appear to be a mass spectrometry data file" % (path, ))
    click.echo("File Format: %s" % (sf.file_format, ))
    click.echo("ID Format: %s" % (sf.id_format, ))
    reader = MSFileLoader(path)
    if isinstance(reader, RandomAccessScanSource):
        click.echo("Format Supports Random Access: True")
        first_scan = reader[0]
        last_scan = reader[-1]
        click.echo("First Scan: %s at %0.3f minutes" % (first_scan.id, first_scan.scan_time))
        click.echo("Last Scan: %s at %0.3f minutes" % (last_scan.id, last_scan.scan_time))
    else:
        click.echo("Format Supports Random Access: False")
    try:
        finfo = reader.file_description()
        click.echo("Contents:")
        for key in finfo.contents:
            click.echo("    %s" % (key, ))
    except AttributeError:
        pass
    index_file_name = quick_index.ExtendedScanIndex.index_file_name(path)
    # Extra introspection if the extended index is available
    if os.path.exists(index_file_name):
        with open(index_file_name, 'rt') as fh:
            index = quick_index.ExtendedScanIndex.deserialize(fh)
        ms1_scans = len(index.ms1_ids)
        msn_scans = len(index.msn_ids)
        click.echo("MS1 Scans: %d" % (ms1_scans, ))
        click.echo("MSn Scans: %d" % (msn_scans, ))
        n_defaulted = 0
        n_orphan = 0

        charges = Counter()
        first_msn = float('inf')
        last_msn = 0
        for scan_info in index.msn_ids.values():
            n_defaulted += scan_info.get('defaulted', False)
            n_orphan += scan_info.get('orphan', False)
            charges[scan_info['charge']] += 1
            rt = scan_info['scan_time']
            if rt < first_msn:
                first_msn = rt
            if rt > last_msn:
                last_msn = rt
        click.echo("First MSn Scan: %0.3f minutes" % (first_msn,))
        click.echo("Last MSn Scan: %0.3f minutes" % (last_msn,))
        for charge, count in sorted(charges.items()):
            if not isinstance(charge, int):
                continue
            click.echo("Precursors with Charge State %d: %d" % (charge, count))
        if n_defaulted > 0:
            click.echo("Defaulted MSn Scans: %d" % (n_defaulted,))
        if n_orphan > 0:
            click.echo("Orphan MSn Scans: %d" % (n_orphan,))
    if diagnostics:
        reader.reset()
        scan_ids_with_invalid_isolation_window = []
        scan_ids_with_empty_isolation_window = []
        activation_counter = Counter()
        n_ms1 = 0
        n_msn = 0
        for precursor, products in reader:
            if precursor is not None:
                n_ms1 += 1
            for product in products:
                n_msn += 1
                if not isolation_window_valid(product):
                    scan_ids_with_invalid_isolation_window.append((precursor.id, product.id))
                if is_isolation_window_empty(product):
                    scan_ids_with_empty_isolation_window.append(
                        (precursor.id, product.id))
                activation_counter[product.activation] += 1

        click.echo("MS1 Spectra: %d" % (n_ms1, ))
        click.echo("MSn Spectra: %d" % (n_msn, ))
        click.echo("Invalid Isolation Windows: %d" % (
            len(scan_ids_with_invalid_isolation_window), ))
        click.echo("Empty Isolation Windows: %d" % (
            len(scan_ids_with_empty_isolation_window), ))

        click.echo("Activation Methods")
        for activation, count in activation_counter.items():
            if not activation.is_multiple_dissociation():
                click.echo("\t{method}:{energy} = {count}".format(
                    method=activation.method, energy=activation.energy, count=count))
            else:
                click.echo("\t{method}:{energy} = {count}".format(
                    method=','.join(map(str, activation.method)),
                    energy=','.join(map(str, activation.energies)), count=count))
Пример #9
0
 def make_scan():
     reader = MSFileLoader(datafile("20150710_3um_AGP_001_29_30.mzML.gz"))
     scan = reader.get_scan_by_id("scanId=1740086")
     return scan