Пример #1
0
def metadata_index(paths, processes=4, deconvoluted=False):
    '''Build an external scan metadata index for a mass spectrometry data file

    This extended index is saved in a separate JSON file that can be loaded with
    :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor
    mass of MSn scans, as well as the relationships between precursor and product ion
    scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information
    '''
    for path in paths:
        click.echo("Indexing %s" % (path, ))
        if deconvoluted:
            reader = ProcessedMzMLDeserializer(path, use_extended_index=False)
        else:
            reader = MSFileLoader(path)
        try:
            fn = reader.prebuild_byte_offset_file
            if not reader.source._check_has_byte_offset_file():
                fn(path)
        except AttributeError:
            pass
        if processes > 1:
            progbar = progress(label='Building Index', length=100)
            acc = [0]

            def update_bar(x):
                '''Progress Bar update callback for :func:`~.quick_index.index`
                '''
                x = int(x * 100)
                x -= acc[0]  # pylint: disable=cell-var-from-loop
                progbar.update(x)  # pylint: disable=cell-var-from-loop
                acc[0] += x  # pylint: disable=cell-var-from-loop

            with progbar:
                update_bar(0.0)
                index, _ = quick_index.index(
                    reader, processes, progress_indicator=update_bar)
        else:
            index = quick_index.ExtendedScanIndex()
            reader.reset()
            try:
                n = len(reader)
                progbar = progress(label='Building Index', length=n)
            except TypeError:
                progbar = spinner(title="Building Index")
            with progbar:
                for bunch in reader.make_iterator(grouped=True):
                    i = 0
                    i += bunch.precursor is not None
                    i += len(bunch.products)
                    index.add_scan_bunch(bunch)
                    progbar.update(i)

        name = path
        index_file_name = index.index_file_name(name)
        with open(index_file_name, 'w') as fh:
            index.serialize(fh)
Пример #2
0
def msms_intervals(paths, processes=4, time_radius=5, mz_lower=2., mz_higher=3., output=None):
    '''Construct an interval tree spanning time and m/z domains where MSn spectra were acquired
    in the LC-MS map. The interval tree is serialized to JSON.
    '''
    interval_extraction = _MSMSIntervalTask(time_radius, mz_lower, mz_higher)
    interval_set = []
    total_work_items = len(paths) * processes * 4

    def _run():
        for path in paths:
            reader = MSFileLoader(path)
            chunk_out_of_order = quick_index.run_task_in_chunks(
                reader, processes, processes * 4, task=interval_extraction)
            for chunk in chunk_out_of_order:
                interval_set.extend(chunk)
                yield 0
    work_iterator = _run()
    with progress(work_iterator, length=total_work_items, label='Extracting Intervals') as g:
        for _ in g:
            pass
    tree = scan_interval_tree.ScanIntervalTree(scan_interval_tree.make_rt_tree(interval_set))
    if output is not None:
        with open(output, 'wt') as fh:
            tree.serialize(fh)
    else:
        stream = click.get_text_stream('stdout')
        tree.serialize(stream)
        stream.flush()
Пример #3
0
def to_mgf(reader, outstream, msn_filters=None):
    """Translate the spectra from `reader` into MGF format written to `outstream`.

    As MGF files do not usually contain MS1 spectra, these will be omitted. Additionally,
    MSn spectra will be centroided if they are not already.

    Parameters
    ----------
    reader : :class:`~.ScanIterator`
        The source of spectra to iterate over
    outstream : file-like
        The output stream to write to
    msn_filters : list, optional
        An optional list of strings or :class:`~.ScanFilterBase` instances which will be
        used to transform the m/z and intennsity arrays of MSn spectra before they are futher
        procssed. (the default is None, which results in no transformations)

    """
    if not msn_filters:
        msn_filters = []
    reader.make_iterator(grouped=False)
    writer = MGFSerializer(outstream, deconvoluted=False)
    try:
        n_spectra = len(reader)
    except TypeError:
        n_spectra = None
    progbar = progress(reader,
                       label="Processed Spectra",
                       length=n_spectra,
                       item_show_func=lambda x: str(x.id) if x else '',
                       color=True,
                       fill_char=click.style('-', 'green'))
    with progbar:
        for scan in progbar:
            if scan.ms_level == 1:
                continue
            if msn_filters:
                scan = scan.transform(msn_filters)
            if scan.peak_set is None:
                scan.pick_peaks()
            writer.save_scan(scan)
    outstream.flush()
Пример #4
0
def draw_tic(path, output_path=None, start_time=None, end_time=None):
    """Draw the Total Ion Chromatogram (TIC), the total signal at each time point.
    """
    if output_path is None:
        output_path = path + '.tic.png'
    if start_time is None:
        start_time = 0
    if end_time is None:
        end_time = float('inf')

    figure, axis = _make_figure()

    reader = ms_deisotope.MSFileLoader(path)
    reader.start_from_scan(rt=start_time, grouped=False)

    time = array('d')
    intensity = array('d')

    bar = progress(reader, item_show_func=lambda x: str(
        x.id) if x is not None else '', color=True, fill_char=click.style('-', 'green'))
    with bar:
        for scan in bar:
            if scan.ms_level != 1:
                continue
            time.append(scan.scan_time)
            intensity.append(scan.arrays.intensity.sum())

    click.echo("Total Ion Current: %e" % np.sum(intensity))

    axis.plot(time, intensity)
    axis.set_xlabel("Scan Time (Min)", fontsize=16)
    axis.set_ylabel("Relative Intensity", fontsize=16)
    ylim = axis.get_ylim()
    axis.set_ylim(-10, ylim[1])
    axis.set_xlim(time[0] - 2, time[-1] + 2)
    figure.text(0.15, 0.8, "%0.3e" % np.sum(intensity), ha='left')
    figure.savefig(output_path, bbox_inches='tight', dpi=120)
Пример #5
0
def to_mzml(reader,
            outstream,
            pick_peaks=False,
            reprofile=False,
            ms1_filters=None,
            msn_filters=None,
            default_activation=None,
            correct_precursor_mz=False,
            write_index=True,
            update_metadata=True):
    """Translate the spectra from `reader` into mzML format written to `outstream`.

    Wraps the process of iterating over `reader`, performing a set of simple data transformations if desired,
    and then converts each :class:`~.Scan` into mzML format. Any data transformations are described in the
    appropriate metadata section. All other metadata from `reader` is copied to into `outstream`.

    Parameters
    ----------
    reader : :class:`~.ScanIterator`
        The source of spectra to iterate over
    outstream : file-like
        The output stream to write mzML to.
    pick_peaks : bool, optional
        Whether to centroid profile spectra (the default is False)
    reprofile: bool, optional
        Whether to reprofile spectra from their centroids (the default is False)
    ms1_filters : list, optional
        An optional list of strings or :class:`~.ScanFilterBase` instances which will be
        used to transform the m/z and intensity arrays of MS1 spectra before they are further
        processed (the default is None, which results in no transformations)
    msn_filters : list, optional
        An optional list of strings or :class:`~.ScanFilterBase` instances which will be
        used to transform the m/z and intennsity arrays of MSn spectra before they are futher
        procssed. (the default is None, which results in no transformations)
    default_activation : :class:`str` or :class:`dict`, optional
        A default activation type to use when `reader` does not contain that information (the default is None)
    correct_precursor_mz : bool, optional
        Whether or not to assign the precursor m/z of each product scan to the nearest peak
        m/z in the precursor's peak list. (the default is False, which results in no correction)

    """
    if ms1_filters is None:
        ms1_filters = []
    if msn_filters is None:
        msn_filters = []
    reader.make_iterator(grouped=True)
    writer = MzMLSerializer(outstream,
                            len(reader),
                            deconvoluted=False,
                            build_extra_index=write_index,
                            include_software_entry=update_metadata)
    writer.copy_metadata_from(reader)
    if update_metadata:
        method = data_transformation.ProcessingMethod(
            software_id='ms_deisotope_1')
        if pick_peaks:
            method.add('MS:1000035')
        if correct_precursor_mz:
            method.add('MS:1000780')
        if reprofile:
            method.add('MS:1000784')
        method.add('MS:1000544')
        writer.add_data_processing(method)
    if default_activation is not None:
        if isinstance(default_activation, basestring):
            default_activation = activation_module.ActivationInformation(
                default_activation, unitfloat(0, 'electronvolt'))
        elif isinstance(default_activation, dict):
            default_activation = activation_module.ActivationInformation(
                **default_activation)
        else:
            click.secho("Could not convert %r into ActivationInformation" %
                        (default_activation, ),
                        err=1,
                        fg='yellow')
            default_activation = None
    if pick_peaks:
        try:
            writer.remove_file_contents("profile spectrum")
        except KeyError:
            pass
        writer.add_file_contents("centroid spectrum")
    n_spectra = len(reader)
    progbar = progress(
        label="Processed Spectra",
        length=n_spectra,
        item_show_func=lambda x: str(x.precursor.id
                                     if x.precursor else x.products[0].id)
        if x else '')
    with progbar:
        for bunch in reader:
            progbar.current_item = bunch
            progbar.update((bunch.precursor is not None) + len(bunch.products))
            discard_peaks = False
            if bunch.precursor is not None:
                if (reprofile):
                    bunch = bunch._replace(
                        precursor=bunch.precursor.reprofile())
                if ms1_filters:
                    bunch = bunch._replace(
                        precursor=bunch.precursor.transform(ms1_filters))
                if (pick_peaks or not bunch.precursor.is_profile):
                    bunch.precursor.pick_peaks()
                if correct_precursor_mz:
                    if not pick_peaks:
                        bunch.precursor.pick_peaks()
                        if bunch.precursor.is_profile:
                            discard_peaks = True

            for i, product in enumerate(bunch.products):
                # if reprofile:
                #     product = bunch.products[i] = product.reprofile()
                if msn_filters:
                    product = bunch.products[i] = product.transform(
                        msn_filters)
                if pick_peaks or not product.is_profile:
                    product.pick_peaks()
                if product.activation is None and default_activation is not None:
                    product.activation = default_activation
                if correct_precursor_mz:
                    product.precursor_information.correct_mz()
            if discard_peaks:
                bunch.precursor.peak_set = None
            writer.save_scan_bunch(bunch)
    writer.complete()
    writer.format()
Пример #6
0
def ms1_spectrum_diagnostics(path, output_path=None):
    '''Collect diagnostic information from MS1 spectra.
    '''
    reader = ms_deisotope.MSFileLoader(path)

    reader.make_iterator(grouped=True)

    ms1_metric_names = [
        'scan_id', 'scan_index', 'scan_time', 'duty_cycle', 'tic',
        'base_peak_mz', 'base_peak_intensity', 'data_point_count',
        'injection_time', 'n_ms2_scans'
    ]
    ms1_metrics = []
    products = None
    last_ms1 = None
    prog = progress(length=len(reader), label='Processing Scans',
                    file=sys.stderr, item_show_func=lambda x: x.id if x else '')
    with prog:
        for precursor, products in reader:
            ms1_time = precursor.scan_time
            if last_ms1 is not None:
                duty_cycle = ms1_time - last_ms1
                ms1_metrics[-1]['duty_cycle'] = duty_cycle
            last_ms1 = ms1_time
            bp = precursor.base_peak()
            acquisition_info = precursor.acquisition_information
            if acquisition_info:
                scan_event = acquisition_info[0]
                inj = scan_event.injection_time
            else:
                inj = np.nan
            ms1_record = {
                "scan_id": precursor.id,
                "scan_index": precursor.index,
                "scan_time": precursor.scan_time,
                "duty_cycle": np.nan,
                "tic": precursor.tic(),
                "base_peak_mz": bp.mz,
                "base_peak_intensity": bp.intensity,
                "data_point_count": precursor.arrays.mz.size,
                "injection_time": inj,
                "n_ms2_scans": len([p for p in products if p.ms_level == 2])
            }
            ms1_metrics.append(ms1_record)
            prog.current_item = precursor
            prog.update(1 + len(products))

    if last_ms1 is not None:
        if products:
            last_time = max([p.scan_time for p in products])
            duty_cycle = last_time - last_ms1
            ms1_metrics[-1]['duty_cycle'] = duty_cycle


    if output_path is None:
        outfh = click.open_file("-", mode='wb')
    else:
        outfh = io.open(output_path, mode='wb')
    if six.PY3:
        stream = io.TextIOWrapper(outfh, encoding='utf8', newline='')
    else:
        stream = outfh
    writer = csv.DictWriter(stream, fieldnames=ms1_metric_names)
    writer.writeheader()
    writer.writerows(ms1_metrics)
    stream.flush()
Пример #7
0
def spectrum_clustering(paths, precursor_error_tolerance=1e-5, similarity_thresholds=None, output_path=None,
                        in_memory=False, deconvoluted=False, cache_size=2**10):
    '''Cluster spectra by precursor mass and cosine similarity.

    Spectrum clusters are written out to a text file recording
    cluster precursor mass, within-cluster similarity, and the
    source file and scan ID for each cluster member.
    '''
    if not similarity_thresholds:
        similarity_thresholds = [0.1, 0.4, 0.7]
    else:
        similarity_thresholds = sorted(similarity_thresholds)
    if output_path is None:
        output_path = "-"
    msn_scans = []
    n_spectra = 0

    with progress(paths, label="Indexing", item_show_func=lambda x: str(x) if x else '', color=True, fill_char=click.style('-', 'cyan')) as progbar:
        key_seqs = []
        for path in progbar:
            if deconvoluted:
                reader = ProcessedMzMLDeserializer(path)
                reader.parse_envelopes = False
                index = reader.extended_index
            else:
                reader, index = _ensure_metadata_index(path)
            key_seqs.append((reader, index))
            n_spectra += len(index.msn_ids)

    with progress(label="Loading Spectra", length=n_spectra,
                  item_show_func=lambda x: str(x) if x else '', color=True, fill_char=click.style('-', 'green')) as progbar:
        for reader, index in key_seqs:
            if not in_memory:
                if not reader.has_fast_random_access:
                    click.secho(
                        "%s does not have fast random access, scan fetching may be slow!" % (
                            reader, ), fg='yellow')
                proxy_context = ScanProxyContext(reader, cache_size=cache_size)
                pinfo_map = {
                    pinfo.product_scan_id: pinfo for pinfo in
                    index.get_precursor_information()
                }
                for i in index.msn_ids:
                    progbar.current_item = i
                    progbar.update(1)
                    scan = proxy_context(i)
                    scan.precursor_information = pinfo_map[i]
                    msn_scans.append(scan)
            else:
                if reader.has_fast_random_access:
                    # We have fast random access so we can just loop over the index and pull out
                    # the MSn scans directly without completely traversing the file.
                    for i in index.msn_ids:
                        progbar.current_item = i
                        progbar.update(1)
                        scan = reader.get_scan_by_id(i)
                        if scan.peak_set is None and not deconvoluted:
                            scan = scan.pick_peaks().pack(bind=True)
                        msn_scans.append(scan)
                else:
                    # If we don't  have fast random access, it's better just to loop over the file,
                    # and absorb the cost of parsing the MS1 scans
                    reader.reset()
                    reader.make_iterator(grouped=False)
                    for scan in reader:
                        if scan.ms_level != 1:
                            progbar.current_item = scan.id
                            progbar.update(1)
                            if scan.peak_set is None and not deconvoluted:
                                scan = scan.pick_peaks().pack(bind=True)
                            msn_scans.append(scan)
                # Dispose of the state that is no longer required.
                reader.reset()
                index.clear()


    click.echo("Begin Clustering", err=True)
    clusters = iterative_clustering(
        msn_scans, precursor_error_tolerance, similarity_thresholds)
    click.echo("Clusering Finished", err=True)
    by_size = Counter()
    for cluster in clusters:
        by_size[len(cluster)] += 1
    click.echo("Clusters: {:d}".format(len(clusters)), err=True)
    for key, value in sorted(by_size.items()):
        click.echo("Size {:d}: {:d}".format(key, value), err=True)
    with click.open_file(output_path, mode='w') as outfh:
        writer = ScanClusterWriter(outfh)
        for cluster in clusters:
            writer.save(cluster)