def metadata_index(paths, processes=4, deconvoluted=False): '''Build an external scan metadata index for a mass spectrometry data file This extended index is saved in a separate JSON file that can be loaded with :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor mass of MSn scans, as well as the relationships between precursor and product ion scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information ''' for path in paths: click.echo("Indexing %s" % (path, )) if deconvoluted: reader = ProcessedMzMLDeserializer(path, use_extended_index=False) else: reader = MSFileLoader(path) try: fn = reader.prebuild_byte_offset_file if not reader.source._check_has_byte_offset_file(): fn(path) except AttributeError: pass if processes > 1: progbar = progress(label='Building Index', length=100) acc = [0] def update_bar(x): '''Progress Bar update callback for :func:`~.quick_index.index` ''' x = int(x * 100) x -= acc[0] # pylint: disable=cell-var-from-loop progbar.update(x) # pylint: disable=cell-var-from-loop acc[0] += x # pylint: disable=cell-var-from-loop with progbar: update_bar(0.0) index, _ = quick_index.index( reader, processes, progress_indicator=update_bar) else: index = quick_index.ExtendedScanIndex() reader.reset() try: n = len(reader) progbar = progress(label='Building Index', length=n) except TypeError: progbar = spinner(title="Building Index") with progbar: for bunch in reader.make_iterator(grouped=True): i = 0 i += bunch.precursor is not None i += len(bunch.products) index.add_scan_bunch(bunch) progbar.update(i) name = path index_file_name = index.index_file_name(name) with open(index_file_name, 'w') as fh: index.serialize(fh)
def msms_intervals(paths, processes=4, time_radius=5, mz_lower=2., mz_higher=3., output=None): '''Construct an interval tree spanning time and m/z domains where MSn spectra were acquired in the LC-MS map. The interval tree is serialized to JSON. ''' interval_extraction = _MSMSIntervalTask(time_radius, mz_lower, mz_higher) interval_set = [] total_work_items = len(paths) * processes * 4 def _run(): for path in paths: reader = MSFileLoader(path) chunk_out_of_order = quick_index.run_task_in_chunks( reader, processes, processes * 4, task=interval_extraction) for chunk in chunk_out_of_order: interval_set.extend(chunk) yield 0 work_iterator = _run() with progress(work_iterator, length=total_work_items, label='Extracting Intervals') as g: for _ in g: pass tree = scan_interval_tree.ScanIntervalTree(scan_interval_tree.make_rt_tree(interval_set)) if output is not None: with open(output, 'wt') as fh: tree.serialize(fh) else: stream = click.get_text_stream('stdout') tree.serialize(stream) stream.flush()
def to_mgf(reader, outstream, msn_filters=None): """Translate the spectra from `reader` into MGF format written to `outstream`. As MGF files do not usually contain MS1 spectra, these will be omitted. Additionally, MSn spectra will be centroided if they are not already. Parameters ---------- reader : :class:`~.ScanIterator` The source of spectra to iterate over outstream : file-like The output stream to write to msn_filters : list, optional An optional list of strings or :class:`~.ScanFilterBase` instances which will be used to transform the m/z and intennsity arrays of MSn spectra before they are futher procssed. (the default is None, which results in no transformations) """ if not msn_filters: msn_filters = [] reader.make_iterator(grouped=False) writer = MGFSerializer(outstream, deconvoluted=False) try: n_spectra = len(reader) except TypeError: n_spectra = None progbar = progress(reader, label="Processed Spectra", length=n_spectra, item_show_func=lambda x: str(x.id) if x else '', color=True, fill_char=click.style('-', 'green')) with progbar: for scan in progbar: if scan.ms_level == 1: continue if msn_filters: scan = scan.transform(msn_filters) if scan.peak_set is None: scan.pick_peaks() writer.save_scan(scan) outstream.flush()
def draw_tic(path, output_path=None, start_time=None, end_time=None): """Draw the Total Ion Chromatogram (TIC), the total signal at each time point. """ if output_path is None: output_path = path + '.tic.png' if start_time is None: start_time = 0 if end_time is None: end_time = float('inf') figure, axis = _make_figure() reader = ms_deisotope.MSFileLoader(path) reader.start_from_scan(rt=start_time, grouped=False) time = array('d') intensity = array('d') bar = progress(reader, item_show_func=lambda x: str( x.id) if x is not None else '', color=True, fill_char=click.style('-', 'green')) with bar: for scan in bar: if scan.ms_level != 1: continue time.append(scan.scan_time) intensity.append(scan.arrays.intensity.sum()) click.echo("Total Ion Current: %e" % np.sum(intensity)) axis.plot(time, intensity) axis.set_xlabel("Scan Time (Min)", fontsize=16) axis.set_ylabel("Relative Intensity", fontsize=16) ylim = axis.get_ylim() axis.set_ylim(-10, ylim[1]) axis.set_xlim(time[0] - 2, time[-1] + 2) figure.text(0.15, 0.8, "%0.3e" % np.sum(intensity), ha='left') figure.savefig(output_path, bbox_inches='tight', dpi=120)
def to_mzml(reader, outstream, pick_peaks=False, reprofile=False, ms1_filters=None, msn_filters=None, default_activation=None, correct_precursor_mz=False, write_index=True, update_metadata=True): """Translate the spectra from `reader` into mzML format written to `outstream`. Wraps the process of iterating over `reader`, performing a set of simple data transformations if desired, and then converts each :class:`~.Scan` into mzML format. Any data transformations are described in the appropriate metadata section. All other metadata from `reader` is copied to into `outstream`. Parameters ---------- reader : :class:`~.ScanIterator` The source of spectra to iterate over outstream : file-like The output stream to write mzML to. pick_peaks : bool, optional Whether to centroid profile spectra (the default is False) reprofile: bool, optional Whether to reprofile spectra from their centroids (the default is False) ms1_filters : list, optional An optional list of strings or :class:`~.ScanFilterBase` instances which will be used to transform the m/z and intensity arrays of MS1 spectra before they are further processed (the default is None, which results in no transformations) msn_filters : list, optional An optional list of strings or :class:`~.ScanFilterBase` instances which will be used to transform the m/z and intennsity arrays of MSn spectra before they are futher procssed. (the default is None, which results in no transformations) default_activation : :class:`str` or :class:`dict`, optional A default activation type to use when `reader` does not contain that information (the default is None) correct_precursor_mz : bool, optional Whether or not to assign the precursor m/z of each product scan to the nearest peak m/z in the precursor's peak list. (the default is False, which results in no correction) """ if ms1_filters is None: ms1_filters = [] if msn_filters is None: msn_filters = [] reader.make_iterator(grouped=True) writer = MzMLSerializer(outstream, len(reader), deconvoluted=False, build_extra_index=write_index, include_software_entry=update_metadata) writer.copy_metadata_from(reader) if update_metadata: method = data_transformation.ProcessingMethod( software_id='ms_deisotope_1') if pick_peaks: method.add('MS:1000035') if correct_precursor_mz: method.add('MS:1000780') if reprofile: method.add('MS:1000784') method.add('MS:1000544') writer.add_data_processing(method) if default_activation is not None: if isinstance(default_activation, basestring): default_activation = activation_module.ActivationInformation( default_activation, unitfloat(0, 'electronvolt')) elif isinstance(default_activation, dict): default_activation = activation_module.ActivationInformation( **default_activation) else: click.secho("Could not convert %r into ActivationInformation" % (default_activation, ), err=1, fg='yellow') default_activation = None if pick_peaks: try: writer.remove_file_contents("profile spectrum") except KeyError: pass writer.add_file_contents("centroid spectrum") n_spectra = len(reader) progbar = progress( label="Processed Spectra", length=n_spectra, item_show_func=lambda x: str(x.precursor.id if x.precursor else x.products[0].id) if x else '') with progbar: for bunch in reader: progbar.current_item = bunch progbar.update((bunch.precursor is not None) + len(bunch.products)) discard_peaks = False if bunch.precursor is not None: if (reprofile): bunch = bunch._replace( precursor=bunch.precursor.reprofile()) if ms1_filters: bunch = bunch._replace( precursor=bunch.precursor.transform(ms1_filters)) if (pick_peaks or not bunch.precursor.is_profile): bunch.precursor.pick_peaks() if correct_precursor_mz: if not pick_peaks: bunch.precursor.pick_peaks() if bunch.precursor.is_profile: discard_peaks = True for i, product in enumerate(bunch.products): # if reprofile: # product = bunch.products[i] = product.reprofile() if msn_filters: product = bunch.products[i] = product.transform( msn_filters) if pick_peaks or not product.is_profile: product.pick_peaks() if product.activation is None and default_activation is not None: product.activation = default_activation if correct_precursor_mz: product.precursor_information.correct_mz() if discard_peaks: bunch.precursor.peak_set = None writer.save_scan_bunch(bunch) writer.complete() writer.format()
def ms1_spectrum_diagnostics(path, output_path=None): '''Collect diagnostic information from MS1 spectra. ''' reader = ms_deisotope.MSFileLoader(path) reader.make_iterator(grouped=True) ms1_metric_names = [ 'scan_id', 'scan_index', 'scan_time', 'duty_cycle', 'tic', 'base_peak_mz', 'base_peak_intensity', 'data_point_count', 'injection_time', 'n_ms2_scans' ] ms1_metrics = [] products = None last_ms1 = None prog = progress(length=len(reader), label='Processing Scans', file=sys.stderr, item_show_func=lambda x: x.id if x else '') with prog: for precursor, products in reader: ms1_time = precursor.scan_time if last_ms1 is not None: duty_cycle = ms1_time - last_ms1 ms1_metrics[-1]['duty_cycle'] = duty_cycle last_ms1 = ms1_time bp = precursor.base_peak() acquisition_info = precursor.acquisition_information if acquisition_info: scan_event = acquisition_info[0] inj = scan_event.injection_time else: inj = np.nan ms1_record = { "scan_id": precursor.id, "scan_index": precursor.index, "scan_time": precursor.scan_time, "duty_cycle": np.nan, "tic": precursor.tic(), "base_peak_mz": bp.mz, "base_peak_intensity": bp.intensity, "data_point_count": precursor.arrays.mz.size, "injection_time": inj, "n_ms2_scans": len([p for p in products if p.ms_level == 2]) } ms1_metrics.append(ms1_record) prog.current_item = precursor prog.update(1 + len(products)) if last_ms1 is not None: if products: last_time = max([p.scan_time for p in products]) duty_cycle = last_time - last_ms1 ms1_metrics[-1]['duty_cycle'] = duty_cycle if output_path is None: outfh = click.open_file("-", mode='wb') else: outfh = io.open(output_path, mode='wb') if six.PY3: stream = io.TextIOWrapper(outfh, encoding='utf8', newline='') else: stream = outfh writer = csv.DictWriter(stream, fieldnames=ms1_metric_names) writer.writeheader() writer.writerows(ms1_metrics) stream.flush()
def spectrum_clustering(paths, precursor_error_tolerance=1e-5, similarity_thresholds=None, output_path=None, in_memory=False, deconvoluted=False, cache_size=2**10): '''Cluster spectra by precursor mass and cosine similarity. Spectrum clusters are written out to a text file recording cluster precursor mass, within-cluster similarity, and the source file and scan ID for each cluster member. ''' if not similarity_thresholds: similarity_thresholds = [0.1, 0.4, 0.7] else: similarity_thresholds = sorted(similarity_thresholds) if output_path is None: output_path = "-" msn_scans = [] n_spectra = 0 with progress(paths, label="Indexing", item_show_func=lambda x: str(x) if x else '', color=True, fill_char=click.style('-', 'cyan')) as progbar: key_seqs = [] for path in progbar: if deconvoluted: reader = ProcessedMzMLDeserializer(path) reader.parse_envelopes = False index = reader.extended_index else: reader, index = _ensure_metadata_index(path) key_seqs.append((reader, index)) n_spectra += len(index.msn_ids) with progress(label="Loading Spectra", length=n_spectra, item_show_func=lambda x: str(x) if x else '', color=True, fill_char=click.style('-', 'green')) as progbar: for reader, index in key_seqs: if not in_memory: if not reader.has_fast_random_access: click.secho( "%s does not have fast random access, scan fetching may be slow!" % ( reader, ), fg='yellow') proxy_context = ScanProxyContext(reader, cache_size=cache_size) pinfo_map = { pinfo.product_scan_id: pinfo for pinfo in index.get_precursor_information() } for i in index.msn_ids: progbar.current_item = i progbar.update(1) scan = proxy_context(i) scan.precursor_information = pinfo_map[i] msn_scans.append(scan) else: if reader.has_fast_random_access: # We have fast random access so we can just loop over the index and pull out # the MSn scans directly without completely traversing the file. for i in index.msn_ids: progbar.current_item = i progbar.update(1) scan = reader.get_scan_by_id(i) if scan.peak_set is None and not deconvoluted: scan = scan.pick_peaks().pack(bind=True) msn_scans.append(scan) else: # If we don't have fast random access, it's better just to loop over the file, # and absorb the cost of parsing the MS1 scans reader.reset() reader.make_iterator(grouped=False) for scan in reader: if scan.ms_level != 1: progbar.current_item = scan.id progbar.update(1) if scan.peak_set is None and not deconvoluted: scan = scan.pick_peaks().pack(bind=True) msn_scans.append(scan) # Dispose of the state that is no longer required. reader.reset() index.clear() click.echo("Begin Clustering", err=True) clusters = iterative_clustering( msn_scans, precursor_error_tolerance, similarity_thresholds) click.echo("Clusering Finished", err=True) by_size = Counter() for cluster in clusters: by_size[len(cluster)] += 1 click.echo("Clusters: {:d}".format(len(clusters)), err=True) for key, value in sorted(by_size.items()): click.echo("Size {:d}: {:d}".format(key, value), err=True) with click.open_file(output_path, mode='w') as outfh: writer = ScanClusterWriter(outfh) for cluster in clusters: writer.save(cluster)