def test_ms_deisotope(): runner = CliRunner(mix_stderr=False) path = datafile("20150710_3um_AGP_001_29_30.mzML.gz") reference = datafile("20150710_3um_AGP_001_29_30.preprocessed.mzML.gz") outpath = tempfile.mktemp() result = runner.invoke(deisotoper.deisotope, [ "-b", 0, "-t", 20, "-tn", 10, "-m", 3, "-mn", 1, path, outpath ]) result_reader = ProcessedMzMLDeserializer(outpath) reference_reader = ProcessedMzMLDeserializer(_compression.get_opener(reference)) assert len(result_reader) == len(reference_reader) for a_bunch, b_bunch in zip(result_reader, reference_reader): assert len(a_bunch.products) == len(b_bunch.products) aprec = a_bunch.precursor bprec = b_bunch.precursor assert aprec.id == bprec.id diffa, diffb = diff_deconvoluted_peak_set( aprec.deconvoluted_peak_set, bprec.deconvoluted_peak_set) assert len(aprec.deconvoluted_peak_set) == len( bprec.deconvoluted_peak_set), "Peak Counts Diff On %r, (%r, %r)" % (aprec.id, diffa, diffb) assert aprec.deconvoluted_peak_set == bprec.deconvoluted_peak_set, "Peaks Diff On %r, (%r, %r)" % ( aprec.id, diffa, diffb) for aprod, bprod in zip(a_bunch.products, b_bunch.products): assert aprod.id == bprod.id diffa, diffb = diff_deconvoluted_peak_set(aprod.deconvoluted_peak_set, bprod.deconvoluted_peak_set) assert len(aprod.deconvoluted_peak_set) == len( bprod.deconvoluted_peak_set), "Peak Counts Diff On %r, (%r, %r)" % (aprod.id, diffa, diffb) assert aprod.deconvoluted_peak_set == bprod.deconvoluted_peak_set, "Peaks Diff On %r" % ( aprod.id, diffa, diffb) result_reader.close() reference_reader.close() os.remove(outpath)
def metadata_index(paths, processes=4, deconvoluted=False): '''Build an external scan metadata index for a mass spectrometry data file This extended index is saved in a separate JSON file that can be loaded with :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor mass of MSn scans, as well as the relationships between precursor and product ion scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information ''' for path in paths: click.echo("Indexing %s" % (path, )) if deconvoluted: reader = ProcessedMzMLDeserializer(path, use_extended_index=False) else: reader = MSFileLoader(path) try: fn = reader.prebuild_byte_offset_file if not reader.source._check_has_byte_offset_file(): fn(path) except AttributeError: pass if processes > 1: progbar = progress(label='Building Index', length=100) acc = [0] def update_bar(x): '''Progress Bar update callback for :func:`~.quick_index.index` ''' x = int(x * 100) x -= acc[0] # pylint: disable=cell-var-from-loop progbar.update(x) # pylint: disable=cell-var-from-loop acc[0] += x # pylint: disable=cell-var-from-loop with progbar: update_bar(0.0) index, _ = quick_index.index( reader, processes, progress_indicator=update_bar) else: index = quick_index.ExtendedScanIndex() reader.reset() try: n = len(reader) progbar = progress(label='Building Index', length=n) except TypeError: progbar = spinner(title="Building Index") with progbar: for bunch in reader.make_iterator(grouped=True): i = 0 i += bunch.precursor is not None i += len(bunch.products) index.add_scan_bunch(bunch) progbar.update(i) name = path index_file_name = index.index_file_name(name) with open(index_file_name, 'w') as fh: index.serialize(fh)
def main(source_paths, output_path): '''Combine multiple processed mzML files together into a single file sorted by time. ''' sources = [] for source_path in source_paths: click.echo("Reading %r" % source_path) sources.append(ProcessedMzMLDeserializer(source_path)) total_n = sum(map(len, sources)) writer = MzMLSerializer(open(output_path, 'wb'), total_n) iterator = TimeOrderMergingIterator(sources) writer.copy_metadata_from(sources[0]) with writer: i = 0 for bunch in iterator: i += 1 if i % 100 == 0: click.echo("Processed %d batches. %d sources depeted" % (i, iterator.count_exhausted())) writer.save(bunch)
def reader(self): reader = ProcessedMzMLDeserializer(get_opener(self.path)) return reader
def spectrum_clustering(paths, precursor_error_tolerance=1e-5, similarity_thresholds=None, output_path=None, in_memory=False, deconvoluted=False): '''Cluster spectra by precursor mass and cosine similarity. Spectrum clusters are written out to a text file recording cluster precursor mass, within-cluster similarity, and the source file and scan ID for each cluster member. ''' if not similarity_thresholds: similarity_thresholds = [0.1, 0.4, 0.7] else: similarity_thresholds = sorted(similarity_thresholds) if output_path is None: output_path = "-" msn_scans = [] n_spectra = 0 with click.progressbar(paths, label="Indexing", item_show_func=lambda x: str(x) if x else '') as progbar: key_seqs = [] for path in progbar: if deconvoluted: reader = ProcessedMzMLDeserializer(path) index = reader.extended_index else: reader, index = _ensure_metadata_index(path) key_seqs.append((reader, index)) n_spectra += len(index.msn_ids) with click.progressbar(label="Loading Spectra", length=n_spectra, item_show_func=lambda x: str(x) if x else '') as progbar: for reader, index in key_seqs: if not in_memory: proxy_context = ScanProxyContext(reader) pinfo_map = { pinfo.product_scan_id: pinfo for pinfo in index.get_precursor_information() } for i in index.msn_ids: progbar.current_item = i progbar.update(1) scan = proxy_context(i) scan.precursor_information = pinfo_map[i] msn_scans.append(scan) else: for i in index.msn_ids: progbar.current_item = i progbar.update(1) scan = reader.get_scan_by_id(i) if scan.peak_set is None and not deconvoluted: scan.pick_peaks() msn_scans.append(scan) click.echo("Begin Clustering", err=True) clusters = iterative_clustering( msn_scans, precursor_error_tolerance, similarity_thresholds) by_size = Counter() for cluster in clusters: by_size[len(cluster)] += 1 click.echo("Clusters: {:d}".format(len(clusters))) for key, value in sorted(by_size.items()): click.echo("Size {:d}: {:d}".format(key, value)) with click.open_file(output_path, mode='w') as outfh: writer = ScanClusterWriter(outfh) for cluster in clusters: writer.save(cluster)
def load_spectra(self): return list( ProcessedMzMLDeserializer( get_test_data("example_glycopeptide_spectra.mzML")))
def reader(self): reader = ProcessedMzMLDeserializer(idzip.open(self.path)) return reader
def spectrum_clustering(paths, precursor_error_tolerance=1e-5, similarity_thresholds=None, output_path=None, in_memory=False, deconvoluted=False, cache_size=2**10): '''Cluster spectra by precursor mass and cosine similarity. Spectrum clusters are written out to a text file recording cluster precursor mass, within-cluster similarity, and the source file and scan ID for each cluster member. ''' if not similarity_thresholds: similarity_thresholds = [0.1, 0.4, 0.7] else: similarity_thresholds = sorted(similarity_thresholds) if output_path is None: output_path = "-" msn_scans = [] n_spectra = 0 with click.progressbar(paths, label="Indexing", item_show_func=lambda x: str(x) if x else '') as progbar: key_seqs = [] for path in progbar: if deconvoluted: reader = ProcessedMzMLDeserializer(path) index = reader.extended_index else: reader, index = _ensure_metadata_index(path) key_seqs.append((reader, index)) n_spectra += len(index.msn_ids) with click.progressbar(label="Loading Spectra", length=n_spectra, item_show_func=lambda x: str(x) if x else '') as progbar: for reader, index in key_seqs: if not in_memory: if not reader.has_fast_random_access: click.secho( "%s does not have fast random access, scan fetching may be slow!" % ( reader, ), fg='yellow') proxy_context = ScanProxyContext(reader, cache_size=cache_size) pinfo_map = { pinfo.product_scan_id: pinfo for pinfo in index.get_precursor_information() } for i in index.msn_ids: progbar.current_item = i progbar.update(1) scan = proxy_context(i) scan.precursor_information = pinfo_map[i] msn_scans.append(scan) else: if reader.has_fast_random_access: # We have fast random access so we can just loop over the index and pull out # the MSn scans directly without completely traversing the file. for i in index.msn_ids: progbar.current_item = i progbar.update(1) scan = reader.get_scan_by_id(i) if scan.peak_set is None and not deconvoluted: scan = scan.pick_peaks().pack() msn_scans.append(scan) else: # If we don't have fast random access, it's better just to loop over the file, # and absorb the cost of parsing the MS1 scans reader.reset() reader.make_iterator(grouped=False) for scan in reader: if scan.ms_level != 1: progbar.current_item = scan.id progbar.update(1) if scan.peak_set is None and not deconvoluted: scan = scan.pick_peaks().pack(bind=True) msn_scans.append(scan) # Dispose of the state that is no longer required. reader.reset() index.clear() click.echo("Begin Clustering", err=True) clusters = iterative_clustering( msn_scans, precursor_error_tolerance, similarity_thresholds) click.echo("Clusering Finished", err=True) by_size = Counter() for cluster in clusters: by_size[len(cluster)] += 1 click.echo("Clusters: {:d}".format(len(clusters)), err=True) for key, value in sorted(by_size.items()): click.echo("Size {:d}: {:d}".format(key, value), err=True) with click.open_file(output_path, mode='w') as outfh: writer = ScanClusterWriter(outfh) for cluster in clusters: writer.save(cluster)
def main(path, reference_path): reader = ProcessedMzMLDeserializer(get_opener(path)) reference_reader = ProcessedMzMLDeserializer(get_opener(reference_path)) compare_readers(reader, reference_reader) print("Processed Files Appear to Match Perfectly.")