示例#1
0
    def fetch_strings(table, id_col):
        collected_segments = defaultdict(list)
        for row in table.iterrows():
            id_ = row[id_col]
            segment_id = row["segment_id"]
            segment = row["segment"]
            collected_segments[id_].append((segment_id, segment))

        id_provider = IdProvider()
        for id_, segments in collected_segments.iteritems():
            segments.sort()  # sorts by segment_id
            full_str = "".join(s for (segment_id, s) in segments)
            id_provider.set_(id_, full_str)
        return id_provider
示例#2
0
    def __init__(self, path_root_dir):
        pep_files = _find_pep_xml_files(path_root_dir)
        if not pep_files:
            raise Exception("no pep.xml file found below %s" % path_root_dir)
        if len(pep_files) > 1:
            raise Exception("multiple pepe files %s found below %s" % (", ".join(pep_files),
                                                                       path_root_dir))

        self.summed_sizes = 0
        self.hit_id_provider = IdProvider(10000000)
        self.pep_file = pep_files[0]

        self.summed_sizes += os.stat(self.pep_file).st_size

        logger.info("input pep.xml file is %s" % self.pep_file)

        self.peak_map_files = _find_mz_xml_files(path_root_dir)
        if self.peak_map_files:
            for f in self.peak_map_files.values():
                self.summed_sizes += os.stat(f).st_size
                logger.info("found peak map file %s" % f)
        else:
            logger.error("found no peak map files below %s" % path_root_dir)
            raise Exception("no peak maps found below %s" % path_root_dir)

        self.feature_map_files = _find_feature_xml_files(path_root_dir)
        if self.feature_map_files:
            for f in self.feature_map_files.values():
                self.summed_sizes += os.stat(f).st_size
                logger.info("found feature map file %s" % f)
        else:
            logger.error("found no feature map files below %s" % path_root_dir)
            raise Exception("no feature maps found below %s" % path_root_dir)
示例#3
0
class CollectHitsData(object):

    def __init__(self, path_root_dir):
        pep_files = _find_pep_xml_files(path_root_dir)
        if not pep_files:
            raise Exception("no pep.xml file found below %s" % path_root_dir)
        if len(pep_files) > 1:
            raise Exception("multiple pepe files %s found below %s" % (", ".join(pep_files),
                                                                       path_root_dir))

        self.summed_sizes = 0
        self.hit_id_provider = IdProvider(10000000)
        self.pep_file = pep_files[0]

        self.summed_sizes += os.stat(self.pep_file).st_size

        logger.info("input pep.xml file is %s" % self.pep_file)

        self.peak_map_files = _find_mz_xml_files(path_root_dir)
        if self.peak_map_files:
            for f in self.peak_map_files.values():
                self.summed_sizes += os.stat(f).st_size
                logger.info("found peak map file %s" % f)
        else:
            logger.error("found no peak map files below %s" % path_root_dir)
            raise Exception("no peak maps found below %s" % path_root_dir)

        self.feature_map_files = _find_feature_xml_files(path_root_dir)
        if self.feature_map_files:
            for f in self.feature_map_files.values():
                self.summed_sizes += os.stat(f).st_size
                logger.info("found feature map file %s" % f)
        else:
            logger.error("found no feature map files below %s" % path_root_dir)
            raise Exception("no feature maps found below %s" % path_root_dir)

    def _extract_hits(self, peps, mz_tolerance_ppm, rt_tolerance_s):

        hits = []
        hit_finders = defaultdict(lambda: HitFinder(rt_tolerance_s, mz_tolerance_ppm))
        for pep in peps:
            li = []
            pep.getKeys(li)
            rt = pep.getRT()
            #  if rt == 0.0:
            #      continue
            mz = pep.getMZ()
            base_name = pep.getBaseName()
            base_name = os.path.basename(base_name)
            base_name, __, __ = base_name.partition("~")
            is_higher_score_better = pep.isHigherScoreBetter()
            for ph in pep.getHits():
                aa_sequence = ph.getSequence().toString()
                score = ph.getScore()
                hit_id = self.hit_id_provider.next_id()
                charge = ph.getCharge()
                hit = Hit(hit_id, aa_sequence, base_name, mz, rt, charge, score, is_higher_score_better)
                hits.append(hit)
                hit_finders[base_name].add_hit(hit)
        logger.info("extracted %d peptide hits" % len(hits))
        return hits, hit_finders

    def collect(self, out_file, unmatched_hits_file, mz_tolerance_ppm=20.0, rt_tolerance_s=5.0):
        with measure_time("collecting and compressing data for visualisation"):
            self._collect(out_file, unmatched_hits_file, mz_tolerance_ppm, rt_tolerance_s)

    def _collect(self, out_file, unmatched_hits_file, mz_tolerance_ppm, rt_tolerance_s):
        writer = CompressedDataWriter(out_file)

        # extract and write all hits from pep xml file:
        prots, peps = self._read_identifcations()
        hits, hit_finders = self._extract_hits(peps, mz_tolerance_ppm, rt_tolerance_s)
        writer.write_hits(hits)
        logger.info("wrote hits")

        # find features related to hits
        self._match_and_write_features(hits, hit_finders, writer)

        # collect ms1 spectra and find ms2 spectra related to hits
        self._match_and_write_spectra(hits, hit_finders, unmatched_hits_file, writer)

        writer.close()

        self._report_size_of_final_file(out_file)

    def _read_identifcations(self):
        prots, peps = [], []
        with measure_time("reading identifcations"):
            fh = oms.PepXMLFile()
            e = oms.MSExperiment()
            fh.load(self.pep_file, prots, peps, "", e, 1)     # use precursor data
        logger.info("got %d protein and %d peptide identifications" % (len(prots), len(peps),))
        return prots, peps

    def _match_and_write_features(self, hits, hit_finders, writer):
        for p in self.feature_map_files.values():
            with measure_time("match features from %s" % p):
                feature_counter = 0
                feature_map = oms.FeatureMap()
                oms.FeatureXMLFile().load(p, feature_map)
                base_file_name, __, __ = os.path.basename(p).partition("~")
                base_name, __ = os.path.splitext(base_file_name)
                hit_finder = hit_finders.get(base_name)
                if hit_finder is None:
                    logger.warn("no hits in .pep.xml for features in %s" % p)
                    continue
                for feature in feature_map:
                    feature_id = writer.add_feature(feature, base_name)
                    feature_counter += 1
                    for pep_id in feature.getPeptideIdentifications():
                        rt = pep_id.getRT()
                        mz = pep_id.getMZ()
                        is_higher_score_better = pep_id.isHigherScoreBetter()
                        for oms_hit in pep_id.getHits():
                            for hit in hit_finder.find_hits(rt, mz):
                                if hit.aa_sequence == oms_hit.getSequence().toString():
                                    break
                            else:
                                # no hit found in for loop, so create new hit:
                                hit_id = self.hit_id_provider.next_id()
                                aa_sequence = oms_hit.getSequence().toString()
                                score = oms_hit.getScore()
                                charge = oms_hit.getCharge()
                                hit = Hit(hit_id, aa_sequence, base_name, mz, rt, charge, score,
                                          is_higher_score_better)
                                hits.append(hit)
                                writer.add_hit(hit)
                            writer.link_feature_with_hit(feature_id, hit)

            logger.info("inserted %d features" % feature_counter)

    def _report_size_of_final_file(self, out_file):

        final_bytes = os.stat(out_file).st_size
        logger.info("target file %s written and closed" % out_file)
        logger.info("size of all input files: %s" % (format_bytes(self.summed_sizes)))
        logger.info("size of compressed file: %s" % (format_bytes(final_bytes)))
        factor = float(self.summed_sizes) / final_bytes
        logger.info("compression factor is %.1f" % factor)

    def _match_and_write_spectra(self, hits, hit_finders, unmatched_hits_file, writer):
        matched_hit_ids = set()
        for base_name, path in self.peak_map_files.items():
            hit_finder = hit_finders[base_name]
            with measure_time("fetching peaks from %s" % path):
                consumer = Consumer(writer, hit_finder, matched_hit_ids, base_name)
                mzxml_file = oms.MzXMLFile()
                mzxml_file.transform(path, consumer)
                logger.info("rt range in this file is %.1f ... %.1f seconds" % (consumer.min_rt,
                                                                                consumer.max_rt))

        all_hit_ids = set(h.id_ for h in hits)
        missing_hit_ids = sorted(all_hit_ids - matched_hit_ids)

        if missing_hit_ids:
            logger.warn("did not find ms2 spectrum for %d hits" % len(missing_hit_ids))
            if unmatched_hits_file is not None:
                i = 0
                hits.sort(key=lambda hit: (hit.base_name, hit.rt))
                with open(unmatched_hits_file, "w") as fp:
                    for hit in hits:
                        if hit.id_ in missing_hit_ids:
                            if i > 0:
                                print >> fp, "-" * 79
                            print >> fp, "%4d" % i, hit.aa_sequence
                            print >> fp, "base_name=", hit.base_name,
                            print >> fp, "mz=%10.5f" % hit.mz, "rt=%.1f" % hit.rt
                            i += 1
                logger.info("wrote unmatched hits to %s" % unmatched_hits_file)
        else:
            logger.info("found ms2 spectrum for all hits")
示例#4
0
class CompressedDataWriter(object):

    class AASequence(IsDescription):

        """ pytables has no variable length string arrays, so we split strings into segments
            of size CHUNKLEN indexed by segment_id
        """

        aa_sequence_id = Int64Col()     # no uint, as pytables can not index uints
        segment_id = Int8Col()     # no uint, as pytables can not index uints
        segment = StringCol(CHUNKLEN)

    class HitsPerAASequenceCounter(IsDescription):

        """ counts number of hits per aa sequenc
        """
        aa_sequence_id = Int64Col()     # no uint, as pytables can not index uints
        hit_count = UInt64Col()

    class BaseName(IsDescription):

        """ pytables has no variable length string arrays, so we split strings into segments
            of size CHUNKLEN indexed by segment_id
        """

        base_name_id = Int16Col()     # no uint, as pytables can not index uints
        segment_id = Int8Col()     # no uint, as pytables can not index uints
        segment = StringCol(CHUNKLEN)

    class Spectrum(IsDescription):

        """ we keep all peaks in two huge arrays for mz and intensities.
            this table references a spectrum with id 'spec_id' to mz[i_low:ihigh, :]
            and intensity[i_low:i_high, :]
        """

        spec_id = Int64Col()       # no uint, as pytables can not index uints
        base_name_id = Int64Col()  # no uint, as pytables can not index uints
        ms_level = UInt8Col()
        rt = Float32Col()
        i_low = UInt64Col()
        i_high = UInt64Col()

    class Feature(IsDescription):

        feature_id = Int64Col()     # no uint, as pytables can not index uints
        base_name_id = Int64Col()   # dito
        feature_id_from_file = UInt64Col()  # size_t in OpenMS
        rtmin = Float32Col()
        rtmax = Float32Col()
        mzmin = Float64Col()
        mzmax = Float64Col()
        area = Float32Col()

    class MassTrace(IsDescription):

        mass_trace_id = Int64Col()     # no uint, as pytables can not index uints
        feature_id = Int64Col()       # dito
        rtmin = Float32Col()
        rtmax = Float32Col()
        mzmin = Float64Col()
        mzmax = Float64Col()
        area = Float32Col()

    class HitFeatureLink(IsDescription):

        hit_id = Int64Col()           # no uint, as pytables can not index uints
        feature_id = Int64Col()       # dito

    class HitData(IsDescription):

        hit_id = Int64Col()     # no uint, as pytables can not index uints
        base_name_id = Int16Col()
        mz = Float64Col()
        rt = Float32Col()
        aa_sequence_id = Int64Col()
        score = Float64Col()
        is_higher_score_better = BoolCol()
        charge = UInt8Col()

    class HitSpectrumLink(IsDescription):

        """
        in most cases we have one spec linked to 0..n hits, but depending on tolerance,
        this might be n:m, that is we find multiple specs for one hit....
        """

        hit_id = Int64Col()     # no uint, as pytables can not index uints
        spec_id = Int64Col()     # no uint, as pytables can not index uints

    def __init__(self, path):
        self.path = path
        self.file_ = open_file(path, mode="w")
        self.root = self.file_.root

        filters = Filters(complib='blosc', complevel=9)

        self.aa_sequence_table = self.file_.create_table(self.root,
                                                         'aa_sequences',
                                                         self.AASequence,
                                                         "AASequences",
                                                         filters=filters)

        self.hit_counts_table = self.file_.create_table(self.root,
                                                        'hit_counts',
                                                        self.HitsPerAASequenceCounter,
                                                        "HitsPerAASequenceCounter",
                                                        filters=filters)

        self.base_name_table = self.file_.create_table(self.root,
                                                       'base_names',
                                                       self.BaseName,
                                                       "BaseNames",
                                                       filters=filters)

        self.spectrum_table = self.file_.create_table(self.root,
                                                      'spectra',
                                                      self.Spectrum,
                                                      "Spectra",
                                                      filters=filters)

        self.feature_table = self.file_.create_table(self.root,
                                                     'features',
                                                     self.Feature,
                                                     "Features",
                                                     filters=filters)

        self.mass_trace_table = self.file_.create_table(self.root,
                                                        'mass_traces',
                                                        self.MassTrace,
                                                        "MassTraces",
                                                        filters=filters)

        self.hit_feature_link_table = self.file_.create_table(self.root,
                                                              'hit_feature_links',
                                                              self.HitFeatureLink,
                                                              "HitFeatureLink",
                                                              filters=filters)

        self.hit_data_table = self.file_.create_table(self.root,
                                                      "hit_data",
                                                      self.HitData,
                                                      "HitData",
                                                      filters=filters)

        self.hit_spectrum_link_table = self.file_.create_table(self.root,
                                                               "hit_spectrum_links",
                                                               self.HitSpectrumLink,
                                                               "HitSpectrumLinks",
                                                               filters=filters)

        self.mz_array = self.file_.create_earray(self.root,
                                                 'mzs',
                                                 Float64Atom(),
                                                 (0,),
                                                 filters=filters,)

        self.intensity_array = self.file_.create_earray(self.root,
                                                        'intensities',
                                                        Float32Atom(),
                                                        (0,),
                                                        filters=filters,)

        self.base_name_id_provider = IdProvider()
        self.aa_sequence_id_provider = IdProvider()
        self.peak_imin = 0
        self.peak_imax = 0
        self.spec_id_provider = IdProvider()
        self.feature_id_provider = IdProvider()
        self.mass_trace_id_provider = IdProvider()

    @staticmethod
    def add_string(table, id_col, id_, string):
        """ pytables has no variable length string arrays, so we split strings into chunks """
        row = table.row
        for i, imin in enumerate(xrange(0, len(string), CHUNKLEN)):
            segment = string[imin:imin + CHUNKLEN]
            row[id_col] = id_
            row['segment_id'] = i
            row['segment'] = segment
            row.append()
        table.flush()

    def add_aa_sequence(self, sequence):
        id_ = self.aa_sequence_id_provider.register(sequence)
        CompressedDataWriter.add_string(self.aa_sequence_table, "aa_sequence_id", id_, sequence)
        return id_

    def finish_writing_aa_sequences(self):
        self.aa_sequence_table.flush()
        self.hit_counts_table.flush()

    def add_base_name(self, name):
        id_ = self.base_name_id_provider.register(name)
        CompressedDataWriter.add_string(self.base_name_table, "base_name_id", id_, name)
        return id_

    def finish_writing_base_names(self):
        self.base_name_table.flush()

    def _add_aa_sequences(self, hits):

        for aa_sequence in set(h.aa_sequence for h in hits):
            self.add_aa_sequence(aa_sequence)

        for aa_sequence, count in Counter((h.aa_sequence for h in hits)).iteritems():
            id_ = self.aa_sequence_id_provider.lookup_id(aa_sequence)
            assert id_ is not None, "may not happen"
            row = self.hit_counts_table.row
            row["aa_sequence_id"] = id_
            row["hit_count"] = count
            row.append()

        self.finish_writing_aa_sequences()

    def _add_base_names(self, base_names):
        for base_name in set(base_names):
            self.add_base_name(base_name)
        self.finish_writing_base_names()

    def _lookup_or_insert_base_name(self, base_name):
        base_name_id = self.base_name_id_provider.lookup_id(base_name)
        if base_name_id is None:
            base_name_id = self.add_base_name(base_name)
        return base_name_id

    def _lookup_or_insert_aa_sequence(self, aa_sequence):
        aa_sequence_id = self.aa_sequence_id_provider.lookup_id(aa_sequence)
        if aa_sequence_id is None:
            aa_sequence_id = self.add_aa_sequence(aa_sequence)
        return aa_sequence_id

    def add_hit(self, hit):
        base_name_id = self._lookup_or_insert_base_name(hit.base_name)
        aa_sequence_id = self._lookup_or_insert_aa_sequence(hit.aa_sequence)
        row = self.hit_data_table.row
        row["hit_id"] = hit.id_
        row["base_name_id"] = base_name_id
        row["rt"] = hit.rt
        row["mz"] = hit.mz
        row["aa_sequence_id"] = aa_sequence_id
        row["score"] = hit.score
        row["charge"] = hit.charge
        row["is_higher_score_better"] = hit.is_higher_score_better
        row.append()

    def write_hits(self, hits):
        # it is important to write aa sequences and basenames before we write the hits
        # as hits link to aa sequences and base names
        self._add_aa_sequences(hits)
        self._add_base_names(set(h.base_name for h in hits))
        for hit in hits:
            self.add_hit(hit)

    def add_spectrum(self, spec, base_name):
        base_name_id = self._lookup_or_insert_base_name(base_name)
        rt, mzs, intensities, precursors, ms_level = spec
        self.mz_array.append(mzs)
        self.intensity_array.append(intensities)
        self.peak_imax += mzs.shape[0]
        # register peaks
        row = self.spectrum_table.row
        row["spec_id"] = self.spec_id_provider.next_id()
        row["base_name_id"] = base_name_id
        row["ms_level"] = ms_level
        row["rt"] = rt
        row["i_low"] = self.peak_imin
        row["i_high"] = self.peak_imax
        id_ = row["spec_id"]  # row.append() below destroys content of row !
        row.append()
        self.peak_imin = self.peak_imax
        return id_

    def link_spec_with_hit(self, spec_id, hit_id):
        row = self.hit_spectrum_link_table.row
        row["spec_id"] = spec_id
        row["hit_id"] = hit_id
        row.append()

    def add_feature(self, feature, base_name):
        hull = feature.getConvexHull()
        rtmin, rtmax, mzmin, mzmax = self._range(hull)
        row = self.feature_table.row
        base_name_id = self.base_name_id_provider.lookup_id(base_name)
        fid = np.uint64(feature.getUniqueId())
        feature_id = self.feature_id_provider.next_id()
        row["feature_id"] = feature_id
        row["feature_id_from_file"] = fid
        row["base_name_id"] = base_name_id
        row["rtmin"] = rtmin
        row["rtmax"] = rtmax
        row["mzmin"] = mzmin
        row["mzmax"] = mzmax
        row["area"] = (rtmax - rtmin) * (mzmax - mzmin)
        row.append()

        for hull in feature.getConvexHulls():
            rtmin, rtmax, mzmin, mzmax = self._range(hull)
            row = self.mass_trace_table.row
            row["mass_trace_id"] = self.mass_trace_id_provider.next_id()
            row["feature_id"] = feature_id
            row["rtmin"] = rtmin
            row["rtmax"] = rtmax
            row["mzmin"] = mzmin
            row["mzmax"] = mzmax
            row["area"] = (rtmax - rtmin) * (mzmax - mzmin)
            row.append()

        return feature_id

    @staticmethod
    def _range(hull):
        hull_points = hull.getHullPoints()
        assert isinstance(hull_points, np.ndarray)
        assert hull_points.shape == (4, 2)   # 4 points, 2 coordinates
        rtmin, mzmin = hull_points.min(axis=0)
        rtmax, mzmax = hull_points.max(axis=0)
        return rtmin, rtmax, mzmin, mzmax

    def link_feature_with_hit(self, feature_id, hit):
        row = self.hit_feature_link_table.row
        row["feature_id"] = feature_id
        row["hit_id"] = hit.id_
        row.append()

    def close(self):
        self.aa_sequence_table.flush()
        self.aa_sequence_table.close()

        self.spectrum_table.flush()
        self.spectrum_table.cols.spec_id.create_index()
        self.spectrum_table.cols.ms_level.create_index()
        self.spectrum_table.cols.rt.create_index()
        self.spectrum_table.flush()
        self.spectrum_table.close()

        self.hit_data_table.flush()
        self.hit_data_table.cols.hit_id.create_index()
        self.hit_data_table.cols.aa_sequence_id.create_index()
        self.hit_data_table.flush()
        self.hit_data_table.close()

        self.hit_spectrum_link_table.flush()
        self.hit_spectrum_link_table.cols.hit_id.create_index()
        self.hit_spectrum_link_table.cols.spec_id.create_index()
        self.hit_spectrum_link_table.flush()
        self.hit_spectrum_link_table.close()

        self.mass_trace_table.flush()
        self.mass_trace_table.cols.mass_trace_id.create_index()
        self.mass_trace_table.cols.feature_id.create_index()
        self.mass_trace_table.cols.rtmin.create_index()
        self.mass_trace_table.cols.rtmax.create_index()
        self.mass_trace_table.cols.mzmin.create_index()
        self.mass_trace_table.cols.mzmax.create_index()
        self.mass_trace_table.flush()
        self.mass_trace_table.close()

        self.feature_table.flush()
        self.feature_table.cols.feature_id.create_index()
        self.feature_table.cols.rtmin.create_index()
        self.feature_table.cols.rtmax.create_index()
        self.feature_table.cols.mzmin.create_index()
        self.feature_table.cols.mzmax.create_index()
        self.feature_table.flush()
        self.feature_table.close()

        self.hit_feature_link_table.flush()
        self.hit_feature_link_table.cols.hit_id.create_index()
        self.hit_feature_link_table.cols.feature_id.create_index()
        self.hit_feature_link_table.flush()
        self.hit_feature_link_table.close()

        self.mz_array.flush()
        self.mz_array.close()

        self.intensity_array.flush()
        self.intensity_array.close()

        self.file_.close()
示例#5
0
    def __init__(self, path):
        self.path = path
        self.file_ = open_file(path, mode="w")
        self.root = self.file_.root

        filters = Filters(complib='blosc', complevel=9)

        self.aa_sequence_table = self.file_.create_table(self.root,
                                                         'aa_sequences',
                                                         self.AASequence,
                                                         "AASequences",
                                                         filters=filters)

        self.hit_counts_table = self.file_.create_table(self.root,
                                                        'hit_counts',
                                                        self.HitsPerAASequenceCounter,
                                                        "HitsPerAASequenceCounter",
                                                        filters=filters)

        self.base_name_table = self.file_.create_table(self.root,
                                                       'base_names',
                                                       self.BaseName,
                                                       "BaseNames",
                                                       filters=filters)

        self.spectrum_table = self.file_.create_table(self.root,
                                                      'spectra',
                                                      self.Spectrum,
                                                      "Spectra",
                                                      filters=filters)

        self.feature_table = self.file_.create_table(self.root,
                                                     'features',
                                                     self.Feature,
                                                     "Features",
                                                     filters=filters)

        self.mass_trace_table = self.file_.create_table(self.root,
                                                        'mass_traces',
                                                        self.MassTrace,
                                                        "MassTraces",
                                                        filters=filters)

        self.hit_feature_link_table = self.file_.create_table(self.root,
                                                              'hit_feature_links',
                                                              self.HitFeatureLink,
                                                              "HitFeatureLink",
                                                              filters=filters)

        self.hit_data_table = self.file_.create_table(self.root,
                                                      "hit_data",
                                                      self.HitData,
                                                      "HitData",
                                                      filters=filters)

        self.hit_spectrum_link_table = self.file_.create_table(self.root,
                                                               "hit_spectrum_links",
                                                               self.HitSpectrumLink,
                                                               "HitSpectrumLinks",
                                                               filters=filters)

        self.mz_array = self.file_.create_earray(self.root,
                                                 'mzs',
                                                 Float64Atom(),
                                                 (0,),
                                                 filters=filters,)

        self.intensity_array = self.file_.create_earray(self.root,
                                                        'intensities',
                                                        Float32Atom(),
                                                        (0,),
                                                        filters=filters,)

        self.base_name_id_provider = IdProvider()
        self.aa_sequence_id_provider = IdProvider()
        self.peak_imin = 0
        self.peak_imax = 0
        self.spec_id_provider = IdProvider()
        self.feature_id_provider = IdProvider()
        self.mass_trace_id_provider = IdProvider()