def setUp(self): # Set up dirs self.dirname = os.path.dirname(os.path.abspath(__file__)) self.topdir = os.path.join(os.path.join(self.dirname, ".."), "..") self.datadir = os.path.join(os.path.join(self.topdir, "test"), "data") self.scriptdir = os.path.join(self.topdir, "analysis") # Set up files peakgroups_file = os.path.join(self.datadir, "imputeValues/imputeValues_5_input.csv") mzml_file = os.path.join(self.datadir, "imputeValues/r004_small/split_olgas_otherfile.chrom.mzML") # Parameters self.initial_alignment_cutoff = 0.0001 fdr_cutoff_all_pg = 1.0 max_rt_diff = 30 # Read input reader = SWATHScoringReader.newReader([peakgroups_file], "openswath", readmethod="complete") self.new_exp = MRExperiment() self.new_exp.runs = reader.parse_files() self.multipeptides = self.new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) # Align all against all self.tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(self.initial_alignment_cutoff) for run_0 in self.new_exp.runs: for run_1 in self.new_exp.runs: helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30) # Select two interesting peptides pepname = "21517_C[160]NVVISGGTGSGK/2_run0 0 0" self.current_mpep1 = [m for m in self.multipeptides if m.getAllPeptides()[0].get_id() == pepname][0] pepname = "26471_GYEDPPAALFR/2_run0 0 0" self.current_mpep2 = [m for m in self.multipeptides if m.getAllPeptides()[0].get_id() == pepname][0]
def setUp(self): import msproteomicstoolslib.data_structures.Precursor as precursor import msproteomicstoolslib.data_structures.PrecursorGroup as precursor_group import msproteomicstoolslib.format.TransformationCollection as transformations from msproteomicstoolslib.algorithms.alignment.SplineAligner import SplineAligner import msproteomicstoolslib.algorithms.alignment.AlignmentHelper as helper # 0. id # 1. quality score (FDR) # 2. retention time (normalized) # 3. intensity mpeps = [Multipeptide() for i in range(3)] [m.set_nr_runs(5) for m in mpeps] # Parameters self.initial_alignment_cutoff = 0.001 runs = [MockRun("0_%s" % (i + 1)) for i in range(5)] ids = 0 for i in range(5): # Two alignment peptides p = precursor.Precursor("anchorpeptide_1", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 100 + i * 10, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[0].insert(runs[i].get_id(), prgr) ids += 1 p = precursor.Precursor("anchorpeptide_2", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 1000 + i * 100, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_2", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[1].insert(runs[i].get_id(), prgr) ids += 1 # The noise peptide p = precursor.Precursor("anchorpeptide_3", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 500 + i * 40, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_3", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[2].insert(runs[i].get_id(), prgr) ids += 1 m = Multipeptide() m.set_nr_runs(5) # Run 1 # - peakgroup 1 : RT = 110 seconds [correct] p = precursor.Precursor("precursor_1", runs[0]) pg_tuple = ("peakgroup1", 0.01, 100, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[0]) prgr.addPrecursor(p) m.insert(runs[0].get_id(), prgr) # Run 2: # - peakgroup 2 : RT = 115 seconds [correct] # - peakgroup 3 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[1]) pg_tuple = ("peakgroup2", 0.2, 112, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup3", 0.18, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[1]) prgr.addPrecursor(p) m.insert(runs[1].get_id(), prgr) # Run 3: # - peakgroup 4 : RT = 120 seconds [correct] # - peakgroup 5 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[2]) pg_tuple = ("peakgroup4", 0.2, 120, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup5", 0.17, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[2]) prgr.addPrecursor(p) m.insert(runs[2].get_id(), prgr) # Run 4: # - peakgroup 6 : missing [correct] # - peakgroup 7 : RT = 145 seconds p = precursor.Precursor("precursor_1", runs[3]) pg_tuple = ("peakgroup7", 0.18, 145, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[3]) prgr.addPrecursor(p) m.insert(runs[3].get_id(), prgr) # Run 5: # - peakgroup 8 : RT = 140 seconds [correct] # - peakgroup 9 : missing p = precursor.Precursor("precursor_1", runs[4]) pg_tuple = ("peakgroup8", 0.1, 139, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[4]) prgr.addPrecursor(p) m.insert(runs[4].get_id(), prgr) self.mpep = m self.exp = Dummy() self.exp.runs = runs mpeps.append(m) self.multipeptides = mpeps # Align all against all self.tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(self.initial_alignment_cutoff) for run_0 in self.exp.runs: for run_1 in self.exp.runs: helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30)
def setUp(self): import msproteomicstoolslib.data_structures.Precursor as precursor import msproteomicstoolslib.data_structures.PrecursorGroup as precursor_group import msproteomicstoolslib.format.TransformationCollection as transformations from msproteomicstoolslib.algorithms.alignment.SplineAligner import SplineAligner import msproteomicstoolslib.algorithms.alignment.AlignmentHelper as helper # 0. id # 1. quality score (FDR) # 2. retention time (normalized) # 3. intensity mpeps = [Multipeptide() for i in range(3)] [m.set_nr_runs(5) for m in mpeps] # Parameters self.initial_alignment_cutoff = 0.001 runs = [MockRun("0_%s" % (i+1)) for i in range(5)] ids = 0 for i in range(5): # Two alignment peptides p = precursor.Precursor("anchorpeptide_1", runs[i] ) pg_tuple = ("id_%s" % ids, 0.0001, 100 + i*10, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[0].insert(runs[i].get_id(), prgr) ids += 1 p = precursor.Precursor("anchorpeptide_2", runs[i] ) pg_tuple = ("id_%s" % ids, 0.0001, 1000 + i*100, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_2", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[1].insert(runs[i].get_id(), prgr) ids += 1 # The noise peptide p = precursor.Precursor("anchorpeptide_3", runs[i] ) pg_tuple = ("id_%s" % ids, 0.0001, 500 + i*40, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_3", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[2].insert(runs[i].get_id(), prgr) ids += 1 m = Multipeptide() m.set_nr_runs(5) # Run 1 # - peakgroup 1 : RT = 110 seconds [correct] p = precursor.Precursor("precursor_1", runs[0]) pg_tuple = ("peakgroup1", 0.01, 100, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[0]) prgr.addPrecursor(p) m.insert(runs[0].get_id(), prgr) # Run 2: # - peakgroup 2 : RT = 115 seconds [correct] # - peakgroup 3 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[1]) pg_tuple = ("peakgroup2", 0.2, 112, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup3", 0.18, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[1]) prgr.addPrecursor(p) m.insert(runs[1].get_id(), prgr) # Run 3: # - peakgroup 4 : RT = 120 seconds [correct] # - peakgroup 5 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[2]) pg_tuple = ("peakgroup4", 0.2, 120, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup5", 0.17, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[2]) prgr.addPrecursor(p) m.insert(runs[2].get_id(), prgr) # Run 4: # - peakgroup 6 : missing [correct] # - peakgroup 7 : RT = 145 seconds p = precursor.Precursor("precursor_1", runs[3]) pg_tuple = ("peakgroup7", 0.18, 145, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[3]) prgr.addPrecursor(p) m.insert(runs[3].get_id(), prgr) # Run 5: # - peakgroup 8 : RT = 140 seconds [correct] # - peakgroup 9 : missing p = precursor.Precursor("precursor_1", runs[4]) pg_tuple = ("peakgroup8", 0.1, 139, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[4]) prgr.addPrecursor(p) m.insert(runs[4].get_id(), prgr) self.mpep = m self.exp = Dummy() self.exp.runs = runs mpeps.append(m) self.multipeptides = mpeps # Align all against all self.tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(self.initial_alignment_cutoff) for run_0 in self.exp.runs: for run_1 in self.exp.runs: helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30)
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test): """Impute values across chromatograms Args: peakgroups_file(filename): CSV file containing all peakgroups mzML_file(filename): mzML file containing chromatograms Returns: A tuple of: new_exp(AlignmentExperiment): experiment containing the aligned peakgroups multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides This function will read the csv file with all peakgroups as well as the provided chromatogram file (.chrom.mzML). It will then try to impute missing values for those peakgroups where no values is currently present, reading the raw chromatograms. """ # We do not want to exclude any peakgroups for noiseIntegration (we assume # that alignment has already happened) fdr_cutoff_all_pg = 1.0 start = time.time() reader = SWATHScoringReader.newReader([peakgroups_file], options.file_format, readmethod="complete", enable_isotopic_grouping = not options.disable_isotopic_grouping) new_exp = Experiment() new_exp.runs = reader.parse_files() multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) print("Parsing the peakgroups file took %ss" % (time.time() - start) ) mapping = {} precursors_mapping = {} sequences_mapping = {} protein_mapping = {} inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False) mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()]) if VERBOSE: print mapping # Do only a single run : read only one single file start = time.time() swath_chromatograms = SwathChromatogramCollection() swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv) print("Reading the chromatogram files took %ss" % (time.time() - start) ) assert len(swath_chromatograms.getRunIDs() ) == 1 rid = swath_chromatograms.getRunIDs()[0] start = time.time() initial_alignment_cutoff = 0.0001 max_rt_diff = 30 sd_data = -1 # We do not use the standard deviation data in this algorithm tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(initial_alignment_cutoff) if method == "singleClosestRun": tree_mapped = None run_1 = [r for r in new_exp.runs if r.get_id() == rid][0] dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id()) print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for run_0 in new_exp.runs: helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) elif method == "singleShortestPath": dist_matrix = None tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner)) tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree] print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for edge in tree: helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], new_exp.runs[edge[1]], spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) else: raise Exception("Unknown method: " + method) print("Alignment took %ss" % (time.time() - start) ) start = time.time() multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms, tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test) print("Analyzing the runs took %ss" % (time.time() - start) ) return new_exp, multipeptides