def write_out(new_exp, multipeptides, outfile, matrix_outfile, single_outfile): """ Write the result to disk """ # write out the complete original files writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = new_exp.runs[0].header for run in new_exp.runs: assert header_first == run.header writer.writerow(header_first) print "number of precursors quantified:", len(multipeptides) for m in multipeptides: # selected_peakgroups = [p.peakgroups[0] for p in m.get_peptides()] # if (len(selected_peakgroups)*2.0 / len(new_exp.runs) < fraction_needed_selected) : continue for p in m.getAllPeptides(): for selected_pg in p.peakgroups: if single_outfile: # Only write the newly imputed ones ... if float(selected_pg.get_value("m_score")) > 1.0: row_to_write = selected_pg.row writer.writerow(row_to_write) else: row_to_write = selected_pg.row writer.writerow(row_to_write) if len(matrix_outfile) > 0: helper.write_out_matrix_file(matrix_outfile, new_exp.runs, multipeptides, 0.0, style=options.matrix_output_method)
def write_out(new_exp, multipeptides, outfile, matrix_outfile, single_outfile=None): """ Write the result to disk This writes all peakgroups to disk (newly imputed ones as previously found ones) as even some "previously good" peakgroups may have changed location due to isotopic_transfer. """ # write out the complete original files writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = new_exp.runs[0].header for run in new_exp.runs: assert header_first == run.header writer.writerow(header_first) print("number of precursors quantified:", len(multipeptides)) for m in sorted(multipeptides, key=lambda x: str(x)): # selected_peakgroups = [p.peakgroups[0] for p in m.get_peptides()] # if (len(selected_peakgroups)*2.0 / len(new_exp.runs) < fraction_needed_selected) : continue for p in m.getAllPeptides(): for selected_pg in sorted(p.peakgroups): if single_outfile is not None: if single_outfile == selected_pg.get_value("run_id"): # Only write the values for this run ... row_to_write = selected_pg.row writer.writerow(row_to_write) else: row_to_write = selected_pg.row writer.writerow(row_to_write) if len(matrix_outfile) > 0: helper.write_out_matrix_file(matrix_outfile, new_exp.runs, multipeptides, 0.0, style=options.matrix_output_method)
def setUp(self): # Set up dirs self.dirname = os.path.dirname(os.path.abspath(__file__)) self.topdir = os.path.join(os.path.join(self.dirname, ".."), "..") self.datadir = os.path.join(os.path.join(self.topdir, "test"), "data") self.scriptdir = os.path.join(self.topdir, "analysis") # Set up files peakgroups_file = os.path.join(self.datadir, "imputeValues/imputeValues_5_input.csv") mzml_file = os.path.join(self.datadir, "imputeValues/r004_small/split_olgas_otherfile.chrom.mzML") # Parameters self.initial_alignment_cutoff = 0.0001 fdr_cutoff_all_pg = 1.0 max_rt_diff = 30 # Read input reader = SWATHScoringReader.newReader([peakgroups_file], "openswath", readmethod="complete") self.new_exp = MRExperiment() self.new_exp.runs = reader.parse_files() self.multipeptides = self.new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) # Align all against all self.tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(self.initial_alignment_cutoff) for run_0 in self.new_exp.runs: for run_1 in self.new_exp.runs: helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30) # Select two interesting peptides pepname = "21517_C[160]NVVISGGTGSGK/2_run0 0 0" self.current_mpep1 = [m for m in self.multipeptides if m.getAllPeptides()[0].get_id() == pepname][0] pepname = "26471_GYEDPPAALFR/2_run0 0 0" self.current_mpep2 = [m for m in self.multipeptides if m.getAllPeptides()[0].get_id() == pepname][0]
def test_matrix_out_2(self): """Test the output matrix writers""" import msproteomicstoolslib.algorithms.alignment.AlignmentHelper as helper runs = self.exp2.runs multipeptides = self.multipeptides2 tmpfile = "tmp.output.csv" helper.write_out_matrix_file(tmpfile, runs, multipeptides, 0.0, style="full", write_requant=False) os.remove(tmpfile) tmpfile = "tmp.output.tsv" helper.write_out_matrix_file(tmpfile, runs, multipeptides, 0.0, style="full", write_requant=False) os.remove(tmpfile) tmpfile = "tmp.output.xls" helper.write_out_matrix_file(tmpfile, runs, multipeptides, 0.0, style="full", write_requant=False) os.remove(tmpfile) tmpfile = "tmp.output.xlsx" helper.write_out_matrix_file(tmpfile, runs, multipeptides, 0.0, style="full", write_requant=False) os.remove(tmpfile)
def setUp(self): import msproteomicstoolslib.data_structures.Precursor as precursor import msproteomicstoolslib.data_structures.PrecursorGroup as precursor_group import msproteomicstoolslib.format.TransformationCollection as transformations from msproteomicstoolslib.algorithms.alignment.SplineAligner import SplineAligner import msproteomicstoolslib.algorithms.alignment.AlignmentHelper as helper # 0. id # 1. quality score (FDR) # 2. retention time (normalized) # 3. intensity mpeps = [Multipeptide() for i in range(3)] [m.set_nr_runs(5) for m in mpeps] # Parameters self.initial_alignment_cutoff = 0.001 runs = [MockRun("0_%s" % (i + 1)) for i in range(5)] ids = 0 for i in range(5): # Two alignment peptides p = precursor.Precursor("anchorpeptide_1", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 100 + i * 10, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[0].insert(runs[i].get_id(), prgr) ids += 1 p = precursor.Precursor("anchorpeptide_2", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 1000 + i * 100, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_2", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[1].insert(runs[i].get_id(), prgr) ids += 1 # The noise peptide p = precursor.Precursor("anchorpeptide_3", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 500 + i * 40, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_3", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[2].insert(runs[i].get_id(), prgr) ids += 1 m = Multipeptide() m.set_nr_runs(5) # Run 1 # - peakgroup 1 : RT = 110 seconds [correct] p = precursor.Precursor("precursor_1", runs[0]) pg_tuple = ("peakgroup1", 0.01, 100, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[0]) prgr.addPrecursor(p) m.insert(runs[0].get_id(), prgr) # Run 2: # - peakgroup 2 : RT = 115 seconds [correct] # - peakgroup 3 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[1]) pg_tuple = ("peakgroup2", 0.2, 112, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup3", 0.18, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[1]) prgr.addPrecursor(p) m.insert(runs[1].get_id(), prgr) # Run 3: # - peakgroup 4 : RT = 120 seconds [correct] # - peakgroup 5 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[2]) pg_tuple = ("peakgroup4", 0.2, 120, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup5", 0.17, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[2]) prgr.addPrecursor(p) m.insert(runs[2].get_id(), prgr) # Run 4: # - peakgroup 6 : missing [correct] # - peakgroup 7 : RT = 145 seconds p = precursor.Precursor("precursor_1", runs[3]) pg_tuple = ("peakgroup7", 0.18, 145, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[3]) prgr.addPrecursor(p) m.insert(runs[3].get_id(), prgr) # Run 5: # - peakgroup 8 : RT = 140 seconds [correct] # - peakgroup 9 : missing p = precursor.Precursor("precursor_1", runs[4]) pg_tuple = ("peakgroup8", 0.1, 139, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[4]) prgr.addPrecursor(p) m.insert(runs[4].get_id(), prgr) self.mpep = m self.exp = Dummy() self.exp.runs = runs mpeps.append(m) self.multipeptides = mpeps # Align all against all self.tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(self.initial_alignment_cutoff) for run_0 in self.exp.runs: for run_1 in self.exp.runs: helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30)
def setUp(self): import msproteomicstoolslib.data_structures.Precursor as precursor import msproteomicstoolslib.data_structures.PrecursorGroup as precursor_group import msproteomicstoolslib.format.TransformationCollection as transformations from msproteomicstoolslib.algorithms.alignment.SplineAligner import SplineAligner import msproteomicstoolslib.algorithms.alignment.AlignmentHelper as helper # 0. id # 1. quality score (FDR) # 2. retention time (normalized) # 3. intensity mpeps = [Multipeptide() for i in range(3)] [m.set_nr_runs(5) for m in mpeps] # Parameters self.initial_alignment_cutoff = 0.001 runs = [MockRun("0_%s" % (i+1)) for i in range(5)] ids = 0 for i in range(5): # Two alignment peptides p = precursor.Precursor("anchorpeptide_1", runs[i] ) pg_tuple = ("id_%s" % ids, 0.0001, 100 + i*10, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[0].insert(runs[i].get_id(), prgr) ids += 1 p = precursor.Precursor("anchorpeptide_2", runs[i] ) pg_tuple = ("id_%s" % ids, 0.0001, 1000 + i*100, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_2", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[1].insert(runs[i].get_id(), prgr) ids += 1 # The noise peptide p = precursor.Precursor("anchorpeptide_3", runs[i] ) pg_tuple = ("id_%s" % ids, 0.0001, 500 + i*40, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_3", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[2].insert(runs[i].get_id(), prgr) ids += 1 m = Multipeptide() m.set_nr_runs(5) # Run 1 # - peakgroup 1 : RT = 110 seconds [correct] p = precursor.Precursor("precursor_1", runs[0]) pg_tuple = ("peakgroup1", 0.01, 100, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[0]) prgr.addPrecursor(p) m.insert(runs[0].get_id(), prgr) # Run 2: # - peakgroup 2 : RT = 115 seconds [correct] # - peakgroup 3 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[1]) pg_tuple = ("peakgroup2", 0.2, 112, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup3", 0.18, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[1]) prgr.addPrecursor(p) m.insert(runs[1].get_id(), prgr) # Run 3: # - peakgroup 4 : RT = 120 seconds [correct] # - peakgroup 5 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[2]) pg_tuple = ("peakgroup4", 0.2, 120, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup5", 0.17, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[2]) prgr.addPrecursor(p) m.insert(runs[2].get_id(), prgr) # Run 4: # - peakgroup 6 : missing [correct] # - peakgroup 7 : RT = 145 seconds p = precursor.Precursor("precursor_1", runs[3]) pg_tuple = ("peakgroup7", 0.18, 145, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[3]) prgr.addPrecursor(p) m.insert(runs[3].get_id(), prgr) # Run 5: # - peakgroup 8 : RT = 140 seconds [correct] # - peakgroup 9 : missing p = precursor.Precursor("precursor_1", runs[4]) pg_tuple = ("peakgroup8", 0.1, 139, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[4]) prgr.addPrecursor(p) m.insert(runs[4].get_id(), prgr) self.mpep = m self.exp = Dummy() self.exp.runs = runs mpeps.append(m) self.multipeptides = mpeps # Align all against all self.tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(self.initial_alignment_cutoff) for run_0 in self.exp.runs: for run_1 in self.exp.runs: helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30)
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test): """Impute values across chromatograms Args: peakgroups_file(filename): CSV file containing all peakgroups mzML_file(filename): mzML file containing chromatograms Returns: A tuple of: new_exp(AlignmentExperiment): experiment containing the aligned peakgroups multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides This function will read the csv file with all peakgroups as well as the provided chromatogram file (.chrom.mzML). It will then try to impute missing values for those peakgroups where no values is currently present, reading the raw chromatograms. """ # We do not want to exclude any peakgroups for noiseIntegration (we assume # that alignment has already happened) fdr_cutoff_all_pg = 1.0 start = time.time() reader = SWATHScoringReader.newReader([peakgroups_file], options.file_format, readmethod="complete", enable_isotopic_grouping = not options.disable_isotopic_grouping) new_exp = Experiment() new_exp.runs = reader.parse_files() multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) print("Parsing the peakgroups file took %ss" % (time.time() - start) ) mapping = {} precursors_mapping = {} sequences_mapping = {} protein_mapping = {} inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False) mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()]) if VERBOSE: print mapping # Do only a single run : read only one single file start = time.time() swath_chromatograms = SwathChromatogramCollection() swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv) print("Reading the chromatogram files took %ss" % (time.time() - start) ) assert len(swath_chromatograms.getRunIDs() ) == 1 rid = swath_chromatograms.getRunIDs()[0] start = time.time() initial_alignment_cutoff = 0.0001 max_rt_diff = 30 sd_data = -1 # We do not use the standard deviation data in this algorithm tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(initial_alignment_cutoff) if method == "singleClosestRun": tree_mapped = None run_1 = [r for r in new_exp.runs if r.get_id() == rid][0] dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id()) print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for run_0 in new_exp.runs: helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) elif method == "singleShortestPath": dist_matrix = None tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner)) tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree] print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for edge in tree: helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], new_exp.runs[edge[1]], spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) else: raise Exception("Unknown method: " + method) print("Alignment took %ss" % (time.time() - start) ) start = time.time() multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms, tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test) print("Analyzing the runs took %ss" % (time.time() - start) ) return new_exp, multipeptides