def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method, use_RT_correction, stdev_max_rt_per_run, use_local_stdev, mst_use_ref): """ Minimum Spanning Tree (MST) based local aligment """ spl_aligner = SplineAligner(initial_alignment_cutoff) if mst_use_ref: # force reference-based alignment bestrun = spl_aligner._determine_best_run(exp) ref = spl_aligner._determine_best_run(exp).get_id() refrun_id, refrun = [ (i,run) for i, run in enumerate(exp.runs) if run.get_id() == ref][0] tree = [( i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id] else: tree = MinimumSpanningTree(getDistanceMatrix(exp, multipeptides, spl_aligner)) print("Computed Tree:", tree) # Get alignments tr_data = LightTransformationData() for edge in tree: addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]], spl_aligner, multipeptides, smoothing_method, max_rt_diff) tree_mapped = [ (exp.runs[a].get_id(), exp.runs[b].get_id()) for a,b in tree] # Perform work al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, rt_diff_isotope=rt_diff_isotope, correctRT_using_pg=use_RT_correction, stdev_max_rt_per_run=stdev_max_rt_per_run, use_local_stdev=use_local_stdev) if method == "LocalMST": al.alignBestCluster(multipeptides, tree_mapped, tr_data) elif method == "LocalMSTAllCluster": al.alignAllCluster(multipeptides, tree_mapped, tr_data) # Store number of ambigous cases (e.g. where more than one peakgroup below # the strict quality cutoff was found in the RT window) and the number of # cases where multiple possibilities were found. exp.nr_ambiguous = al.nr_ambiguous exp.nr_multiple_align = al.nr_multiple_align return tree
def setUp(self): # Set up dirs self.dirname = os.path.dirname(os.path.abspath(__file__)) self.topdir = os.path.join(os.path.join(self.dirname, ".."), "..") self.datadir = os.path.join(os.path.join(self.topdir, "test"), "data") self.scriptdir = os.path.join(self.topdir, "analysis") # Set up files peakgroups_file = os.path.join(self.datadir, "imputeValues/imputeValues_5_input.csv") mzml_file = os.path.join(self.datadir, "imputeValues/r004_small/split_olgas_otherfile.chrom.mzML") # Parameters self.initial_alignment_cutoff = 0.0001 fdr_cutoff_all_pg = 1.0 max_rt_diff = 30 # Read input reader = SWATHScoringReader.newReader([peakgroups_file], "openswath", readmethod="complete") self.new_exp = MRExperiment() self.new_exp.runs = reader.parse_files() self.multipeptides = self.new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) # Align all against all self.tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(self.initial_alignment_cutoff) for run_0 in self.new_exp.runs: for run_1 in self.new_exp.runs: helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30) # Select two interesting peptides pepname = "21517_C[160]NVVISGGTGSGK/2_run0 0 0" self.current_mpep1 = [m for m in self.multipeptides if m.getAllPeptides()[0].get_id() == pepname][0] pepname = "26471_GYEDPPAALFR/2_run0 0 0" self.current_mpep2 = [m for m in self.multipeptides if m.getAllPeptides()[0].get_id() == pepname][0]
def test_alignAllCluster_1(self): """Test the best cluster align This is using the best possible conditions with only 7 seconds retention time cutoff - Run1 : 100s [threadRT = 100s] - Run2 : 112s [threadRT = 106s] - Run3 : 120s [threadRT = 118s] - Run4 : xxx [threadRT = 126s] (should be around 130s) - Run5 : 139s [threadRT = 133s] """ spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree( algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id()) for a, b in tree] alignment = algo.TreeConsensusAlignment(max_rt_diff=6, fdr_cutoff=0.1, aligned_fdr_cutoff=0.25, correctRT_using_pg=True, verbose=True) alignment.alignAllCluster(self.multipeptides, tree_mapped, self.tr_data) # We should have 4 peakgroups prec1 = self.mpep self.assertEqual(len(prec1.get_selected_peakgroups()), 4) # Check that we have all the correct ones (1,2,4,8) self.assertEqual( set(['peakgroup8', 'peakgroup2', 'peakgroup4', 'peakgroup1']), set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
def test_reference_1(self): rid = "0_0" self.tr_data.reference = "0_2" # set reference run to 0_2 spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree] # Select peakgroups, compute left/right border selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1] border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, rid, self.tr_data, "median") # Direct transformation from 0_2 to 0_0 self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ])[0]) self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ])[0]) self.assertAlmostEqual(border_l, 77.992277992277934) self.assertAlmostEqual(border_r, 84.1698841699) border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, rid, self.tr_data, "mean") self.assertAlmostEqual(border_l, 77.992277992277934) self.assertAlmostEqual(border_r, 84.1698841699) border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, rid, self.tr_data, "max_width") self.assertAlmostEqual(border_l, 77.992277992277934) self.assertAlmostEqual(border_r, 84.1698841699) self.assertRaises(Exception, integrationBorderReference, self.new_exp, selected_pg, rid, self.tr_data, "dummy")
def test_reference_2(self): rid = "0_1" self.tr_data.reference = "0_0" # set reference run to 0_0 spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree] # Select peakgroups, compute left/right border selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1] border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, rid, self.tr_data, "median") # Reference 0_0 means that we transformed from 0_2 to 0_0 and then to 0_1 self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_0", "0_1").predict( self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ]) )) self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_0", "0_1").predict( self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ]) )) self.assertAlmostEqual(border_l, 187.18146718146681) self.assertAlmostEqual(border_r, 202.00772200772167)
def test_alignBestCluster_1(self): """Test the best cluster align This is now using no correction of the alignment thread by using the found peakgroup. In this case it means that after finding the second peakgroup at 112 s, the search RT for run 2 is still at 106 seconds which gets mapped to 112 seconds in run 3 (but the next pg is at 120s, too far for 7 seconds tolerance). """ spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree( algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id()) for a, b in tree] alignment = algo.TreeConsensusAlignment(max_rt_diff=6, fdr_cutoff=0.1, aligned_fdr_cutoff=0.25, correctRT_using_pg=False) alignment.alignBestCluster_legacy(self.multipeptides, tree_mapped, self.tr_data) # Now only 2 peakgroups should be selected prec1 = self.mpep self.assertEqual(len(prec1.get_selected_peakgroups()), 2) # Check that we have all the correct ones (only 1,2) self.assertEqual( set(['peakgroup2', 'peakgroup1']), set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
def doReferenceAlignment(options, this_exp, multipeptides): # Performing re-alignment using a reference run if options.realign_method != "diRT": start = time.time() spl_aligner = SplineAligner(alignment_fdr_threshold = options.alignment_score, smoother=options.realign_method, external_r_tmpdir = options.tmpdir, experiment=this_exp) this_exp.transformation_collection = spl_aligner.rt_align_all_runs(this_exp, multipeptides) trafoError = spl_aligner.getTransformationError() print("Aligning the runs took %0.2fs" % (time.time() - start) ) try: options.aligned_fdr_cutoff = float(options.aligned_fdr_cutoff) except ValueError: # We have a range of values to step through. # Since we trust the input, wo dont do error checking. exec("fdr_range = numpy.arange(%s)" % options.aligned_fdr_cutoff) options.aligned_fdr_cutoff = estimate_aligned_fdr_cutoff(options, this_exp, multipeptides, fdr_range) try: options.rt_diff_cutoff = float(options.rt_diff_cutoff) except ValueError: if options.rt_diff_cutoff == "auto_2medianstdev": options.rt_diff_cutoff = 2*numpy.median(list(trafoError.getStdev())) elif options.rt_diff_cutoff == "auto_3medianstdev": options.rt_diff_cutoff = 3*numpy.median(list(trafoError.getStdev())) elif options.rt_diff_cutoff == "auto_4medianstdev": options.rt_diff_cutoff = 4*numpy.median(list(trafoError.getStdev())) elif options.rt_diff_cutoff == "auto_maxstdev": options.rt_diff_cutoff = max(list(trafoError.getStdev())) else: raise Exception("max_rt_diff either needs to be a value in seconds or" + \ "one of ('auto_2medianstdev', 'auto_3medianstdev', " + \ "'auto_4medianstdev', 'auto_maxstdev'). Found instead: '%s'" % options.rt_diff_cutoff) print("Will calculate with aligned_fdr cutoff of", options.aligned_fdr_cutoff, "and an RT difference of", options.rt_diff_cutoff) start = time.time() AlignmentAlgorithm().align_features(multipeptides, options.rt_diff_cutoff, options.fdr_cutoff, options.aligned_fdr_cutoff, options.method) print("Re-aligning peak groups took %0.2fs" % (time.time() - start) )
def test_shortestPath_2(self): rid = "0_1" spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree] # Select peakgroups, compute left/right border selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1] border_l, border_r = integrationBorderShortestPath(selected_pg, rid, self.tr_data, tree_mapped) # Shortest path means that we transformed from 0_2 to 0_1 self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_1").predict( [ 240.0 ] )) self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_1").predict( [ 260.0 ] )) self.assertAlmostEqual(border_l, 168.03088803088787) self.assertAlmostEqual(border_r, 183.32046332046318)
def test_shortestDistance_2(self): rid = "0_1" spl_aligner = SplineAligner(self.initial_alignment_cutoff) dist_matrix = getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner) # Select peakgroups, compute left/right border selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1] rmap = dict([(r.get_id(),i) for i,r in enumerate(self.new_exp.runs) ]) border_l, border_r = integrationBorderShortestDistance(selected_pg, rid, self.tr_data, dist_matrix, rmap) # Shortest distance means that we transformed directly from 0_2 to 0_1 self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_1").predict([ 240.0 ])[0]) self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_1").predict([ 260.0 ])[0]) self.assertAlmostEqual(border_l, 168.03088803088787) self.assertAlmostEqual(border_r, 183.32046332)
def test_shortestDistance_1(self): rid = "0_0" spl_aligner = SplineAligner(self.initial_alignment_cutoff) dist_matrix = getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner) # Select peakgroups, compute left/right border selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1] rmap = dict([(r.get_id(),i) for i,r in enumerate(self.new_exp.runs) ]) border_l, border_r = integrationBorderShortestDistance(selected_pg, rid, self.tr_data, dist_matrix, rmap) # Direct transformation from 0_2 to 0_0 self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ])[0]) self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ])[0]) self.assertAlmostEqual(border_l, 77.992277992277934) self.assertAlmostEqual(border_r, 84.1698841699)
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method, use_RT_correction, stdev_max_rt_per_run, use_local_stdev): """ Minimum Spanning Tree (MST) based local aligment """ spl_aligner = SplineAligner(initial_alignment_cutoff) tree = MinimumSpanningTree( getDistanceMatrix(exp, multipeptides, spl_aligner)) print "Computed Tree:", tree # Get alignments tr_data = LightTransformationData() for edge in tree: addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]], spl_aligner, multipeptides, smoothing_method, max_rt_diff) tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id()) for a, b in tree] # Perform work al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, rt_diff_isotope=rt_diff_isotope, correctRT_using_pg=use_RT_correction, stdev_max_rt_per_run=stdev_max_rt_per_run, use_local_stdev=use_local_stdev) if method == "LocalMST": al.alignBestCluster(multipeptides, tree_mapped, tr_data) elif method == "LocalMSTAllCluster": al.alignAllCluster(multipeptides, tree_mapped, tr_data) # Store number of ambigous cases (e.g. where more than one peakgroup below # the strict quality cutoff was found in the RT window) and the number of # cases where multiple possibilities were found. exp.nr_ambiguous = al.nr_ambiguous exp.nr_multiple_align = al.nr_multiple_align
def test_alignBestCluster_2(self): """Test the best cluster align This is now using no correction of the alignment thread by using the found peakgroup (e.g. no correction of the threading). - Run1 : 100s [threadRT = 100s] - Run2 : 112s [threadRT = 106s] - Run3 : 120s [threadRT = 112s] - Run4 : xxx [threadRT = 118s] - Run5 : 139s [threadRT = 124s] By using a larger tolerance of 15s, we can still manage to find all the correct peakgroups """ spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree( algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id()) for a, b in tree] alignment = algo.TreeConsensusAlignment(max_rt_diff=15, fdr_cutoff=0.1, aligned_fdr_cutoff=0.25, correctRT_using_pg=False) alignment.alignBestCluster_legacy(self.multipeptides, tree_mapped, self.tr_data) # Now only 2 peakgroups should be selected prec1 = self.mpep self.assertEqual(len(prec1.get_selected_peakgroups()), 4) # Check that we have all the correct ones (1,2,4,8) self.assertEqual( set(['peakgroup8', 'peakgroup2', 'peakgroup4', 'peakgroup1']), set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
def main(options): infiles = options.feature_files chromatograms = options.chromatogram_files readfilter = ReadFilter() file_format = 'openswath' readmethod = "minimal" reader = SWATHScoringReader.newReader(infiles, file_format, readmethod, readfilter, enable_isotopic_grouping=False, read_cluster_id=False) reader.map_infiles_chromfiles(chromatograms) runs = reader.parse_files() MStoFeature = MSfileRunMapping(chromatograms, runs) precursor_to_transitionID, precursor_sequence = getPrecursorTransitionMapping( infiles[0]) MZs = mzml_accessors(runs, MStoFeature) MZs.set_precursor_to_chromID(precursor_to_transitionID) this_exp = Experiment() this_exp.set_runs(runs) start = time.time() fdr_cutoff = options.aligned_fdr_cutoff multipeptides = this_exp.get_all_multipeptides(fdr_cutoff, verbose=False, verbosity=10) print("Mapping the precursors took %0.2fs" % (time.time() - start)) # Reference based alignment # best_run = this_exp.determine_best_run(alignment_fdr_threshold = 0.05) reference_run = referenceForPrecursor( refType="precursor_specific", alignment_fdr_threshold=options.fdr_cutoff ).get_reference_for_precursors(multipeptides) # Pairwise global alignment spl_aligner = SplineAligner(alignment_fdr_threshold=fdr_cutoff, smoother="lowess", experiment=this_exp) tr_data = initialize_transformation() # Initialize XIC smoothing function chrom_smoother = chromSmoother(smoother="sgolay", kernelLen=11, polyOrd=4) # Calculate the aligned retention time for each precursor across all runs prec_ids = list(precursor_to_transitionID.keys()) for i in range(len(prec_ids)): prec_id = prec_ids[i] #9719 9720 refrun = reference_run.get(prec_id) if not refrun: print( "The precursor {} doesn't have any associated reference run. Skipping!" .format(prec_id)) continue eXps = list(set(runs) - set([refrun])) # Extract XICs from reference run and smooth it. XICs_ref = MZs.extractXIC_group(refrun, prec_id) if not XICs_ref: continue XICs_ref_sm = chrom_smoother.smoothXICs(XICs_ref) # For each precursor, we need peptide_group_label and trgr_id peptide_group_label = precursor_sequence[prec_id][0] # Iterate through all other runs and align them to the reference run for eXprun in eXps: ## Extract XICs from experiment run and smooth it. XICs_eXp = MZs.extractXIC_group(eXprun, prec_id) if not XICs_eXp: continue XICs_eXp_sm = chrom_smoother.smoothXICs(XICs_eXp) t_ref_aligned, t_eXp_aligned = RTofAlignedXICs( XICs_ref_sm, XICs_eXp_sm, tr_data, spl_aligner, eXprun, refrun, multipeptides, RSEdistFactor=4, alignType=b"hybrid", normalization=b"mean", simType=b"dotProductMasked", goFactor=0.125, geFactor=40, cosAngleThresh=0.3, OverlapAlignment=True, dotProdThresh=0.96, gapQuantile=0.5, hardConstrain=False, samples4gradient=100) # Update retention time of all peak-groups to reference peak-group updateRetentionTime(eXprun, peptide_group_label, prec_id, t_ref_aligned, t_eXp_aligned) AlignmentAlgorithm().align_features( multipeptides, rt_diff_cutoff=40, fdr_cutoff=0.01, aligned_fdr_cutoff=options.aligned_fdr_cutoff, method=options.method) al = this_exp.print_stats(multipeptides, 0.05, 0.1, 1) write_out_matrix_file(options.matrix_outfile, runs, multipeptides, options.min_frac_selected, options.matrix_output_method, True, 0.05, precursor_sequence)
def test_prepare(self): spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree( algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner)) self.assertEqual(tree, [(3, 4), (2, 3), (1, 2), (0, 1)])
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method, use_RT_correction, stdev_max_rt_per_run, use_local_stdev, mst_use_ref, force, optimized_cython): """ Minimum Spanning Tree (MST) based local aligment """ spl_aligner = SplineAligner(initial_alignment_cutoff, experiment=exp) if mst_use_ref: # force reference-based alignment bestrun = spl_aligner._determine_best_run(exp) ref = spl_aligner._determine_best_run(exp).get_id() refrun_id, refrun = [(i, run) for i, run in enumerate(exp.runs) if run.get_id() == ref][0] tree = [(i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id] else: start = time.time() tree = MinimumSpanningTree( getDistanceMatrix(exp, multipeptides, spl_aligner)) print("Computing tree took %0.2fs" % (time.time() - start)) print("Computed Tree:", tree) # Get alignments start = time.time() try: from msproteomicstoolslib.cython._optimized import CyLightTransformationData if optimized_cython: tr_data = CyLightTransformationData() else: tr_data = LightTransformationData() except ImportError: print( "WARNING: cannot import CyLightTransformationData, will use Python version (slower)." ) tr_data = LightTransformationData() for edge in tree: addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]], spl_aligner, multipeptides, smoothing_method, max_rt_diff, force=force) tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id()) for a, b in tree] print("Computing transformations for all edges took %0.2fs" % (time.time() - start)) # Perform work al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, rt_diff_isotope=rt_diff_isotope, correctRT_using_pg=use_RT_correction, stdev_max_rt_per_run=stdev_max_rt_per_run, use_local_stdev=use_local_stdev) if method == "LocalMST": if optimized_cython: al.alignBestCluster(multipeptides, tree_mapped, tr_data) else: print( "WARNING: cannot utilize optimized MST alignment (needs readmethod = cminimal), will use Python version (slower)." ) al.alignBestCluster_legacy(multipeptides, tree_mapped, tr_data) elif method == "LocalMSTAllCluster": al.alignAllCluster(multipeptides, tree_mapped, tr_data) # Store number of ambigous cases (e.g. where more than one peakgroup below # the strict quality cutoff was found in the RT window) and the number of # cases where multiple possibilities were found. exp.nr_ambiguous = al.nr_ambiguous exp.nr_multiple_align = al.nr_multiple_align return tree
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test): """Impute values across chromatograms Args: peakgroups_file(filename): CSV file containing all peakgroups mzML_file(filename): mzML file containing chromatograms Returns: A tuple of: new_exp(AlignmentExperiment): experiment containing the aligned peakgroups multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides This function will read the csv file with all peakgroups as well as the provided chromatogram file (.chrom.mzML). It will then try to impute missing values for those peakgroups where no values is currently present, reading the raw chromatograms. """ # We do not want to exclude any peakgroups for noiseIntegration (we assume # that alignment has already happened) fdr_cutoff_all_pg = 1.0 start = time.time() reader = SWATHScoringReader.newReader([peakgroups_file], options.file_format, readmethod="complete", enable_isotopic_grouping = not options.disable_isotopic_grouping) new_exp = Experiment() new_exp.runs = reader.parse_files() multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) print("Parsing the peakgroups file took %ss" % (time.time() - start) ) mapping = {} precursors_mapping = {} sequences_mapping = {} protein_mapping = {} inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False) mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()]) if VERBOSE: print mapping # Do only a single run : read only one single file start = time.time() swath_chromatograms = SwathChromatogramCollection() swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv) print("Reading the chromatogram files took %ss" % (time.time() - start) ) assert len(swath_chromatograms.getRunIDs() ) == 1 rid = swath_chromatograms.getRunIDs()[0] start = time.time() initial_alignment_cutoff = 0.0001 max_rt_diff = 30 sd_data = -1 # We do not use the standard deviation data in this algorithm tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(initial_alignment_cutoff) if method == "singleClosestRun": tree_mapped = None run_1 = [r for r in new_exp.runs if r.get_id() == rid][0] dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id()) print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for run_0 in new_exp.runs: helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) elif method == "singleShortestPath": dist_matrix = None tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner)) tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree] print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for edge in tree: helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], new_exp.runs[edge[1]], spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) else: raise Exception("Unknown method: " + method) print("Alignment took %ss" % (time.time() - start) ) start = time.time() multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms, tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test) print("Analyzing the runs took %ss" % (time.time() - start) ) return new_exp, multipeptides
def setUp(self): import msproteomicstoolslib.data_structures.Precursor as precursor import msproteomicstoolslib.data_structures.PrecursorGroup as precursor_group import msproteomicstoolslib.format.TransformationCollection as transformations from msproteomicstoolslib.algorithms.alignment.SplineAligner import SplineAligner import msproteomicstoolslib.algorithms.alignment.AlignmentHelper as helper # 0. id # 1. quality score (FDR) # 2. retention time (normalized) # 3. intensity mpeps = [Multipeptide() for i in range(3)] [m.set_nr_runs(5) for m in mpeps] # Parameters self.initial_alignment_cutoff = 0.001 runs = [MockRun("0_%s" % (i + 1)) for i in range(5)] ids = 0 for i in range(5): # Two alignment peptides p = precursor.Precursor("anchorpeptide_1", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 100 + i * 10, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[0].insert(runs[i].get_id(), prgr) ids += 1 p = precursor.Precursor("anchorpeptide_2", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 1000 + i * 100, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_2", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[1].insert(runs[i].get_id(), prgr) ids += 1 # The noise peptide p = precursor.Precursor("anchorpeptide_3", runs[i]) pg_tuple = ("id_%s" % ids, 0.0001, 500 + i * 40, 10000) p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_3", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i]) prgr.addPrecursor(p) mpeps[2].insert(runs[i].get_id(), prgr) ids += 1 m = Multipeptide() m.set_nr_runs(5) # Run 1 # - peakgroup 1 : RT = 110 seconds [correct] p = precursor.Precursor("precursor_1", runs[0]) pg_tuple = ("peakgroup1", 0.01, 100, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[0]) prgr.addPrecursor(p) m.insert(runs[0].get_id(), prgr) # Run 2: # - peakgroup 2 : RT = 115 seconds [correct] # - peakgroup 3 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[1]) pg_tuple = ("peakgroup2", 0.2, 112, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup3", 0.18, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[1]) prgr.addPrecursor(p) m.insert(runs[1].get_id(), prgr) # Run 3: # - peakgroup 4 : RT = 120 seconds [correct] # - peakgroup 5 : RT = 130 seconds p = precursor.Precursor("precursor_1", runs[2]) pg_tuple = ("peakgroup4", 0.2, 120, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) pg_tuple = ("peakgroup5", 0.17, 130, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[2]) prgr.addPrecursor(p) m.insert(runs[2].get_id(), prgr) # Run 4: # - peakgroup 6 : missing [correct] # - peakgroup 7 : RT = 145 seconds p = precursor.Precursor("precursor_1", runs[3]) pg_tuple = ("peakgroup7", 0.18, 145, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[3]) prgr.addPrecursor(p) m.insert(runs[3].get_id(), prgr) # Run 5: # - peakgroup 8 : RT = 140 seconds [correct] # - peakgroup 9 : missing p = precursor.Precursor("precursor_1", runs[4]) pg_tuple = ("peakgroup8", 0.1, 139, 10000) p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1) prgr = precursor_group.PrecursorGroup(p.get_id(), runs[4]) prgr.addPrecursor(p) m.insert(runs[4].get_id(), prgr) self.mpep = m self.exp = Dummy() self.exp.runs = runs mpeps.append(m) self.multipeptides = mpeps # Align all against all self.tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(self.initial_alignment_cutoff) for run_0 in self.exp.runs: for run_1 in self.exp.runs: helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30)
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method, use_RT_correction, stdev_max_rt_per_run, use_local_stdev, mst_use_ref, force, optimized_cython): """ Minimum Spanning Tree (MST) based local aligment """ spl_aligner = SplineAligner(initial_alignment_cutoff, experiment=exp) if mst_use_ref: # force reference-based alignment bestrun = spl_aligner._determine_best_run(exp) ref = spl_aligner._determine_best_run(exp).get_id() refrun_id, refrun = [ (i,run) for i, run in enumerate(exp.runs) if run.get_id() == ref][0] tree = [( i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id] else: start = time.time() tree = MinimumSpanningTree(getDistanceMatrix(exp, multipeptides, spl_aligner)) print("Computing tree took %0.2fs" % (time.time() - start) ) print("Computed Tree:", tree) # Get alignments start = time.time() try: from msproteomicstoolslib.cython._optimized import CyLightTransformationData if optimized_cython: tr_data = CyLightTransformationData() else: tr_data = LightTransformationData() except ImportError: print("WARNING: cannot import CyLightTransformationData, will use Python version (slower).") tr_data = LightTransformationData() for edge in tree: addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]], spl_aligner, multipeptides, smoothing_method, max_rt_diff, force=force) tree_mapped = [ (exp.runs[a].get_id(), exp.runs[b].get_id()) for a,b in tree] print("Computing transformations for all edges took %0.2fs" % (time.time() - start) ) # Perform work al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, rt_diff_isotope=rt_diff_isotope, correctRT_using_pg=use_RT_correction, stdev_max_rt_per_run=stdev_max_rt_per_run, use_local_stdev=use_local_stdev) if method == "LocalMST": if optimized_cython: al.alignBestCluster(multipeptides, tree_mapped, tr_data) else: print("WARNING: cannot utilize optimized MST alignment (needs readmethod = cminimal), will use Python version (slower).") al.alignBestCluster_legacy(multipeptides, tree_mapped, tr_data) elif method == "LocalMSTAllCluster": al.alignAllCluster(multipeptides, tree_mapped, tr_data) # Store number of ambigous cases (e.g. where more than one peakgroup below # the strict quality cutoff was found in the RT window) and the number of # cases where multiple possibilities were found. exp.nr_ambiguous = al.nr_ambiguous exp.nr_multiple_align = al.nr_multiple_align return tree