def test_reference_2(self): rid = "0_1" self.tr_data.reference = "0_0" # set reference run to 0_0 spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree] # Select peakgroups, compute left/right border selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1] border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, rid, self.tr_data, "median") # Reference 0_0 means that we transformed from 0_2 to 0_0 and then to 0_1 self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_0", "0_1").predict( self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ]) )) self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_0", "0_1").predict( self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ]) )) self.assertAlmostEqual(border_l, 187.18146718146681) self.assertAlmostEqual(border_r, 202.00772200772167)
def test_alignAllCluster_1(self): """Test the best cluster align This is using the best possible conditions with only 7 seconds retention time cutoff - Run1 : 100s [threadRT = 100s] - Run2 : 112s [threadRT = 106s] - Run3 : 120s [threadRT = 118s] - Run4 : xxx [threadRT = 126s] (should be around 130s) - Run5 : 139s [threadRT = 133s] """ spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree( algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id()) for a, b in tree] alignment = algo.TreeConsensusAlignment(max_rt_diff=6, fdr_cutoff=0.1, aligned_fdr_cutoff=0.25, correctRT_using_pg=True, verbose=True) alignment.alignAllCluster(self.multipeptides, tree_mapped, self.tr_data) # We should have 4 peakgroups prec1 = self.mpep self.assertEqual(len(prec1.get_selected_peakgroups()), 4) # Check that we have all the correct ones (1,2,4,8) self.assertEqual( set(['peakgroup8', 'peakgroup2', 'peakgroup4', 'peakgroup1']), set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
def test_alignBestCluster_1(self): """Test the best cluster align This is now using no correction of the alignment thread by using the found peakgroup. In this case it means that after finding the second peakgroup at 112 s, the search RT for run 2 is still at 106 seconds which gets mapped to 112 seconds in run 3 (but the next pg is at 120s, too far for 7 seconds tolerance). """ spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree( algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id()) for a, b in tree] alignment = algo.TreeConsensusAlignment(max_rt_diff=6, fdr_cutoff=0.1, aligned_fdr_cutoff=0.25, correctRT_using_pg=False) alignment.alignBestCluster_legacy(self.multipeptides, tree_mapped, self.tr_data) # Now only 2 peakgroups should be selected prec1 = self.mpep self.assertEqual(len(prec1.get_selected_peakgroups()), 2) # Check that we have all the correct ones (only 1,2) self.assertEqual( set(['peakgroup2', 'peakgroup1']), set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
def test_reference_1(self): rid = "0_0" self.tr_data.reference = "0_2" # set reference run to 0_2 spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree] # Select peakgroups, compute left/right border selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1] border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, rid, self.tr_data, "median") # Direct transformation from 0_2 to 0_0 self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ])[0]) self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ])[0]) self.assertAlmostEqual(border_l, 77.992277992277934) self.assertAlmostEqual(border_r, 84.1698841699) border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, rid, self.tr_data, "mean") self.assertAlmostEqual(border_l, 77.992277992277934) self.assertAlmostEqual(border_r, 84.1698841699) border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, rid, self.tr_data, "max_width") self.assertAlmostEqual(border_l, 77.992277992277934) self.assertAlmostEqual(border_r, 84.1698841699) self.assertRaises(Exception, integrationBorderReference, self.new_exp, selected_pg, rid, self.tr_data, "dummy")
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method, use_RT_correction, stdev_max_rt_per_run, use_local_stdev, mst_use_ref, force): """ Minimum Spanning Tree (MST) based local aligment """ spl_aligner = SplineAligner(initial_alignment_cutoff, experiment=exp) if mst_use_ref: # force reference-based alignment bestrun = spl_aligner._determine_best_run(exp) ref = spl_aligner._determine_best_run(exp).get_id() refrun_id, refrun = [(i, run) for i, run in enumerate(exp.runs) if run.get_id() == ref][0] tree = [(i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id] else: tree = MinimumSpanningTree( getDistanceMatrix(exp, multipeptides, spl_aligner)) print("Computed Tree:", tree) # Get alignments tr_data = LightTransformationData() for edge in tree: addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]], spl_aligner, multipeptides, smoothing_method, max_rt_diff, force=force) tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id()) for a, b in tree] # Perform work al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, rt_diff_isotope=rt_diff_isotope, correctRT_using_pg=use_RT_correction, stdev_max_rt_per_run=stdev_max_rt_per_run, use_local_stdev=use_local_stdev) if method == "LocalMST": al.alignBestCluster(multipeptides, tree_mapped, tr_data) elif method == "LocalMSTAllCluster": al.alignAllCluster(multipeptides, tree_mapped, tr_data) # Store number of ambigous cases (e.g. where more than one peakgroup below # the strict quality cutoff was found in the RT window) and the number of # cases where multiple possibilities were found. exp.nr_ambiguous = al.nr_ambiguous exp.nr_multiple_align = al.nr_multiple_align return tree
def test_shortestPath_2(self): rid = "0_1" spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree] # Select peakgroups, compute left/right border selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1] border_l, border_r = integrationBorderShortestPath(selected_pg, rid, self.tr_data, tree_mapped) # Shortest path means that we transformed from 0_2 to 0_1 self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_1").predict( [ 240.0 ] )) self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_1").predict( [ 260.0 ] )) self.assertAlmostEqual(border_l, 168.03088803088787) self.assertAlmostEqual(border_r, 183.32046332046318)
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method, use_RT_correction, stdev_max_rt_per_run, use_local_stdev): """ Minimum Spanning Tree (MST) based local aligment """ spl_aligner = SplineAligner(initial_alignment_cutoff) tree = MinimumSpanningTree( getDistanceMatrix(exp, multipeptides, spl_aligner)) print "Computed Tree:", tree # Get alignments tr_data = LightTransformationData() for edge in tree: addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]], spl_aligner, multipeptides, smoothing_method, max_rt_diff) tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id()) for a, b in tree] # Perform work al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, rt_diff_isotope=rt_diff_isotope, correctRT_using_pg=use_RT_correction, stdev_max_rt_per_run=stdev_max_rt_per_run, use_local_stdev=use_local_stdev) if method == "LocalMST": al.alignBestCluster(multipeptides, tree_mapped, tr_data) elif method == "LocalMSTAllCluster": al.alignAllCluster(multipeptides, tree_mapped, tr_data) # Store number of ambigous cases (e.g. where more than one peakgroup below # the strict quality cutoff was found in the RT window) and the number of # cases where multiple possibilities were found. exp.nr_ambiguous = al.nr_ambiguous exp.nr_multiple_align = al.nr_multiple_align
def test_alignBestCluster_2(self): """Test the best cluster align This is now using no correction of the alignment thread by using the found peakgroup (e.g. no correction of the threading). - Run1 : 100s [threadRT = 100s] - Run2 : 112s [threadRT = 106s] - Run3 : 120s [threadRT = 112s] - Run4 : xxx [threadRT = 118s] - Run5 : 139s [threadRT = 124s] By using a larger tolerance of 15s, we can still manage to find all the correct peakgroups """ spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree( algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner)) tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id()) for a, b in tree] alignment = algo.TreeConsensusAlignment(max_rt_diff=15, fdr_cutoff=0.1, aligned_fdr_cutoff=0.25, correctRT_using_pg=False) alignment.alignBestCluster_legacy(self.multipeptides, tree_mapped, self.tr_data) # Now only 2 peakgroups should be selected prec1 = self.mpep self.assertEqual(len(prec1.get_selected_peakgroups()), 4) # Check that we have all the correct ones (1,2,4,8) self.assertEqual( set(['peakgroup8', 'peakgroup2', 'peakgroup4', 'peakgroup1']), set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
def test_prepare(self): spl_aligner = SplineAligner(self.initial_alignment_cutoff) tree = MinimumSpanningTree( algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner)) self.assertEqual(tree, [(3, 4), (2, 3), (1, 2), (0, 1)])
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test): """Impute values across chromatograms Args: peakgroups_file(filename): CSV file containing all peakgroups mzML_file(filename): mzML file containing chromatograms Returns: A tuple of: new_exp(AlignmentExperiment): experiment containing the aligned peakgroups multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides This function will read the csv file with all peakgroups as well as the provided chromatogram file (.chrom.mzML). It will then try to impute missing values for those peakgroups where no values is currently present, reading the raw chromatograms. """ # We do not want to exclude any peakgroups for noiseIntegration (we assume # that alignment has already happened) fdr_cutoff_all_pg = 1.0 start = time.time() reader = SWATHScoringReader.newReader([peakgroups_file], options.file_format, readmethod="complete", enable_isotopic_grouping = not options.disable_isotopic_grouping) new_exp = Experiment() new_exp.runs = reader.parse_files() multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) print("Parsing the peakgroups file took %ss" % (time.time() - start) ) mapping = {} precursors_mapping = {} sequences_mapping = {} protein_mapping = {} inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False) mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()]) if VERBOSE: print mapping # Do only a single run : read only one single file start = time.time() swath_chromatograms = SwathChromatogramCollection() swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv) print("Reading the chromatogram files took %ss" % (time.time() - start) ) assert len(swath_chromatograms.getRunIDs() ) == 1 rid = swath_chromatograms.getRunIDs()[0] start = time.time() initial_alignment_cutoff = 0.0001 max_rt_diff = 30 sd_data = -1 # We do not use the standard deviation data in this algorithm tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(initial_alignment_cutoff) if method == "singleClosestRun": tree_mapped = None run_1 = [r for r in new_exp.runs if r.get_id() == rid][0] dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id()) print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for run_0 in new_exp.runs: helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) elif method == "singleShortestPath": dist_matrix = None tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner)) tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree] print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for edge in tree: helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], new_exp.runs[edge[1]], spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) else: raise Exception("Unknown method: " + method) print("Alignment took %ss" % (time.time() - start) ) start = time.time() multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms, tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test) print("Analyzing the runs took %ss" % (time.time() - start) ) return new_exp, multipeptides
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method, use_RT_correction, stdev_max_rt_per_run, use_local_stdev, mst_use_ref, force, optimized_cython): """ Minimum Spanning Tree (MST) based local aligment """ spl_aligner = SplineAligner(initial_alignment_cutoff, experiment=exp) if mst_use_ref: # force reference-based alignment bestrun = spl_aligner._determine_best_run(exp) ref = spl_aligner._determine_best_run(exp).get_id() refrun_id, refrun = [(i, run) for i, run in enumerate(exp.runs) if run.get_id() == ref][0] tree = [(i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id] else: start = time.time() tree = MinimumSpanningTree( getDistanceMatrix(exp, multipeptides, spl_aligner)) print("Computing tree took %0.2fs" % (time.time() - start)) print("Computed Tree:", tree) # Get alignments start = time.time() try: from msproteomicstoolslib.cython._optimized import CyLightTransformationData if optimized_cython: tr_data = CyLightTransformationData() else: tr_data = LightTransformationData() except ImportError: print( "WARNING: cannot import CyLightTransformationData, will use Python version (slower)." ) tr_data = LightTransformationData() for edge in tree: addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]], spl_aligner, multipeptides, smoothing_method, max_rt_diff, force=force) tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id()) for a, b in tree] print("Computing transformations for all edges took %0.2fs" % (time.time() - start)) # Perform work al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, rt_diff_isotope=rt_diff_isotope, correctRT_using_pg=use_RT_correction, stdev_max_rt_per_run=stdev_max_rt_per_run, use_local_stdev=use_local_stdev) if method == "LocalMST": if optimized_cython: al.alignBestCluster(multipeptides, tree_mapped, tr_data) else: print( "WARNING: cannot utilize optimized MST alignment (needs readmethod = cminimal), will use Python version (slower)." ) al.alignBestCluster_legacy(multipeptides, tree_mapped, tr_data) elif method == "LocalMSTAllCluster": al.alignAllCluster(multipeptides, tree_mapped, tr_data) # Store number of ambigous cases (e.g. where more than one peakgroup below # the strict quality cutoff was found in the RT window) and the number of # cases where multiple possibilities were found. exp.nr_ambiguous = al.nr_ambiguous exp.nr_multiple_align = al.nr_multiple_align return tree