def runImputeValues(options, peakgroups_file, trafo_fnames, is_test): """Impute values across chromatograms Args: peakgroups_file(filename): CSV file containing all peakgroups trafo_fnames(filename): A list of .tr filenames (it is assumed that in the same directory also the chromatogram mzML reside) Returns: A tuple of: new_exp(AlignmentExperiment): experiment containing the aligned peakgroups multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides This function will read the csv file with all peakgroups as well as the transformation files (.tr) and the corresponding raw chromatograms which need to be in the same folder. It will then try to impute missing values for those peakgroups where no values is currently present, reading the raw chromatograms. """ # We do not want to exclude any peakgroups for noiseIntegration (we assume # that alignment has already happened) fdr_cutoff_all_pg = 1.0 start = time.time() reader = SWATHScoringReader.newReader([peakgroups_file], options.file_format, readmethod="complete", enable_isotopic_grouping = not options.disable_isotopic_grouping) new_exp = Experiment() new_exp.runs = reader.parse_files() multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) print("Parsing the peakgroups file took %ss" % (time.time() - start) ) start = time.time() transformation_collection_ = transformations.TransformationCollection() for filename in trafo_fnames: transformation_collection_.readTransformationData(filename) # Read the datapoints and perform the smoothing print("Reading the trafo file took %ss" % (time.time() - start) ) start = time.time() transformation_collection_.initialize_from_data(reverse=True, smoother=options.realign_method) print("Initializing the trafo file took %ss" % (time.time() - start) ) if options.do_single_run and not options.dry_run: # Do only a single run : read only one single file start = time.time() swath_chromatograms = SwathChromatogramCollection() swath_chromatograms.parseFromTrafoFiles([ options.do_single_run ]) print("Reading the chromatogram files took %ss" % (time.time() - start) ) assert len(swath_chromatograms.getRunIDs() ) == 1 rid = swath_chromatograms.getRunIDs()[0] # start = time.time() multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms, transformation_collection_, options.border_option, onlyExtractFromRun=rid, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test) print("Analyzing the runs took %ss" % (time.time() - start) ) return new_exp, multipeptides swath_chromatograms = SwathChromatogramCollection() swath_chromatograms.parseFromTrafoFiles(trafo_fnames) print("Reading the chromatogram files took %ss" % (time.time() - start) ) if options.dry_run: print "Dry Run only" print "Found multipeptides:", len(multipeptides) print "Found swath chromatograms:", swath_chromatograms return [], [] start = time.time() if options.cache_in_memory: run_ids = [r.get_id() for r in new_exp.runs] for rid in run_ids: # Create the cache for run "rid" and then only extract peakgroups from this run swath_chromatograms.createRunCache(rid) multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms, transformation_collection_, options.border_option, onlyExtractFromRun=rid, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test) else: multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms, transformation_collection_, options.border_option, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test) print("Analyzing the runs took %ss" % (time.time() - start) ) return new_exp, multipeptides
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test): """Impute values across chromatograms Args: peakgroups_file(filename): CSV file containing all peakgroups mzML_file(filename): mzML file containing chromatograms Returns: A tuple of: new_exp(AlignmentExperiment): experiment containing the aligned peakgroups multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides This function will read the csv file with all peakgroups as well as the provided chromatogram file (.chrom.mzML). It will then try to impute missing values for those peakgroups where no values is currently present, reading the raw chromatograms. """ # We do not want to exclude any peakgroups for noiseIntegration (we assume # that alignment has already happened) fdr_cutoff_all_pg = 1.0 start = time.time() reader = SWATHScoringReader.newReader([peakgroups_file], options.file_format, readmethod="complete", enable_isotopic_grouping = not options.disable_isotopic_grouping) new_exp = Experiment() new_exp.runs = reader.parse_files() multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False) print("Parsing the peakgroups file took %ss" % (time.time() - start) ) mapping = {} precursors_mapping = {} sequences_mapping = {} protein_mapping = {} inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False) mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()]) if VERBOSE: print mapping # Do only a single run : read only one single file start = time.time() swath_chromatograms = SwathChromatogramCollection() swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv) print("Reading the chromatogram files took %ss" % (time.time() - start) ) assert len(swath_chromatograms.getRunIDs() ) == 1 rid = swath_chromatograms.getRunIDs()[0] start = time.time() initial_alignment_cutoff = 0.0001 max_rt_diff = 30 sd_data = -1 # We do not use the standard deviation data in this algorithm tr_data = transformations.LightTransformationData() spl_aligner = SplineAligner(initial_alignment_cutoff) if method == "singleClosestRun": tree_mapped = None run_1 = [r for r in new_exp.runs if r.get_id() == rid][0] dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id()) print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for run_0 in new_exp.runs: helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) elif method == "singleShortestPath": dist_matrix = None tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner)) tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree] print("Distance matrix took %ss" % (time.time() - start) ) start = time.time() for edge in tree: helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], new_exp.runs[edge[1]], spl_aligner, multipeptides, options.realign_method, max_rt_diff, sd_max_data_length=sd_data) else: raise Exception("Unknown method: " + method) print("Alignment took %ss" % (time.time() - start) ) start = time.time() multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms, tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test) print("Analyzing the runs took %ss" % (time.time() - start) ) return new_exp, multipeptides