def test_get_peptide(self): r = Run([], {}, "run1", "file1.txt", filename="file1.csv", aligned_filename="file1.tsv") r.all_precursor_groups_ = dict( [ (str(i), MockPrecursorGroup(i)) for i in range(5) ] ) self.assertEqual( r.getPrecursorGroup("2").id_, 2) self.assertIsNone( r.getPrecursorGroup("9_dummy")) ids = sorted([p.id_ for p in r]) self.assertEqual( ids, range(5))
def test_get_best_peaks(self): r = Run([], {}, "run1", "file1.txt", filename="file1.csv", aligned_filename="file1.tsv") r.all_precursor_groups_ = dict([(str(i), MockPrecursorGroup(i)) for i in range(5)]) self.assertEqual(r.get_best_peaks(), ["42" for i in range(5)])
def test_createRun(self): r = Run([], {}, "run1", "file1.txt", filename="file1.csv", aligned_filename="file1.tsv") self.assertTrue(True) self.assertEqual(r.get_id(), "run1") self.assertEqual(r.get_openswath_filename(), "file1.csv") self.assertEqual(r.get_aligned_filename(), "file1.tsv")
def test_get_peptide(self): r = Run([], {}, "run1", "file1.txt", filename="file1.csv", aligned_filename="file1.tsv") r.all_precursor_groups_ = dict([(str(i), MockPrecursorGroup(i)) for i in range(5)]) self.assertEqual(r.getPrecursorGroup("2").id_, 2) self.assertIsNone(r.getPrecursorGroup("9_dummy")) ids = sorted([p.id_ for p in r]) self.assertEqual(ids, list(range(5)))
def test_MSfileRunMapping(self): from msproteomicstoolslib.data_structures.Run import Run filename = os.path.join(self.datadir_DIAlign, 'merged.osw') chromFile0 = os.path.join( self.datadir_DIAlign, 'hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.chrom.mzML') chromFile2 = os.path.join( self.datadir_DIAlign, 'hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.chrom.mzML') chromFiles = [chromFile0, chromFile2] run0 = Run( [], {}, 125704171604355508, filename, 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz', 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz', useCython=False) run1 = Run( [], {}, 6752973645981403097, filename, 'data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', 'data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', useCython=False) run2 = Run( [], {}, 2234664662238281994, filename, 'data/raw/hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', 'data/raw/hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', useCython=False) runs = [run0, run1, run2] MStoFeature = mapper.MSfileRunMapping(chromFiles, runs) self.assertEqual( MStoFeature[ 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz'] [0], chromFile0) self.assertEqual( MStoFeature[ 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz'] [1].get_id(), 125704171604355508) self.assertEqual( MStoFeature[ 'data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz'] [0], chromFile2) self.assertEqual( MStoFeature[ 'data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz'] [1].get_id(), 6752973645981403097)
def setUp(self): self.dirname = os.path.dirname(os.path.abspath(__file__)) self.topdir = os.path.join(os.path.join(self.dirname, ".."), "..") self.datadir = os.path.join(os.path.join(self.topdir, "test"), "data") self.datadir_DIAlign = os.path.join(self.datadir, "DIAlign") # Instance attribute filename = os.path.join(self.datadir_DIAlign, 'merged.osw') self.chromFile0 = os.path.join(self.datadir_DIAlign, 'hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.chrom.mzML') self.chromFile2 = os.path.join(self.datadir_DIAlign, 'hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.chrom.mzML') run0 = Run([], {}, 125704171604355508, filename, 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz', 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz', useCython=False) run2 = Run([], {}, 2234664662238281994, filename, 'data/raw/hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', 'data/raw/hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', useCython=False) self.runs = [run0, run2] self.MStoFeature = {'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz': (self.chromFile0, run0), 'data/raw/hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.mzML.gz':(self.chromFile2, run2)}
def test_updateRetentionTime(self): run = Run([], {}, 125704171604355508, 'merged.osw', 'file.mzML.gz', 'file.mzML.gz', useCython=False) p = Precursor(self.trgr_id, run) run.addPrecursor(p, self.peptide_group_label) run.getPrecursor(self.peptide_group_label, self.trgr_id).add_peakgroup_tpl( (364283, 0.001, 1.47, 3000), self.trgr_id, cluster_id=-1) t_ref = np.array([ np.nan, 21.1, 21.2, 21.3, 21.35, 21.4, 21.5, 21.55, 21.6, 21.7, 21.8, np.nan ]) t_eXp = np.array([ np.nan, 1.1, 1.2, 1.3, 1.35, 1.4, 1.5, 1.55, 1.6, 1.7, 1.8, np.nan ]) chromAlign.updateRetentionTime(run, self.peptide_group_label, self.trgr_id, t_ref, t_eXp) self.assertEqual( run.getPrecursor(self.peptide_group_label, self.trgr_id).peakgroups_, [(364283, 0.001, 21.5, 3000, None)])
def test_getMapping(self): filename = os.path.join(self.datadir_DIAlign, 'merged.osw') chromFile1 = os.path.join( self.datadir_DIAlign, 'hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.chrom.mzML') chromFile2 = os.path.join( self.datadir_DIAlign, 'hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.chrom.mzML') chromatogramFiles = [chromFile1, chromFile2] featureFiles = [filename] featureFiles_chromFiles_map = reader.getMapping( chromatogramFiles, featureFiles) run0 = Run( [], {}, 125704171604355508, filename, 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz', 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz', useCython=False) run1 = Run( [], {}, 6752973645981403097, filename, 'data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', 'data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', useCython=False) # featureFiles_chromFiles_map = {filename : [run0, run1]} self.assertIsInstance(featureFiles_chromFiles_map[filename][0], Run) self.assertEqual( featureFiles_chromFiles_map[filename][0].get_openswath_filename(), "data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz" ) self.assertEqual(featureFiles_chromFiles_map[filename][0].get_id(), 125704171604355508) self.assertEqual( featureFiles_chromFiles_map[filename][1].get_original_filename(), filename) self.assertEqual( featureFiles_chromFiles_map[filename][1].get_aligned_filename(), "data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz" )
def test_getRunfromFeatureFile(self): filename = os.path.join(self.datadir_DIAlign, 'merged.osw') run0 = Run( [], {}, 125704171604355508, filename, 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz', 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz', useCython=False) run1 = Run( [], {}, 6752973645981403097, filename, 'data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', 'data/raw/hroest_K120809_Strep0%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', useCython=False) run2 = Run( [], {}, 2234664662238281994, filename, 'data/raw/hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', 'data/raw/hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.mzML.gz', useCython=False) fileMapping = reader.getRunfromFeatureFile([filename]) # fileMapping = {filename : [run0, run1, run2]} self.assertIsInstance(fileMapping[filename][0], Run) self.assertEqual( fileMapping[filename][0].get_openswath_filename(), 'data/raw/hroest_K120808_Strep10%PlasmaBiolRepl1_R03_SW_filt.mzML.gz' ) self.assertEqual(fileMapping[filename][1].get_id(), 6752973645981403097) self.assertEqual(fileMapping[filename][2].get_original_filename(), filename) self.assertEqual( fileMapping[filename][2].get_aligned_filename(), 'data/raw/hroest_K120809_Strep10%PlasmaBiolRepl2_R04_SW_filt.mzML.gz' )
def parse_file(self, filename, runs, useCython): """ Parse a whole OSW file (which may contain data from multiple runs) """ import sqlite3 conn = sqlite3.connect(filename) c = conn.cursor() # Retrieve and then iterate over all available runs query = """SELECT ID, FILENAME FROM RUN""" res = [row for row in c.execute(query)] nrows = 0 for row in res: runid = row[0] current_run = Run([], {}, runid, filename, row[1], row[1], useCython=useCython) runs.append(current_run) nrows += self._parse_file(filename, current_run, runid, conn) return nrows
def getRunfromFeatureFile(featureFiles, useCython=False): """ Return as dictionary with key as feature file and value as associated Run objects. >>> featureFiles = ["merged.osw"] >>> fileMapping = getRunfromFeatureFile(featureFiles) >>> fileMapping = {"merged.osw": [Run0, Run1, Run2]} """ import sqlite3 MSfile_featureFile_mapping = {} for filename in featureFiles: MSfile_featureFile_mapping[filename] = [] conn = sqlite3.connect(filename) c = conn.cursor() try: # Retrieve and then iterate over all available runs query = """SELECT ID, FILENAME FROM RUN""" results = [row for row in c.execute(query)] for (run_id, MS_file) in results: current_run = Run([], {}, run_id, filename, MS_file, MS_file, useCython=useCython) MSfile_featureFile_mapping[filename].append(current_run) except sqlite3.Error as e: print("An error occured in reading file " + str(filename) + ", ", e.args[0]) conn.close() # Close the connection conn.close() return MSfile_featureFile_mapping
def parse_files(self, read_exp_RT=True, verbosity=10): """Parse the input file(s) (CSV). Args: read_exp_RT(bool) : to read the real, experimental retention time (default behavior) or the delta iRT should be used instead. Returns: runs(list(SWATHScoringReader.Run)) A single CSV file might contain more than one run and thus to create unique run ids, we number the runs as xx_yy where xx is the current file number and yy is the run found in the current file. However, if an alignment has already been performed and each run has already obtained a unique run id, we can directly use the previous alignment id. """ print "Parsing input files" from sys import stdout import csv skipped = 0; read = 0 runs = [] for file_nr, f in enumerate(self.infiles): if verbosity >= 10: stdout.write("\rReading %s" % str(f)) stdout.flush() header_dict = {} if f.endswith('.gz'): import gzip filehandler = gzip.open(f,'rb') else: filehandler = open(f) reader = csv.reader(filehandler, delimiter="\t") header = reader.next() for i,n in enumerate(header): header_dict[n] = i if verbosity >= 10: stdout.write("\rReading file %s" % (str(f)) ) stdout.flush() # Check if runs are already aligned (only one input file and correct header) already_aligned = (len(self.infiles) == 1 and header_dict.has_key(self.aligned_run_id_name)) for this_row in reader: if already_aligned: runid = this_row[header_dict[self.aligned_run_id_name]] else: runnr = this_row[header_dict[self.run_id_name]] runid = runnr + "_" + str(file_nr) current_run = [r for r in runs if r.get_id() == runid] # check if we have a new run if len(current_run) == 0: orig_fname = None aligned_fname = None if header_dict.has_key("align_origfilename"): aligned_fname = this_row[header_dict[ "align_origfilename"] ] if header_dict.has_key("filename"): orig_fname = this_row[header_dict[ "filename"] ] current_run = Run(header, header_dict, runid, f, orig_fname, aligned_fname) runs.append(current_run) print current_run, "maps to ", orig_fname else: assert len(current_run) == 1 current_run = current_run[0] if not self.readfilter(this_row, current_run.header_dict): skipped += 1 continue read += 1 # Unfortunately, since we are using csv, tell() will not work... # print "parse row at", filehandler.tell() self.parse_row(current_run, this_row, read_exp_RT) # Here we check that each run indeed has a unique id assert len(set([r.get_id() for r in runs])) == len(runs) # each run has a unique id if verbosity >= 10: stdout.write("\r\r\n") # clean up print "Found %s runs, read %s lines and skipped %s lines" % (len(runs), read, skipped) return runs
def test_get_best_peaks(self): r = Run([], {}, "run1", "file1.txt", filename="file1.csv", aligned_filename="file1.tsv") r.all_precursor_groups_ = dict( [ (str(i), MockPrecursorGroup(i)) for i in range(5) ] ) self.assertEqual( r.get_best_peaks(), ["42" for i in range(5)] )