def test_num_ions_threshold(self, peak_list, tic): """ Filter the peak list, first by removing all intensities in a peak less than a given relative threshold, then by removing all peaks that have less than a given number of ions above a given value """ # trim by relative intensity pl = rel_threshold(peak_list, 2) # trim by threshold new_peak_list = num_ions_threshold(pl, 3, 10000) assert isinstance(new_peak_list, list) assert isinstance(new_peak_list[0], Peak) assert len(new_peak_list) == 215 assert len(new_peak_list) <= len(peak_list) assert len(new_peak_list) <= len(pl) # With window_analyzer # estimate noise level from the TIC, used later to # discern true signal peaks noise_level = window_analyzer(tic) # trim by relative intensity apl = rel_threshold(peak_list, 1) # trim by number of ions above threshold peak_list = num_ions_threshold(apl, 3, noise_level) assert isinstance(peak_list, list) assert isinstance(peak_list[0], Peak) assert len(peak_list) in (87, 88) assert len(peak_list) <= len(peak_list)
def matrix_from_cdf(cdffile, name): data = ANDI_reader(cdffile) print(name) data.info() tic = data.get_tic() noise_lvl = window_analyzer(tic) return build_intensity_matrix(data), noise_lvl
def call_peaks(im, tic, smooth, args): print "calling peaks" if smooth: print "Smoothing IM first..." im.crop_mass(args.lowmass, args.highmass) print "cropped masses..." # get the size of the intensity matrix n_scan, n_mz = im.get_size() print "# masses in intensity matrix: ", n_mz # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) #print "got ic for mass ", ii # ic1 = savitzky_golay(ic) ic_smooth = savitzky_golay(ic, window=args.window, degree=4) #JT: changed to 4 from 2 #print "savitky golay ran " ic_base = tophat(ic_smooth, struct="1.5m") #print "tophat ran " im.set_ic_at_index(ii, ic_base) #print "smoothed mass ", ii print "smoothed IM..." # noise level calc tic1 = savitzky_golay(tic) tic2 = tophat(tic1, struct="1.5m") #JT: How does struct size work? noise_level = window_analyzer(tic2) print "Noise level in TIC: ", noise_level # get the list of Peak objects using BB peak detection / deconv pl = BillerBiemann(im, args.window, args.scans) print "Initial number of Peaks found:", len(pl) # filter down the peaks. # - First: remove any masses from each peak that have intensity less than r percent of the max intensity in that peak # - Second: remove any peak where there are less than n ions with intensity above the cutoff pl2 = rel_threshold(pl, percent=args.minintensity) pl3 = num_ions_threshold( pl2, n=args.minions, cutoff=100000 ) #100000 for pegBT #200 for peg3 #minions maybe 3 instead of 4? #JT: Was getting very different noise cutoff values so just made it 10^5 # Which was decided on by looking at chromatograms to find baseline noise lvl print "Peaks remaining after filtering:", len(pl3) for peak in pl3: #peak.null_mass(73) #peak.null_mass(207) # column bleed #peak.null_mass(84) # solvent tailing area = peak_sum_area(im, peak) # get the TIC area for this peak peak.set_area(area) area_dict = peak_top_ion_areas( im, peak, args.topions) # get top n ion areas for this peak peak.set_ion_areas(area_dict) return pl3
def call_peaks(im, tic, smooth, args): print "calling peaks" if smooth: print "Smoothing IM first..." im.crop_mass(args.lowmass, args.highmass) print "cropped masses..." # get the size of the intensity matrix n_scan, n_mz = im.get_size() print "# masses in intensity matrix: ", n_mz # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) #print "got ic for mass ", ii # ic1 = savitzky_golay(ic) ic_smooth = savitzky_golay(ic, window=args.window, degree=2) #print "savitky golay ran " ic_base = tophat(ic_smooth, struct="1.5m") #print "tophat ran " im.set_ic_at_index(ii, ic_base) #print "smoothed mass ", ii print "smoothed IM..." # noise level calc tic1 = savitzky_golay(tic) tic2 = tophat(tic1, struct="1.5m") noise_level = window_analyzer(tic2) print "Noise level in TIC: ", noise_level # get the list of Peak objects using BB peak detection / deconv pl = BillerBiemann(im, args.window, args.scans) print "Initial number of Peaks found:", len(pl) # filter down the peaks. # - First: remove any masses from each peak that have intensity less than r percent of the max intensity in that peak # - Second: remove any peak where there are less than n ions with intensity above the cutoff pl2 = rel_threshold(pl, percent=args.minintensity) pl3 = num_ions_threshold(pl2, n=args.minions, cutoff=noise_level * args.noisemult) print "Peaks remaining after filtering:", len(pl3) for peak in pl3: # peak.null_mass(73) peak.null_mass(207) # column bleed peak.null_mass(84) # solvent tailing area = peak_sum_area(im, peak) # get the TIC area for this peak peak.set_area(area) area_dict = peak_top_ion_areas(im, peak, args.topions) # get top n ion areas for this peak peak.set_ion_areas(area_dict) return pl3
def matrix_from_cdf(cdffile, name): ''' Intakes a .cdf file and produces an intensity matrix and a noise level . The noise level info is obtained by producing a tic and using the window_analyzer method to extract a noise approximation. @param cdffile: Absolutepath to a .cdf file to be processed @param name: file name associated with .cdf file @return: An intensity matrix and a corresponding noise level value ''' data = ANDI_reader(cdffile) print(name) data.info() tic = data.get_tic() noise_lvl = window_analyzer(tic) print('nz=', noise_lvl) return build_intensity_matrix(data), noise_lvl
def test_window_anlyzer(tic): noise_estimate = window_analyzer(tic, rand_seed=test_int) assert noise_estimate == 22524.833209785025 assert isinstance(noise_estimate, float) assert isinstance(window_analyzer(tic), float) assert isinstance(window_analyzer(tic, rand_seed=test_string), float) assert isinstance(window_analyzer(tic, rand_seed=test_float), float) for obj in [test_string, *test_numbers, *test_lists, test_dict]: with pytest.raises(TypeError): window_analyzer(obj) # type: ignore for obj in [*test_lists, test_dict]: with pytest.raises(TypeError): window_analyzer(tic, rand_seed=obj) # type: ignore for obj in [test_float, *test_lists, test_dict]: with pytest.raises(TypeError): window_analyzer(tic, window=obj) # type: ignore for obj in [test_string, test_float, *test_lists, test_dict]: with pytest.raises(TypeError): window_analyzer(tic, n_windows=obj) # type: ignore
def run(self): print("Quantitative Processing in Progress...") # TODO: Include data etc. in experiment file self.update_pbar() if self.filetype == ID_Format_jcamp: # Load data using JCAMP_reader from pyms.GCMS.IO.JCAMP import JCAMP_reader data = JCAMP_reader(self.properties["Original Filename"]) elif self.filetype == ID_Format_mzML: # Load data using JCAMP_reader from pyms.GCMS.IO.MZML import MZML_reader data = MZML_reader(self.properties["Original Filename"]) elif self.filetype == ID_Format_ANDI: # Load data using JCAMP_reader from pyms.GCMS.IO.ANDI import ANDI_reader data = ANDI_reader(self.properties["Original Filename"]) else: # Unknown Format return # TODO: Waters RAW, Thermo RAW, Agilent .d self.update_pbar() method = Method.Method(self.properties["Method"]) self.update_pbar() # list of all retention times, in seconds times = data.get_time_list() # get Total Ion Chromatogram tic = data.get_tic() # RT Range, time step, no. scans, min, max, mean and median m/z data.info() # Build "intensity matrix" by binning data with integer bins and a # window of -0.3 to +0.7, the same as NIST uses im = build_intensity_matrix_i(data) self.update_pbar() # Show the m/z of the maximum and minimum bins print(" Minimum m/z bin: {}".format(im.get_min_mass())) print(" Maximum m/z bin: {}".format(im.get_max_mass())) # Crop masses min_mass, max_mass, *_ = method.mass_range if min_mass < im.get_min_mass(): min_mass = im.get_min_mass() if max_mass > im.get_max_mass(): max_mass = im.get_max_mass() im.crop_mass(min_mass, max_mass) self.update_pbar() # Perform Data filtering n_scan, n_mz = im.get_size() # Iterate over each IC in the intensity matrix for ii in range(n_mz): # print("\rWorking on IC#", ii+1, ' ',end='') ic = im.get_ic_at_index(ii) if method.enable_sav_gol: # Perform Savitzky-Golay smoothing. # Note that Turbomass does not use smoothing for qualitative method. ic = savitzky_golay(ic) if method.enable_tophat: # Perform Tophat baseline correction # Top-hat baseline Correction seems to bring down noise, # retaining shapes, but keeps points on actual peaks ic = tophat(ic, struct=method.tophat_struct) # Set the IC in the intensity matrix to the filtered one im.set_ic_at_index(ii, ic) self.update_pbar() # Peak Detection based on Biller and Biemann (1974), with a window # of <points>, and combining <scans> if they apex next to each other peak_list = BillerBiemann(im, points=method.bb_points, scans=method.bb_scans) self.update_pbar() print(" Number of peaks identified before filtering: {}".format( len(peak_list))) if method.enable_noise_filter: # Filtering peak lists with automatic noise filtering noise_level = window_analyzer(tic) # should we also do rel_threshold() here? # https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold peak_list = num_ions_threshold(peak_list, method.noise_thresh, noise_level) self.update_pbar() filtered_peak_list = [] for peak in peak_list: # Get mass and intensity lists for the mass spectrum at the apex of the peak apex_mass_list = peak.mass_spectrum.mass_list apex_mass_spec = peak.mass_spectrum.mass_spec # Determine the intensity of the base peak in the mass spectrum base_peak_intensity = max(apex_mass_spec) # Determine the index of the base peak in the mass spectrum base_peak_index = [ index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity ][0] # Finally, determine the mass of the base peak base_peak_mass = apex_mass_list[base_peak_index] # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed if base_peak_mass in method.base_peak_filter: continue area = peak_sum_area(im, peak) peak.set_area(area) filtered_peak_list.append(peak) self.update_pbar() print(" Number of peaks identified: {}".format( len(filtered_peak_list))) # Create an experiment self.expr = Experiment(self.sample_name, filtered_peak_list) self.expr.sele_rt_range([ "{}m".format(method.target_range[0]), "{}m".format(method.target_range[1]) ]) self.update_pbar() current_time = time_now() # The date and time the experiment was created self.properties["Date Created"] = current_time # The date and time the experiment was last modified self.properties["Date Modified"] = current_time if self.pbar: self.pbar.Update(self.pbar.Range) self.tic = tic self.filtered_peak_list = filtered_peak_list
im = build_intensity_matrix(data) n_scan, n_mz = im.size for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_bc) peak_list = BillerBiemann(im, points=9, scans=2) from pyms.Noise.Analysis import window_analyzer tic = data.tic noise_level = window_analyzer(tic) from pyms.BillerBiemann import num_ions_threshold filtered_peak_list = num_ions_threshold(peak_list, n=3, cutoff=noise_level) print(filtered_peak_list[:10]) # Given a list of peaks, areas can be determined and added as follows: # In[ ]: from pyms.Peak.Function import peak_sum_area for peak in peak_list: area = peak_sum_area(im, peak) peak.area = area
def import_processing(jcamp_file, spectrum_csv_file, report_csv_file, combined_csv_file, bb_points = 9, bb_scans = 2, noise_thresh = 2, target_range = (0,120), tophat_struct="1.5m", nistpath = "../MSSEARCH", base_peak_filter = ['73'], ExprDir = "."): global nist_path nist_path = nistpath # Parameters base_peak_filter = [int(x) for x in base_peak_filter] target_range = tuple(target_range) sample_name = os.path.splitext(os.path.basename(jcamp_file))[0] number_of_peaks = 80 data = JCAMP_reader(jcamp_file) # list of all retention times, in seconds times = data.get_time_list() # get Total Ion Chromatogram tic = data.get_tic() # RT Range, time step, no. scans, min, max, mean and median m/z data.info() #data.write("output/data") # save output # Mass Binning im = build_intensity_matrix_i(data) # covnert to intensity matrix #im.get_size() #number of scans, number of bins masses = im.get_mass_list() # list of mass bins print(" Minimum m/z bin: {}".format(im.get_min_mass())) print(" Maximum m/z bin: {}".format(im.get_max_mass())) # Write Binned Mass Spectra to OpenChrom-like CSV file ms = im.get_ms_at_index(0) # first mass spectrum spectrum_csv = open(spectrum_csv_file, 'w') spectrum_csv.write('RT(milliseconds);RT(minutes) - NOT USED BY IMPORT;RI;') spectrum_csv.write(';'.join(str(mz) for mz in ms.mass_list)) spectrum_csv.write("\n") for scan in range(len(times)): spectrum_csv.write("{};{};{};".format(int(times[scan]*1000),rounders((times[scan]/60),"0.0000000000"),0)) ms = im.get_ms_at_index(scan) spectrum_csv.write(';'.join(str(intensity) for intensity in ms.mass_spec)) spectrum_csv.write('\n') spectrum_csv.close() ## Data filtering # Note that Turbomass does not use smoothing for qualitative method. # Top-hat baseline Correction seems to bring down noise, # retaning shapes, but keeps points on actual peaks #dump_object(im, "output/im.dump") # un-processed output n_scan, n_mz = im.get_size() for ii in range(n_mz): #print("\rWorking on IC#", ii+1, ' ',end='') ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct=tophat_struct) im.set_ic_at_index(ii, ic_bc) #dump_object(im, "output/im-proc.dump") # processed output # Peak Detection based on Biller and Biemann, 1974, with a window # of n points, and combining y scans if they apex next to each other peak_list = BillerBiemann(im, points=bb_points, scans=bb_scans) print(" Number of peaks identified before filtering: {}".format(len(peak_list))) # Filtering peak lists with automatic noise filtering noise_level = window_analyzer(tic) peak_list = num_ions_threshold(peak_list, noise_thresh, noise_level) # why use 2 for number of ions above threshold? print(" Number of peaks identified: {}".format(len(peak_list))) # Peak Areas peak_area_list = [] filtered_peak_list = [] for peak in peak_list: apex_mass_list = peak.get_mass_spectrum().mass_list apex_mass_spec = peak.get_mass_spectrum().mass_spec base_peak_intensity = max(apex_mass_spec) base_peak_index = [index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity][0] base_peak_mass = apex_mass_list[base_peak_index] #print(base_peak_mass) if base_peak_mass in base_peak_filter: continue # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed area = peak_sum_area(im, peak) peak.set_area(area) peak_area_list.append(area) filtered_peak_list.append(peak) # Save the TIC and Peak List tic.write(os.path.join(ExprDir,"{}_tic.dat".format(sample_name)),formatting=False) store_peaks(filtered_peak_list,os.path.join(ExprDir,"{}_peaks.dat".format(sample_name))) # from https://stackoverflow.com/questions/16878715/how-to-find-the-index-of-n-largest-elements-in-a-list-or-np-array-python?lq=1 top_peaks = sorted(range(len(peak_area_list)), key=lambda x: peak_area_list[x]) # Write to turbomass-like CSV file report_csv = open(report_csv_file, "w") # Write to GunShotMatch Combine-like CSV file combine_csv = open(combined_csv_file, "w") combine_csv.write(sample_name) combine_csv.write("\n") report_csv.write("#;RT;Scan;Height;Area\n") combine_csv.write("Retention Time;Peak Area;;Lib;Match;R Match;Name;CAS Number;Scan\n") report_buffer = [] for index in top_peaks: # Peak Number (1-80) peak_number = top_peaks.index(index)+1 # Retention time (minutes, 3dp) RT = rounders(filtered_peak_list[index].get_rt()/60,"0.000") if not target_range[0] < RT <= target_range[1]: continue # skip the peak if it is outside the desired range # scan number, not that we really nead it as the peak object has the spectrum Scan = data.get_index_at_time(filtered_peak_list[index].get_rt())+1 # the binned mass spectrum filtered_peak_list[index].get_mass_spectrum() # TIC intensity, as proxy for Peak height, which should be from baseline Height = '{:,}'.format(rounders(tic.get_intensity_at_index(data.get_index_at_time(filtered_peak_list[index].get_rt())),"0")) # Peak area, originally in "intensity seconds", so dividing by 60 to # get "intensity minutes" like turbomass uses Area = '{:,}'.format(rounders(filtered_peak_list[index].get_area()/60,"0.0")) #report_csv.write("{};{};{};{};{};{}\n".format(peak_number, RT, Scan, Height, Area,bounds)) report_buffer.append([peak_number, RT, Scan, Height, Area]) report_buffer = report_buffer[::-1] # Reverse list order # List of peaks already added to report existing_peaks = [] filtered_report_buffer = [] for row in report_buffer: filtered_report_buffer.append(row) filtered_report_buffer = filtered_report_buffer[:number_of_peaks] filtered_report_buffer.sort(key=operator.itemgetter(2)) for row in filtered_report_buffer: index = filtered_report_buffer.index(row) report_csv.write(";".join([str(i) for i in row])) ms = im.get_ms_at_index(row[2]-1) create_msp("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec) matches_dict = nist_ms_comparison("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec) combine_csv.write("{};{};Page {} of 80;;;;;;{}\n".format(row[1],row[4],index+1,row[2])) for hit in range(1,6): report_csv.write(str(matches_dict["Hit{}".format(hit)])) report_csv.write(";") combine_csv.write(";;{};{};{};{};{};{};\n".format(hit, matches_dict["Hit{}".format(hit)]["Lib"], matches_dict["Hit{}".format(hit)]["MF"], matches_dict["Hit{}".format(hit)]["RMF"], matches_dict["Hit{}".format(hit)]["Name"], matches_dict["Hit{}".format(hit)]["CAS"], )) report_csv.write("\n") time.sleep(2) report_csv.close() combine_csv.close() # Create an experiment expr = Experiment(sample_name, filtered_peak_list) expr.sele_rt_range(["{}m".format(target_range[0]),"{}m".format(target_range[1])]) store_expr(os.path.join(ExprDir,"{}.expr".format(sample_name)), expr) return 0
def quantitative_processing(self, jcamp_file, log_stdout=True): """ Import JCAMP-DX Files :param jcamp_file: :type jcamp_file: :param log_stdout: :type log_stdout: :return: :rtype: """ # Determine the name of the sample from the filename sample_name = os.path.splitext(os.path.basename(jcamp_file))[0] # Log Stdout to File if log_stdout: sys.stdout = open(os.path.join(self.config.log_dir, sample_name + ".log"), "w") # Load data using JCAMP_reader data = JCAMP_reader(jcamp_file) # list of all retention times, in seconds times = data.get_time_list() # get Total Ion Chromatogram tic = data.get_tic() # RT Range, time step, no. scans, min, max, mean and median m/z data.info() # Build "intensity matrix" by binning data with integer bins and a # window of -0.3 to +0.7, the same as NIST uses im = build_intensity_matrix_i(data) # Show the m/z of the maximum and minimum bins print(" Minimum m/z bin: {}".format(im.get_min_mass())) print(" Maximum m/z bin: {}".format(im.get_max_mass())) # Crop masses min_mass, max_mass, *_ = self.config.mass_range if min_mass < im.get_min_mass(): min_mass = im.get_min_mass() if max_mass > im.get_max_mass(): max_mass = im.get_max_mass() im.crop_mass(min_mass, max_mass) # Perform Data filtering n_scan, n_mz = im.get_size() # Iterate over each IC in the intensity matrix for ii in range(n_mz): # print("\rWorking on IC#", ii+1, ' ',end='') ic = im.get_ic_at_index(ii) # Perform Savitzky-Golay smoothing. # Note that Turbomass does not use smoothing for qualitative method. ic_smooth = savitzky_golay(ic) # Perform Tophat baseline correction # Top-hat baseline Correction seems to bring down noise, # retaining shapes, but keeps points on actual peaks ic_bc = tophat(ic_smooth, struct=self.config.tophat_struct) # Set the IC in the intensity matrix to the filtered one im.set_ic_at_index(ii, ic_bc) # Peak Detection based on Biller and Biemann (1974), with a window # of <points>, and combining <scans> if they apex next to each other peak_list = BillerBiemann(im, points=self.config.bb_points, scans=self.config.bb_scans) print(" Number of peaks identified before filtering: {}".format(len(peak_list))) # Filtering peak lists with automatic noise filtering noise_level = window_analyzer(tic) # should we also do rel_threshold() here? # https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold peak_list = num_ions_threshold(peak_list, self.config.noise_thresh, noise_level) filtered_peak_list = [] for peak in peak_list: # Get mass and intensity lists for the mass spectrum at the apex of the peak apex_mass_list = peak.mass_spectrum.mass_list apex_mass_spec = peak.mass_spectrum.mass_spec # Determine the intensity of the base peak in the mass spectrum base_peak_intensity = max(apex_mass_spec) # Determine the index of the base peak in the mass spectrum base_peak_index = [ index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity][0] # Finally, determine the mass of the base peak base_peak_mass = apex_mass_list[base_peak_index] # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed if base_peak_mass in self.config.base_peak_filter: continue area = peak_sum_area(im, peak) peak.set_area(area) filtered_peak_list.append(peak) print(" Number of peaks identified: {}".format(len(filtered_peak_list))) # Save the TIC and Peak List tic.write(os.path.join(self.config.expr_dir, "{}_tic.dat".format(sample_name)), formatting=False) store_peaks(filtered_peak_list, os.path.join(self.config.expr_dir, "{}_peaks.dat".format(sample_name))) # Create an experiment expr = Experiment(sample_name, filtered_peak_list) expr.sele_rt_range(["{}m".format(self.config.target_range[0]), "{}m".format(self.config.target_range[1])]) store_expr(os.path.join(self.config.expr_dir, "{}.expr".format(sample_name)), expr)
# 'pk_points' is the estimated number of points across signal peak pk_points = 5 pk_scans = 2 n = 3 r = 1 #andi_file = "/x/PyMS/data/0605_549.CDF" andi_file = "/x/PyMS/data/a0806_077.cdf" # read raw data data = ANDI_reader(andi_file) # estimate noise level from the TIC, used later to # discern true signal peaks tic = data.get_tic() noise_level = window_analyzer(tic) print " Building intensity matrix ...", # build integer intensity matrix im = build_intensity_matrix_i(data) print " done." # crop mass range to 50-540 im.crop_mass(50,540) # ignore TMS ions 73 and 147 im.null_mass(73) im.null_mass(147) # get the size of the intensity matrix n_scan, n_mz = im.get_size()