def test_BillerBiemann(self, im_i): im_i = copy.deepcopy(im_i) # Intensity matrix size (scans, masses) n_scan, n_mz = im_i.size # noise filter and baseline correct for ii in range(n_mz): ic = im_i.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") im_i.set_ic_at_index(ii, ic_bc) # Use Biller and Biemann technique to find apexing ions at a scan # default is maxima over three scans and not to combine with any neighbouring # scan. peak_list = BillerBiemann(im_i) assert isinstance(peak_list, list) assert len(peak_list) == 2101 for peak in peak_list: assert isinstance(peak, Peak) # Find apex oven 9 points and combine with neighbouring peak if two scans apex # next to each other. peak_list2 = BillerBiemann(im_i, points=9, scans=2) assert len(peak_list2) == 805 assert len(peak_list2) <= len(peak_list)
def test_smooth_im(data): # Build intensity matrix with defaults, float masses with interval # (bin size) of one from min mass im = build_intensity_matrix_i(data) im.min_mass n_scan, n_mz = im.size # process data for ii in range(n_mz): # print("Working on IC#", ii + 1) ic = im.get_ic_at_index(ii) assert isinstance(ic, IonChromatogram) # if ((ii+off) in [319, 205, 160, 217]): # ic.write("output/ic-raw-%d.dat" % (ii+off)) ic_smooth = savitzky_golay(ic) assert isinstance(ic_smooth, IonChromatogram) ic_bc = tophat(ic_smooth, struct="1.5m") assert isinstance(ic_bc, IonChromatogram) # if ((ii+off) in [319, 205, 160, 217]): # ic_bc.write("output/ic-flt-%d.dat" % (ii+off)) im.set_ic_at_index(ii, ic_bc) assert im.get_ic_at_index(ii) == ic_bc
def expr_list(pyms_datadir): with tempfile.TemporaryDirectory() as tmpdir: outputdir = pathlib.Path(tmpdir) # Create experiment files for jcamp_file in eley_codes: im = build_intensity_matrix_i( JCAMP_reader(pyms_datadir / f"{jcamp_file}.JDX")) # Intensity matrix size (scans, masses) n_scan, n_mz = im.size # noise filter and baseline correct for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_bc) peak_list = BillerBiemann(im, points=9, scans=2) print('#') apl = rel_threshold(peak_list, 2) new_peak_list = num_ions_threshold(apl, 3, 3000) print('#') # ignore TMS ions and set mass range for peak in new_peak_list: peak.crop_mass(50, 400) peak.null_mass(73) peak.null_mass(147) # find area area = peak_sum_area(im, peak) peak.area = area area_dict = peak_top_ion_areas(im, peak) peak.ion_areas = area_dict expr = Experiment(jcamp_file, new_peak_list) # set time range for all experiments expr.sele_rt_range(["6.5m", "21m"]) print('#') expr.dump(outputdir / f"{jcamp_file}.expr") print('#') # Load experiments expr_list = [] for expr_code in eley_codes: expr = load_expr(outputdir / f"{expr_code}.expr") assert isinstance(expr, Experiment) expr_list.append(expr) yield expr_list
def _peak_list(im_i): im_i = deepcopy(im_i) # Intensity matrix size (scans, masses) n_scan, n_mz = im_i.size # noise filter and baseline correct for ii in range(n_mz): ic = im_i.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") im_i.set_ic_at_index(ii, ic_bc) # Use Biller and Biemann technique to find apexing ions at a scan # default is maxima over three scans and not to combine with any neighbouring # scan. peak_list = BillerBiemann(im_i, points=9, scans=2) return peak_list
def Preprocess_IntensityMatrixes(matrixes): # noise removal and baseline correction of Intensity Matricies #input matrix list, outputs corrected matrix list count = 1 for im in matrixes: n_s, n_mz = im.get_size() count += 1 for ii in range(n_mz): print("Working on IC#", ii + 1, " Unit", count) ic = im.get_ic_at_index(ii) ic_smoof = savitzky_golay(ic) ic_bc = tophat(ic_smoof, struct='1.5m') im.set_ic_at_index(ii, ic_bc) return (matrixes) #save to file
def Preprocess_IntensityMatrices(matrices): """ Baseline correction and smoothing of Intensity Matrices input matrix list, outputs corrected/"cleansed" matrix list @param matrices: List of matrices generated by the matrix_from_cdf method @return: List of matrices that have been baseline corrected & smoothed for peak detection """ count = 1 for im in matrices: n_s, n_mz = im.get_size() count += 1 for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smoof = savitzky_golay(ic) ic_bc = tophat(ic_smoof, struct='1.5m') im.set_ic_at_index(ii, ic_bc) return (matrices) # save to file
def Preprocess_IntensityMatrixes(matrixes): ''' noise removal and baseline correction of Intensity Matricies input matrix list, outputs corrected/"cleansed" matrix list @param matrixes: List of matrixes generated by the matrix_from_cdf method @return: List of matrixes that have been 'cleansed' ''' count = 1 for im in matrixes: n_s, n_mz = im.get_size() count += 1 for ii in range(n_mz): # print("Working on IC#", ii+1, " Unit", count) ic = im.get_ic_at_index(ii) ic_smoof = savitzky_golay(ic) ic_bc = tophat(ic_smoof, struct='1.5m') im.set_ic_at_index(ii, ic_bc) # print(matrixes) return (matrixes) # save to file
def test_savitzky_golay(tic): assert isinstance(tic, IonChromatogram) # apply noise smoothing tic1 = savitzky_golay(tic) assert isinstance(tic1, IonChromatogram) assert tic1 != tic assert tic1.is_tic() assert len(tic1) == 2103 assert len(tic) == len(tic1) # Length should be unchanged assert tic1.get_intensity_at_index(test_int) == 421885.76190476184 assert tic1.get_time_at_index(test_int) == 1304.15599823 assert tic1.get_time_at_index(test_int) == tic.get_time_at_index(test_int) assert tic1.time_list[0] == 1.05200003833 assert tic1.time_list[0] == tic.time_list[0] assert tic1.time_step == 1.0560000035830972 assert tic1.time_step == tic1.time_step assert tic1.get_index_at_time(12) == 10 assert tic1.get_index_at_time(12) == tic1.get_index_at_time(12) with pytest.warns(Warning): tic1.mass # Test Errors for obj in [test_string, *test_numbers, *test_lists, test_dict]: with pytest.raises(TypeError): savitzky_golay(obj) # type: ignore for obj in [test_string, test_float, *test_lists, test_dict]: with pytest.raises(TypeError): savitzky_golay(tic, degree=obj) # type: ignore for obj in [test_float, *test_lists, test_dict]: with pytest.raises(TypeError): savitzky_golay(tic, window=obj) # type: ignore
def run(self): print("Quantitative Processing in Progress...") # TODO: Include data etc. in experiment file self.update_pbar() if self.filetype == ID_Format_jcamp: # Load data using JCAMP_reader from pyms.GCMS.IO.JCAMP import JCAMP_reader data = JCAMP_reader(self.properties["Original Filename"]) elif self.filetype == ID_Format_mzML: # Load data using JCAMP_reader from pyms.GCMS.IO.MZML import MZML_reader data = MZML_reader(self.properties["Original Filename"]) elif self.filetype == ID_Format_ANDI: # Load data using JCAMP_reader from pyms.GCMS.IO.ANDI import ANDI_reader data = ANDI_reader(self.properties["Original Filename"]) else: # Unknown Format return # TODO: Waters RAW, Thermo RAW, Agilent .d self.update_pbar() method = Method.Method(self.properties["Method"]) self.update_pbar() # list of all retention times, in seconds times = data.get_time_list() # get Total Ion Chromatogram tic = data.get_tic() # RT Range, time step, no. scans, min, max, mean and median m/z data.info() # Build "intensity matrix" by binning data with integer bins and a # window of -0.3 to +0.7, the same as NIST uses im = build_intensity_matrix_i(data) self.update_pbar() # Show the m/z of the maximum and minimum bins print(" Minimum m/z bin: {}".format(im.get_min_mass())) print(" Maximum m/z bin: {}".format(im.get_max_mass())) # Crop masses min_mass, max_mass, *_ = method.mass_range if min_mass < im.get_min_mass(): min_mass = im.get_min_mass() if max_mass > im.get_max_mass(): max_mass = im.get_max_mass() im.crop_mass(min_mass, max_mass) self.update_pbar() # Perform Data filtering n_scan, n_mz = im.get_size() # Iterate over each IC in the intensity matrix for ii in range(n_mz): # print("\rWorking on IC#", ii+1, ' ',end='') ic = im.get_ic_at_index(ii) if method.enable_sav_gol: # Perform Savitzky-Golay smoothing. # Note that Turbomass does not use smoothing for qualitative method. ic = savitzky_golay(ic) if method.enable_tophat: # Perform Tophat baseline correction # Top-hat baseline Correction seems to bring down noise, # retaining shapes, but keeps points on actual peaks ic = tophat(ic, struct=method.tophat_struct) # Set the IC in the intensity matrix to the filtered one im.set_ic_at_index(ii, ic) self.update_pbar() # Peak Detection based on Biller and Biemann (1974), with a window # of <points>, and combining <scans> if they apex next to each other peak_list = BillerBiemann(im, points=method.bb_points, scans=method.bb_scans) self.update_pbar() print(" Number of peaks identified before filtering: {}".format( len(peak_list))) if method.enable_noise_filter: # Filtering peak lists with automatic noise filtering noise_level = window_analyzer(tic) # should we also do rel_threshold() here? # https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold peak_list = num_ions_threshold(peak_list, method.noise_thresh, noise_level) self.update_pbar() filtered_peak_list = [] for peak in peak_list: # Get mass and intensity lists for the mass spectrum at the apex of the peak apex_mass_list = peak.mass_spectrum.mass_list apex_mass_spec = peak.mass_spectrum.mass_spec # Determine the intensity of the base peak in the mass spectrum base_peak_intensity = max(apex_mass_spec) # Determine the index of the base peak in the mass spectrum base_peak_index = [ index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity ][0] # Finally, determine the mass of the base peak base_peak_mass = apex_mass_list[base_peak_index] # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed if base_peak_mass in method.base_peak_filter: continue area = peak_sum_area(im, peak) peak.set_area(area) filtered_peak_list.append(peak) self.update_pbar() print(" Number of peaks identified: {}".format( len(filtered_peak_list))) # Create an experiment self.expr = Experiment(self.sample_name, filtered_peak_list) self.expr.sele_rt_range([ "{}m".format(method.target_range[0]), "{}m".format(method.target_range[1]) ]) self.update_pbar() current_time = time_now() # The date and time the experiment was created self.properties["Date Created"] = current_time # The date and time the experiment was last modified self.properties["Date Modified"] = current_time if self.pbar: self.pbar.Update(self.pbar.Range) self.tic = tic self.filtered_peak_list = filtered_peak_list
def test_align_2_alignments(A1, pyms_datadir, tmp_pathplus): expr_list = [] for jcamp_file in geco_codes: im = build_intensity_matrix_i( JCAMP_reader(pyms_datadir / f"{jcamp_file}.JDX")) # Intensity matrix size (scans, masses) n_scan, n_mz = im.size # noise filter and baseline correct for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_bc) peak_list = BillerBiemann(im, points=9, scans=2) apl = rel_threshold(peak_list, 2) new_peak_list = num_ions_threshold(apl, 3, 3000) # ignore TMS ions and set mass range for peak in new_peak_list: peak.crop_mass(50, 400) peak.null_mass(73) peak.null_mass(147) # find area area = peak_sum_area(im, peak) peak.area = area area_dict = peak_top_ion_areas(im, peak) peak.ion_areas = area_dict expr = Experiment(jcamp_file, new_peak_list) # set time range for all experiments expr.sele_rt_range(["6.5m", "21m"]) expr_list.append(expr) F2 = exprl2alignment(expr_list) T2 = PairwiseAlignment(F2, Dw, Gw) A2 = align_with_tree(T2, min_peaks=2) # top_ion_list = A2.common_ion() # A2.write_common_ion_csv(tmp_pathplus/'area1.csv', top_ion_list) # between replicates alignment parameters Db = 10.0 # rt modulation Gb = 0.30 # gap penalty print("Aligning input {1,2}") T9 = PairwiseAlignment([A1, A2], Db, Gb) A9 = align_with_tree(T9) A9.write_csv(tmp_pathplus / "rt.csv", tmp_pathplus / "area.csv") aligned_peaks = list(filter(None, A9.aligned_peaks())) store_peaks(aligned_peaks, tmp_pathplus / "peaks.bin") top_ion_list = A9.common_ion() A9.write_common_ion_csv(tmp_pathplus / "area.csv", top_ion_list)
"""proc.py """ import sys sys.path.append("/x/PyMS/") from pyms.GCMS.IO.ANDI.Function import ANDI_reader from pyms.Noise.SavitzkyGolay import savitzky_golay from pyms.Baseline.TopHat import tophat # read the raw data andi_file = "/x/PyMS/data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) # get the TIC tic = data.get_tic() # apply noise smoothing and baseline correction tic1 = savitzky_golay(tic) tic2 = tophat(tic1, struct="1.5m") # save smoothed/baseline corrected TIC tic.write("output/tic.dat", minutes=True) tic1.write("output/tic_smooth.dat", minutes=True) tic2.write("output/tic_smooth_bc.dat", minutes=True)
ic_smooth1 = im_smooth1.get_ic_at_index(73) ic.write(output_directory / "noise_smoothing_ic.dat", minutes=True) ic_smooth1.write(output_directory / "noise_smoothing_ic_smooth1.dat", minutes=True) # ## Savitzky--Golay noise filter # # A more sophisticated noise filter is the Savitzky-Golay filter. # Given the data loaded as above, this filter can be applied as # follows: # In[9]: from pyms.Noise.SavitzkyGolay import savitzky_golay tic4 = savitzky_golay(tic) # Write the smoothed TIC to disk: # In[10]: tic4.write(output_directory / "noise_smoothing_tic4.dat", minutes=True) # In this example the default parameters were used. # # ### Savitzky-Golay Noise filtering of Intensity Matrix Object # # The |savitzky_golay()| function described above acts on a single # |IonChromatogram|. Where it is desired to perform Savitzky Golay # filtering on the whole |IntensityMatrix| the function # |savitzky_golay_im()| may be used as follows:
# read in raw data andi_file = "/x/PyMS/data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) data.trim(4101, 4350) # Build Intensity Matrix real_im = build_intensity_matrix_i(data) n_scan, n_mz = real_im.get_size() # perform necessary pre filtering for ii in range(n_mz): ic = real_im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") real_im.set_ic_at_index(ii, ic_bc) # Detect Peaks peak_list = BillerBiemann(real_im, points=3, scans=2) print "Number of peaks found in real data: ", len(peak_list) ######### Filter peaks############### # Filter the peak list, # first by removing all intensities in a peak less than a given relative # threshold, # then by removing all peaks that have less than a given number of ions above # a given value
def missing_peak_finder(sample, andi_file, points=7, null_ions=[73, 207],\ crop_ions=[45,300], threshold=100000, rt_window=10): """ @summary: Integrates raw data around missing peak locations to fill in NAs in the data matrix @param sample: The sample object containing missing peaks @type sample: pyms.MissingPeak.Class.Sample @param andi_file: Name of the raw data file @type andi_file: stringType @param points: Peak finding - Peak if maxima over 'points' \ number of scans (Default 3) @type points: intType @param null_ions: Ions to be deleted in the matrix @type null_ions: listType @param crop_ions: Range of Ions to be considered @type crop_ions: listType @param threshold: Minimum intensity of IonChromatogram allowable to fill\ missing peak @type threshold: intType @param rt_window: Window in seconds around average RT to look for \ missing peak @type rt_window: floatType @author: Sean O'Callaghan """ ### some error checks on null and crop ions ### a for root,files,dirs in os.path.walk(): loop print "Sample:", sample.get_name(), "andi_file:", andi_file data = ANDI_reader(andi_file) # build integer intensity matrix im = build_intensity_matrix_i(data) for null_ion in null_ions: im.null_mass(null_ion) im.crop_mass(crop_ions[0], crop_ions[1]) # get the size of the intensity matrix n_scan, n_mz = im.get_size() # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic, points) ic_smooth = savitzky_golay(ic1, points) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) for mp in sample.get_missing_peaks(): #JT: Debug peak attributes #attrs = vars(mp) #print ', '.join("%s: %s" % item for item in attrs.items()) mp_rt = mp.get_rt() #print(repr(mp_rt)) common_ion = mp.get_ci() qual_ion_1 = float(mp.get_qual_ion1()) qual_ion_2 = float(mp.get_qual_ion2()) ci_ion_chrom = im.get_ic_at_mass(common_ion) #print "ci = ",common_ion qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1) #print "qi1 = ", qual_ion_1 qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2) #print "qi2 = ", qual_ion_2 ###### # Integrate the CI around that particular RT ####### #Convert time to points # How long between scans? points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt)) points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window) #print "rt_window = ", points_1 - points_2 rt_window_points = points_1 - points_2 maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, \ rt_window_points) large_peaks = [] for rt, intens in maxima_list: if intens > threshold: q1_index = qi1_ion_chrom.get_index_at_time(rt) q2_index = qi2_ion_chrom.get_index_at_time(rt) q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index) q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index) if q1_intensity > threshold / 2 and q2_intensity > threshold / 2: large_peaks.append([rt, intens]) #print('found %d peaks above threshold'%len(large_peaks)) areas = [] for peak in large_peaks: apex = ci_ion_chrom.get_index_at_time(peak[0]) ia = ci_ion_chrom.get_intensity_array().tolist() area, left, fight, l_share, r_share = ion_area(ia, apex, 0) areas.append(area) ######################## areas.sort() if len(areas) > 0: biggest_area = areas[-1] mp.set_ci_area(biggest_area) #print "found area:", biggest_area, "at rt:", mp_rt else: #print "Missing peak at rt = ", mp_rt mp.set_ci_area('NA')
def import_processing(jcamp_file, spectrum_csv_file, report_csv_file, combined_csv_file, bb_points = 9, bb_scans = 2, noise_thresh = 2, target_range = (0,120), tophat_struct="1.5m", nistpath = "../MSSEARCH", base_peak_filter = ['73'], ExprDir = "."): global nist_path nist_path = nistpath # Parameters base_peak_filter = [int(x) for x in base_peak_filter] target_range = tuple(target_range) sample_name = os.path.splitext(os.path.basename(jcamp_file))[0] number_of_peaks = 80 data = JCAMP_reader(jcamp_file) # list of all retention times, in seconds times = data.get_time_list() # get Total Ion Chromatogram tic = data.get_tic() # RT Range, time step, no. scans, min, max, mean and median m/z data.info() #data.write("output/data") # save output # Mass Binning im = build_intensity_matrix_i(data) # covnert to intensity matrix #im.get_size() #number of scans, number of bins masses = im.get_mass_list() # list of mass bins print(" Minimum m/z bin: {}".format(im.get_min_mass())) print(" Maximum m/z bin: {}".format(im.get_max_mass())) # Write Binned Mass Spectra to OpenChrom-like CSV file ms = im.get_ms_at_index(0) # first mass spectrum spectrum_csv = open(spectrum_csv_file, 'w') spectrum_csv.write('RT(milliseconds);RT(minutes) - NOT USED BY IMPORT;RI;') spectrum_csv.write(';'.join(str(mz) for mz in ms.mass_list)) spectrum_csv.write("\n") for scan in range(len(times)): spectrum_csv.write("{};{};{};".format(int(times[scan]*1000),rounders((times[scan]/60),"0.0000000000"),0)) ms = im.get_ms_at_index(scan) spectrum_csv.write(';'.join(str(intensity) for intensity in ms.mass_spec)) spectrum_csv.write('\n') spectrum_csv.close() ## Data filtering # Note that Turbomass does not use smoothing for qualitative method. # Top-hat baseline Correction seems to bring down noise, # retaning shapes, but keeps points on actual peaks #dump_object(im, "output/im.dump") # un-processed output n_scan, n_mz = im.get_size() for ii in range(n_mz): #print("\rWorking on IC#", ii+1, ' ',end='') ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct=tophat_struct) im.set_ic_at_index(ii, ic_bc) #dump_object(im, "output/im-proc.dump") # processed output # Peak Detection based on Biller and Biemann, 1974, with a window # of n points, and combining y scans if they apex next to each other peak_list = BillerBiemann(im, points=bb_points, scans=bb_scans) print(" Number of peaks identified before filtering: {}".format(len(peak_list))) # Filtering peak lists with automatic noise filtering noise_level = window_analyzer(tic) peak_list = num_ions_threshold(peak_list, noise_thresh, noise_level) # why use 2 for number of ions above threshold? print(" Number of peaks identified: {}".format(len(peak_list))) # Peak Areas peak_area_list = [] filtered_peak_list = [] for peak in peak_list: apex_mass_list = peak.get_mass_spectrum().mass_list apex_mass_spec = peak.get_mass_spectrum().mass_spec base_peak_intensity = max(apex_mass_spec) base_peak_index = [index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity][0] base_peak_mass = apex_mass_list[base_peak_index] #print(base_peak_mass) if base_peak_mass in base_peak_filter: continue # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed area = peak_sum_area(im, peak) peak.set_area(area) peak_area_list.append(area) filtered_peak_list.append(peak) # Save the TIC and Peak List tic.write(os.path.join(ExprDir,"{}_tic.dat".format(sample_name)),formatting=False) store_peaks(filtered_peak_list,os.path.join(ExprDir,"{}_peaks.dat".format(sample_name))) # from https://stackoverflow.com/questions/16878715/how-to-find-the-index-of-n-largest-elements-in-a-list-or-np-array-python?lq=1 top_peaks = sorted(range(len(peak_area_list)), key=lambda x: peak_area_list[x]) # Write to turbomass-like CSV file report_csv = open(report_csv_file, "w") # Write to GunShotMatch Combine-like CSV file combine_csv = open(combined_csv_file, "w") combine_csv.write(sample_name) combine_csv.write("\n") report_csv.write("#;RT;Scan;Height;Area\n") combine_csv.write("Retention Time;Peak Area;;Lib;Match;R Match;Name;CAS Number;Scan\n") report_buffer = [] for index in top_peaks: # Peak Number (1-80) peak_number = top_peaks.index(index)+1 # Retention time (minutes, 3dp) RT = rounders(filtered_peak_list[index].get_rt()/60,"0.000") if not target_range[0] < RT <= target_range[1]: continue # skip the peak if it is outside the desired range # scan number, not that we really nead it as the peak object has the spectrum Scan = data.get_index_at_time(filtered_peak_list[index].get_rt())+1 # the binned mass spectrum filtered_peak_list[index].get_mass_spectrum() # TIC intensity, as proxy for Peak height, which should be from baseline Height = '{:,}'.format(rounders(tic.get_intensity_at_index(data.get_index_at_time(filtered_peak_list[index].get_rt())),"0")) # Peak area, originally in "intensity seconds", so dividing by 60 to # get "intensity minutes" like turbomass uses Area = '{:,}'.format(rounders(filtered_peak_list[index].get_area()/60,"0.0")) #report_csv.write("{};{};{};{};{};{}\n".format(peak_number, RT, Scan, Height, Area,bounds)) report_buffer.append([peak_number, RT, Scan, Height, Area]) report_buffer = report_buffer[::-1] # Reverse list order # List of peaks already added to report existing_peaks = [] filtered_report_buffer = [] for row in report_buffer: filtered_report_buffer.append(row) filtered_report_buffer = filtered_report_buffer[:number_of_peaks] filtered_report_buffer.sort(key=operator.itemgetter(2)) for row in filtered_report_buffer: index = filtered_report_buffer.index(row) report_csv.write(";".join([str(i) for i in row])) ms = im.get_ms_at_index(row[2]-1) create_msp("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec) matches_dict = nist_ms_comparison("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec) combine_csv.write("{};{};Page {} of 80;;;;;;{}\n".format(row[1],row[4],index+1,row[2])) for hit in range(1,6): report_csv.write(str(matches_dict["Hit{}".format(hit)])) report_csv.write(";") combine_csv.write(";;{};{};{};{};{};{};\n".format(hit, matches_dict["Hit{}".format(hit)]["Lib"], matches_dict["Hit{}".format(hit)]["MF"], matches_dict["Hit{}".format(hit)]["RMF"], matches_dict["Hit{}".format(hit)]["Name"], matches_dict["Hit{}".format(hit)]["CAS"], )) report_csv.write("\n") time.sleep(2) report_csv.close() combine_csv.close() # Create an experiment expr = Experiment(sample_name, filtered_peak_list) expr.sele_rt_range(["{}m".format(target_range[0]),"{}m".format(target_range[1])]) store_expr(os.path.join(ExprDir,"{}.expr".format(sample_name)), expr) return 0
# read raw data andi_file = data_directory / "data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) data.trim(4101, 4350) # Build Intensity Matrix real_im = build_intensity_matrix_i(data) n_scan, n_mz = real_im.size # perform necessary pre filtering for ii in range(n_mz): ic = real_im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") real_im.set_ic_at_index(ii, ic_bc) # Detect Peaks peak_list = BillerBiemann(real_im, points=3, scans=2) print("Number of peaks found in real data: ", len(peak_list)) ######### Filter peaks############### # Filter the peak list, # first by removing all intensities in a peak less than a given relative # threshold, # then by removing all peaks that have less than a given number of ions above # a given value
rel_threshold, num_ions_threshold # read the raw data as a GCMS_data object andi_file = "data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) im = build_intensity_matrix_i(data) n_scan, n_mz = im.size print("Intensity matrix size (scans, masses):", (n_scan, n_mz)) # noise filter and baseline correct for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_bc) # Use Biller and Biemann technique to find apexing ions at a scan. # Find apex oven 9 points and combine with neighbouring peak if two scans apex # next to each other. peak_list = BillerBiemann(im, points=9, scans=2) print("Number of peaks found: ", len(peak_list)) # Filter the peak list, # first by removing all intensities in a peak less than a given relative # threshold, # then by removing all peaks that have less than a given number of ions above # a given value
def missing_peak_finder(sample, filename, points=13, null_ions=[73, 147],\ crop_ions=[50,540], threshold=1000, rt_window=1, filetype='cdf'): """ @summary: Integrates raw data around missing peak locations to fill in NAs in the data matrix @param sample: The sample object containing missing peaks @type sample: pyms.MissingPeak.Class.Sample @param andi_file: Name of the raw data file @type andi_file: stringType @param points: Peak finding - Peak if maxima over 'points' \ number of scans (Default 3) @type points: intType @param null_ions: Ions to be deleted in the matrix @type null_ions: listType @param crop_ions: Range of Ions to be considered @type crop_ions: listType @param threshold: Minimum intensity of IonChromatogram allowable to fill\ missing peak @type threshold: intType @param rt_window: Window in seconds around average RT to look for \ missing peak @type rt_window: floatType @author: Sean O'Callaghan """ ### some error checks on null and crop ions ### a for root,files,dirs in os.path.walk(): loop print "Sample:", sample.get_name(), "File:", filename if filetype == 'cdf': data = ANDI_reader(filename) elif filetype == 'mzml': data = mzML_reader(filename) else: print "file type not valid" # build integer intensity matrix im = build_intensity_matrix_i(data) for null_ion in null_ions: im.null_mass(null_ion) im.crop_mass(crop_ions[0], crop_ions[1]) # get the size of the intensity matrix n_scan, n_mz = im.get_size() # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic, points) ic_smooth = savitzky_golay(ic1, points) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) for mp in sample.get_missing_peaks(): mp_rt = mp.get_rt() common_ion = mp.get_ci() qual_ion_1 = float(mp.get_qual_ion1()) qual_ion_2 = float(mp.get_qual_ion2()) ci_ion_chrom = im.get_ic_at_mass(common_ion) print "ci = ",common_ion qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1) print "qi1 = ", qual_ion_1 qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2) print "qi2 = ", qual_ion_2 ###### # Integrate the CI around that particular RT ####### #Convert time to points # How long between scans? points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt)) points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt)-rt_window) print "rt_window = ", points_1 - points_2 rt_window_points = points_1 - points_2 maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, \ rt_window_points) large_peaks = [] for rt, intens in maxima_list: if intens > threshold: q1_index = qi1_ion_chrom.get_index_at_time(rt) q2_index = qi2_ion_chrom.get_index_at_time(rt) q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index) q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index) if q1_intensity > threshold/2 and q2_intensity > threshold/2: large_peaks.append([rt, intens]) print('found %d peaks above threshold'%len(large_peaks)) areas = [] for peak in large_peaks: apex = ci_ion_chrom.get_index_at_time(peak[0]) ia = ci_ion_chrom.get_intensity_array().tolist() area, left, fight, l_share, r_share = ion_area(ia, apex, 0) areas.append(area) ######################## areas.sort() if len(areas)>0: biggest_area = areas[-1] mp.set_ci_area(biggest_area) print "found area:", biggest_area, "at rt:", mp_rt else: print "Missing peak at rt = ", mp_rt mp.set_ci_area('na')
def missing_peak_finder( sample: Sample, file_name: str, points: int = 3, null_ions: Optional[List] = None, crop_ions: Optional[List] = None, threshold: int = 1000, rt_window: float = 1, filetype: MissingPeakFiletype = MZML, ): r""" Integrates raw data around missing peak locations to fill ``NA``\s in the data matrix. :param sample: The sample object containing missing peaks :param file_name: Name of the raw data file :param points: Peak finding - Peak if maxima over 'points' number of scans. :param null_ions: Ions to be deleted in the matrix. :default null_ions: ``[73, 147]`` :param crop_ions: Range of Ions to be considered. :default crop_ions: ``[50, 540]`` :param threshold: Minimum intensity of IonChromatogram allowable to fill. :param rt_window: Window in seconds around average RT to look for. :param filetype: :author: Sean O'Callaghan """ if not null_ions: null_ions = [73, 147] if not crop_ions: crop_ions = [50, 540] # TODO: some error checks on null and crop ions # TODO: a for root,files,dirs in os.path.walk(): loop print("Sample:", sample.name, "File:", file_name) if filetype == NETCDF: # this package from pyms.GCMS.IO.ANDI import ANDI_reader data = ANDI_reader(file_name) elif filetype == MZML: # this package from pyms.GCMS.IO.MZML import mzML_reader data = mzML_reader(file_name) else: print("file type not valid") # build integer intensity matrix im = build_intensity_matrix_i(data) for null_ion in null_ions: im.null_mass(null_ion) im.crop_mass(crop_ions[0], crop_ions[1]) # get the size of the intensity matrix n_scan, n_mz = im.size # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic, points) ic_smooth = savitzky_golay(ic1, points) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) for mp in sample.missing_peaks: mp_rt = mp.rt common_ion = mp.common_ion qual_ion_1 = float(mp.qual_ion1) qual_ion_2 = float(mp.qual_ion2) ci_ion_chrom = im.get_ic_at_mass(common_ion) print("ci = ", common_ion) qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1) print("qi1 = ", qual_ion_1) qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2) print("qi2 = ", qual_ion_2) ###### # Integrate the CI around that particular RT ####### # Convert time to points # How long between scans? points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt)) points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window) print("rt_window = ", points_1 - points_2) rt_window_points = points_1 - points_2 maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, rt_window_points) large_peaks = [] for rt, intens in maxima_list: if intens > threshold: q1_index = qi1_ion_chrom.get_index_at_time(rt) q2_index = qi2_ion_chrom.get_index_at_time(rt) q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index) q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index) if q1_intensity > threshold / 2 and q2_intensity > threshold / 2: large_peaks.append([rt, intens]) print(f"found {len(large_peaks):d} peaks above threshold") areas = [] for peak in large_peaks: apex = ci_ion_chrom.get_index_at_time(peak[0]) ia = ci_ion_chrom.intensity_array.tolist() area, left, right, l_share, r_share = ion_area(ia, apex, 0) areas.append(area) ######################## areas.sort() if len(areas) > 0: biggest_area = areas[-1] mp.common_ion_area = biggest_area # mp.exact_rt = f"{float(mp_rt) / 60.0:.3f}" mp.exact_rt = float(mp_rt) / 60.0 print("found area:", biggest_area, "at rt:", mp_rt) else: print("Missing peak at rt = ", mp_rt) mp.common_ion_area = None
"""proc.py """ import sys sys.path.append("/x/PyMS/") from pyms.GCMS.IO.ANDI.Function import ANDI_reader from pyms.Noise.SavitzkyGolay import savitzky_golay from pyms.Baseline.TopHat import tophat # read the raw data andi_file = "/x/PyMS/data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) # get the TIC tic = data.get_tic() # apply noise smoothing and baseline correction tic1 = savitzky_golay(tic) tic2 = tophat(tic1, struct="1.5m") # save smoothed/baseline corrected TIC tic.write("output/tic.dat",minutes=True) tic1.write("output/tic_smooth.dat",minutes=True) tic2.write("output/tic_smooth_bc.dat",minutes=True)
andi_file = data_directory / "a0806_077.cdf" data = ANDI_reader(andi_file) im = build_intensity_matrix_i(data) # Preprocess the data (Savitzky-Golay smoothing and Tophat baseline detection) # In[3]: from pyms.Noise.SavitzkyGolay import savitzky_golay from pyms.TopHat import tophat n_scan, n_mz = im.size for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic) ic_smooth = savitzky_golay(ic1) # Why the second pass here? ic_bc = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_bc) # Now the Biller and Biemann based technique can be applied to detect peaks. # In[4]: from pyms.BillerBiemann import BillerBiemann pl = BillerBiemann(im, points=9, scans=2) len(pl) # Trim the peak list by relative intensity
# read the raw data as a GCMS_data object data = ANDI_reader(andi_file) #data.trim(2431, 2469) # IntensityMatrix # default, float masses with interval (bin interval) of one from min mass print "default intensity matrix, bin interval = 1, boundary +/- 0.5" im = build_intensity_matrix(data) im.null_mass(73) im.null_mass(147) n_scan, n_mz = im.get_size() for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) # Load the experiment exper = load_expr(expr_file) # Load the peak list peak_list = exper.get_peak_list() # Pass Ion Chromatograms into a list of ICs n_mz = len(im.get_mass_list()) ic = [] for m in range(n_mz): ic.append(im.get_ic_at_index(m))
def quantitative_processing(self, jcamp_file, log_stdout=True): """ Import JCAMP-DX Files :param jcamp_file: :type jcamp_file: :param log_stdout: :type log_stdout: :return: :rtype: """ # Determine the name of the sample from the filename sample_name = os.path.splitext(os.path.basename(jcamp_file))[0] # Log Stdout to File if log_stdout: sys.stdout = open(os.path.join(self.config.log_dir, sample_name + ".log"), "w") # Load data using JCAMP_reader data = JCAMP_reader(jcamp_file) # list of all retention times, in seconds times = data.get_time_list() # get Total Ion Chromatogram tic = data.get_tic() # RT Range, time step, no. scans, min, max, mean and median m/z data.info() # Build "intensity matrix" by binning data with integer bins and a # window of -0.3 to +0.7, the same as NIST uses im = build_intensity_matrix_i(data) # Show the m/z of the maximum and minimum bins print(" Minimum m/z bin: {}".format(im.get_min_mass())) print(" Maximum m/z bin: {}".format(im.get_max_mass())) # Crop masses min_mass, max_mass, *_ = self.config.mass_range if min_mass < im.get_min_mass(): min_mass = im.get_min_mass() if max_mass > im.get_max_mass(): max_mass = im.get_max_mass() im.crop_mass(min_mass, max_mass) # Perform Data filtering n_scan, n_mz = im.get_size() # Iterate over each IC in the intensity matrix for ii in range(n_mz): # print("\rWorking on IC#", ii+1, ' ',end='') ic = im.get_ic_at_index(ii) # Perform Savitzky-Golay smoothing. # Note that Turbomass does not use smoothing for qualitative method. ic_smooth = savitzky_golay(ic) # Perform Tophat baseline correction # Top-hat baseline Correction seems to bring down noise, # retaining shapes, but keeps points on actual peaks ic_bc = tophat(ic_smooth, struct=self.config.tophat_struct) # Set the IC in the intensity matrix to the filtered one im.set_ic_at_index(ii, ic_bc) # Peak Detection based on Biller and Biemann (1974), with a window # of <points>, and combining <scans> if they apex next to each other peak_list = BillerBiemann(im, points=self.config.bb_points, scans=self.config.bb_scans) print(" Number of peaks identified before filtering: {}".format(len(peak_list))) # Filtering peak lists with automatic noise filtering noise_level = window_analyzer(tic) # should we also do rel_threshold() here? # https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold peak_list = num_ions_threshold(peak_list, self.config.noise_thresh, noise_level) filtered_peak_list = [] for peak in peak_list: # Get mass and intensity lists for the mass spectrum at the apex of the peak apex_mass_list = peak.mass_spectrum.mass_list apex_mass_spec = peak.mass_spectrum.mass_spec # Determine the intensity of the base peak in the mass spectrum base_peak_intensity = max(apex_mass_spec) # Determine the index of the base peak in the mass spectrum base_peak_index = [ index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity][0] # Finally, determine the mass of the base peak base_peak_mass = apex_mass_list[base_peak_index] # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed if base_peak_mass in self.config.base_peak_filter: continue area = peak_sum_area(im, peak) peak.set_area(area) filtered_peak_list.append(peak) print(" Number of peaks identified: {}".format(len(filtered_peak_list))) # Save the TIC and Peak List tic.write(os.path.join(self.config.expr_dir, "{}_tic.dat".format(sample_name)), formatting=False) store_peaks(filtered_peak_list, os.path.join(self.config.expr_dir, "{}_peaks.dat".format(sample_name))) # Create an experiment expr = Experiment(sample_name, filtered_peak_list) expr.sele_rt_range(["{}m".format(self.config.target_range[0]), "{}m".format(self.config.target_range[1])]) store_expr(os.path.join(self.config.expr_dir, "{}.expr".format(sample_name)), expr)