def A1(T1): A1 = align_with_tree(T1, min_peaks=2) assert isinstance(A1, Alignment) assert isinstance(len(A1), int) assert len(A1) == 232 A1.filter_min_peaks(5) assert len(A1) == 50 return A1
T1 = PairwiseAlignment(F1, Dw, Gw) # The parameters for the alignment by dynamic programming are: ``Dw``, the # retention time modulation in seconds; and ``Gw``, the gap penalty. These # parameters are explained in detail in [1]_. # # The output of |PairwiseAlignment| (``T1``) is an object which contains the # dendrogram tree that maps the similarity relationship between the input # 1-alignments, and also 1-alignments themselves. # # The function |align_with_tree()| then takes the object ``T1`` and aligns the # individual alignment objects according to the guide tree. # In[24]: A1 = align_with_tree(T1, min_peaks=2) # In this example, the individual alignments are three 1-alignments, and the # function |align_with_tree()| first creates a 2-alignment from the two most # similar 1-alignments and then adds the third 1-alignment to this to create # a 3-alignment. # # The parameter ``min_peaks=2`` specifies that any peak column of the data # matrix that has fewer than two peaks in the final alignment will be dropped. # This is useful to clean up the data matrix of accidental peaks that are not # truly observed over the set of replicates. # # Finally, the resulting 3-alignment is saved by writing alignment tables # containing peak retention times (``rt.csv``) and the corresponding peak areas # (``area.csv``). These are plain ASCII files in CSV format.
def test_align_2_alignments(A1, pyms_datadir, tmp_pathplus): expr_list = [] for jcamp_file in geco_codes: im = build_intensity_matrix_i( JCAMP_reader(pyms_datadir / f"{jcamp_file}.JDX")) # Intensity matrix size (scans, masses) n_scan, n_mz = im.size # noise filter and baseline correct for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_bc) peak_list = BillerBiemann(im, points=9, scans=2) apl = rel_threshold(peak_list, 2) new_peak_list = num_ions_threshold(apl, 3, 3000) # ignore TMS ions and set mass range for peak in new_peak_list: peak.crop_mass(50, 400) peak.null_mass(73) peak.null_mass(147) # find area area = peak_sum_area(im, peak) peak.area = area area_dict = peak_top_ion_areas(im, peak) peak.ion_areas = area_dict expr = Experiment(jcamp_file, new_peak_list) # set time range for all experiments expr.sele_rt_range(["6.5m", "21m"]) expr_list.append(expr) F2 = exprl2alignment(expr_list) T2 = PairwiseAlignment(F2, Dw, Gw) A2 = align_with_tree(T2, min_peaks=2) # top_ion_list = A2.common_ion() # A2.write_common_ion_csv(tmp_pathplus/'area1.csv', top_ion_list) # between replicates alignment parameters Db = 10.0 # rt modulation Gb = 0.30 # gap penalty print("Aligning input {1,2}") T9 = PairwiseAlignment([A1, A2], Db, Gb) A9 = align_with_tree(T9) A9.write_csv(tmp_pathplus / "rt.csv", tmp_pathplus / "area.csv") aligned_peaks = list(filter(None, A9.aligned_peaks())) store_peaks(aligned_peaks, tmp_pathplus / "peaks.bin") top_ion_list = A9.common_ion() A9.write_common_ion_csv(tmp_pathplus / "area.csv", top_ion_list)
def peak_comparison(self, a_value=0.05): """ Open the output file :param a_value: :type a_value: :return: :rtype: """ output_filename = os.path.join(self.config.results_dir, f"{self.comparison_name}_COMPARISON") while True: try: outputCSV = open(output_filename + ".CSV", "w") outputCSV.write( f"Explained Variance Ratio: {rounders(self.pca[0], '0.0000')}, " f"{rounders(self.pca[1], '0.0000')};;;;{self.left_sample};;;;;;;;" f"{self.right_sample};;;;;;;;t-tests;;;;;;;;\n") outputCSV.write( f"t-test Threshold α={a_value};;;;Retention Time;;;;Peak Area;;;;" f"Retention Time;;;;Peak Area;;;;Retention Time;;;;Peak Area;;;;" f"Welch's t-test Peak Area;;;;MS Comparison\n") outputCSV.write( "Name;CAS Number;;;Mean;STDEV;%RSD;;Mean;STDEV;%RSD;;Mean;STDEV;" "%RSD;;Mean;STDEV;%RSD;;t-statistic;p-value;Result;;t-statistic;" "p-value;Result;;t-statistic;p-value;Result;;Mean;STDEV;%RSD\n" ) break except IOError: print( f"The file '{output_filename}' is locked for editing in another program." ) input("Press any key to try again.") """Peak Data""" left_peak_data = [] with open( os.path.join(self.config.csv_dir, f"{self.left_sample}_peak_data.json"), "r") as jsonfile: for peak in jsonfile.readlines(): left_peak_data.append(json.loads(peak)) right_peak_data = [] with open( os.path.join(self.config.csv_dir, f"{self.right_sample}_peak_data.json"), "r") as jsonfile: for peak in jsonfile.readlines(): right_peak_data.append(json.loads(peak)) """Alignment Data""" # define the input experiments list left_expr_list = [] for prefix in self.left_prefixList: file_name = os.path.join(self.config.expr_dir, f"{prefix}.expr") left_expr_list.append(load_expr(file_name)) right_expr_list = [] for prefix in self.right_prefixList: file_name = os.path.join(self.config.expr_dir, f"{prefix}.expr") right_expr_list.append(load_expr(file_name)) print("\nAligning\n") left_F1 = exprl2alignment(left_expr_list) left_T1 = PairwiseAlignment(left_F1, self.config.comparison_rt_modulation, self.config.comparison_gap_penalty) left_A1 = align_with_tree(left_T1, min_peaks=self.config.comparison_min_peaks) right_F2 = exprl2alignment(right_expr_list) right_T2 = PairwiseAlignment(right_F2, self.config.comparison_rt_modulation, self.config.comparison_gap_penalty) right_A2 = align_with_tree(right_T2, min_peaks=self.config.comparison_min_peaks) both_alignment = align(left_A1, right_A2, self.config.comparison_rt_modulation, self.config.comparison_gap_penalty) # print(score_matrix(left_A1, right_A2, Dw)) rt_alignment = get_peak_alignment(both_alignment) if not rt_alignment.empty: rt_alignment[self.left_sample] = rt_alignment.apply( df_mean, axis=1, args=([prefix for prefix in self.left_prefixList], ), ) rt_alignment[self.right_sample] = rt_alignment.apply( df_mean, axis=1, args=([prefix for prefix in self.right_prefixList], ), ) ms_alignment = get_ms_alignment(both_alignment) left_aligned_peaks = find_aligned_peaks(self.left_sample, self.left_prefixList, left_peak_data, rt_alignment, ms_alignment) right_aligned_peaks = find_aligned_peaks(self.right_sample, self.right_prefixList, right_peak_data, rt_alignment, ms_alignment) # # print(f"{left_sample} Peaks") # for index, aligned_peak in enumerate(rt_alignment[self.left_sample]): # found_peak = False # for peak in left_peak_data: # # print(aligned_peak, peak["average_rt"]) # if peak["average_rt"] == aligned_peak: # # print(peak) # # for key in peak: # # print(f"{key}: {peak[key]}") # peak["ms_list"] = [ms_alignment.iloc[index][prefix] for prefix in self.left_prefixList] # left_aligned_peaks.append(peak) # found_peak = True # break # if not found_peak: # left_aligned_peaks.append(None) # # # print(f"{right_sample} Peaks") # for index, aligned_peak in enumerate(rt_alignment[self.right_sample]): # found_peak = False # for peak in right_peak_data: # if peak["average_rt"] == aligned_peak: # # print(peak) # # for key in peak: # # print(f"{key}: {peak[key]}") # peak["ms_list"] = [ms_alignment.iloc[index][prefix] for prefix in self.right_prefixList] # right_aligned_peaks.append(peak) # found_peak = True # break # if not found_peak: # right_aligned_peaks.append(None) aligned_non_matching_peaks = [] for left_peak, right_peak in zip(left_aligned_peaks, right_aligned_peaks): if not any([left_peak is None, right_peak is None]): # print(f"{left_peak['average_rt']} {right_peak['average_rt']}") if f"{left_peak['hits'][0]['CAS']}" == f"{right_peak['hits'][0]['CAS']}": # The top hit for each project is the same # print(f"{left_peak['hits'][0]['Name']} {right_peak['hits'][0]['Name']}") left_hit_number, right_hit_number = 0, 0 else: # Check if there is a match in the other four hits left_hit_dict = {} right_hit_dict = {} for hit_num in range(0, 5): left_hit_dict[ f"{left_peak['hits'][hit_num]['CAS']}"] = hit_num right_hit_dict[ f"{right_peak['hits'][hit_num]['CAS']}"] = hit_num left_hit_set = set(left_hit_dict) right_hit_set = set(right_hit_dict) results_list = [] for CAS in left_hit_set.intersection(right_hit_set): # CAS Number and Hit Numbers of hits in common left_hit_num = left_hit_dict[CAS] right_hit_num = right_hit_dict[CAS] left_hit_mf = left_peak["hits"][left_hit_num][ "average_MF"] right_hit_mf = right_peak["hits"][right_hit_num][ "average_MF"] results_list.append([ CAS, left_hit_num, right_hit_num, numpy.mean([left_hit_num, right_hit_num]), numpy.mean([left_hit_mf, right_hit_mf]) ]) results_list = sorted(results_list, key=operator.itemgetter(3, 4)) # print(results_list[0]) # print(results_list) if not results_list: aligned_non_matching_peaks.append( (left_peak, right_peak)) continue left_hit_number, right_hit_number = results_list[0][ 1:3] # print(f"{left_peak['hits'][left_hit_number]['Name']} {right_peak['hits'][right_hit_number]['Name']}") # Write output data name = left_peak['hits'][left_hit_number]['Name'] CAS = left_peak['hits'][left_hit_number]['CAS'] mf_mean, mf_stdev = ms_comparisons(left_peak["ms_list"], right_peak["ms_list"]) write_peak(outputCSV, name, CAS, *get_peak_rt_stats(left_peak), *get_peak_area_stats(left_peak), *get_peak_rt_stats(right_peak), *get_peak_area_stats(right_peak), mf_mean, mf_stdev, a_value) # print('The following peaks were aligned by retention time but none of the "hits" matched:') for peak_pair in aligned_non_matching_peaks: # br() # print(f"Retention time: {peak_pair[0]['average_rt']}") # print(f"Left Peak: {pformat(peak_pair[0])}") left_peak = peak_pair[0] # print(f"Right Peak: {pformat(peak_pair[1])}") right_peak = peak_pair[1] # Write output data write_output_data(left_peak, outputCSV) write_output_data(right_peak, outputCSV) # print("Peaks in the left sample only:") for peak in left_peak_data: if peak not in left_aligned_peaks: # Write output data name = peak['hits'][0]['Name'] CAS = peak['hits'][0]['CAS'] left_rt_mean, left_rt_stdev, left_rt_n = get_peak_rt_stats( peak) left_area_mean, left_area_stdev, left_area_n = get_peak_area_stats( peak) write_peak(outputCSV, name, CAS, left_rt_mean=left_rt_mean, left_rt_stdev=left_rt_stdev, left_area_mean=left_area_mean, left_area_stdev=left_area_stdev, a_value=a_value) # outputCSV.write(f"{name};{CAS};;;") # Name;CAS Number;;; # # outputCSV.write( # f"{left_rt_mean};{left_rt_stdev};{left_rt_stdev / left_rt_mean};;") # Mean RT left;STDEV RT left;%RSD RT left;; # outputCSV.write( # f"{left_area_mean};{left_area_stdev};{left_area_stdev / left_area_mean};;") # Mean Area left;STDEV Area left;%RSD Area left;; # # outputCSV.write(";;;;;;;;;;;;;;;;;;;;;;;;") # outputCSV.write("\n") # print("Peaks in the right sample only:") for peak in right_peak_data: if peak not in right_aligned_peaks: # Write output data name = peak['hits'][0]['Name'] CAS = peak['hits'][0]['CAS'] right_rt_mean, right_rt_stdev, right_rt_n = get_peak_rt_stats( peak) right_area_mean, right_area_stdev, right_area_n = get_peak_area_stats( peak) write_peak(outputCSV, name, CAS, right_rt_mean=right_rt_mean, right_rt_stdev=right_rt_stdev, right_area_mean=right_area_mean, right_area_stdev=right_area_stdev, a_value=a_value) # outputCSV.write(f"{name};{CAS};;;;;;;;;;;") # Name;CAS Number # # outputCSV.write(f"{right_rt_mean};{right_rt_stdev};{right_rt_stdev / right_rt_mean};;") # Mean RT right;STDEV RT right;%RSD RT right;; # outputCSV.write(f"{right_area_mean};{right_area_stdev};{right_area_stdev / right_area_mean};;") # Mean Area right;STDEV Area right;%RSD Area right;; # # outputCSV.write(";;;;;;;;;;;;;;;;") # outputCSV.write("\n") else: print("No peaks were found in common") # TODO: peaks that only appear in left or right sample outputCSV.close() time.sleep(3) append_to_xlsx(output_filename + ".CSV", output_filename + ".xlsx", "Comparison", overwrite=True, separator=";", toFloats=True) comparison_format(output_filename + ".xlsx")
"""proc.py """ # This file has been replaced by jupyter/DPA.ipynb import os from pyms.Experiment import load_expr from pyms.DPA.PairwiseAlignment import PairwiseAlignment, align_with_tree from pyms.DPA.Alignment import exprl2alignment # define the input experiments list exprA_codes = ["a0806_077", "a0806_078", "a0806_079"] # within replicates alignment parameters Dw = 2.5 # rt modulation [s] Gw = 0.30 # gap penalty # do the alignment print('Aligning expt A') expr_list = [] expr_dir = "../61a/output/" for expr_code in exprA_codes: file_name = os.path.join(expr_dir, expr_code + ".expr") expr = load_expr(file_name) expr_list.append(expr) F1 = exprl2alignment(expr_list) T1 = PairwiseAlignment(F1, Dw, Gw) A1 = align_with_tree(T1, min_peaks=2) A1.write_csv('output/rt.csv', 'output/area.csv')
def align(self): """ Perform Peak Alignment on the selected experiments """ if self.alignment_performed: raise ValueError( f"Alignment has already been performed.\n{self.alignment_audit_record}" ) # Imports from pyms.DPA.Alignment import exprl2alignment from pyms.DPA.PairwiseAlignment import align_with_tree, PairwiseAlignment from pyms.json import PyMassSpecEncoder # Perform dynamic peak alignment print("\nAligning\n") # Read the experiment.expr file from each experiment into a list pyms_expr_list = [] for experiment in self.experiment_objects: pyms_expr_list.append(experiment.experiment_data) F1 = exprl2alignment(pyms_expr_list) T1 = PairwiseAlignment(F1, self.method_data.alignment_rt_modulation, self.method_data.alignment_gap_penalty) A1 = align_with_tree(T1, min_peaks=self.method_data.alignment_min_peaks) # Save alignment to file and then add to tarfile with tempfile.TemporaryDirectory() as tmp: A1.write_csv(os.path.join(tmp, 'alignment_rt.csv'), os.path.join(tmp, 'alignment_area.csv')) self.add_to_archive(os.path.join(tmp, 'alignment_rt.csv'), arcname="alignment_rt.csv") self.add_to_archive(os.path.join(tmp, 'alignment_area.csv'), arcname="alignment_area.csv") self.rt_alignment = A1.get_peak_alignment(require_all_expr=False) self.rt_alignment.to_json(os.path.join(tmp, 'alignment_rt.json')) self.add_to_archive(os.path.join(tmp, 'alignment_rt.json'), arcname="alignment_rt.json") self.ms_alignment = A1.get_ms_alignment(require_all_expr=False) # self.ms_alignment.to_json(os.path.join(tmp, 'alignment_ms.json')) with open(os.path.join(tmp, 'alignment_ms.json'), "w") as fp: json.dump(self.ms_alignment.to_dict(), fp, cls=PyMassSpecEncoder) self.add_to_archive(os.path.join(tmp, 'alignment_ms.json'), arcname="alignment_ms.json") self.area_alignment = A1.get_area_alignment(require_all_expr=False) self.area_alignment.to_json( os.path.join(tmp, 'alignment_area.json')) self.add_to_archive(os.path.join(tmp, 'alignment_area.json'), arcname="alignment_area.json") self.alignment_performed = True self.alignment_audit_record = watchdog.AuditRecord() self.date_modified.value = datetime.datetime.now().timestamp() self.store()
def run(self): # Indicate which steps to perform print(f"do_quantitative: {self.config.do_quantitative}") print(f"do_qualitative: {self.config.do_qualitative}") print(f"do_merge: {self.config.do_merge}") print(f"do_counter: {self.config.do_counter}") print(f"do_spectra: {self.config.do_spectra}") print(f"do_charts: {self.config.do_charts}") if self.config.do_quantitative: print("Quantitative Processing in Progress...") # Number of workers for performing Quantitative Processing in parallel # If 0 processing will be performed sequentially n_quant_workers = self.PL_len if n_quant_workers: # Perform Quantitative Processing in parallel with Pool(n_quant_workers) as p: p.map( self.quantitative_processing, [os.path.join( self.config.raw_dir, "{}.JDX".format(prefix) ) for prefix in self.config.prefixList]) for prefix in self.config.prefixList: if not n_quant_workers: # Perform Quantitative Processing sequentially self.quantitative_processing(os.path.join(self.config.raw_dir, "{}.JDX".format(prefix)), False) # Read the log file and print the contents with open(os.path.join(self.config.log_dir, prefix + ".log"), "r") as f: print(f.read()) # Loads the experiment file created during Quantitative Processing for prefix in self.config.prefixList: file_name = os.path.join(self.config.expr_dir, prefix + ".expr") self.expr_list.append(load_expr(file_name)) if any([self.config.do_qualitative, self.config.do_merge, self.config.do_spectra, self.config.do_counter]): # Perform dynamic peak alignment print("\nAligning\n") print(self.expr_list) F1 = exprl2alignment(self.expr_list) print(F1) T1 = PairwiseAlignment(F1, self.config.rt_modulation, self.config.gap_penalty) A1 = align_with_tree(T1, min_peaks=self.config.min_peaks) A1.write_csv( os.path.join(self.config.expr_dir, '{}_rt.csv'.format(self.lot_name)), os.path.join(self.config.expr_dir, '{}_area.csv'.format(self.lot_name))) rt_alignment = get_peak_alignment(A1) ms_alignment = get_ms_alignment(A1) # print(rt_alignment) # print(ms_alignment) if self.config.do_qualitative: print("Qualitative Processing in Progress...") for prefix in self.config.prefixList: # print(list(rt_alignment[prefix])) self.qualitative_processing(prefix, list(rt_alignment[prefix])) if self.config.do_merge: self.merge() if self.config.do_counter: chart_data = self.match_counter(self.ms_comparisons(ms_alignment)) chart_data = chart_data.set_index("Compound", drop=True) # remove duplicate compounds: # chart_data_count = Counter(chart_data["Compound"]) chart_data_count = Counter(chart_data.index) replacement_data = {"Compound": [], f"{self.lot_name} Peak Area": [], f"{self.lot_name} Standard Deviation": []} for prefix in self.config.prefixList: replacement_data[prefix] = [] for compound in chart_data_count: if chart_data_count[compound] > 1: replacement_data["Compound"].append(compound) replacement_data[f"{self.lot_name} Peak Area"].append( sum(chart_data.loc[compound, f"{self.lot_name} Peak Area"])) peak_data = [] for prefix in self.config.prefixList: replacement_data[prefix].append(sum(chart_data.loc[compound, prefix])) peak_data.append(sum(chart_data.loc[compound, prefix])) replacement_data[f"{self.lot_name} Standard Deviation"].append(numpy.std(peak_data)) chart_data = chart_data.drop(compound, axis=0) replacement_data = pandas.DataFrame(replacement_data) replacement_data = replacement_data.set_index("Compound", drop=False) chart_data = chart_data.append(replacement_data, sort=False) chart_data.sort_index(inplace=True) chart_data = chart_data.drop("Compound", axis=1) chart_data['Compound Names'] = chart_data.index chart_data.to_csv(os.path.join(self.config.csv_dir, "{}_CHART_DATA.csv".format(self.lot_name)), sep=";") else: chart_data = pandas.read_csv( os.path.join(self.config.csv_dir, "{}_CHART_DATA.csv".format(self.lot_name)), sep=";", index_col=0) # chart_data = chart_data.set_index("Compound", drop=True) if self.config.do_spectra: self.generate_spectra_from_alignment(rt_alignment, ms_alignment) # Write Mass Spectra to OpenChrom-like CSV files def generate_spectra_csv(rt_data, ms_data, name): # Write Mass Spectra to OpenChrom-like CSV files ms = ms_data[0] # first mass spectrum spectrum_csv_file = os.path.join(self.config.spectra_dir, self.lot_name, f"{name}_data.csv") spectrum_csv = open(spectrum_csv_file, 'w') spectrum_csv.write('RT(milliseconds);RT(minutes) - NOT USED BY IMPORT;RI;') spectrum_csv.write(';'.join(str(mz) for mz in ms.mass_list)) spectrum_csv.write("\n") for rt, ms in zip(rt_data, ms_data): spectrum_csv.write(f"{int(rt * 60000)};{rounders(rt, '0.0000000000')};0;") spectrum_csv.write(';'.join(str(intensity) for intensity in ms.mass_spec)) spectrum_csv.write('\n') spectrum_csv.close() for prefix in self.config.prefixList: print(prefix) # print(rt_alignment[prefix]) # print(ms_alignment[prefix]) generate_spectra_csv(rt_alignment[prefix], ms_alignment[prefix], prefix) if self.config.do_charts: print("\nGenerating Charts") chart_data.to_csv(os.path.join(self.config.csv_dir, "{}_CHART_DATA.csv".format(self.lot_name)), sep=";") maybe_make(os.path.join(self.config.charts_dir, self.lot_name)) if chart_data.empty: print("ALERT: No peaks were found for compounds that have") print(" previously been reported in literature.") print(" Check the results for more information\n") else: from GSMatch.GSMatch_Core.charts import box_whisker_wrapper, radar_chart_wrapper, \ mean_peak_area_wrapper, \ peak_area_wrapper # from GSMatch.GSMatch_Core.charts import peak_area_wrapper, radar_chart_wrapper radar_chart_wrapper( chart_data, [self.lot_name], use_log=10, legend=False, mode=os.path.join(self.config.charts_dir, self.lot_name, "radar_log10_peak_area")) radar_chart_wrapper( chart_data, [self.lot_name], use_log=False, legend=False, mode=os.path.join(self.config.charts_dir, self.lot_name, "radar_peak_area")) mean_peak_area_wrapper( chart_data, [self.lot_name], mode=os.path.join(self.config.charts_dir, self.lot_name, "mean_peak_area")) peak_area_wrapper( chart_data, self.lot_name, self.config.prefixList, mode=os.path.join(self.config.charts_dir, self.lot_name, "peak_area_percentage")) peak_area_wrapper( chart_data, self.lot_name, self.config.prefixList, percentage=False, mode=os.path.join(self.config.charts_dir, self.lot_name, "peak_area")) peak_area_wrapper( chart_data, self.lot_name, self.config.prefixList, use_log=10, mode=os.path.join(self.config.charts_dir, self.lot_name, "log10_peak_area_percentage")) samples_to_compare = [(self.lot_name, self.config.prefixList)] box_whisker_wrapper( chart_data, samples_to_compare, mode=os.path.join(self.config.charts_dir, self.lot_name, "box_whisker")) with open(os.path.join(self.config.results_dir, f"{self.lot_name}.info"), "w") as info_file: for prefix in self.config.prefixList: info_file.write(f"{prefix}\n") # TODO: self.make_archive() pynist.reload_ini(self.config.nist_path) print("\nComplete.")