def convert_quantification(input_integrals_filename, workflow_parameters, output_filename): params_obj = proteosafe.parse_xml_file(workflow_parameters) mangled_mapping = proteosafe.get_mangled_file_mapping(params_obj) all_input_filename = [ os.path.basename(mangled_mapping[key]) for key in mangled_mapping ] filename_mapping = {} for filename in all_input_filename: removed_extension = os.path.splitext(filename)[0] filename_mapping[removed_extension] = filename integrals_df = pd.read_csv(input_integrals_filename, skiprows=[1, 2, 3]) feature_to_rt_mapping = load_feature_to_rt_mapping( input_integrals_filename) all_molecules = list(integrals_df.keys()) all_molecules.remove("No:") output_list = [] for molecule in all_molecules: output_dict = {} output_dict["row ID"] = molecule output_dict["row m/z"] = "0" output_dict["row retention time"] = feature_to_rt_mapping[molecule] for record in integrals_df.to_dict(orient="records"): sample_name = record["No:"] abundance = record[molecule] if sample_name in filename_mapping: sample_name = filename_mapping[sample_name] output_dict[sample_name + " Peak area"] = abundance output_list.append(output_dict) pd.DataFrame(output_list).to_csv(output_filename, sep=",", index=False)
def main(): parser = argparse.ArgumentParser( description='Processing and feature detecting all gc files') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('spectrum_folder', help='spectrum_folder') parser.add_argument('scratch_folder', help='scratch_folder') parser.add_argument('clustered_mgf', help='scratch_folder') parser.add_argument('clusterinfo', help='scratch_folder') parser.add_argument('clustersummary', help='scratch_folder') parser.add_argument('summary_output', help='summary_output') parser.add_argument('python_runtime', help='python_runtime') parser.add_argument('-import_script', help='import_script', default="./proc/io/importmsdata.py") parser.add_argument('-align_script', help='align_script', default="./proc/preproc/intrapalign.py") parser.add_argument('-noise_script', help='noise_script', default="./proc/preproc/noisefilter.py") parser.add_argument('-interalign_script', help='interalign_script', default="./proc/preproc/interpalign.py") parser.add_argument('-peakdetect_script', help='peakdetect_script', default="./proc/preproc/peakdetect.py") parser.add_argument('-export_script', help='export_script', default="./proc/io/export.py") parser.add_argument('-report_script', help='report_script', default="./proc/io/report.py") parser.add_argument('-vistic_script', help='vistic_script', default="./proc/vis/vistic.py") args = parser.parse_args() workflow_params = proteosafe.parse_xml_file(args.proteosafe_parameters) mangled_mapping = proteosafe.get_mangled_file_mapping(workflow_params) file_type_of_import = determine_filetype_of_import(args.spectrum_folder) hdf5_filename = os.path.join(args.scratch_folder, "data.h5") cmds = [] #import data if workflow_params['TIME_UNIT'][0] == "MIN": cmds.append([ args.python_runtime, args.import_script, "-f", file_type_of_import, args.spectrum_folder, hdf5_filename, "--timeunits", "'min'" ]) else: cmds.append([ args.python_runtime, args.import_script, "-f", file_type_of_import, args.spectrum_folder, hdf5_filename, "--timeunits", "'sec'" ]) #intra align cmds.append([ args.python_runtime, args.align_script, hdf5_filename, "--h5writepath", "'sp2D'" ]) #noisefilter cmds.append([ args.python_runtime, args.noise_script, hdf5_filename, "--h5readpath", "'sp2D'", "--h5writepath", "'spproc2D'", "--window", "6", "--frame", "50" ]) #inter align cmds.append([ args.python_runtime, args.interalign_script, hdf5_filename, "--h5readpath", "'spproc2D'", "--h5writepath", "'spal2D'" ]) #According to ipython #peak detect cmds.append([ args.python_runtime, args.peakdetect_script, hdf5_filename, "--h5readpath", "'spal2D'", "--individual", "no", "--frag_pattern", "deconvolution" ]) #output detection cmds.append([ args.python_runtime, args.export_script, hdf5_filename, args.scratch_folder ]) #Additional outputs #python3 ./mshub/proc/io/export.py ./test.h5 --export_integral_table "yes" --export_ms_peak_list "yes" --logfile ./test.log --overwrite_logfile 'no' #outputting TIC with viz package #cmds.append([args.python_runtime, args.vistic_script, "--outputfile", os.path.join("..", args.tic_html), "--display", "no", hdf5_filename]) cmds.append([ args.python_runtime, args.report_script, "--output_prefix", "gnps-gc", hdf5_filename, "summary_temp" ]) cmds.append([ "tar", "-cvf", os.path.join(args.summary_output, "summary.tar"), os.path.join("summary_temp", "gnps-gc") ]) for cmd in cmds: print(" ".join(cmd)) subprocess.call(cmd) #Parse files output_peak_txt_filename = os.path.join(args.scratch_folder, "data_ms_peaks.txt") output_quant_filename = os.path.join(args.scratch_folder, "data_integrals.csv") # mapping the input spec names mangled_mapping_filename = {} for key, value in mangled_mapping.items(): mangled_mapping_filename[key.split('.')[0]] = value.split( '.')[0].split("/")[-1] f = open(output_quant_filename, 'r').readlines() quant_in_memory = [] for line in f: fname = line.split(',')[0] if fname.startswith("spec") and fname in mangled_mapping_filename: print("before:") print(line) line = line.replace(fname, mangled_mapping_filename[fname]) print("after:") print(line) quant_in_memory.append(line) rewrite_quant = open(output_quant_filename, 'w') rewrite_quant.write("".join(quant_in_memory)) rewrite_quant.close() parse_peaks_for_output(output_peak_txt_filename, args.clustered_mgf) simple_presence_of_merged_spectra_processing(output_quant_filename, args.clusterinfo, mangled_mapping) generate_clustersummary(output_quant_filename, args.clustersummary)
def process_candidate_molecules(candidate_molecules, path_to_spectrum_files, proteosafe_param): #Grouping by filename structures_by_filename = defaultdict(list) for candidate_object in candidate_molecules: filename = candidate_object["filename"] structures_by_filename[filename].append(candidate_object) output_dict = defaultdict(list) #Demangle if proteosafe_param: workflow_params = proteosafe.parse_xml_file(proteosafe_param) mangled_mapping = proteosafe.get_mangled_file_mapping(workflow_params) reversed_mapping = {} for key, value in mangled_mapping.items(): fn = value.split("/")[-1] reversed_mapping[fn]=key for filename in structures_by_filename: #print(filename) # if param exists => proteosafe workflow => demangle fileName path_to_spectrum_file = os.path.join(path_to_spectrum_files, filename) displaying_filename = filename if proteosafe_param: #This produces the mangled name path_to_spectrum_file = os.path.join(path_to_spectrum_files, reversed_mapping[filename]) #Try to resolve the full path for key in mangled_mapping: if displaying_filename in mangled_mapping[key]: displaying_filename = mangled_mapping[key] break #loading file spectrum_list = [] try: spectrum_list = ming_spectrum_library.load_mzxml_file(path_to_spectrum_file) except KeyboardInterrupt: raise except Exception as e: print(e) print("Could not load", path_to_spectrum_file) spectrum_list = [] #structure_object is candidate from tsv, spectrum is the input features #print(structures_by_filename) for structure_object in structures_by_filename[filename]: highest_intensity = -1000 best_spectrum = None ppm_threshold = candidate_object["ppm_threshold"] #print(structure_object) #print("molecule mass","monoisotopic mass","ppm","filename","exact_mass","adduct") for spectrum in spectrum_list: if spectrum.ms_level == 1: continue #evaluate candidate_object monoisotopic_mass = structure_object["monoisotopic_mass"] #print(spectrum.mz, structure_object["exact_mass"]) mz_delta = abs(spectrum.mz - monoisotopic_mass) ppm_delta = (mz_delta / monoisotopic_mass ) * 1000000 if ppm_delta > ppm_threshold: continue else: if spectrum.get_total_spectrum_intensity() > highest_intensity: #print(ppm_delta,ppm_threshold) best_spectrum = spectrum highest_intensity = max(spectrum.totIonCurrent, spectrum.get_total_spectrum_intensity()) if best_spectrum != None and highest_intensity > candidate_object["min_precursor_int"]: #print(structure_object["monoisotopic_mass"],candidate_object["monoisotopic_mass"]) #output_dict["FILENAME"].append(os.path.basename(filename)) output_dict["FILENAME"].append(displaying_filename) output_dict["SEQ"].append("*.*") output_dict["COMPOUND_NAME"].append(structure_object["name"]) output_dict["MOLECULEMASS"].append(structure_object["monoisotopic_mass"]) output_dict["INSTRUMENT"].append(structure_object["instrument"]) output_dict["IONSOURCE"].append(structure_object["ionsource"]) output_dict["EXTRACTSCAN"].append(best_spectrum.scan) output_dict["SMILES"].append(structure_object["smiles"]) output_dict["INCHI"].append(structure_object["inchi"]) output_dict["INCHIAUX"].append("N/A") output_dict["CHARGE"].append(structure_object["charge"]) output_dict["IONMODE"].append(structure_object["ionmode"]) output_dict["PUBMED"].append(structure_object["pubmed"]) output_dict["ACQUISITION"].append(structure_object["acquisition"]) output_dict["EXACTMASS"].append(structure_object["exact_mass"]) output_dict["DATACOLLECTOR"].append(structure_object["datacollector"]) output_dict["ADDUCT"].append(structure_object["adduct"]) output_dict["INTEREST"].append("N/A") output_dict["LIBQUALITY"].append("1") output_dict["GENUS"].append("N/A") output_dict["SPECIES"].append("N/A") output_dict["STRAIN"].append("N/A") output_dict["CASNUMBER"].append(structure_object["casnumber"]) output_dict["PI"].append(structure_object["pi"]) #print(spectrum.mz, monoisotopic_mass, ppm_delta, filename, structure_object["exact_mass"],structure_object["adduct"]) #print("Found ", structure_object["name"],structure_object["adduct"], highest_intensity) #else: #print("Not Seen", structure_object["name"], structure_object["adduct"], highest_intensity) return output_dict
def main(): parser = argparse.ArgumentParser(description='Create parallel parameters') parser.add_argument('toolname', help='name of input tool') parser.add_argument('quantification_table', help='quantification_table') parser.add_argument('quantification_table_reformatted', help='quantification_table_reformatted') parser.add_argument('input_spectra_folder', help='input_spectra_folder') parser.add_argument('output_mgf', help='output_mgf') parser.add_argument('workflowParameters', help='workflowParameters') args = parser.parse_args() input_filenames = glob.glob(os.path.join(args.input_spectra_folder, "*")) if args.toolname == "MZMINE2": print("MZMINE2") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) mzmine2_formatter.convert_to_feature_csv( args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "OPENMS": print("OPENMS") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) openms_formatter.convert_to_feature_csv( args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "OPTIMUS": print("OPTIMUS") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) optimus_formatter.convert_to_feature_csv( args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "MSDIAL": print("MSDIAL") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) msdial_formatter.convert_to_feature_csv( args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "METABOSCAPE": print("METABOSCAPE") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) metaboscape_formatter.convert_to_feature_csv( args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "XCMS3": print("XCMS3") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) xcms_formatter.convert_to_feature_csv( args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "PROGENESIS": print("PROGENESIS") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] compound_scan_mapping = progenesis_formatter.convert_to_feature_csv( args.quantification_table, args.quantification_table_reformatted) progenesis_formatter.convert_mgf(input_mgf, args.output_mgf, compound_scan_mapping) elif args.toolname == "MZTABM": print("MZTABM") workflow_parameters = proteosafe.parse_xml_file( args.workflowParameters) mangled_mapping = proteosafe.get_mangled_file_mapping( workflow_parameters) name_mangle_mapping = {} for key in mangled_mapping: demangled_name = mangled_mapping[key] name_mangle_mapping[os.path.basename( demangled_name)] = os.path.join(args.input_spectra_folder, key) compound_filename_mapping = mztabm_formatter.convert_to_feature_csv( args.quantification_table, args.quantification_table_reformatted) mztabm_formatter.create_mgf(input_filenames, args.output_mgf, compound_filename_mapping, name_mangle_mapping=name_mangle_mapping)
def main(): parser = argparse.ArgumentParser(description='Create parallel parameters') parser.add_argument('toolname', help='name of input tool') parser.add_argument('quantification_table', help='quantification_table') parser.add_argument('quantification_table_reformatted', help='quantification_table_reformatted') parser.add_argument('input_spectra_folder', help='input_spectra_folder') parser.add_argument('output_mgf', help='output_mgf') parser.add_argument('workflowParameters', help='workflowParameters') parser.add_argument('--QUANT_FILE_NORM', default="None", help='QUANT_FILE_NORM') args = parser.parse_args() input_filenames = glob.glob(os.path.join(args.input_spectra_folder, "*")) if args.toolname == "MZMINE2": print("MZMINE2") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) mzmine2_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "OPENMS": print("OPENMS") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) openms_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "OPTIMUS": print("OPTIMUS") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) optimus_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "MSDIAL": print("MSDIAL") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) msdial_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "METABOSCAPE": print("METABOSCAPE") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) metaboscape_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "XCMS3": print("XCMS3") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] shutil.copyfile(input_mgf, args.output_mgf) xcms_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted) elif args.toolname == "PROGENESIS": print("PROGENESIS") if len(input_filenames) != 1: print("Must input exactly 1 spectrum mgf file") exit(1) input_mgf = input_filenames[0] compound_scan_mapping = progenesis_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted) progenesis_formatter.convert_mgf(input_mgf, args.output_mgf, compound_scan_mapping) elif args.toolname == "MZTABM": print("MZTABM") workflow_parameters = proteosafe.parse_xml_file(args.workflowParameters) mangled_mapping = proteosafe.get_mangled_file_mapping(workflow_parameters) name_mangle_mapping = {} for key in mangled_mapping: demangled_name = mangled_mapping[key] name_mangle_mapping[os.path.basename(demangled_name)] = os.path.join(args.input_spectra_folder, key) compound_filename_mapping = mztabm_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted) mztabm_formatter.create_mgf(input_filenames, args.output_mgf, compound_filename_mapping, name_mangle_mapping=name_mangle_mapping) # Finally, we can renormlize the output try: if args.QUANT_FILE_NORM == "RowSum": import pandas as pd quant_df = pd.read_csv(args.quantification_table_reformatted, sep=",") quant_df = quant_df.loc[:, ~quant_df.columns.str.contains('^Unnamed')] for column in quant_df: if "Peak area" in column: quant_df[column] = quant_df[column] / sum(quant_df[column]) * 1000000 quant_df.to_csv(args.quantification_table_reformatted, sep=",", index=False) except: pass