def convert_quantification(input_integrals_filename, workflow_parameters,
                           output_filename):
    params_obj = proteosafe.parse_xml_file(workflow_parameters)
    mangled_mapping = proteosafe.get_mangled_file_mapping(params_obj)
    all_input_filename = [
        os.path.basename(mangled_mapping[key]) for key in mangled_mapping
    ]

    filename_mapping = {}
    for filename in all_input_filename:
        removed_extension = os.path.splitext(filename)[0]
        filename_mapping[removed_extension] = filename

    integrals_df = pd.read_csv(input_integrals_filename, skiprows=[1, 2, 3])

    feature_to_rt_mapping = load_feature_to_rt_mapping(
        input_integrals_filename)

    all_molecules = list(integrals_df.keys())
    all_molecules.remove("No:")

    output_list = []
    for molecule in all_molecules:
        output_dict = {}
        output_dict["row ID"] = molecule
        output_dict["row m/z"] = "0"
        output_dict["row retention time"] = feature_to_rt_mapping[molecule]
        for record in integrals_df.to_dict(orient="records"):
            sample_name = record["No:"]
            abundance = record[molecule]

            if sample_name in filename_mapping:
                sample_name = filename_mapping[sample_name]
            output_dict[sample_name + " Peak area"] = abundance

        output_list.append(output_dict)

    pd.DataFrame(output_list).to_csv(output_filename, sep=",", index=False)
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description='Processing and feature detecting all gc files')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('spectrum_folder', help='spectrum_folder')
    parser.add_argument('scratch_folder', help='scratch_folder')
    parser.add_argument('clustered_mgf', help='scratch_folder')
    parser.add_argument('clusterinfo', help='scratch_folder')
    parser.add_argument('clustersummary', help='scratch_folder')
    parser.add_argument('summary_output', help='summary_output')
    parser.add_argument('python_runtime', help='python_runtime')
    parser.add_argument('-import_script',
                        help='import_script',
                        default="./proc/io/importmsdata.py")
    parser.add_argument('-align_script',
                        help='align_script',
                        default="./proc/preproc/intrapalign.py")
    parser.add_argument('-noise_script',
                        help='noise_script',
                        default="./proc/preproc/noisefilter.py")
    parser.add_argument('-interalign_script',
                        help='interalign_script',
                        default="./proc/preproc/interpalign.py")
    parser.add_argument('-peakdetect_script',
                        help='peakdetect_script',
                        default="./proc/preproc/peakdetect.py")
    parser.add_argument('-export_script',
                        help='export_script',
                        default="./proc/io/export.py")
    parser.add_argument('-report_script',
                        help='report_script',
                        default="./proc/io/report.py")
    parser.add_argument('-vistic_script',
                        help='vistic_script',
                        default="./proc/vis/vistic.py")
    args = parser.parse_args()

    workflow_params = proteosafe.parse_xml_file(args.proteosafe_parameters)
    mangled_mapping = proteosafe.get_mangled_file_mapping(workflow_params)

    file_type_of_import = determine_filetype_of_import(args.spectrum_folder)

    hdf5_filename = os.path.join(args.scratch_folder, "data.h5")

    cmds = []

    #import data
    if workflow_params['TIME_UNIT'][0] == "MIN":
        cmds.append([
            args.python_runtime, args.import_script, "-f", file_type_of_import,
            args.spectrum_folder, hdf5_filename, "--timeunits", "'min'"
        ])
    else:
        cmds.append([
            args.python_runtime, args.import_script, "-f", file_type_of_import,
            args.spectrum_folder, hdf5_filename, "--timeunits", "'sec'"
        ])

    #intra align
    cmds.append([
        args.python_runtime, args.align_script, hdf5_filename, "--h5writepath",
        "'sp2D'"
    ])

    #noisefilter
    cmds.append([
        args.python_runtime, args.noise_script, hdf5_filename, "--h5readpath",
        "'sp2D'", "--h5writepath", "'spproc2D'", "--window", "6", "--frame",
        "50"
    ])

    #inter align
    cmds.append([
        args.python_runtime, args.interalign_script, hdf5_filename,
        "--h5readpath", "'spproc2D'", "--h5writepath", "'spal2D'"
    ])  #According to ipython

    #peak detect
    cmds.append([
        args.python_runtime, args.peakdetect_script, hdf5_filename,
        "--h5readpath", "'spal2D'", "--individual", "no", "--frag_pattern",
        "deconvolution"
    ])

    #output detection
    cmds.append([
        args.python_runtime, args.export_script, hdf5_filename,
        args.scratch_folder
    ])

    #Additional outputs
    #python3 ./mshub/proc/io/export.py ./test.h5 --export_integral_table "yes" --export_ms_peak_list "yes" --logfile ./test.log --overwrite_logfile 'no'

    #outputting TIC with viz package
    #cmds.append([args.python_runtime, args.vistic_script, "--outputfile", os.path.join("..", args.tic_html), "--display", "no", hdf5_filename])
    cmds.append([
        args.python_runtime, args.report_script, "--output_prefix", "gnps-gc",
        hdf5_filename, "summary_temp"
    ])
    cmds.append([
        "tar", "-cvf",
        os.path.join(args.summary_output, "summary.tar"),
        os.path.join("summary_temp", "gnps-gc")
    ])

    for cmd in cmds:
        print(" ".join(cmd))
        subprocess.call(cmd)

    #Parse files
    output_peak_txt_filename = os.path.join(args.scratch_folder,
                                            "data_ms_peaks.txt")
    output_quant_filename = os.path.join(args.scratch_folder,
                                         "data_integrals.csv")

    # mapping the input spec names
    mangled_mapping_filename = {}
    for key, value in mangled_mapping.items():
        mangled_mapping_filename[key.split('.')[0]] = value.split(
            '.')[0].split("/")[-1]

    f = open(output_quant_filename, 'r').readlines()
    quant_in_memory = []
    for line in f:
        fname = line.split(',')[0]
        if fname.startswith("spec") and fname in mangled_mapping_filename:
            print("before:")
            print(line)
            line = line.replace(fname, mangled_mapping_filename[fname])
            print("after:")
            print(line)
        quant_in_memory.append(line)
    rewrite_quant = open(output_quant_filename, 'w')
    rewrite_quant.write("".join(quant_in_memory))
    rewrite_quant.close()
    parse_peaks_for_output(output_peak_txt_filename, args.clustered_mgf)
    simple_presence_of_merged_spectra_processing(output_quant_filename,
                                                 args.clusterinfo,
                                                 mangled_mapping)
    generate_clustersummary(output_quant_filename, args.clustersummary)
def process_candidate_molecules(candidate_molecules, path_to_spectrum_files, proteosafe_param):
    #Grouping by filename
    structures_by_filename = defaultdict(list)

    for candidate_object in candidate_molecules:
        filename = candidate_object["filename"]
        structures_by_filename[filename].append(candidate_object)

    output_dict = defaultdict(list)
    #Demangle
    if proteosafe_param:
        workflow_params = proteosafe.parse_xml_file(proteosafe_param)
        mangled_mapping = proteosafe.get_mangled_file_mapping(workflow_params)
        reversed_mapping = {}
        for key, value in mangled_mapping.items():
            fn = value.split("/")[-1]
            reversed_mapping[fn]=key
    for filename in structures_by_filename:
        #print(filename)
        # if param exists => proteosafe workflow => demangle fileName
        path_to_spectrum_file = os.path.join(path_to_spectrum_files, filename)
        displaying_filename = filename
        if proteosafe_param:
            #This produces the mangled name
            path_to_spectrum_file = os.path.join(path_to_spectrum_files, reversed_mapping[filename])

            #Try to resolve the full path
            for key in mangled_mapping:
                if displaying_filename in mangled_mapping[key]:
                    displaying_filename = mangled_mapping[key]
                    break

        #loading file
        spectrum_list = []
        try:
            spectrum_list = ming_spectrum_library.load_mzxml_file(path_to_spectrum_file)
        except KeyboardInterrupt:
            raise
        except Exception as e:
            print(e)
            print("Could not load", path_to_spectrum_file)
            spectrum_list = []

        #structure_object is candidate from tsv, spectrum is the input features
        #print(structures_by_filename)
        for structure_object in structures_by_filename[filename]:
            highest_intensity = -1000
            best_spectrum = None
            ppm_threshold = candidate_object["ppm_threshold"]
            #print(structure_object)
            #print("molecule mass","monoisotopic mass","ppm","filename","exact_mass","adduct")
            for spectrum in spectrum_list:
                if spectrum.ms_level == 1:
                    continue

                #evaluate candidate_object
                monoisotopic_mass = structure_object["monoisotopic_mass"]
                #print(spectrum.mz, structure_object["exact_mass"])
                mz_delta = abs(spectrum.mz - monoisotopic_mass)
                ppm_delta = (mz_delta / monoisotopic_mass ) * 1000000
                if ppm_delta > ppm_threshold:
                    continue
                else:
                    if spectrum.get_total_spectrum_intensity() > highest_intensity:
                        #print(ppm_delta,ppm_threshold)
                        best_spectrum = spectrum
                        highest_intensity = max(spectrum.totIonCurrent, spectrum.get_total_spectrum_intensity())

            if best_spectrum != None and highest_intensity > candidate_object["min_precursor_int"]:
                #print(structure_object["monoisotopic_mass"],candidate_object["monoisotopic_mass"])
                #output_dict["FILENAME"].append(os.path.basename(filename))
                output_dict["FILENAME"].append(displaying_filename)
                output_dict["SEQ"].append("*.*")
                output_dict["COMPOUND_NAME"].append(structure_object["name"])
                output_dict["MOLECULEMASS"].append(structure_object["monoisotopic_mass"])
                output_dict["INSTRUMENT"].append(structure_object["instrument"])
                output_dict["IONSOURCE"].append(structure_object["ionsource"])
                output_dict["EXTRACTSCAN"].append(best_spectrum.scan)
                output_dict["SMILES"].append(structure_object["smiles"])
                output_dict["INCHI"].append(structure_object["inchi"])
                output_dict["INCHIAUX"].append("N/A")
                output_dict["CHARGE"].append(structure_object["charge"])
                output_dict["IONMODE"].append(structure_object["ionmode"])
                output_dict["PUBMED"].append(structure_object["pubmed"])
                output_dict["ACQUISITION"].append(structure_object["acquisition"])
                output_dict["EXACTMASS"].append(structure_object["exact_mass"])
                output_dict["DATACOLLECTOR"].append(structure_object["datacollector"])
                output_dict["ADDUCT"].append(structure_object["adduct"])
                output_dict["INTEREST"].append("N/A")
                output_dict["LIBQUALITY"].append("1")
                output_dict["GENUS"].append("N/A")
                output_dict["SPECIES"].append("N/A")
                output_dict["STRAIN"].append("N/A")
                output_dict["CASNUMBER"].append(structure_object["casnumber"])
                output_dict["PI"].append(structure_object["pi"])
                #print(spectrum.mz, monoisotopic_mass, ppm_delta, filename, structure_object["exact_mass"],structure_object["adduct"])
                #print("Found ", structure_object["name"],structure_object["adduct"], highest_intensity)
            #else:
                #print("Not Seen", structure_object["name"], structure_object["adduct"], highest_intensity)

    return output_dict
def main():
    parser = argparse.ArgumentParser(description='Create parallel parameters')
    parser.add_argument('toolname', help='name of input tool')
    parser.add_argument('quantification_table', help='quantification_table')
    parser.add_argument('quantification_table_reformatted',
                        help='quantification_table_reformatted')
    parser.add_argument('input_spectra_folder', help='input_spectra_folder')
    parser.add_argument('output_mgf', help='output_mgf')
    parser.add_argument('workflowParameters', help='workflowParameters')
    args = parser.parse_args()

    input_filenames = glob.glob(os.path.join(args.input_spectra_folder, "*"))

    if args.toolname == "MZMINE2":
        print("MZMINE2")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        mzmine2_formatter.convert_to_feature_csv(
            args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "OPENMS":
        print("OPENMS")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        openms_formatter.convert_to_feature_csv(
            args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "OPTIMUS":
        print("OPTIMUS")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        optimus_formatter.convert_to_feature_csv(
            args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "MSDIAL":
        print("MSDIAL")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        msdial_formatter.convert_to_feature_csv(
            args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "METABOSCAPE":
        print("METABOSCAPE")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        metaboscape_formatter.convert_to_feature_csv(
            args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "XCMS3":
        print("XCMS3")
        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        xcms_formatter.convert_to_feature_csv(
            args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "PROGENESIS":
        print("PROGENESIS")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]

        compound_scan_mapping = progenesis_formatter.convert_to_feature_csv(
            args.quantification_table, args.quantification_table_reformatted)
        progenesis_formatter.convert_mgf(input_mgf, args.output_mgf,
                                         compound_scan_mapping)
    elif args.toolname == "MZTABM":
        print("MZTABM")
        workflow_parameters = proteosafe.parse_xml_file(
            args.workflowParameters)
        mangled_mapping = proteosafe.get_mangled_file_mapping(
            workflow_parameters)

        name_mangle_mapping = {}
        for key in mangled_mapping:
            demangled_name = mangled_mapping[key]
            name_mangle_mapping[os.path.basename(
                demangled_name)] = os.path.join(args.input_spectra_folder, key)

        compound_filename_mapping = mztabm_formatter.convert_to_feature_csv(
            args.quantification_table, args.quantification_table_reformatted)
        mztabm_formatter.create_mgf(input_filenames,
                                    args.output_mgf,
                                    compound_filename_mapping,
                                    name_mangle_mapping=name_mangle_mapping)
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description='Create parallel parameters')
    parser.add_argument('toolname', help='name of input tool')
    parser.add_argument('quantification_table', help='quantification_table')
    parser.add_argument('quantification_table_reformatted', help='quantification_table_reformatted')
    parser.add_argument('input_spectra_folder', help='input_spectra_folder')
    parser.add_argument('output_mgf', help='output_mgf')
    parser.add_argument('workflowParameters', help='workflowParameters')
    parser.add_argument('--QUANT_FILE_NORM', default="None", help='QUANT_FILE_NORM')
    
    args = parser.parse_args()

    input_filenames = glob.glob(os.path.join(args.input_spectra_folder, "*"))

    if args.toolname == "MZMINE2":
        print("MZMINE2")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        mzmine2_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "OPENMS":
        print("OPENMS")
        
        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        openms_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "OPTIMUS":
        print("OPTIMUS")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        optimus_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "MSDIAL":
        print("MSDIAL")
        
        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        msdial_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "METABOSCAPE":
        print("METABOSCAPE")
        
        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        metaboscape_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "XCMS3":
        print("XCMS3")
        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]
        shutil.copyfile(input_mgf, args.output_mgf)
        xcms_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted)
    elif args.toolname == "PROGENESIS":
        print("PROGENESIS")

        if len(input_filenames) != 1:
            print("Must input exactly 1 spectrum mgf file")
            exit(1)

        input_mgf = input_filenames[0]

        compound_scan_mapping = progenesis_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted)
        progenesis_formatter.convert_mgf(input_mgf, args.output_mgf, compound_scan_mapping)
    elif args.toolname == "MZTABM":
        print("MZTABM")
        workflow_parameters = proteosafe.parse_xml_file(args.workflowParameters)
        mangled_mapping = proteosafe.get_mangled_file_mapping(workflow_parameters)

        name_mangle_mapping = {}
        for key in mangled_mapping:
            demangled_name = mangled_mapping[key]
            name_mangle_mapping[os.path.basename(demangled_name)] = os.path.join(args.input_spectra_folder, key)

        compound_filename_mapping = mztabm_formatter.convert_to_feature_csv(args.quantification_table, args.quantification_table_reformatted)
        mztabm_formatter.create_mgf(input_filenames, args.output_mgf, compound_filename_mapping, name_mangle_mapping=name_mangle_mapping)

    # Finally, we can renormlize the output
    try:
        if args.QUANT_FILE_NORM == "RowSum":
            import pandas as pd
            quant_df = pd.read_csv(args.quantification_table_reformatted, sep=",")
            quant_df = quant_df.loc[:, ~quant_df.columns.str.contains('^Unnamed')]

            for column in quant_df:
                if "Peak area" in column:
                    quant_df[column] = quant_df[column] / sum(quant_df[column]) * 1000000

            quant_df.to_csv(args.quantification_table_reformatted, sep=",", index=False)
    except:
        pass