Пример #1
0
def main():
    results_filename = sys.argv[1]
    output_filename_unique_files = sys.argv[2]
    output_filename_all_matches = sys.argv[3]

    all_datasets = ming_gnps_library.get_all_datasets(gnps_only=True)
    all_matches = ming_fileio_library.parse_table_with_headers_object_list(
        results_filename)

    output_source_list = []
    output_match_list = []

    MetaDataServerStatus = test_metadata_server()

    for match_object in all_matches:
        dataset_accession = match_object["dataset_id"]
        dataset_scan = match_object["dataset_scan"]

        #output_source_list += trace_filename(all_datasets, dataset_accession, dataset_scan)
        current_filelist, current_match_list = trace_filename_filesystem(
            all_datasets,
            dataset_accession,
            dataset_scan,
            enrichmetadata=MetaDataServerStatus)
        output_source_list += current_filelist
        output_match_list += current_match_list

    ming_fileio_library.write_list_dict_table_data(
        output_source_list, output_filename_unique_files)
    ming_fileio_library.write_list_dict_table_data(
        output_match_list, output_filename_all_matches)
def main():
    parser = argparse.ArgumentParser(description='Creates alan table')
    parser.add_argument('input_identifications_filename', help='input_identifications_filename')
    parser.add_argument('output_filename', help='output_filename')
    args = parser.parse_args()

    print(args.input_identifications_filename)

    data_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_identifications_filename)

    all_filenames = set()
    compounds_to_files = defaultdict(set)
    for data_object in data_list:
        query_filename = "f." + data_object["full_CCMS_path"]
        compound_name = data_object["Compound_Name"]
        all_filenames.add(query_filename)
        compounds_to_files[compound_name].add(query_filename)

    output_list = []
    for compound_name in compounds_to_files:
        output_dict = {}
        output_dict["LibraryID"] = compound_name
        output_dict["TotalFiles"] = len(compounds_to_files[compound_name])
        for filename in compounds_to_files[compound_name]:
            output_dict[filename] = "1"

        for filename in all_filenames:
            if not filename in output_dict:
                output_dict[filename] = "0"

        output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list, args.output_filename)
def main():
    input_library_identifications = sys.argv[1]
    output_library_identifications = sys.argv[2]

    annotations_list = ming_fileio_library.parse_table_with_headers_object_list(input_library_identifications)

    already_identified_compounds = set()
    already_identified_spectra = set()

    annotations_list = sorted(annotations_list, key=lambda identification: float(identification["MQScore"]), reverse=True)

    output_annotation_list = []
    for annotation in annotations_list:
        compound_name = annotation["Compound_Name"]
        spectrum_identifier = annotation["#Scan#"] + ":" + annotation["SpectrumFile"]

        if compound_name in already_identified_compounds:
            continue
        if spectrum_identifier in already_identified_spectra:
            continue

        print(compound_name, spectrum_identifier)

        output_annotation_list.append(annotation)
        already_identified_compounds.add(compound_name)
        already_identified_spectra.add(spectrum_identifier)

    ming_fileio_library.write_list_dict_table_data(output_annotation_list, output_library_identifications)
def main():
    input_library_identifications = sys.argv[1]
    output_library_identifications = sys.argv[2]

    annotations_list = ming_fileio_library.parse_table_with_headers_object_list(
        input_library_identifications)

    already_identified_compounds = set()
    already_identified_spectra = set()

    annotations_list = sorted(
        annotations_list,
        key=lambda identification: float(identification["MQScore"]),
        reverse=True)

    output_annotation_list = []
    for annotation in annotations_list:
        compound_name = annotation["Compound_Name"]
        spectrum_identifier = annotation["#Scan#"] + ":" + annotation[
            "SpectrumFile"]

        if compound_name in already_identified_compounds:
            continue
        if spectrum_identifier in already_identified_spectra:
            continue

        print(compound_name, spectrum_identifier)

        output_annotation_list.append(annotation)
        already_identified_compounds.add(compound_name)
        already_identified_spectra.add(spectrum_identifier)

    ming_fileio_library.write_list_dict_table_data(
        output_annotation_list, output_library_identifications)
Пример #5
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('filestats', help='filestats')
    parser.add_argument('dbresults', help='dbresults')
    parser.add_argument('output_filestats', help='output folder for parameters')
    args = parser.parse_args()

    identified_spectra_in_filename = defaultdict(set)

    all_identifications = ming_fileio_library.parse_table_with_headers_object_list(args.dbresults)
    for identification in all_identifications:
        filename = identification["full_CCMS_path"]
        scan = identification["#Scan#"]

        identified_spectra_in_filename[filename].add(scan)

    print(identified_spectra_in_filename)

    output_list = []
    file_summaries = ming_fileio_library.parse_table_with_headers_object_list(args.filestats)

    for file_summary in file_summaries:
        filename = file_summary["full_CCMS_path"]
        count = len(identified_spectra_in_filename[filename])
        file_summary["identified_ms2"] = count
        percent_identified = 0
        try:
            percent_identified = float(count) / float(file_summary["MS2s"])
        except:
            percent_identified = 0

        file_summary["percent_identified"] = percent_identified
        output_list.append(file_summary)

    ming_fileio_library.write_list_dict_table_data(output_list, args.output_filestats)
def main():
    parser = argparse.ArgumentParser(description='Creates alan table')
    parser.add_argument('input_clusterinfosummary',
                        help='input_clusterinfosummary')
    parser.add_argument('output_filename', help='output_filename')
    args = parser.parse_args()

    print(args.input_clusterinfosummary)

    data_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_clusterinfosummary)

    all_filenames = []
    for data_object in data_list:
        if "UniqueFileSources" in data_object:
            all_filenames += data_object["UniqueFileSources"].split("|")
        else:
            filenames = list(
                set([
                    filename.split(":")[0]
                    for filename in data_object["AllFiles"].split("###")
                    if len(filename) > 2
                ]))
            all_filenames += filenames

    all_filenames = list(set(all_filenames))

    compounds_to_files = defaultdict(list)
    for data_object in data_list:
        filenames = []
        if "UniqueFileSources" in data_object:
            filenames = data_object["UniqueFileSources"].split("|")
        else:
            filenames = list(
                set([
                    filename.split(":")[0]
                    for filename in data_object["AllFiles"].split("###")
                    if len(filename) > 2
                ]))
        compound_name = data_object["LibraryID"]
        compounds_to_files[compound_name] += filenames

    output_list = []
    for compound_name in compounds_to_files:
        output_dict = {}
        output_dict["LibraryID"] = compound_name
        output_dict["TotalFiles"] = len(compounds_to_files[compound_name])
        for filename in compounds_to_files[compound_name]:
            output_dict[filename] = "1"

        for filename in all_filenames:
            if not filename in output_dict:
                output_dict[filename] = "0"

        output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list,
                                                   args.output_filename)
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('input_clusterinfo_summary', help='Input cluster info summary')
    parser.add_argument('input_network_pairs_file', help='network_pairs_file')
    parser.add_argument('input_library_search_file', help='network_pairs_file')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    parser.add_argument('output_component_summary', help='output component file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfo_summary)

    library_ids_dict = load_library_id_dict(args.input_library_search_file)
    nodes_to_component, component_to_nodes = load_pairs_dict(args.input_network_pairs_file)

    for cluster in all_clusterinfo_list:
        cluster_index = cluster["cluster index"]
        if cluster_index in nodes_to_component:
            cluster["componentindex"] = nodes_to_component[cluster_index]
            cluster["GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s" % (nodes_to_component[cluster_index], param_obj["task"][0])
        else:
            cluster["componentindex"] = "-1"
            cluster["GNPSLinkout_Network"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (param_obj["task"][0], cluster_index, cluster_index)

        if cluster_index in library_ids_dict:
            cluster["LibraryID"] = library_ids_dict[cluster_index]["Compound_Name"]
            cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"]
            cluster["SpectrumID"] = library_ids_dict[cluster_index]["SpectrumID"]
        else:
            cluster["LibraryID"] = "N/A"
            cluster["MQScore"] = "N/A"
            cluster["SpectrumID"] = "N/A"

    ming_fileio_library.write_list_dict_table_data(all_clusterinfo_list, args.output_clusterinfo_summary)

    output_component_list = []

    for componentindex in component_to_nodes:
        output_dict = {}
        output_dict["ComponentIndex"] = componentindex
        output_dict["NodeCount"] = len(component_to_nodes[componentindex])
        output_dict["#Spectra"] = len(component_to_nodes[componentindex])
        all_lib_identifications = []
        for node in component_to_nodes[componentindex]:
            if node in library_ids_dict:
                all_lib_identifications.append(library_ids_dict[node]["Compound_Name"])
        output_dict["AllIDs"] = "!".join(all_lib_identifications)
        output_component_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_component_list, args.output_component_summary)
Пример #8
0
def generate_clustersummary(input_integrals_filename, output_clustersummary_filename):
    header_order = open(input_integrals_filename).readline().rstrip().split(",")[1:]
    output_list = []

    scan_number = 0
    for header in header_order:
        scan_number += 1
        output_dict = {}
        output_dict["cluster index"] = scan_number
        output_dict["RTMean"] = header

        output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list, output_clustersummary_filename)
def main():
    results_filename = sys.argv[1]
    output_filename = sys.argv[2]

    input_results = ming_fileio_library.parse_table_with_headers_object_list(
        results_filename)
    output_results = []

    #Check if server is up

    for result_object in input_results:
        filename = result_object["filename"]
        get_metadata_information_per_filename(filename)

    ming_fileio_library.write_list_dict_table_data(output_results,
                                                   output_filename)
def main():
    parameters_filename = sys.argv[1]
    input_mgf_filename = sys.argv[2]
    output_clusterinfosummary = sys.argv[3]

    output_list = []

    spectrum_collection = ming_spectrum_library.SpectrumCollection(input_mgf_filename)
    spectrum_collection.load_from_file()

    for spectrum in spectrum_collection.spectrum_list:
        output_dict = {}
        output_dict["cluster index"] = spectrum.scan
        output_dict["RTMean"] = spectrum.retention_time
        output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list, output_clusterinfosummary)
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    output_list = []
    for parallel_output_filename in all_intermediate_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(
            parallel_output_filename)
        output_list += result_list

    ming_fileio_library.write_list_dict_table_data(output_list,
                                                   output_filename)
Пример #12
0
def output_graph_with_headers(G, filename):
    output_list = []

    #Outputting the graph
    component_index = 0
    for component in nx.connected_components(G):
        component_index += 1
        for edge in get_edges_of_component(G, component):
            output_dict = {}

            if int(edge[0]) < int(edge[1]):
                output_dict["CLUSTERID1"] = edge[0]
                output_dict["CLUSTERID2"] = edge[1]
            else:
                output_dict["CLUSTERID1"] = edge[1]
                output_dict["CLUSTERID2"] = edge[0]

            output_dict["DeltaMZ"] = edge[2]["mass_difference"]
            output_dict["Cosine"] = edge[2]["cosine_score"]
            output_dict["ComponentIndex"] = component_index

            output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list, filename)
def output_graph_with_headers(G, filename):
    output_list = []

    #Outputting the graph
    component_index = 0
    for component in nx.connected_components(G):
        component_index += 1
        for edge in get_edges_of_component(G, component):
            output_dict = {}

            if int(edge[0]) < int(edge[1]):
                output_dict["CLUSTERID1"] = edge[0]
                output_dict["CLUSTERID2"] = edge[1]
            else:
                output_dict["CLUSTERID1"] = edge[1]
                output_dict["CLUSTERID2"] = edge[0]

            output_dict["DeltaMZ"] = edge[2]["mass_difference"]
            output_dict["Cosine"] = edge[2]["cosine_score"]
            output_dict["ComponentIndex"] = component_index

            output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list, filename)
def main():
    parser = argparse.ArgumentParser(description='Creates alan table')
    parser.add_argument('input_identifications_filename',
                        help='input_identifications_filename')
    parser.add_argument('output_filename', help='output_filename')
    args = parser.parse_args()

    print(args.input_identifications_filename)

    data_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_identifications_filename)

    all_filenames = set()
    compounds_to_files = defaultdict(set)
    for data_object in data_list:
        query_filename = "f." + data_object["full_CCMS_path"]
        compound_name = data_object["Compound_Name"]
        all_filenames.add(query_filename)
        compounds_to_files[compound_name].add(query_filename)

    output_list = []
    for compound_name in compounds_to_files:
        output_dict = {}
        output_dict["LibraryID"] = compound_name
        output_dict["TotalFiles"] = len(compounds_to_files[compound_name])
        for filename in compounds_to_files[compound_name]:
            output_dict[filename] = "1"

        for filename in all_filenames:
            if not filename in output_dict:
                output_dict[filename] = "0"

        output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list,
                                                   args.output_filename)
Пример #15
0
def main():
    parser = argparse.ArgumentParser(
        description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clusterinfo_file',
                        help='input_clusterinfo_file')
    parser.add_argument('input_clusterinfosummary_file',
                        help='input_clusterinfosummary_file')
    parser.add_argument('input_group_mapping_filename',
                        help='input_group_mapping_filename')
    parser.add_argument('input_attribute_mapping_filename',
                        help='input_attribute_mapping_filename')
    parser.add_argument('input_networking_pairs',
                        help='input_networking_pairs')
    parser.add_argument('input_library_search', help='input_library_search')
    parser.add_argument('output_clusterinfosummary_filename',
                        help='output_clusterinfosummary_filename')
    args = parser.parse_args()
    """Loading group filenames"""
    group_to_files, files_to_groups = load_group_mapping(
        args.input_group_mapping_filename)
    print("Loaded Group Mapping")
    cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_clusterinfosummary_file)
    print("Loaded Cluster Summary")

    attribute_to_groups = load_attribute_mapping(
        args.input_attribute_mapping_filename)

    params_object = ming_proteosafe_library.parse_xml_file(open(
        args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)

    CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0])
    RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0]

    #Calculating the spectrum counts per group
    cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0))
    cluster_to_files = defaultdict(set)
    cluster_to_RT = defaultdict(list)
    line_count = 0
    for line in open(args.input_clusterinfo_file):
        line_count += 1
        if line_count == 1:
            continue
        if line_count % 10000 == 0:
            print(line_count)

        splits = line.rstrip().split("\t")
        cluster_index = splits[0]
        filename = os.path.basename(splits[1])
        rt = float(splits[6])

        group_membership = files_to_groups[filename]
        cluster_to_files[cluster_index].add(filename)
        cluster_to_RT[cluster_index].append(rt)

        for group in group_membership:
            cluster_to_group_counts[cluster_index][group] += 1

    if RUN_MSCLUSTER == "on":
        cluster_summary_list = filter_clusters_based_on_cluster_size(
            cluster_summary_list, CLUSTER_MIN_SIZE)

    print(len(cluster_summary_list))

    print("Setting up grouping", len(group_to_files.keys()))
    for cluster_summary_object in cluster_summary_list:
        cluster_index = cluster_summary_object["cluster index"]
        for group in group_to_files:
            group_count = 0
            if group in cluster_to_group_counts[cluster_index]:
                group_count = cluster_to_group_counts[cluster_index][group]
            cluster_summary_object[group] = group_count

        for attribute in attribute_to_groups:
            groups_to_include = []
            for group in attribute_to_groups[attribute]:
                if group in cluster_summary_object:
                    if cluster_summary_object[group] > 0:
                        groups_to_include.append(group)

            cluster_summary_object[attribute] = ",".join(
                groups_to_include).replace("GNPSGROUP:", "")

    print("Default Attributes")
    calculate_default_attributes(cluster_summary_list, group_to_files.keys())

    print("calculate_cluster_file_stats")
    calculate_cluster_file_stats(cluster_summary_list, cluster_to_files,
                                 mangled_mapping)

    print("rt stats")
    calculate_rt_stats(cluster_summary_list, cluster_to_RT)

    print("populate_network_component")
    populate_network_component(cluster_summary_list,
                               args.input_networking_pairs)

    print("calculate_ancillary_information")
    calculate_ancillary_information(cluster_summary_list,
                                    params_object["task"][0])

    print("populate_network_identifications")
    populate_network_identifications(cluster_summary_list,
                                     args.input_library_search)

    ming_fileio_library.write_list_dict_table_data(
        cluster_summary_list, args.output_clusterinfosummary_filename)
def main():
    input_filename = sys.argv[1]
    output_tsv = sys.argv[2]

    results_list = ming_fileio_library.parse_table_with_headers_object_list(
        input_filename)
    results_by_compound_name = defaultdict(list)
    for result in results_list:
        annotation_string = result["Compound_Name"]
        results_by_compound_name[annotation_string].append(result)

    output_results = []
    for compound_name in results_by_compound_name:
        best_result = sorted(results_by_compound_name[compound_name],
                             key=lambda result: float(result["MQScore"]),
                             reverse=True)[0]

        all_RTs = [
            float(result["RT_Query"])
            for result in results_by_compound_name[compound_name]
        ]
        all_MZs = [
            float(result["SpecMZ"])
            for result in results_by_compound_name[compound_name]
        ]
        all_MZ_ppmerror = [
            float(result["MZErrorPPM"])
            for result in results_by_compound_name[compound_name]
        ]

        rt_mean = statistics.mean(all_RTs)
        rt_median = statistics.median(all_RTs)
        mz_mean = statistics.mean(all_MZs)
        mz_ppm_mean = statistics.mean(all_MZ_ppmerror)

        rt_max = max(all_RTs)
        rt_min = min(all_RTs)

        mz_max = max(all_MZs)
        mz_min = min(all_MZs)

        #STDDev
        rt_stdev = 0.0
        mz_stdev = 0.0
        ppmerror_stdev = 0.0
        if len(all_RTs) > 1:
            rt_stdev = statistics.stdev(all_RTs)
            mz_stdev = statistics.stdev(all_MZs)
            ppmerror_stdev = statistics.stdev(all_MZ_ppmerror)

        best_result["rt_mean"] = rt_mean
        best_result["rt_median"] = rt_median
        best_result["mz_mean"] = mz_mean
        best_result["mz_ppm_mean"] = mz_ppm_mean
        best_result["rt_max"] = rt_max
        best_result["rt_min"] = rt_min
        best_result["mz_max"] = mz_max
        best_result["mz_min"] = mz_min
        best_result["rt_stdev"] = rt_stdev
        best_result["mz_stdev"] = mz_stdev
        best_result["ppmerror_stdev"] = ppmerror_stdev
        best_result["number_spectra"] = len(all_RTs)

        output_results.append(best_result)

    ming_fileio_library.write_list_dict_table_data(output_results, output_tsv)
def main():
    parser = argparse.ArgumentParser(description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file')
    parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file')
    parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename')
    parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename')
    parser.add_argument('input_networking_pairs', help='input_networking_pairs')
    parser.add_argument('input_library_search', help='input_library_search')
    parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename')
    args = parser.parse_args()

    """Loading group filenames"""
    group_to_files, files_to_groups = load_group_mapping(args.input_group_mapping_filename)
    print("Loaded Group Mapping")
    cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfosummary_file)
    print("Loaded Cluster Summary")

    attribute_to_groups = load_attribute_mapping(args.input_attribute_mapping_filename)

    params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)

    CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0])
    RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0]

    #Calculating the spectrum counts per group
    cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0))
    cluster_to_files = defaultdict(set)
    cluster_to_RT = defaultdict(list)
    line_count = 0
    for line in open(args.input_clusterinfo_file):
        line_count += 1
        if line_count == 1:
            continue
        if line_count % 10000 == 0:
            print(line_count)

        splits = line.rstrip().split("\t")
        cluster_index = splits[0]
        filename = os.path.basename(splits[1])
        rt = float(splits[6])

        group_membership = files_to_groups[filename]
        cluster_to_files[cluster_index].add(filename)
        cluster_to_RT[cluster_index].append(rt)

        for group in group_membership:
            cluster_to_group_counts[cluster_index][group] += 1

    if RUN_MSCLUSTER == "on":
        cluster_summary_list = filter_clusters_based_on_cluster_size(cluster_summary_list, CLUSTER_MIN_SIZE)

    print(len(cluster_summary_list))

    print("Setting up grouping", len(group_to_files.keys()))
    for cluster_summary_object in cluster_summary_list:
        cluster_index = cluster_summary_object["cluster index"]
        for group in group_to_files:
            group_count = 0
            if group in cluster_to_group_counts[cluster_index]:
                group_count = cluster_to_group_counts[cluster_index][group]
            cluster_summary_object[group] = group_count

        for attribute in attribute_to_groups:
            groups_to_include = []
            for group in attribute_to_groups[attribute]:
                if group in cluster_summary_object:
                    if cluster_summary_object[group] > 0:
                        groups_to_include.append(group)

            cluster_summary_object[attribute] = ",".join(groups_to_include).replace("GNPSGROUP:", "")


    print("Default Attributes")
    calculate_default_attributes(cluster_summary_list, group_to_files.keys())

    print("calculate_cluster_file_stats")
    calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping)

    print("rt stats")
    calculate_rt_stats(cluster_summary_list, cluster_to_RT)

    print("calculate_ancillary_information")
    calculate_ancillary_information(cluster_summary_list, params_object["task"][0])

    print("populate_network_component")
    populate_network_component(cluster_summary_list, args.input_networking_pairs)

    print("populate_network_identifications")
    populate_network_identifications(cluster_summary_list, args.input_library_search)

    ming_fileio_library.write_list_dict_table_data(cluster_summary_list, args.output_clusterinfosummary_filename)
Пример #18
0
def main():
    parser = argparse.ArgumentParser(description='Modifying script')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_table', help='output_metadata_table')
    parser.add_argument('output_view_emporer', help='output_metadata_table')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(
        open(args.param_xml, "r"))
    """Outputting html"""
    from urllib.parse import urlencode, quote_plus
    parameters_for_qiime = {
        'biom':
        'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=biom_output/networking_quant.biom'
        % (param_object["task"][0]),
        'metadata':
        'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=metadata_for_qiime/metadata_for_qiime.txt'
        % (param_object["task"][0])
    }

    output_html_file = open(args.output_view_emporer, "w")
    output_html_file.write("<script>\n")
    output_html_file.write(
        'window.location.replace("https://mingwangbeta.ucsd.edu/emperor?%s")\n'
        % urlencode(parameters_for_qiime))
    output_html_file.write("</script>\n")
    output_html_file.close()

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(
        param_object)

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)

    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename": real_name})
        #open(args.output_metadata_table, "w").write("NO OUTPUT")
        #open(args.output_view_emporer, "w").write("Please Include Metadata File")
        #exit(0)
    else:
        object_list = ming_fileio_library.parse_table_with_headers_object_list(
            metadata_files_in_folder[0])

        if len(object_list) == 0:
            for real_name in reverse_file_mangling:
                mangled_name = reverse_file_mangling[real_name]
                if mangled_name.find("spec") == -1:
                    continue
                object_list.append({"filename": real_name})
            #open(args.output_metadata_table, "w").write("NO OUTPUT")
            #open(args.output_view_emporer, "w").write("Please Include Non Empty Metadata File")
            #exit(0)

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            metadata_object[
                "#SampleID"] = ming_fileio_library.get_filename_without_extension(
                    metadata_object["filename"])
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        mangled_name = reverse_file_mangling[metadata_object["filename"]]
        if mangled_name.find("spec-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
        elif mangled_name.find("spectwo-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
        elif mangled_name.find("specthree-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
        elif mangled_name.find("specfour-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
        elif mangled_name.find("specfive-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
        elif mangled_name.find("specsix-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"

    ming_fileio_library.write_list_dict_table_data(object_list,
                                                   args.output_metadata_table,
                                                   header_list)
def match_unclustered(match_parameters, spectrum_collection, dataset_dict,
                      all_datasets, output_matches_filename,
                      output_filename_unique_files,
                      output_filename_all_matches):
    MetaDataServerStatus = trace_to_single_file.test_metadata_server()

    all_matches_by_dataset = finding_matches_in_public_data(
        spectrum_collection, all_datasets, match_parameters)

    dataset_matches_output_list = []
    output_filename_unique_files_list = []
    output_filename_all_matches_list = []
    for dataset in all_matches_by_dataset:
        #For each dataset, lets try to find the clustering information
        if len(all_matches_by_dataset[dataset]["matches"]) == 0:
            continue

        top_match = sorted(all_matches_by_dataset[dataset]["matches"],
                           key=lambda match: match["cosine"],
                           reverse=True)[0]

        output_dict = {}
        output_dict['specs_filename'] = "specs_ms.mgf"
        output_dict['specs_scan'] = top_match["queryscan"]
        output_dict['dataset_id'] = dataset_dict[dataset]["dataset"]
        output_dict['dataset_title'] = dataset_dict[dataset]["title"]
        output_dict['dataset_description'] = dataset_dict[dataset][
            "description"].replace("\n", "").replace("\t", "")
        output_dict['dataset_organisms'] = dataset_dict[dataset][
            "species"].replace(";", "!")
        output_dict['dataset_filename'] = top_match["filename"]
        output_dict['dataset_scan'] = top_match["scan"]
        output_dict['score'] = top_match["cosine"]
        output_dict['matchedpeaks'] = top_match["matchedpeaks"]
        output_dict['mzerror'] = top_match["mzerror"]
        output_dict['files_count'] = len(
            all_matches_by_dataset[dataset]["matches"])

        dataset_matches_output_list.append(output_dict)
        """Unique Filenames Calculation"""
        unique_files = list(
            set([
                match["filename"]
                for match in all_matches_by_dataset[dataset]["matches"]
            ]))
        for source_file in unique_files:
            output_object = {}
            output_object["dataset_id"] = dataset_dict[dataset]["dataset"]
            output_object["cluster_scan"] = ""
            output_object["filename"] = source_file
            output_object["metadata"] = ""

            if MetaDataServerStatus:
                metadata_list = trace_to_single_file.get_metadata_information_per_filename(
                    source_file)
                output_object["metadata"] = "|".join(metadata_list)

            output_filename_unique_files_list.append(output_object)

        for match in all_matches_by_dataset[dataset]["matches"]:
            output_object = {}
            output_object["dataset_id"] = dataset
            output_object["cluster_scan"] = match["queryscan"]
            output_object["filename"] = match["filename"]
            output_object["filescan"] = match["scan"]
            output_object["metadata"] = ""

            if MetaDataServerStatus:
                metadata_list = trace_to_single_file.get_metadata_information_per_filename(
                    match["filename"])
                output_object["metadata"] = "|".join(metadata_list)

            output_filename_all_matches_list.append(output_object)

    ming_fileio_library.write_list_dict_table_data(dataset_matches_output_list,
                                                   output_matches_filename)
    ming_fileio_library.write_list_dict_table_data(
        output_filename_unique_files_list, output_filename_unique_files)
    ming_fileio_library.write_list_dict_table_data(
        output_filename_all_matches_list, output_filename_all_matches)
def match_clustered(match_parameters, spectrum_collection, dataset_dict,
                    all_datasets, output_matches_filename,
                    output_filename_unique_files, output_filename_all_matches):
    all_matches = finding_matches_in_public_data(spectrum_collection,
                                                 all_datasets,
                                                 match_parameters)
    """Resolving to File Level"""
    dataset_files_count = defaultdict(lambda: 0)
    output_source_list = []
    output_match_list = []

    MetaDataServerStatus = trace_to_single_file.test_metadata_server()

    for dataset in all_matches:
        for match_object in all_matches[dataset]["matches"]:
            dataset_accession = dataset_dict[dataset]["dataset"]
            dataset_scan = match_object["scan"]
            current_filelist, current_match_list = trace_to_single_file.trace_filename_filesystem(
                all_datasets,
                dataset_accession,
                dataset_scan,
                enrichmetadata=MetaDataServerStatus)
            output_source_list += current_filelist
            output_match_list += current_match_list

    seen_files = set()
    output_unique_source_list = []
    for output_file_object in output_source_list:
        dataset_accession = output_file_object["dataset_id"]
        dataset_filename = output_file_object["filename"]

        key = dataset_accession + ":" + dataset_filename
        if key in seen_files:
            continue

        dataset_files_count[dataset_accession] += 1

        seen_files.add(key)

        output_unique_source_list.append(output_file_object)

    ming_fileio_library.write_list_dict_table_data(
        output_unique_source_list, output_filename_unique_files)
    ming_fileio_library.write_list_dict_table_data(
        output_match_list, output_filename_all_matches)
    """ Summary """
    output_map = {
        "specs_filename": [],
        "specs_scan": [],
        "dataset_filename": [],
        "dataset_scan": [],
        "score": [],
        "dataset_id": [],
        "dataset_title": [],
        "dataset_description": [],
        "dataset_organisms": [],
        "matchedpeaks": [],
        "mzerror": [],
        "files_count": []
    }
    for dataset in all_matches:
        #For each dataset, lets try to find the clustering information
        if len(all_matches[dataset]["matches"]) == 0:
            continue

        match_object = None

        #If it is more than one match, we need to consolidate
        if len(all_matches[dataset]["matches"]) > 1:
            sorted_match_list = sorted(
                all_matches[dataset]["matches"],
                key=lambda match: float(match["cosine"]),
                reverse=True)
            match_object = sorted_match_list[0]
        else:
            match_object = all_matches[dataset]["matches"][0]

        output_map['specs_filename'].append("specs_ms.mgf")
        output_map['specs_scan'].append(match_object["queryscan"])
        output_map['dataset_id'].append(dataset_dict[dataset]["dataset"])
        output_map['dataset_title'].append(dataset_dict[dataset]["title"])
        output_map['dataset_description'].append(
            dataset_dict[dataset]["description"].replace("\n", "").replace(
                "\t", "").replace("\r", ""))
        output_map['dataset_organisms'].append(
            dataset_dict[dataset]["species"].replace(
                "<hr class='separator'\/>", "!"))
        output_map['dataset_filename'].append(match_object["filename"])
        output_map['dataset_scan'].append(match_object["scan"])
        output_map['score'].append(match_object["cosine"])
        output_map['matchedpeaks'].append(match_object["matchedpeaks"])
        output_map['mzerror'].append(match_object["mzerror"])
        output_map['files_count'].append(dataset_files_count[dataset])

    ming_fileio_library.write_dictionary_table_data(output_map,
                                                    output_matches_filename)
Пример #21
0
def main():
    parser = argparse.ArgumentParser(
        description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters',
                        help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism',
                        default=1,
                        type=int,
                        help='Parallelism')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)
    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(
        tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(
                input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list,
                                                   args.result_file)
Пример #22
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main():
    parser = argparse.ArgumentParser(
        description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('input_clusterinfo_summary',
                        help='Input cluster info summary')
    parser.add_argument('input_network_pairs_file', help='network_pairs_file')
    parser.add_argument('input_library_search_file', help='network_pairs_file')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    parser.add_argument('output_component_summary',
                        help='output component file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_clusterinfo_summary)

    library_ids_dict = load_library_id_dict(args.input_library_search_file)
    nodes_to_component, component_to_nodes = load_pairs_dict(
        args.input_network_pairs_file)

    for cluster in all_clusterinfo_list:
        cluster_index = cluster["cluster index"]
        if cluster_index in nodes_to_component:
            cluster["componentindex"] = nodes_to_component[cluster_index]
            cluster[
                "GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s&show=true" % (
                    nodes_to_component[cluster_index], param_obj["task"][0])
        else:
            cluster["componentindex"] = "-1"
            cluster["GNPSLinkout_Network"] = 'This Node is a Singleton'

        if cluster_index in library_ids_dict:
            cluster["LibraryID"] = library_ids_dict[cluster_index][
                "Compound_Name"]
            cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"]
            cluster["SpectrumID"] = library_ids_dict[cluster_index][
                "SpectrumID"]
        else:
            cluster["LibraryID"] = "N/A"
            cluster["MQScore"] = "N/A"
            cluster["SpectrumID"] = "N/A"

    ming_fileio_library.write_list_dict_table_data(
        all_clusterinfo_list, args.output_clusterinfo_summary)

    output_component_list = []

    for componentindex in component_to_nodes:
        output_dict = {}
        output_dict["ComponentIndex"] = componentindex
        output_dict["NodeCount"] = len(component_to_nodes[componentindex])
        output_dict["#Spectra"] = len(component_to_nodes[componentindex])
        all_lib_identifications = []
        for node in component_to_nodes[componentindex]:
            if node in library_ids_dict:
                all_lib_identifications.append(
                    library_ids_dict[node]["Compound_Name"])
        output_dict["AllIDs"] = "!".join(all_lib_identifications)
        output_component_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(
        output_component_list, args.output_component_summary)
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('consensus_feature_file', help='Consensus Quantification File')
    parser.add_argument('metadata_folder', help='metadata metadata_folder')
    parser.add_argument('mgf_filename', help='mgf_filename')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    task_id = param_obj["task"][0]

    group_to_files_mapping = defaultdict(list)
    attributes_to_groups_mapping = defaultdict(set)

    metadata_files = glob.glob(os.path.join(args.metadata_folder, "*"))
    if len(metadata_files) == 1:
        group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(metadata_files[0])

    ROW_NORMALIZATION = "None"
    try:
        ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0]
    except:
        ROW_NORMALIZATION = "None"

    GROUP_COUNT_AGGREGATE_METHOD = "Sum"
    try:
        GROUP_COUNT_AGGREGATE_METHOD = param_obj["GROUP_COUNT_AGGREGATE_METHOD"][0]
    except:
        GROUP_COUNT_AGGREGATE_METHOD = "None"


    quantification_list = ming_fileio_library.parse_table_with_headers_object_list(args.consensus_feature_file, delimiter=",")
    input_filenames, input_filename_headers = determine_input_files(quantification_list[0].keys())

    ### Filling in Quantification table if it is missing values
    for quantification_object in quantification_list:
        ###Handling empty quantification
        for filename in input_filename_headers:
            try:
                if len(quantification_object[filename]) == 0:
                    #print(filename, quantification_object[filename], quantification_object["row ID"])
                    quantification_object[filename] = 0
            except:
                x = 1

    print("Number of Features", len(quantification_list))

    #Doing row sum normalization
    if ROW_NORMALIZATION == "RowSum":
        print("ROW SUM NORM")
        for filename_header in input_filename_headers:
            file_quants = [float(quantification_object[filename_header]) for quantification_object in quantification_list]
            for quantification_object in quantification_list:
                quantification_object[filename_header] = float(quantification_object[filename_header]) / sum(file_quants)

    """Loading MS2 Spectra"""
    mgf_collection = ming_spectrum_library.SpectrumCollection(args.mgf_filename)
    mgf_collection.load_from_file()

    clusters_list = []
    for quantification_object in quantification_list:

        cluster_obj = {}
        cluster_obj["cluster index"] = quantification_object["row ID"]
        cluster_obj["precursor mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]))
        cluster_obj["RTConsensus"] = "{0:.4f}".format(float(quantification_object["row retention time"]))

        all_charges = []

        """Checking about the charge of this cluster"""
        try:
            spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
            charge = int(spectrum_object.charge)
        except:
            charge = 0

        """Checking if this spectrum has no peaks"""
        # try:
        #     spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
        #
        # except:
        #     continue

        all_files = [os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0]
        abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers]
        all_abundances = [float(quantification_object[filename]) for filename in input_filename_headers]

        if charge != 0:
            cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]) * charge - charge + 1)
        else:
            cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]))
        cluster_obj["precursor charge"] = charge

        try:
            cluster_obj["RTMean"] = statistics.mean(all_retention_times)
            cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times)
        except:
            cluster_obj["RTMean"] = cluster_obj["RTConsensus"]
            cluster_obj["RTStdErr"] = 0

        cluster_obj["GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (task_id, quantification_object["row ID"], quantification_object["row ID"])
        #cluster_obj["AllFiles"] = "###".join(all_files)

        cluster_obj["sum(precursor intensity)"] = sum(all_abundances)
        cluster_obj["SumPeakIntensity"] = sum(all_abundances)
        cluster_obj["number of spectra"] = len(all_files)
        cluster_obj["UniqueFileSourcesCount"] = len(all_files)

        group_abundances = determine_group_abundances(group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD)

        default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"]
        for group in group_to_files_mapping:
            group_header = "GNPSGROUP:" + group
            if group in default_groups:
                continue
            cluster_obj[group_header] = group_abundances[group]

        for group in default_groups:
            cluster_obj[group] = group_abundances[group]

        #Writing attributes
        for attribute in attributes_to_groups_mapping:
            groups_to_include = []
            for group in attributes_to_groups_mapping[attribute]:
                if group_abundances[group] > 0.0:
                    groups_to_include.append(group)
            if len(groups_to_include) == 0:
                cluster_obj[attribute] = ""
            else:
                cluster_obj[attribute] = ",".join(groups_to_include)


        """
        Enriching the cluster info with adduct collapsing information
        """
        enrich_adduct_annotations(cluster_obj, quantification_object)


        clusters_list.append(cluster_obj)

    ming_fileio_library.write_list_dict_table_data(clusters_list, args.output_clusterinfo_summary)
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")


    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
Пример #26
0
def main():
    parser = argparse.ArgumentParser(
        description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters',
                        help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary',
                        help='output folder for parameters')
    parser.add_argument('--parallelism',
                        default=1,
                        type=int,
                        help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[
        parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)
    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(
        tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(
            input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(
        full_result_list,
        os.path.join(args.result_folder,
                     str(uuid.uuid4()) + ".tsv"))
def main():
    parser = argparse.ArgumentParser(
        description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('consensus_feature_file',
                        help='Consensus Quantification File')
    parser.add_argument('metadata_folder', help='metadata metadata_folder')
    parser.add_argument('mgf_filename', help='mgf_filename')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    task_id = param_obj["task"][0]

    group_to_files_mapping = defaultdict(list)
    attributes_to_groups_mapping = defaultdict(set)

    metadata_files = glob.glob(os.path.join(args.metadata_folder, "*"))
    if len(metadata_files) == 1:
        group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(
            metadata_files[0])

    ROW_NORMALIZATION = "None"
    try:
        ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0]
    except:
        ROW_NORMALIZATION = "None"

    GROUP_COUNT_AGGREGATE_METHOD = "Sum"
    try:
        GROUP_COUNT_AGGREGATE_METHOD = param_obj[
            "GROUP_COUNT_AGGREGATE_METHOD"][0]
    except:
        GROUP_COUNT_AGGREGATE_METHOD = "None"

    quantification_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.consensus_feature_file, delimiter=",")
    input_filenames, input_filename_headers = determine_input_files(
        quantification_list[0].keys())

    ### Filling in Quantification table if it is missing values
    for quantification_object in quantification_list:
        ###Handling empty quantification
        for filename in input_filename_headers:
            try:
                if len(quantification_object[filename]) == 0:
                    #print(filename, quantification_object[filename], quantification_object["row ID"])
                    quantification_object[filename] = 0
            except:
                x = 1

    print("Number of Features", len(quantification_list))

    #Doing row sum normalization
    if ROW_NORMALIZATION == "RowSum":
        print("ROW SUM NORM")
        for filename_header in input_filename_headers:
            file_quants = [
                float(quantification_object[filename_header])
                for quantification_object in quantification_list
            ]
            for quantification_object in quantification_list:
                quantification_object[filename_header] = float(
                    quantification_object[filename_header]) / sum(file_quants)
    """Loading MS2 Spectra"""
    mgf_collection = ming_spectrum_library.SpectrumCollection(
        args.mgf_filename)
    mgf_collection.load_from_file()

    clusters_list = []
    for quantification_object in quantification_list:

        cluster_obj = {}
        cluster_obj["cluster index"] = quantification_object["row ID"]
        cluster_obj["precursor mass"] = "{0:.4f}".format(
            float(quantification_object["row m/z"]))
        cluster_obj["RTConsensus"] = "{0:.4f}".format(
            float(quantification_object["row retention time"]))

        all_charges = []
        """Checking about the charge of this cluster"""
        try:
            spectrum_object = mgf_collection.scandict[int(
                cluster_obj["cluster index"])]
            charge = int(spectrum_object.charge)
        except:
            charge = 0
        """Checking if this spectrum has no peaks"""
        # try:
        #     spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
        #
        # except:
        #     continue

        all_files = [
            os.path.basename(filename) for filename in input_filename_headers
            if float(quantification_object[filename]) > 0
        ]
        abundance_per_file = [(os.path.basename(filename),
                               float(quantification_object[filename]))
                              for filename in input_filename_headers]
        all_abundances = [
            float(quantification_object[filename])
            for filename in input_filename_headers
        ]

        if charge != 0:
            cluster_obj["parent mass"] = "{0:.4f}".format(
                float(quantification_object["row m/z"]) * charge - charge + 1)
        else:
            cluster_obj["parent mass"] = "{0:.4f}".format(
                float(quantification_object["row m/z"]))
        cluster_obj["precursor charge"] = charge

        try:
            cluster_obj["RTMean"] = statistics.mean(all_retention_times)
            cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times)
        except:
            cluster_obj["RTMean"] = cluster_obj["RTConsensus"]
            cluster_obj["RTStdErr"] = 0

        cluster_obj[
            "GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (
                task_id, quantification_object["row ID"],
                quantification_object["row ID"])
        #cluster_obj["AllFiles"] = "###".join(all_files)

        cluster_obj["sum(precursor intensity)"] = sum(all_abundances)
        cluster_obj["SumPeakIntensity"] = sum(all_abundances)
        cluster_obj["number of spectra"] = len(all_files)
        cluster_obj["UniqueFileSourcesCount"] = len(all_files)

        group_abundances = determine_group_abundances(
            group_to_files_mapping,
            abundance_per_file,
            operation=GROUP_COUNT_AGGREGATE_METHOD)

        default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"]
        for group in group_to_files_mapping:
            group_header = "GNPSGROUP:" + group
            if group in default_groups:
                continue
            cluster_obj[group_header] = group_abundances[group]

        for group in default_groups:
            cluster_obj[group] = group_abundances[group]

        #Writing attributes
        for attribute in attributes_to_groups_mapping:
            groups_to_include = []
            for group in attributes_to_groups_mapping[attribute]:
                if group_abundances[group] > 0.0:
                    groups_to_include.append(group)
            if len(groups_to_include) == 0:
                cluster_obj[attribute] = ""
            else:
                cluster_obj[attribute] = ",".join(groups_to_include)
        """
        Enriching the cluster info with adduct collapsing information
        """
        enrich_adduct_annotations(cluster_obj, quantification_object)

        clusters_list.append(cluster_obj)

    ming_fileio_library.write_list_dict_table_data(
        clusters_list, args.output_clusterinfo_summary)