def main(): results_filename = sys.argv[1] output_filename_unique_files = sys.argv[2] output_filename_all_matches = sys.argv[3] all_datasets = ming_gnps_library.get_all_datasets(gnps_only=True) all_matches = ming_fileio_library.parse_table_with_headers_object_list( results_filename) output_source_list = [] output_match_list = [] MetaDataServerStatus = test_metadata_server() for match_object in all_matches: dataset_accession = match_object["dataset_id"] dataset_scan = match_object["dataset_scan"] #output_source_list += trace_filename(all_datasets, dataset_accession, dataset_scan) current_filelist, current_match_list = trace_filename_filesystem( all_datasets, dataset_accession, dataset_scan, enrichmetadata=MetaDataServerStatus) output_source_list += current_filelist output_match_list += current_match_list ming_fileio_library.write_list_dict_table_data( output_source_list, output_filename_unique_files) ming_fileio_library.write_list_dict_table_data( output_match_list, output_filename_all_matches)
def main(): parser = argparse.ArgumentParser(description='Creates alan table') parser.add_argument('input_identifications_filename', help='input_identifications_filename') parser.add_argument('output_filename', help='output_filename') args = parser.parse_args() print(args.input_identifications_filename) data_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_identifications_filename) all_filenames = set() compounds_to_files = defaultdict(set) for data_object in data_list: query_filename = "f." + data_object["full_CCMS_path"] compound_name = data_object["Compound_Name"] all_filenames.add(query_filename) compounds_to_files[compound_name].add(query_filename) output_list = [] for compound_name in compounds_to_files: output_dict = {} output_dict["LibraryID"] = compound_name output_dict["TotalFiles"] = len(compounds_to_files[compound_name]) for filename in compounds_to_files[compound_name]: output_dict[filename] = "1" for filename in all_filenames: if not filename in output_dict: output_dict[filename] = "0" output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, args.output_filename)
def main(): input_library_identifications = sys.argv[1] output_library_identifications = sys.argv[2] annotations_list = ming_fileio_library.parse_table_with_headers_object_list(input_library_identifications) already_identified_compounds = set() already_identified_spectra = set() annotations_list = sorted(annotations_list, key=lambda identification: float(identification["MQScore"]), reverse=True) output_annotation_list = [] for annotation in annotations_list: compound_name = annotation["Compound_Name"] spectrum_identifier = annotation["#Scan#"] + ":" + annotation["SpectrumFile"] if compound_name in already_identified_compounds: continue if spectrum_identifier in already_identified_spectra: continue print(compound_name, spectrum_identifier) output_annotation_list.append(annotation) already_identified_compounds.add(compound_name) already_identified_spectra.add(spectrum_identifier) ming_fileio_library.write_list_dict_table_data(output_annotation_list, output_library_identifications)
def main(): input_library_identifications = sys.argv[1] output_library_identifications = sys.argv[2] annotations_list = ming_fileio_library.parse_table_with_headers_object_list( input_library_identifications) already_identified_compounds = set() already_identified_spectra = set() annotations_list = sorted( annotations_list, key=lambda identification: float(identification["MQScore"]), reverse=True) output_annotation_list = [] for annotation in annotations_list: compound_name = annotation["Compound_Name"] spectrum_identifier = annotation["#Scan#"] + ":" + annotation[ "SpectrumFile"] if compound_name in already_identified_compounds: continue if spectrum_identifier in already_identified_spectra: continue print(compound_name, spectrum_identifier) output_annotation_list.append(annotation) already_identified_compounds.add(compound_name) already_identified_spectra.add(spectrum_identifier) ming_fileio_library.write_list_dict_table_data( output_annotation_list, output_library_identifications)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('filestats', help='filestats') parser.add_argument('dbresults', help='dbresults') parser.add_argument('output_filestats', help='output folder for parameters') args = parser.parse_args() identified_spectra_in_filename = defaultdict(set) all_identifications = ming_fileio_library.parse_table_with_headers_object_list(args.dbresults) for identification in all_identifications: filename = identification["full_CCMS_path"] scan = identification["#Scan#"] identified_spectra_in_filename[filename].add(scan) print(identified_spectra_in_filename) output_list = [] file_summaries = ming_fileio_library.parse_table_with_headers_object_list(args.filestats) for file_summary in file_summaries: filename = file_summary["full_CCMS_path"] count = len(identified_spectra_in_filename[filename]) file_summary["identified_ms2"] = count percent_identified = 0 try: percent_identified = float(count) / float(file_summary["MS2s"]) except: percent_identified = 0 file_summary["percent_identified"] = percent_identified output_list.append(file_summary) ming_fileio_library.write_list_dict_table_data(output_list, args.output_filestats)
def main(): parser = argparse.ArgumentParser(description='Creates alan table') parser.add_argument('input_clusterinfosummary', help='input_clusterinfosummary') parser.add_argument('output_filename', help='output_filename') args = parser.parse_args() print(args.input_clusterinfosummary) data_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_clusterinfosummary) all_filenames = [] for data_object in data_list: if "UniqueFileSources" in data_object: all_filenames += data_object["UniqueFileSources"].split("|") else: filenames = list( set([ filename.split(":")[0] for filename in data_object["AllFiles"].split("###") if len(filename) > 2 ])) all_filenames += filenames all_filenames = list(set(all_filenames)) compounds_to_files = defaultdict(list) for data_object in data_list: filenames = [] if "UniqueFileSources" in data_object: filenames = data_object["UniqueFileSources"].split("|") else: filenames = list( set([ filename.split(":")[0] for filename in data_object["AllFiles"].split("###") if len(filename) > 2 ])) compound_name = data_object["LibraryID"] compounds_to_files[compound_name] += filenames output_list = [] for compound_name in compounds_to_files: output_dict = {} output_dict["LibraryID"] = compound_name output_dict["TotalFiles"] = len(compounds_to_files[compound_name]) for filename in compounds_to_files[compound_name]: output_dict[filename] = "1" for filename in all_filenames: if not filename in output_dict: output_dict[filename] = "0" output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, args.output_filename)
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('input_clusterinfo_summary', help='Input cluster info summary') parser.add_argument('input_network_pairs_file', help='network_pairs_file') parser.add_argument('input_library_search_file', help='network_pairs_file') parser.add_argument('output_clusterinfo_summary', help='output file') parser.add_argument('output_component_summary', help='output component file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfo_summary) library_ids_dict = load_library_id_dict(args.input_library_search_file) nodes_to_component, component_to_nodes = load_pairs_dict(args.input_network_pairs_file) for cluster in all_clusterinfo_list: cluster_index = cluster["cluster index"] if cluster_index in nodes_to_component: cluster["componentindex"] = nodes_to_component[cluster_index] cluster["GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s" % (nodes_to_component[cluster_index], param_obj["task"][0]) else: cluster["componentindex"] = "-1" cluster["GNPSLinkout_Network"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (param_obj["task"][0], cluster_index, cluster_index) if cluster_index in library_ids_dict: cluster["LibraryID"] = library_ids_dict[cluster_index]["Compound_Name"] cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"] cluster["SpectrumID"] = library_ids_dict[cluster_index]["SpectrumID"] else: cluster["LibraryID"] = "N/A" cluster["MQScore"] = "N/A" cluster["SpectrumID"] = "N/A" ming_fileio_library.write_list_dict_table_data(all_clusterinfo_list, args.output_clusterinfo_summary) output_component_list = [] for componentindex in component_to_nodes: output_dict = {} output_dict["ComponentIndex"] = componentindex output_dict["NodeCount"] = len(component_to_nodes[componentindex]) output_dict["#Spectra"] = len(component_to_nodes[componentindex]) all_lib_identifications = [] for node in component_to_nodes[componentindex]: if node in library_ids_dict: all_lib_identifications.append(library_ids_dict[node]["Compound_Name"]) output_dict["AllIDs"] = "!".join(all_lib_identifications) output_component_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_component_list, args.output_component_summary)
def generate_clustersummary(input_integrals_filename, output_clustersummary_filename): header_order = open(input_integrals_filename).readline().rstrip().split(",")[1:] output_list = [] scan_number = 0 for header in header_order: scan_number += 1 output_dict = {} output_dict["cluster index"] = scan_number output_dict["RTMean"] = header output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, output_clustersummary_filename)
def main(): results_filename = sys.argv[1] output_filename = sys.argv[2] input_results = ming_fileio_library.parse_table_with_headers_object_list( results_filename) output_results = [] #Check if server is up for result_object in input_results: filename = result_object["filename"] get_metadata_information_per_filename(filename) ming_fileio_library.write_list_dict_table_data(output_results, output_filename)
def main(): parameters_filename = sys.argv[1] input_mgf_filename = sys.argv[2] output_clusterinfosummary = sys.argv[3] output_list = [] spectrum_collection = ming_spectrum_library.SpectrumCollection(input_mgf_filename) spectrum_collection.load_from_file() for spectrum in spectrum_collection.spectrum_list: output_dict = {} output_dict["cluster index"] = spectrum.scan output_dict["RTMean"] = spectrum.retention_time output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, output_clusterinfosummary)
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir( input_intermediate_folder) output_list = [] for parallel_output_filename in all_intermediate_files: result_list = ming_fileio_library.parse_table_with_headers_object_list( parallel_output_filename) output_list += result_list ming_fileio_library.write_list_dict_table_data(output_list, output_filename)
def output_graph_with_headers(G, filename): output_list = [] #Outputting the graph component_index = 0 for component in nx.connected_components(G): component_index += 1 for edge in get_edges_of_component(G, component): output_dict = {} if int(edge[0]) < int(edge[1]): output_dict["CLUSTERID1"] = edge[0] output_dict["CLUSTERID2"] = edge[1] else: output_dict["CLUSTERID1"] = edge[1] output_dict["CLUSTERID2"] = edge[0] output_dict["DeltaMZ"] = edge[2]["mass_difference"] output_dict["Cosine"] = edge[2]["cosine_score"] output_dict["ComponentIndex"] = component_index output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, filename)
def output_graph_with_headers(G, filename): output_list = [] #Outputting the graph component_index = 0 for component in nx.connected_components(G): component_index += 1 for edge in get_edges_of_component(G, component): output_dict = {} if int(edge[0]) < int(edge[1]): output_dict["CLUSTERID1"] = edge[0] output_dict["CLUSTERID2"] = edge[1] else: output_dict["CLUSTERID1"] = edge[1] output_dict["CLUSTERID2"] = edge[0] output_dict["DeltaMZ"] = edge[2]["mass_difference"] output_dict["Cosine"] = edge[2]["cosine_score"] output_dict["ComponentIndex"] = component_index output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, filename)
def main(): parser = argparse.ArgumentParser(description='Creates alan table') parser.add_argument('input_identifications_filename', help='input_identifications_filename') parser.add_argument('output_filename', help='output_filename') args = parser.parse_args() print(args.input_identifications_filename) data_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_identifications_filename) all_filenames = set() compounds_to_files = defaultdict(set) for data_object in data_list: query_filename = "f." + data_object["full_CCMS_path"] compound_name = data_object["Compound_Name"] all_filenames.add(query_filename) compounds_to_files[compound_name].add(query_filename) output_list = [] for compound_name in compounds_to_files: output_dict = {} output_dict["LibraryID"] = compound_name output_dict["TotalFiles"] = len(compounds_to_files[compound_name]) for filename in compounds_to_files[compound_name]: output_dict[filename] = "1" for filename in all_filenames: if not filename in output_dict: output_dict[filename] = "0" output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, args.output_filename)
def main(): parser = argparse.ArgumentParser( description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file') parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file') parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename') parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename') parser.add_argument('input_networking_pairs', help='input_networking_pairs') parser.add_argument('input_library_search', help='input_library_search') parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename') args = parser.parse_args() """Loading group filenames""" group_to_files, files_to_groups = load_group_mapping( args.input_group_mapping_filename) print("Loaded Group Mapping") cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_clusterinfosummary_file) print("Loaded Cluster Summary") attribute_to_groups = load_attribute_mapping( args.input_attribute_mapping_filename) params_object = ming_proteosafe_library.parse_xml_file(open( args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0]) RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0] #Calculating the spectrum counts per group cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0)) cluster_to_files = defaultdict(set) cluster_to_RT = defaultdict(list) line_count = 0 for line in open(args.input_clusterinfo_file): line_count += 1 if line_count == 1: continue if line_count % 10000 == 0: print(line_count) splits = line.rstrip().split("\t") cluster_index = splits[0] filename = os.path.basename(splits[1]) rt = float(splits[6]) group_membership = files_to_groups[filename] cluster_to_files[cluster_index].add(filename) cluster_to_RT[cluster_index].append(rt) for group in group_membership: cluster_to_group_counts[cluster_index][group] += 1 if RUN_MSCLUSTER == "on": cluster_summary_list = filter_clusters_based_on_cluster_size( cluster_summary_list, CLUSTER_MIN_SIZE) print(len(cluster_summary_list)) print("Setting up grouping", len(group_to_files.keys())) for cluster_summary_object in cluster_summary_list: cluster_index = cluster_summary_object["cluster index"] for group in group_to_files: group_count = 0 if group in cluster_to_group_counts[cluster_index]: group_count = cluster_to_group_counts[cluster_index][group] cluster_summary_object[group] = group_count for attribute in attribute_to_groups: groups_to_include = [] for group in attribute_to_groups[attribute]: if group in cluster_summary_object: if cluster_summary_object[group] > 0: groups_to_include.append(group) cluster_summary_object[attribute] = ",".join( groups_to_include).replace("GNPSGROUP:", "") print("Default Attributes") calculate_default_attributes(cluster_summary_list, group_to_files.keys()) print("calculate_cluster_file_stats") calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping) print("rt stats") calculate_rt_stats(cluster_summary_list, cluster_to_RT) print("populate_network_component") populate_network_component(cluster_summary_list, args.input_networking_pairs) print("calculate_ancillary_information") calculate_ancillary_information(cluster_summary_list, params_object["task"][0]) print("populate_network_identifications") populate_network_identifications(cluster_summary_list, args.input_library_search) ming_fileio_library.write_list_dict_table_data( cluster_summary_list, args.output_clusterinfosummary_filename)
def main(): input_filename = sys.argv[1] output_tsv = sys.argv[2] results_list = ming_fileio_library.parse_table_with_headers_object_list( input_filename) results_by_compound_name = defaultdict(list) for result in results_list: annotation_string = result["Compound_Name"] results_by_compound_name[annotation_string].append(result) output_results = [] for compound_name in results_by_compound_name: best_result = sorted(results_by_compound_name[compound_name], key=lambda result: float(result["MQScore"]), reverse=True)[0] all_RTs = [ float(result["RT_Query"]) for result in results_by_compound_name[compound_name] ] all_MZs = [ float(result["SpecMZ"]) for result in results_by_compound_name[compound_name] ] all_MZ_ppmerror = [ float(result["MZErrorPPM"]) for result in results_by_compound_name[compound_name] ] rt_mean = statistics.mean(all_RTs) rt_median = statistics.median(all_RTs) mz_mean = statistics.mean(all_MZs) mz_ppm_mean = statistics.mean(all_MZ_ppmerror) rt_max = max(all_RTs) rt_min = min(all_RTs) mz_max = max(all_MZs) mz_min = min(all_MZs) #STDDev rt_stdev = 0.0 mz_stdev = 0.0 ppmerror_stdev = 0.0 if len(all_RTs) > 1: rt_stdev = statistics.stdev(all_RTs) mz_stdev = statistics.stdev(all_MZs) ppmerror_stdev = statistics.stdev(all_MZ_ppmerror) best_result["rt_mean"] = rt_mean best_result["rt_median"] = rt_median best_result["mz_mean"] = mz_mean best_result["mz_ppm_mean"] = mz_ppm_mean best_result["rt_max"] = rt_max best_result["rt_min"] = rt_min best_result["mz_max"] = mz_max best_result["mz_min"] = mz_min best_result["rt_stdev"] = rt_stdev best_result["mz_stdev"] = mz_stdev best_result["ppmerror_stdev"] = ppmerror_stdev best_result["number_spectra"] = len(all_RTs) output_results.append(best_result) ming_fileio_library.write_list_dict_table_data(output_results, output_tsv)
def main(): parser = argparse.ArgumentParser(description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file') parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file') parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename') parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename') parser.add_argument('input_networking_pairs', help='input_networking_pairs') parser.add_argument('input_library_search', help='input_library_search') parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename') args = parser.parse_args() """Loading group filenames""" group_to_files, files_to_groups = load_group_mapping(args.input_group_mapping_filename) print("Loaded Group Mapping") cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfosummary_file) print("Loaded Cluster Summary") attribute_to_groups = load_attribute_mapping(args.input_attribute_mapping_filename) params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0]) RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0] #Calculating the spectrum counts per group cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0)) cluster_to_files = defaultdict(set) cluster_to_RT = defaultdict(list) line_count = 0 for line in open(args.input_clusterinfo_file): line_count += 1 if line_count == 1: continue if line_count % 10000 == 0: print(line_count) splits = line.rstrip().split("\t") cluster_index = splits[0] filename = os.path.basename(splits[1]) rt = float(splits[6]) group_membership = files_to_groups[filename] cluster_to_files[cluster_index].add(filename) cluster_to_RT[cluster_index].append(rt) for group in group_membership: cluster_to_group_counts[cluster_index][group] += 1 if RUN_MSCLUSTER == "on": cluster_summary_list = filter_clusters_based_on_cluster_size(cluster_summary_list, CLUSTER_MIN_SIZE) print(len(cluster_summary_list)) print("Setting up grouping", len(group_to_files.keys())) for cluster_summary_object in cluster_summary_list: cluster_index = cluster_summary_object["cluster index"] for group in group_to_files: group_count = 0 if group in cluster_to_group_counts[cluster_index]: group_count = cluster_to_group_counts[cluster_index][group] cluster_summary_object[group] = group_count for attribute in attribute_to_groups: groups_to_include = [] for group in attribute_to_groups[attribute]: if group in cluster_summary_object: if cluster_summary_object[group] > 0: groups_to_include.append(group) cluster_summary_object[attribute] = ",".join(groups_to_include).replace("GNPSGROUP:", "") print("Default Attributes") calculate_default_attributes(cluster_summary_list, group_to_files.keys()) print("calculate_cluster_file_stats") calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping) print("rt stats") calculate_rt_stats(cluster_summary_list, cluster_to_RT) print("calculate_ancillary_information") calculate_ancillary_information(cluster_summary_list, params_object["task"][0]) print("populate_network_component") populate_network_component(cluster_summary_list, args.input_networking_pairs) print("populate_network_identifications") populate_network_identifications(cluster_summary_list, args.input_library_search) ming_fileio_library.write_list_dict_table_data(cluster_summary_list, args.output_clusterinfosummary_filename)
def main(): parser = argparse.ArgumentParser(description='Modifying script') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_table', help='output_metadata_table') parser.add_argument('output_view_emporer', help='output_metadata_table') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file( open(args.param_xml, "r")) """Outputting html""" from urllib.parse import urlencode, quote_plus parameters_for_qiime = { 'biom': 'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=biom_output/networking_quant.biom' % (param_object["task"][0]), 'metadata': 'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=metadata_for_qiime/metadata_for_qiime.txt' % (param_object["task"][0]) } output_html_file = open(args.output_view_emporer, "w") output_html_file.write("<script>\n") output_html_file.write( 'window.location.replace("https://mingwangbeta.ucsd.edu/emperor?%s")\n' % urlencode(parameters_for_qiime)) output_html_file.write("</script>\n") output_html_file.close() reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping( param_object) metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename": real_name}) #open(args.output_metadata_table, "w").write("NO OUTPUT") #open(args.output_view_emporer, "w").write("Please Include Metadata File") #exit(0) else: object_list = ming_fileio_library.parse_table_with_headers_object_list( metadata_files_in_folder[0]) if len(object_list) == 0: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename": real_name}) #open(args.output_metadata_table, "w").write("NO OUTPUT") #open(args.output_view_emporer, "w").write("Please Include Non Empty Metadata File") #exit(0) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: metadata_object[ "#SampleID"] = ming_fileio_library.get_filename_without_extension( metadata_object["filename"]) if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" ming_fileio_library.write_list_dict_table_data(object_list, args.output_metadata_table, header_list)
def match_unclustered(match_parameters, spectrum_collection, dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches): MetaDataServerStatus = trace_to_single_file.test_metadata_server() all_matches_by_dataset = finding_matches_in_public_data( spectrum_collection, all_datasets, match_parameters) dataset_matches_output_list = [] output_filename_unique_files_list = [] output_filename_all_matches_list = [] for dataset in all_matches_by_dataset: #For each dataset, lets try to find the clustering information if len(all_matches_by_dataset[dataset]["matches"]) == 0: continue top_match = sorted(all_matches_by_dataset[dataset]["matches"], key=lambda match: match["cosine"], reverse=True)[0] output_dict = {} output_dict['specs_filename'] = "specs_ms.mgf" output_dict['specs_scan'] = top_match["queryscan"] output_dict['dataset_id'] = dataset_dict[dataset]["dataset"] output_dict['dataset_title'] = dataset_dict[dataset]["title"] output_dict['dataset_description'] = dataset_dict[dataset][ "description"].replace("\n", "").replace("\t", "") output_dict['dataset_organisms'] = dataset_dict[dataset][ "species"].replace(";", "!") output_dict['dataset_filename'] = top_match["filename"] output_dict['dataset_scan'] = top_match["scan"] output_dict['score'] = top_match["cosine"] output_dict['matchedpeaks'] = top_match["matchedpeaks"] output_dict['mzerror'] = top_match["mzerror"] output_dict['files_count'] = len( all_matches_by_dataset[dataset]["matches"]) dataset_matches_output_list.append(output_dict) """Unique Filenames Calculation""" unique_files = list( set([ match["filename"] for match in all_matches_by_dataset[dataset]["matches"] ])) for source_file in unique_files: output_object = {} output_object["dataset_id"] = dataset_dict[dataset]["dataset"] output_object["cluster_scan"] = "" output_object["filename"] = source_file output_object["metadata"] = "" if MetaDataServerStatus: metadata_list = trace_to_single_file.get_metadata_information_per_filename( source_file) output_object["metadata"] = "|".join(metadata_list) output_filename_unique_files_list.append(output_object) for match in all_matches_by_dataset[dataset]["matches"]: output_object = {} output_object["dataset_id"] = dataset output_object["cluster_scan"] = match["queryscan"] output_object["filename"] = match["filename"] output_object["filescan"] = match["scan"] output_object["metadata"] = "" if MetaDataServerStatus: metadata_list = trace_to_single_file.get_metadata_information_per_filename( match["filename"]) output_object["metadata"] = "|".join(metadata_list) output_filename_all_matches_list.append(output_object) ming_fileio_library.write_list_dict_table_data(dataset_matches_output_list, output_matches_filename) ming_fileio_library.write_list_dict_table_data( output_filename_unique_files_list, output_filename_unique_files) ming_fileio_library.write_list_dict_table_data( output_filename_all_matches_list, output_filename_all_matches)
def match_clustered(match_parameters, spectrum_collection, dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches): all_matches = finding_matches_in_public_data(spectrum_collection, all_datasets, match_parameters) """Resolving to File Level""" dataset_files_count = defaultdict(lambda: 0) output_source_list = [] output_match_list = [] MetaDataServerStatus = trace_to_single_file.test_metadata_server() for dataset in all_matches: for match_object in all_matches[dataset]["matches"]: dataset_accession = dataset_dict[dataset]["dataset"] dataset_scan = match_object["scan"] current_filelist, current_match_list = trace_to_single_file.trace_filename_filesystem( all_datasets, dataset_accession, dataset_scan, enrichmetadata=MetaDataServerStatus) output_source_list += current_filelist output_match_list += current_match_list seen_files = set() output_unique_source_list = [] for output_file_object in output_source_list: dataset_accession = output_file_object["dataset_id"] dataset_filename = output_file_object["filename"] key = dataset_accession + ":" + dataset_filename if key in seen_files: continue dataset_files_count[dataset_accession] += 1 seen_files.add(key) output_unique_source_list.append(output_file_object) ming_fileio_library.write_list_dict_table_data( output_unique_source_list, output_filename_unique_files) ming_fileio_library.write_list_dict_table_data( output_match_list, output_filename_all_matches) """ Summary """ output_map = { "specs_filename": [], "specs_scan": [], "dataset_filename": [], "dataset_scan": [], "score": [], "dataset_id": [], "dataset_title": [], "dataset_description": [], "dataset_organisms": [], "matchedpeaks": [], "mzerror": [], "files_count": [] } for dataset in all_matches: #For each dataset, lets try to find the clustering information if len(all_matches[dataset]["matches"]) == 0: continue match_object = None #If it is more than one match, we need to consolidate if len(all_matches[dataset]["matches"]) > 1: sorted_match_list = sorted( all_matches[dataset]["matches"], key=lambda match: float(match["cosine"]), reverse=True) match_object = sorted_match_list[0] else: match_object = all_matches[dataset]["matches"][0] output_map['specs_filename'].append("specs_ms.mgf") output_map['specs_scan'].append(match_object["queryscan"]) output_map['dataset_id'].append(dataset_dict[dataset]["dataset"]) output_map['dataset_title'].append(dataset_dict[dataset]["title"]) output_map['dataset_description'].append( dataset_dict[dataset]["description"].replace("\n", "").replace( "\t", "").replace("\r", "")) output_map['dataset_organisms'].append( dataset_dict[dataset]["species"].replace( "<hr class='separator'\/>", "!")) output_map['dataset_filename'].append(match_object["filename"]) output_map['dataset_scan'].append(match_object["scan"]) output_map['score'].append(match_object["cosine"]) output_map['matchedpeaks'].append(match_object["matchedpeaks"]) output_map['mzerror'].append(match_object["mzerror"]) output_map['files_count'].append(dataset_files_count[dataset]) ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main(): parser = argparse.ArgumentParser( description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir( tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list( input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main(): parser = argparse.ArgumentParser( description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('input_clusterinfo_summary', help='Input cluster info summary') parser.add_argument('input_network_pairs_file', help='network_pairs_file') parser.add_argument('input_library_search_file', help='network_pairs_file') parser.add_argument('output_clusterinfo_summary', help='output file') parser.add_argument('output_component_summary', help='output component file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_clusterinfo_summary) library_ids_dict = load_library_id_dict(args.input_library_search_file) nodes_to_component, component_to_nodes = load_pairs_dict( args.input_network_pairs_file) for cluster in all_clusterinfo_list: cluster_index = cluster["cluster index"] if cluster_index in nodes_to_component: cluster["componentindex"] = nodes_to_component[cluster_index] cluster[ "GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s&show=true" % ( nodes_to_component[cluster_index], param_obj["task"][0]) else: cluster["componentindex"] = "-1" cluster["GNPSLinkout_Network"] = 'This Node is a Singleton' if cluster_index in library_ids_dict: cluster["LibraryID"] = library_ids_dict[cluster_index][ "Compound_Name"] cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"] cluster["SpectrumID"] = library_ids_dict[cluster_index][ "SpectrumID"] else: cluster["LibraryID"] = "N/A" cluster["MQScore"] = "N/A" cluster["SpectrumID"] = "N/A" ming_fileio_library.write_list_dict_table_data( all_clusterinfo_list, args.output_clusterinfo_summary) output_component_list = [] for componentindex in component_to_nodes: output_dict = {} output_dict["ComponentIndex"] = componentindex output_dict["NodeCount"] = len(component_to_nodes[componentindex]) output_dict["#Spectra"] = len(component_to_nodes[componentindex]) all_lib_identifications = [] for node in component_to_nodes[componentindex]: if node in library_ids_dict: all_lib_identifications.append( library_ids_dict[node]["Compound_Name"]) output_dict["AllIDs"] = "!".join(all_lib_identifications) output_component_list.append(output_dict) ming_fileio_library.write_list_dict_table_data( output_component_list, args.output_component_summary)
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('consensus_feature_file', help='Consensus Quantification File') parser.add_argument('metadata_folder', help='metadata metadata_folder') parser.add_argument('mgf_filename', help='mgf_filename') parser.add_argument('output_clusterinfo_summary', help='output file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) task_id = param_obj["task"][0] group_to_files_mapping = defaultdict(list) attributes_to_groups_mapping = defaultdict(set) metadata_files = glob.glob(os.path.join(args.metadata_folder, "*")) if len(metadata_files) == 1: group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(metadata_files[0]) ROW_NORMALIZATION = "None" try: ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0] except: ROW_NORMALIZATION = "None" GROUP_COUNT_AGGREGATE_METHOD = "Sum" try: GROUP_COUNT_AGGREGATE_METHOD = param_obj["GROUP_COUNT_AGGREGATE_METHOD"][0] except: GROUP_COUNT_AGGREGATE_METHOD = "None" quantification_list = ming_fileio_library.parse_table_with_headers_object_list(args.consensus_feature_file, delimiter=",") input_filenames, input_filename_headers = determine_input_files(quantification_list[0].keys()) ### Filling in Quantification table if it is missing values for quantification_object in quantification_list: ###Handling empty quantification for filename in input_filename_headers: try: if len(quantification_object[filename]) == 0: #print(filename, quantification_object[filename], quantification_object["row ID"]) quantification_object[filename] = 0 except: x = 1 print("Number of Features", len(quantification_list)) #Doing row sum normalization if ROW_NORMALIZATION == "RowSum": print("ROW SUM NORM") for filename_header in input_filename_headers: file_quants = [float(quantification_object[filename_header]) for quantification_object in quantification_list] for quantification_object in quantification_list: quantification_object[filename_header] = float(quantification_object[filename_header]) / sum(file_quants) """Loading MS2 Spectra""" mgf_collection = ming_spectrum_library.SpectrumCollection(args.mgf_filename) mgf_collection.load_from_file() clusters_list = [] for quantification_object in quantification_list: cluster_obj = {} cluster_obj["cluster index"] = quantification_object["row ID"] cluster_obj["precursor mass"] = "{0:.4f}".format(float(quantification_object["row m/z"])) cluster_obj["RTConsensus"] = "{0:.4f}".format(float(quantification_object["row retention time"])) all_charges = [] """Checking about the charge of this cluster""" try: spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] charge = int(spectrum_object.charge) except: charge = 0 """Checking if this spectrum has no peaks""" # try: # spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] # # except: # continue all_files = [os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0] abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers] all_abundances = [float(quantification_object[filename]) for filename in input_filename_headers] if charge != 0: cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]) * charge - charge + 1) else: cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"])) cluster_obj["precursor charge"] = charge try: cluster_obj["RTMean"] = statistics.mean(all_retention_times) cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times) except: cluster_obj["RTMean"] = cluster_obj["RTConsensus"] cluster_obj["RTStdErr"] = 0 cluster_obj["GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (task_id, quantification_object["row ID"], quantification_object["row ID"]) #cluster_obj["AllFiles"] = "###".join(all_files) cluster_obj["sum(precursor intensity)"] = sum(all_abundances) cluster_obj["SumPeakIntensity"] = sum(all_abundances) cluster_obj["number of spectra"] = len(all_files) cluster_obj["UniqueFileSourcesCount"] = len(all_files) group_abundances = determine_group_abundances(group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD) default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"] for group in group_to_files_mapping: group_header = "GNPSGROUP:" + group if group in default_groups: continue cluster_obj[group_header] = group_abundances[group] for group in default_groups: cluster_obj[group] = group_abundances[group] #Writing attributes for attribute in attributes_to_groups_mapping: groups_to_include = [] for group in attributes_to_groups_mapping[attribute]: if group_abundances[group] > 0.0: groups_to_include.append(group) if len(groups_to_include) == 0: cluster_obj[attribute] = "" else: cluster_obj[attribute] = ",".join(groups_to_include) """ Enriching the cluster info with adduct collapsing information """ enrich_adduct_annotations(cluster_obj, quantification_object) clusters_list.append(cluster_obj) ming_fileio_library.write_list_dict_table_data(clusters_list, args.output_clusterinfo_summary)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser( description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[ parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir( tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list( input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data( full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser( description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('consensus_feature_file', help='Consensus Quantification File') parser.add_argument('metadata_folder', help='metadata metadata_folder') parser.add_argument('mgf_filename', help='mgf_filename') parser.add_argument('output_clusterinfo_summary', help='output file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) task_id = param_obj["task"][0] group_to_files_mapping = defaultdict(list) attributes_to_groups_mapping = defaultdict(set) metadata_files = glob.glob(os.path.join(args.metadata_folder, "*")) if len(metadata_files) == 1: group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings( metadata_files[0]) ROW_NORMALIZATION = "None" try: ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0] except: ROW_NORMALIZATION = "None" GROUP_COUNT_AGGREGATE_METHOD = "Sum" try: GROUP_COUNT_AGGREGATE_METHOD = param_obj[ "GROUP_COUNT_AGGREGATE_METHOD"][0] except: GROUP_COUNT_AGGREGATE_METHOD = "None" quantification_list = ming_fileio_library.parse_table_with_headers_object_list( args.consensus_feature_file, delimiter=",") input_filenames, input_filename_headers = determine_input_files( quantification_list[0].keys()) ### Filling in Quantification table if it is missing values for quantification_object in quantification_list: ###Handling empty quantification for filename in input_filename_headers: try: if len(quantification_object[filename]) == 0: #print(filename, quantification_object[filename], quantification_object["row ID"]) quantification_object[filename] = 0 except: x = 1 print("Number of Features", len(quantification_list)) #Doing row sum normalization if ROW_NORMALIZATION == "RowSum": print("ROW SUM NORM") for filename_header in input_filename_headers: file_quants = [ float(quantification_object[filename_header]) for quantification_object in quantification_list ] for quantification_object in quantification_list: quantification_object[filename_header] = float( quantification_object[filename_header]) / sum(file_quants) """Loading MS2 Spectra""" mgf_collection = ming_spectrum_library.SpectrumCollection( args.mgf_filename) mgf_collection.load_from_file() clusters_list = [] for quantification_object in quantification_list: cluster_obj = {} cluster_obj["cluster index"] = quantification_object["row ID"] cluster_obj["precursor mass"] = "{0:.4f}".format( float(quantification_object["row m/z"])) cluster_obj["RTConsensus"] = "{0:.4f}".format( float(quantification_object["row retention time"])) all_charges = [] """Checking about the charge of this cluster""" try: spectrum_object = mgf_collection.scandict[int( cluster_obj["cluster index"])] charge = int(spectrum_object.charge) except: charge = 0 """Checking if this spectrum has no peaks""" # try: # spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] # # except: # continue all_files = [ os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0 ] abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers] all_abundances = [ float(quantification_object[filename]) for filename in input_filename_headers ] if charge != 0: cluster_obj["parent mass"] = "{0:.4f}".format( float(quantification_object["row m/z"]) * charge - charge + 1) else: cluster_obj["parent mass"] = "{0:.4f}".format( float(quantification_object["row m/z"])) cluster_obj["precursor charge"] = charge try: cluster_obj["RTMean"] = statistics.mean(all_retention_times) cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times) except: cluster_obj["RTMean"] = cluster_obj["RTConsensus"] cluster_obj["RTStdErr"] = 0 cluster_obj[ "GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % ( task_id, quantification_object["row ID"], quantification_object["row ID"]) #cluster_obj["AllFiles"] = "###".join(all_files) cluster_obj["sum(precursor intensity)"] = sum(all_abundances) cluster_obj["SumPeakIntensity"] = sum(all_abundances) cluster_obj["number of spectra"] = len(all_files) cluster_obj["UniqueFileSourcesCount"] = len(all_files) group_abundances = determine_group_abundances( group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD) default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"] for group in group_to_files_mapping: group_header = "GNPSGROUP:" + group if group in default_groups: continue cluster_obj[group_header] = group_abundances[group] for group in default_groups: cluster_obj[group] = group_abundances[group] #Writing attributes for attribute in attributes_to_groups_mapping: groups_to_include = [] for group in attributes_to_groups_mapping[attribute]: if group_abundances[group] > 0.0: groups_to_include.append(group) if len(groups_to_include) == 0: cluster_obj[attribute] = "" else: cluster_obj[attribute] = ",".join(groups_to_include) """ Enriching the cluster info with adduct collapsing information """ enrich_adduct_annotations(cluster_obj, quantification_object) clusters_list.append(cluster_obj) ming_fileio_library.write_list_dict_table_data( clusters_list, args.output_clusterinfo_summary)