def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('filestats', help='filestats') parser.add_argument('dbresults', help='dbresults') parser.add_argument('output_filestats', help='output folder for parameters') args = parser.parse_args() identified_spectra_in_filename = defaultdict(set) all_identifications = ming_fileio_library.parse_table_with_headers_object_list(args.dbresults) for identification in all_identifications: filename = identification["full_CCMS_path"] scan = identification["#Scan#"] identified_spectra_in_filename[filename].add(scan) print(identified_spectra_in_filename) output_list = [] file_summaries = ming_fileio_library.parse_table_with_headers_object_list(args.filestats) for file_summary in file_summaries: filename = file_summary["full_CCMS_path"] count = len(identified_spectra_in_filename[filename]) file_summary["identified_ms2"] = count percent_identified = 0 try: percent_identified = float(count) / float(file_summary["MS2s"]) except: percent_identified = 0 file_summary["percent_identified"] = percent_identified output_list.append(file_summary) ming_fileio_library.write_list_dict_table_data(output_list, args.output_filestats)
def main(): input_library_identifications = sys.argv[1] output_library_identifications = sys.argv[2] annotations_list = ming_fileio_library.parse_table_with_headers_object_list(input_library_identifications) already_identified_compounds = set() already_identified_spectra = set() annotations_list = sorted(annotations_list, key=lambda identification: float(identification["MQScore"]), reverse=True) output_annotation_list = [] for annotation in annotations_list: compound_name = annotation["Compound_Name"] spectrum_identifier = annotation["#Scan#"] + ":" + annotation["SpectrumFile"] if compound_name in already_identified_compounds: continue if spectrum_identifier in already_identified_spectra: continue print(compound_name, spectrum_identifier) output_annotation_list.append(annotation) already_identified_compounds.add(compound_name) already_identified_spectra.add(spectrum_identifier) ming_fileio_library.write_list_dict_table_data(output_annotation_list, output_library_identifications)
def main(): parser = argparse.ArgumentParser(description='Creates alan table') parser.add_argument('input_identifications_filename', help='input_identifications_filename') parser.add_argument('output_filename', help='output_filename') args = parser.parse_args() print(args.input_identifications_filename) data_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_identifications_filename) all_filenames = set() compounds_to_files = defaultdict(set) for data_object in data_list: query_filename = "f." + data_object["full_CCMS_path"] compound_name = data_object["Compound_Name"] all_filenames.add(query_filename) compounds_to_files[compound_name].add(query_filename) output_list = [] for compound_name in compounds_to_files: output_dict = {} output_dict["LibraryID"] = compound_name output_dict["TotalFiles"] = len(compounds_to_files[compound_name]) for filename in compounds_to_files[compound_name]: output_dict[filename] = "1" for filename in all_filenames: if not filename in output_dict: output_dict[filename] = "0" output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, args.output_filename)
def simple_presence_of_merged_spectra_processing(input_integrals_filename, output_clusterinfo_filename, mangled_mapping): extension_stripped_mangled_mapping = {} for key in mangled_mapping: without_ext = ming_fileio_library.get_filename_without_extension(key) extension_stripped_mangled_mapping[without_ext] = mangled_mapping[key] header_order = open(input_integrals_filename).readline().rstrip().split(",")[1:] table_list = ming_fileio_library.parse_table_with_headers_object_list(input_integrals_filename, delimiter=",") #Removing other header infroamtion table_list = table_list[2:] output_dict = defaultdict(list) print("for zheng's sanity print the wholetable ----") print(table_list) for result_object in table_list: try: sample_name = result_object["RTS:"] except: sample_name = "unknown" scan_number = 0 for header in header_order: scan_number += 1 abundance = result_object[header] output_dict["filename"].append( sample_name ) output_dict["abundance"].append( abundance ) output_dict["scan_number"].append( scan_number ) output_dict["RT"].append( header ) ming_fileio_library.write_dictionary_table_data(output_dict, output_clusterinfo_filename)
def main(): input_library_identifications = sys.argv[1] output_library_identifications = sys.argv[2] annotations_list = ming_fileio_library.parse_table_with_headers_object_list( input_library_identifications) already_identified_compounds = set() already_identified_spectra = set() annotations_list = sorted( annotations_list, key=lambda identification: float(identification["MQScore"]), reverse=True) output_annotation_list = [] for annotation in annotations_list: compound_name = annotation["Compound_Name"] spectrum_identifier = annotation["#Scan#"] + ":" + annotation[ "SpectrumFile"] if compound_name in already_identified_compounds: continue if spectrum_identifier in already_identified_spectra: continue print(compound_name, spectrum_identifier) output_annotation_list.append(annotation) already_identified_compounds.add(compound_name) already_identified_spectra.add(spectrum_identifier) ming_fileio_library.write_list_dict_table_data( output_annotation_list, output_library_identifications)
def add_additional_edges(G, path_to_supplemental_edges): edge_list = ming_fileio_library.parse_table_with_headers_object_list(path_to_supplemental_edges, delimiter=",") edges_to_add = [] for additional_edge_row in edge_list: try: node1 = additional_edge_row["ID1"] node2 = additional_edge_row["ID2"] node1_mz = G.node[node1]["precursor mass"] node2_mz = G.node[node2]["precursor mass"] mass_difference = float(node1_mz) - float(node2_mz) edgetype = additional_edge_row["EdgeType"] score = additional_edge_row["Score"] annotation = additional_edge_row["Annotation"] edge_object = {} edge_object["node1"] = node1 edge_object["node2"] = node2 edge_object["EdgeType"] = edgetype edge_object["EdgeAnnotation"] = annotation.rstrip() edge_object["EdgeScore"] = float(score) edge_object["mass_difference"] = mass_difference edges_to_add.append((node1, node2, edge_object)) except: print("Error Adding Edge") continue G.add_edges_from(edges_to_add) return G
def add_additional_edges(G, path_to_supplemental_edges): edge_list = ming_fileio_library.parse_table_with_headers_object_list( path_to_supplemental_edges, delimiter=",") edges_to_add = [] for additional_edge_row in edge_list: node1 = additional_edge_row["ID1"] node2 = additional_edge_row["ID2"] edgetype = additional_edge_row["EdgeType"] score = additional_edge_row["Score"] annotation = additional_edge_row["Annotation"] edge_object = {} edge_object["node1"] = node1 edge_object["node2"] = node2 edge_object["EdgeType"] = edgetype edge_object["EdgeAnnotation"] = annotation.rstrip() edge_object["EdgeScore"] = float(score) edges_to_add.append((node1, node2, edge_object)) G.add_edges_from(edges_to_add) return G
def add_additional_edges(G, path_to_supplemental_edges): edge_list = ming_fileio_library.parse_table_with_headers_object_list(path_to_supplemental_edges, delimiter=",") edges_to_add = [] for additional_edge_row in edge_list: node1 = additional_edge_row["ID1"] node2 = additional_edge_row["ID2"] edgetype = additional_edge_row["EdgeType"] score = additional_edge_row["Score"] annotation = additional_edge_row["Annotation"] edge_object = {} edge_object["node1"] = node1 edge_object["node2"] = node2 edge_object["EdgeType"] = edgetype edge_object["EdgeAnnotation"] = annotation.rstrip() edge_object["EdgeScore"] = float(score) edges_to_add.append((node1, node2, edge_object)) G.add_edges_from(edges_to_add) return G
def main(): results_filename = sys.argv[1] output_filename_unique_files = sys.argv[2] output_filename_all_matches = sys.argv[3] all_datasets = ming_gnps_library.get_all_datasets(gnps_only=True) all_matches = ming_fileio_library.parse_table_with_headers_object_list( results_filename) output_source_list = [] output_match_list = [] MetaDataServerStatus = test_metadata_server() for match_object in all_matches: dataset_accession = match_object["dataset_id"] dataset_scan = match_object["dataset_scan"] #output_source_list += trace_filename(all_datasets, dataset_accession, dataset_scan) current_filelist, current_match_list = trace_filename_filesystem( all_datasets, dataset_accession, dataset_scan, enrichmetadata=MetaDataServerStatus) output_source_list += current_filelist output_match_list += current_match_list ming_fileio_library.write_list_dict_table_data( output_source_list, output_filename_unique_files) ming_fileio_library.write_list_dict_table_data( output_match_list, output_filename_all_matches)
def main(): parser = argparse.ArgumentParser(description='Creates alan table') parser.add_argument('input_clusterinfosummary', help='input_clusterinfosummary') parser.add_argument('output_filename', help='output_filename') args = parser.parse_args() print(args.input_clusterinfosummary) data_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_clusterinfosummary) all_filenames = [] for data_object in data_list: if "UniqueFileSources" in data_object: all_filenames += data_object["UniqueFileSources"].split("|") else: filenames = list( set([ filename.split(":")[0] for filename in data_object["AllFiles"].split("###") if len(filename) > 2 ])) all_filenames += filenames all_filenames = list(set(all_filenames)) compounds_to_files = defaultdict(list) for data_object in data_list: filenames = [] if "UniqueFileSources" in data_object: filenames = data_object["UniqueFileSources"].split("|") else: filenames = list( set([ filename.split(":")[0] for filename in data_object["AllFiles"].split("###") if len(filename) > 2 ])) compound_name = data_object["LibraryID"] compounds_to_files[compound_name] += filenames output_list = [] for compound_name in compounds_to_files: output_dict = {} output_dict["LibraryID"] = compound_name output_dict["TotalFiles"] = len(compounds_to_files[compound_name]) for filename in compounds_to_files[compound_name]: output_dict[filename] = "1" for filename in all_filenames: if not filename in output_dict: output_dict[filename] = "0" output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, args.output_filename)
def load_library_id_dict(library_filename): results_list = ming_fileio_library.parse_table_with_headers_object_list(library_filename) output_dict = {} for result_obj in results_list: scan = result_obj["#Scan#"] output_dict[scan] = result_obj return output_dict
def load_library_id_dict(library_filename): results_list = ming_fileio_library.parse_table_with_headers_object_list( library_filename) output_dict = {} for result_obj in results_list: scan = result_obj["#Scan#"] output_dict[scan] = result_obj return output_dict
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('input_clusterinfo_summary', help='Input cluster info summary') parser.add_argument('input_network_pairs_file', help='network_pairs_file') parser.add_argument('input_library_search_file', help='network_pairs_file') parser.add_argument('output_clusterinfo_summary', help='output file') parser.add_argument('output_component_summary', help='output component file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfo_summary) library_ids_dict = load_library_id_dict(args.input_library_search_file) nodes_to_component, component_to_nodes = load_pairs_dict(args.input_network_pairs_file) for cluster in all_clusterinfo_list: cluster_index = cluster["cluster index"] if cluster_index in nodes_to_component: cluster["componentindex"] = nodes_to_component[cluster_index] cluster["GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s" % (nodes_to_component[cluster_index], param_obj["task"][0]) else: cluster["componentindex"] = "-1" cluster["GNPSLinkout_Network"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (param_obj["task"][0], cluster_index, cluster_index) if cluster_index in library_ids_dict: cluster["LibraryID"] = library_ids_dict[cluster_index]["Compound_Name"] cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"] cluster["SpectrumID"] = library_ids_dict[cluster_index]["SpectrumID"] else: cluster["LibraryID"] = "N/A" cluster["MQScore"] = "N/A" cluster["SpectrumID"] = "N/A" ming_fileio_library.write_list_dict_table_data(all_clusterinfo_list, args.output_clusterinfo_summary) output_component_list = [] for componentindex in component_to_nodes: output_dict = {} output_dict["ComponentIndex"] = componentindex output_dict["NodeCount"] = len(component_to_nodes[componentindex]) output_dict["#Spectra"] = len(component_to_nodes[componentindex]) all_lib_identifications = [] for node in component_to_nodes[componentindex]: if node in library_ids_dict: all_lib_identifications.append(library_ids_dict[node]["Compound_Name"]) output_dict["AllIDs"] = "!".join(all_lib_identifications) output_component_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_component_list, args.output_component_summary)
def main(): results_filename = sys.argv[1] output_filename = sys.argv[2] input_results = ming_fileio_library.parse_table_with_headers_object_list( results_filename) output_results = [] #Check if server is up for result_object in input_results: filename = result_object["filename"] get_metadata_information_per_filename(filename) ming_fileio_library.write_list_dict_table_data(output_results, output_filename)
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir( input_intermediate_folder) output_list = [] for parallel_output_filename in all_intermediate_files: result_list = ming_fileio_library.parse_table_with_headers_object_list( parallel_output_filename) output_list += result_list ming_fileio_library.write_list_dict_table_data(output_list, output_filename)
def populate_network_identifications(cluster_summary_list, library_search_filename): clusters_to_identifications = {} library_ids_list = ming_fileio_library.parse_table_with_headers_object_list(library_search_filename) for library_id in library_ids_list: cluster_index = library_id["#Scan#"] clusters_to_identifications[cluster_index] = library_id fields_to_copy = ["Smiles", "MQScore", "MassDiff", "MZErrorPPM", "SpectrumID", "Smiles"] for cluster in cluster_summary_list: cluster_index = cluster["cluster index"] if cluster_index in clusters_to_identifications: cluster["LibraryID"] = clusters_to_identifications[cluster_index]["Compound_Name"] for field in fields_to_copy: cluster[field] = clusters_to_identifications[cluster_index][field] else: cluster["LibraryID"] = "N/A" for field in fields_to_copy: cluster[field] = "N/A"
def load_pairs_dict(pairs_filename): results_list = ming_fileio_library.parse_table_with_headers_object_list(pairs_filename) node_to_component = {} component_to_node = defaultdict(set) for result_obj in results_list: node1 = result_obj["CLUSTERID1"] node2 = result_obj["CLUSTERID2"] component = result_obj["ComponentIndex"] node_to_component[node1] = component node_to_component[node2] = component component_to_node[component].add(node1) component_to_node[component].add(node2) return node_to_component, component_to_node
def load_pairs_dict(pairs_filename): results_list = ming_fileio_library.parse_table_with_headers_object_list( pairs_filename) node_to_component = {} component_to_node = defaultdict(set) for result_obj in results_list: node1 = result_obj["CLUSTERID1"] node2 = result_obj["CLUSTERID2"] component = result_obj["ComponentIndex"] node_to_component[node1] = component node_to_component[node2] = component component_to_node[component].add(node1) component_to_node[component].add(node2) return node_to_component, component_to_node
def populate_dataset_metadata(input_metadata_filename): Filename.create_table(True) Attribute.create_table(True) AttributeTerm.create_table(True) Compound.create_table(True) CompoundFilenameConnection.create_table(True) FilenameAttributeConnection.create_table(True) CompoundTag.create_table(True) CompoundTagFilenameConnection.create_table(True) #Check if dataset metadata is in the database already included_accessions = [] # try: # accession_attribute = Attribute.select().where(Attribute.categoryname == "ATTRIBUTE_DatasetAccession")[0] # for joined in FilenameAttributeConnection.select().where(FilenameAttributeConnection.attribute == accession_attribute).group_by(FilenameAttributeConnection.attributeterm): # included_accessions.append(joined.attributeterm.term) # except: # print("No Accessions") result_list = ming_fileio_library.parse_table_with_headers_object_list( input_metadata_filename, "\t") metadata_by_accession = defaultdict(list) for result in result_list: massive_accession = result["MassiveID"] metadata_by_accession[massive_accession].append(result) total_added_files = 0 for dataset_accession in metadata_by_accession: print("Attempting Import", dataset_accession) if dataset_accession in included_accessions: print("Skipping %s, already imported" % (dataset_accession)) continue added_files = add_metadata_per_accession( dataset_accession, metadata_by_accession[dataset_accession]) total_added_files += added_files print(dataset_accession, len(metadata_by_accession[dataset_accession]), added_files) return total_added_files
def main(): parser = argparse.ArgumentParser(description='Creates alan table') parser.add_argument('input_identifications_filename', help='input_identifications_filename') parser.add_argument('output_filename', help='output_filename') args = parser.parse_args() print(args.input_identifications_filename) data_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_identifications_filename) all_filenames = set() compounds_to_files = defaultdict(set) for data_object in data_list: query_filename = "f." + data_object["full_CCMS_path"] compound_name = data_object["Compound_Name"] all_filenames.add(query_filename) compounds_to_files[compound_name].add(query_filename) output_list = [] for compound_name in compounds_to_files: output_dict = {} output_dict["LibraryID"] = compound_name output_dict["TotalFiles"] = len(compounds_to_files[compound_name]) for filename in compounds_to_files[compound_name]: output_dict[filename] = "1" for filename in all_filenames: if not filename in output_dict: output_dict[filename] = "0" output_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_list, args.output_filename)
def main(): parser = argparse.ArgumentParser(description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file') parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file') parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename') parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename') parser.add_argument('input_networking_pairs', help='input_networking_pairs') parser.add_argument('input_library_search', help='input_library_search') parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename') args = parser.parse_args() """Loading group filenames""" group_to_files, files_to_groups = load_group_mapping(args.input_group_mapping_filename) print("Loaded Group Mapping") cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfosummary_file) print("Loaded Cluster Summary") attribute_to_groups = load_attribute_mapping(args.input_attribute_mapping_filename) params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0]) RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0] #Calculating the spectrum counts per group cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0)) cluster_to_files = defaultdict(set) cluster_to_RT = defaultdict(list) line_count = 0 for line in open(args.input_clusterinfo_file): line_count += 1 if line_count == 1: continue if line_count % 10000 == 0: print(line_count) splits = line.rstrip().split("\t") cluster_index = splits[0] filename = os.path.basename(splits[1]) rt = float(splits[6]) group_membership = files_to_groups[filename] cluster_to_files[cluster_index].add(filename) cluster_to_RT[cluster_index].append(rt) for group in group_membership: cluster_to_group_counts[cluster_index][group] += 1 if RUN_MSCLUSTER == "on": cluster_summary_list = filter_clusters_based_on_cluster_size(cluster_summary_list, CLUSTER_MIN_SIZE) print(len(cluster_summary_list)) print("Setting up grouping", len(group_to_files.keys())) for cluster_summary_object in cluster_summary_list: cluster_index = cluster_summary_object["cluster index"] for group in group_to_files: group_count = 0 if group in cluster_to_group_counts[cluster_index]: group_count = cluster_to_group_counts[cluster_index][group] cluster_summary_object[group] = group_count for attribute in attribute_to_groups: groups_to_include = [] for group in attribute_to_groups[attribute]: if group in cluster_summary_object: if cluster_summary_object[group] > 0: groups_to_include.append(group) cluster_summary_object[attribute] = ",".join(groups_to_include).replace("GNPSGROUP:", "") print("Default Attributes") calculate_default_attributes(cluster_summary_list, group_to_files.keys()) print("calculate_cluster_file_stats") calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping) print("rt stats") calculate_rt_stats(cluster_summary_list, cluster_to_RT) print("calculate_ancillary_information") calculate_ancillary_information(cluster_summary_list, params_object["task"][0]) print("populate_network_component") populate_network_component(cluster_summary_list, args.input_networking_pairs) print("populate_network_identifications") populate_network_identifications(cluster_summary_list, args.input_library_search) ming_fileio_library.write_list_dict_table_data(cluster_summary_list, args.output_clusterinfosummary_filename)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list used_files = set() for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path result_object["CCMS_filename"] = os.path.basename(full_path) used_files.add(full_path) for mangled_name in spectra_files: full_path = mangled_mapping[os.path.basename(mangled_name)] if full_path in used_files: continue output_dict = {} output_dict["full_CCMS_path"] = full_path output_dict["CCMS_filename"] = os.path.basename(full_path) full_result_list.append(output_dict) pd.DataFrame(full_result_list).to_csv(args.result_file, sep="\t", index=False)
def main(): parser = argparse.ArgumentParser(description='Modifying script') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_table', help='output_metadata_table') parser.add_argument('output_view_emporer', help='output_metadata_table') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file( open(args.param_xml, "r")) """Outputting html""" from urllib.parse import urlencode, quote_plus parameters_for_qiime = { 'biom': 'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=biom_output/networking_quant.biom' % (param_object["task"][0]), 'metadata': 'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=metadata_for_qiime/metadata_for_qiime.txt' % (param_object["task"][0]) } output_html_file = open(args.output_view_emporer, "w") output_html_file.write("<script>\n") output_html_file.write( 'window.location.replace("https://mingwangbeta.ucsd.edu/emperor?%s")\n' % urlencode(parameters_for_qiime)) output_html_file.write("</script>\n") output_html_file.close() reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping( param_object) metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename": real_name}) #open(args.output_metadata_table, "w").write("NO OUTPUT") #open(args.output_view_emporer, "w").write("Please Include Metadata File") #exit(0) else: object_list = ming_fileio_library.parse_table_with_headers_object_list( metadata_files_in_folder[0]) if len(object_list) == 0: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename": real_name}) #open(args.output_metadata_table, "w").write("NO OUTPUT") #open(args.output_view_emporer, "w").write("Please Include Non Empty Metadata File") #exit(0) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: metadata_object[ "#SampleID"] = ming_fileio_library.get_filename_without_extension( metadata_object["filename"]) if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" ming_fileio_library.write_list_dict_table_data(object_list, args.output_metadata_table, header_list)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('cluster_buckets', help='cluster_buckets') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_folder', help='output_folder') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r")) if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0": print("Do not do things") exit(0) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object) """Reading Metadata File""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) else: print(metadata_files_in_folder[0]) object_list = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0]) if len(object_list) == 0: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: if "#SampleID" in metadata_object: metadata_object["#SampleID"] = metadata_object["#SampleID"] else: #Stripping off all non-alphanumeric characters metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum()) if not "Description" in metadata_object: metadata_object["Description"] = "LoremIpsum" if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" try: mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" except: print(metadata_object["filename"], "Not Mapped") metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped" output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv") output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv") for metadatum in object_list: if "sample_name" in metadatum: if len(metadatum["sample_name"]) > 1: metadatum["#SampleID"] = metadatum["sample_name"] metadata_df = pd.DataFrame(object_list) metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list) """Outputting Manifest Filename""" manifest_df = pd.DataFrame() manifest_df["sample_name"] = metadata_df["#SampleID"] manifest_df["filepath"] = metadata_df["filename"] manifest_df.to_csv(output_manifest_filename, index=False, sep=",") """Calling remote server to do the calculation""" SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024" #SERVER_BASE = "http://mingwangbeta.ucsd.edu:5024" files = {'manifest': open(output_manifest_filename, 'r'), \ 'metadata': open(output_metadata_filename, 'r'), \ 'bucket': open(args.cluster_buckets, 'r')} r_post = requests.post(SERVER_BASE + "/processclassic", files=files) response_dict = r_post.json() with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f) with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f)
def main(): input_filename = sys.argv[1] output_tsv = sys.argv[2] results_list = ming_fileio_library.parse_table_with_headers_object_list( input_filename) results_by_compound_name = defaultdict(list) for result in results_list: annotation_string = result["Compound_Name"] results_by_compound_name[annotation_string].append(result) output_results = [] for compound_name in results_by_compound_name: best_result = sorted(results_by_compound_name[compound_name], key=lambda result: float(result["MQScore"]), reverse=True)[0] all_RTs = [ float(result["RT_Query"]) for result in results_by_compound_name[compound_name] ] all_MZs = [ float(result["SpecMZ"]) for result in results_by_compound_name[compound_name] ] all_MZ_ppmerror = [ float(result["MZErrorPPM"]) for result in results_by_compound_name[compound_name] ] rt_mean = statistics.mean(all_RTs) rt_median = statistics.median(all_RTs) mz_mean = statistics.mean(all_MZs) mz_ppm_mean = statistics.mean(all_MZ_ppmerror) rt_max = max(all_RTs) rt_min = min(all_RTs) mz_max = max(all_MZs) mz_min = min(all_MZs) #STDDev rt_stdev = 0.0 mz_stdev = 0.0 ppmerror_stdev = 0.0 if len(all_RTs) > 1: rt_stdev = statistics.stdev(all_RTs) mz_stdev = statistics.stdev(all_MZs) ppmerror_stdev = statistics.stdev(all_MZ_ppmerror) best_result["rt_mean"] = rt_mean best_result["rt_median"] = rt_median best_result["mz_mean"] = mz_mean best_result["mz_ppm_mean"] = mz_ppm_mean best_result["rt_max"] = rt_max best_result["rt_min"] = rt_min best_result["mz_max"] = mz_max best_result["mz_min"] = mz_min best_result["rt_stdev"] = rt_stdev best_result["mz_stdev"] = mz_stdev best_result["ppmerror_stdev"] = ppmerror_stdev best_result["number_spectra"] = len(all_RTs) output_results.append(best_result) ming_fileio_library.write_list_dict_table_data(output_results, output_tsv)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('cluster_buckets', help='cluster_buckets') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_folder', help='output_folder') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r")) if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0": print("Do not do things") exit(0) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object) """Reading Metadata File""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) else: object_list_temp = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0]) #object_list_temp = pd.read_csv(metadata_files_in_folder[0], sep="\t") object_list = [] for metadata_object in object_list_temp: if len(metadata_object["filename"]) > 1: object_list.append(metadata_object) #Adding all files, if analyzed file is not in list for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue found = False for metadata_object in object_list: if os.path.basename(real_name) == metadata_object["filename"]: found = True break if found is False: object_list.append({"filename" : real_name}) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: if "#SampleID" in metadata_object: metadata_object["#SampleID"] = metadata_object["#SampleID"] else: #Stripping off all non-alphanumeric characters #metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum()) metadata_object["#SampleID"] = metadata_object["filename"] if not "Description" in metadata_object: metadata_object["Description"] = "LoremIpsum" if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" #Adding default grouping information try: mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" except: print(metadata_object["filename"], "Not Mapped") metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped" output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv") output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv") for metadatum in object_list: if "sample_name" in metadatum: if len(metadatum["sample_name"]) > 1: metadatum["#SampleID"] = metadatum["sample_name"] #Removing metadata filenames that are not in the actual data #analysis_files = metadata_df = pd.DataFrame(object_list) metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list) """Outputting Manifest Filename""" manifest_df = pd.DataFrame() manifest_df["sample_name"] = metadata_df["#SampleID"] manifest_df["filepath"] = metadata_df["filename"] manifest_df.to_csv(output_manifest_filename, index=False, sep=",") """Calling remote server to do the calculation""" SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024" files = {'manifest': open(output_manifest_filename, 'r'), \ 'metadata': open(output_metadata_filename, 'r'), \ 'bucket': open(args.cluster_buckets, 'r')} r_post = requests.post(SERVER_BASE + "/processclassic", files=files) response_dict = r_post.json() with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f) with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('consensus_feature_file', help='Consensus Quantification File') parser.add_argument('metadata_folder', help='metadata metadata_folder') parser.add_argument('mgf_filename', help='mgf_filename') parser.add_argument('output_clusterinfo_summary', help='output file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) task_id = param_obj["task"][0] group_to_files_mapping = defaultdict(list) attributes_to_groups_mapping = defaultdict(set) metadata_files = glob.glob(os.path.join(args.metadata_folder, "*")) if len(metadata_files) == 1: group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(metadata_files[0]) ROW_NORMALIZATION = "None" try: ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0] except: ROW_NORMALIZATION = "None" GROUP_COUNT_AGGREGATE_METHOD = "Sum" try: GROUP_COUNT_AGGREGATE_METHOD = param_obj["GROUP_COUNT_AGGREGATE_METHOD"][0] except: GROUP_COUNT_AGGREGATE_METHOD = "None" quantification_list = ming_fileio_library.parse_table_with_headers_object_list(args.consensus_feature_file, delimiter=",") input_filenames, input_filename_headers = determine_input_files(quantification_list[0].keys()) ### Filling in Quantification table if it is missing values for quantification_object in quantification_list: ###Handling empty quantification for filename in input_filename_headers: try: if len(quantification_object[filename]) == 0: #print(filename, quantification_object[filename], quantification_object["row ID"]) quantification_object[filename] = 0 except: x = 1 print("Number of Features", len(quantification_list)) #Doing row sum normalization if ROW_NORMALIZATION == "RowSum": print("ROW SUM NORM") for filename_header in input_filename_headers: file_quants = [float(quantification_object[filename_header]) for quantification_object in quantification_list] for quantification_object in quantification_list: quantification_object[filename_header] = float(quantification_object[filename_header]) / sum(file_quants) """Loading MS2 Spectra""" mgf_collection = ming_spectrum_library.SpectrumCollection(args.mgf_filename) mgf_collection.load_from_file() clusters_list = [] for quantification_object in quantification_list: cluster_obj = {} cluster_obj["cluster index"] = quantification_object["row ID"] cluster_obj["precursor mass"] = "{0:.4f}".format(float(quantification_object["row m/z"])) cluster_obj["RTConsensus"] = "{0:.4f}".format(float(quantification_object["row retention time"])) all_charges = [] """Checking about the charge of this cluster""" try: spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] charge = int(spectrum_object.charge) except: charge = 0 """Checking if this spectrum has no peaks""" # try: # spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] # # except: # continue all_files = [os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0] abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers] all_abundances = [float(quantification_object[filename]) for filename in input_filename_headers] if charge != 0: cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]) * charge - charge + 1) else: cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"])) cluster_obj["precursor charge"] = charge try: cluster_obj["RTMean"] = statistics.mean(all_retention_times) cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times) except: cluster_obj["RTMean"] = cluster_obj["RTConsensus"] cluster_obj["RTStdErr"] = 0 cluster_obj["GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (task_id, quantification_object["row ID"], quantification_object["row ID"]) #cluster_obj["AllFiles"] = "###".join(all_files) cluster_obj["sum(precursor intensity)"] = sum(all_abundances) cluster_obj["SumPeakIntensity"] = sum(all_abundances) cluster_obj["number of spectra"] = len(all_files) cluster_obj["UniqueFileSourcesCount"] = len(all_files) group_abundances = determine_group_abundances(group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD) default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"] for group in group_to_files_mapping: group_header = "GNPSGROUP:" + group if group in default_groups: continue cluster_obj[group_header] = group_abundances[group] for group in default_groups: cluster_obj[group] = group_abundances[group] #Writing attributes for attribute in attributes_to_groups_mapping: groups_to_include = [] for group in attributes_to_groups_mapping[attribute]: if group_abundances[group] > 0.0: groups_to_include.append(group) if len(groups_to_include) == 0: cluster_obj[attribute] = "" else: cluster_obj[attribute] = ",".join(groups_to_include) """ Enriching the cluster info with adduct collapsing information """ enrich_adduct_annotations(cluster_obj, quantification_object) clusters_list.append(cluster_obj) ming_fileio_library.write_list_dict_table_data(clusters_list, args.output_clusterinfo_summary)
def main(): parser = argparse.ArgumentParser( description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('input_clusterinfo_summary', help='Input cluster info summary') parser.add_argument('input_network_pairs_file', help='network_pairs_file') parser.add_argument('input_library_search_file', help='network_pairs_file') parser.add_argument('output_clusterinfo_summary', help='output file') parser.add_argument('output_component_summary', help='output component file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_clusterinfo_summary) library_ids_dict = load_library_id_dict(args.input_library_search_file) nodes_to_component, component_to_nodes = load_pairs_dict( args.input_network_pairs_file) for cluster in all_clusterinfo_list: cluster_index = cluster["cluster index"] if cluster_index in nodes_to_component: cluster["componentindex"] = nodes_to_component[cluster_index] cluster[ "GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s&show=true" % ( nodes_to_component[cluster_index], param_obj["task"][0]) else: cluster["componentindex"] = "-1" cluster["GNPSLinkout_Network"] = 'This Node is a Singleton' if cluster_index in library_ids_dict: cluster["LibraryID"] = library_ids_dict[cluster_index][ "Compound_Name"] cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"] cluster["SpectrumID"] = library_ids_dict[cluster_index][ "SpectrumID"] else: cluster["LibraryID"] = "N/A" cluster["MQScore"] = "N/A" cluster["SpectrumID"] = "N/A" ming_fileio_library.write_list_dict_table_data( all_clusterinfo_list, args.output_clusterinfo_summary) output_component_list = [] for componentindex in component_to_nodes: output_dict = {} output_dict["ComponentIndex"] = componentindex output_dict["NodeCount"] = len(component_to_nodes[componentindex]) output_dict["#Spectra"] = len(component_to_nodes[componentindex]) all_lib_identifications = [] for node in component_to_nodes[componentindex]: if node in library_ids_dict: all_lib_identifications.append( library_ids_dict[node]["Compound_Name"]) output_dict["AllIDs"] = "!".join(all_lib_identifications) output_component_list.append(output_dict) ming_fileio_library.write_list_dict_table_data( output_component_list, args.output_component_summary)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('cluster_buckets', help='cluster_buckets') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_folder', help='output_folder') parser.add_argument("conda_activate_bin") parser.add_argument("conda_environment") args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r")) if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0": print("Do not do things") exit(0) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object) """Reading Metadata File""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) else: object_list_temp = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0]) #object_list_temp = pd.read_csv(metadata_files_in_folder[0], sep="\t") object_list = [] for metadata_object in object_list_temp: if len(metadata_object["filename"]) > 1: object_list.append(metadata_object) #Adding all files, if analyzed file is not in list for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue found = False for metadata_object in object_list: if os.path.basename(real_name) == metadata_object["filename"]: found = True break if found is False: object_list.append({"filename" : real_name}) if len(object_list) == 0: print("Do not do things, not enough files") exit(0) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: if "#SampleID" in metadata_object: metadata_object["#SampleID"] = metadata_object["#SampleID"] else: #Stripping off all non-alphanumeric characters #metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum()) metadata_object["#SampleID"] = metadata_object["filename"] if not "Description" in metadata_object: metadata_object["Description"] = "LoremIpsum" if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" #Adding default grouping information try: mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" except: print(metadata_object["filename"], "Not Mapped") metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped" output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv") output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv") for metadatum in object_list: if "sample_name" in metadatum: if len(metadatum["sample_name"]) > 1: metadatum["#SampleID"] = metadatum["sample_name"] metadata_df = pd.DataFrame(object_list) """Outputting Manifest Filename""" manifest_df = pd.DataFrame() manifest_df["sample_name"] = metadata_df["#SampleID"] manifest_df["filepath"] = metadata_df["filename"] manifest_df.to_csv(output_manifest_filename, index=False, sep=",") #Removing protected headers #metadata_df = metadata_df.drop(columns=["feature", "#SampleID"], errors="ignore") metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list) #Running Qiime2 local_qza_table = os.path.join(args.output_folder, "qiime2_table.qza") local_qza_distance = os.path.join(args.output_folder, "qiime2_distance.qza") local_qza_pcoa = os.path.join(args.output_folder, "qiime2_pcoa.qza") local_qzv_emperor = os.path.join(args.output_folder, "qiime2_emperor.qzv") all_cmd = [] all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime metabolomics import-gnpsnetworkingclusteringbuckettable \ --p-manifest {} \ --p-buckettable {} \ --o-feature-table {}".format(args.conda_activate_bin, args.conda_environment, output_manifest_filename, args.cluster_buckets, local_qza_table)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime diversity beta \ --i-table {} \ --p-metric cosine \ --o-distance-matrix {}".format(args.conda_activate_bin, args.conda_environment, local_qza_table, local_qza_distance)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime diversity pcoa \ --i-distance-matrix {} \ --o-pcoa {}".format(args.conda_activate_bin, args.conda_environment, local_qza_distance, local_qza_pcoa)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime emperor plot \ --i-pcoa {} \ --m-metadata-file {} \ --o-visualization {} \ --p-ignore-missing-samples".format(args.conda_activate_bin, args.conda_environment, local_qza_pcoa, output_metadata_filename, local_qzv_emperor)) for cmd in all_cmd: os.system(cmd)
def main(): parser = argparse.ArgumentParser( description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('consensus_feature_file', help='Consensus Quantification File') parser.add_argument('metadata_folder', help='metadata metadata_folder') parser.add_argument('mgf_filename', help='mgf_filename') parser.add_argument('output_clusterinfo_summary', help='output file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) task_id = param_obj["task"][0] group_to_files_mapping = defaultdict(list) attributes_to_groups_mapping = defaultdict(set) metadata_files = glob.glob(os.path.join(args.metadata_folder, "*")) if len(metadata_files) == 1: group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings( metadata_files[0]) ROW_NORMALIZATION = "None" try: ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0] except: ROW_NORMALIZATION = "None" GROUP_COUNT_AGGREGATE_METHOD = "Sum" try: GROUP_COUNT_AGGREGATE_METHOD = param_obj[ "GROUP_COUNT_AGGREGATE_METHOD"][0] except: GROUP_COUNT_AGGREGATE_METHOD = "None" quantification_list = ming_fileio_library.parse_table_with_headers_object_list( args.consensus_feature_file, delimiter=",") input_filenames, input_filename_headers = determine_input_files( quantification_list[0].keys()) ### Filling in Quantification table if it is missing values for quantification_object in quantification_list: ###Handling empty quantification for filename in input_filename_headers: try: if len(quantification_object[filename]) == 0: #print(filename, quantification_object[filename], quantification_object["row ID"]) quantification_object[filename] = 0 except: x = 1 print("Number of Features", len(quantification_list)) #Doing row sum normalization if ROW_NORMALIZATION == "RowSum": print("ROW SUM NORM") for filename_header in input_filename_headers: file_quants = [ float(quantification_object[filename_header]) for quantification_object in quantification_list ] for quantification_object in quantification_list: quantification_object[filename_header] = float( quantification_object[filename_header]) / sum(file_quants) """Loading MS2 Spectra""" mgf_collection = ming_spectrum_library.SpectrumCollection( args.mgf_filename) mgf_collection.load_from_file() clusters_list = [] for quantification_object in quantification_list: cluster_obj = {} cluster_obj["cluster index"] = quantification_object["row ID"] cluster_obj["precursor mass"] = "{0:.4f}".format( float(quantification_object["row m/z"])) cluster_obj["RTConsensus"] = "{0:.4f}".format( float(quantification_object["row retention time"])) all_charges = [] """Checking about the charge of this cluster""" try: spectrum_object = mgf_collection.scandict[int( cluster_obj["cluster index"])] charge = int(spectrum_object.charge) except: charge = 0 """Checking if this spectrum has no peaks""" # try: # spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] # # except: # continue all_files = [ os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0 ] abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers] all_abundances = [ float(quantification_object[filename]) for filename in input_filename_headers ] if charge != 0: cluster_obj["parent mass"] = "{0:.4f}".format( float(quantification_object["row m/z"]) * charge - charge + 1) else: cluster_obj["parent mass"] = "{0:.4f}".format( float(quantification_object["row m/z"])) cluster_obj["precursor charge"] = charge try: cluster_obj["RTMean"] = statistics.mean(all_retention_times) cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times) except: cluster_obj["RTMean"] = cluster_obj["RTConsensus"] cluster_obj["RTStdErr"] = 0 cluster_obj[ "GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % ( task_id, quantification_object["row ID"], quantification_object["row ID"]) #cluster_obj["AllFiles"] = "###".join(all_files) cluster_obj["sum(precursor intensity)"] = sum(all_abundances) cluster_obj["SumPeakIntensity"] = sum(all_abundances) cluster_obj["number of spectra"] = len(all_files) cluster_obj["UniqueFileSourcesCount"] = len(all_files) group_abundances = determine_group_abundances( group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD) default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"] for group in group_to_files_mapping: group_header = "GNPSGROUP:" + group if group in default_groups: continue cluster_obj[group_header] = group_abundances[group] for group in default_groups: cluster_obj[group] = group_abundances[group] #Writing attributes for attribute in attributes_to_groups_mapping: groups_to_include = [] for group in attributes_to_groups_mapping[attribute]: if group_abundances[group] > 0.0: groups_to_include.append(group) if len(groups_to_include) == 0: cluster_obj[attribute] = "" else: cluster_obj[attribute] = ",".join(groups_to_include) """ Enriching the cluster info with adduct collapsing information """ enrich_adduct_annotations(cluster_obj, quantification_object) clusters_list.append(cluster_obj) ming_fileio_library.write_list_dict_table_data( clusters_list, args.output_clusterinfo_summary)
def trace_filename_filesystem(all_datasets, dataset_accession, dataset_scan, enrichmetadata=False): output_file_list = [] output_match_list = [] for dataset_object in all_datasets: if dataset_object["dataset"] == dataset_accession: networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset( dataset_object["task"]) if networking_job == None: continue networking_task_info = ming_proteosafe_library.get_task_information( "gnps.ucsd.edu", networking_job["task"]) task_user = networking_task_info["user"] clustering_path = os.path.join( "/data/ccms-data/tasks", task_user, networking_job["task"], "allclustered_spectra_info_withpath") clustering_files = ming_fileio_library.list_files_in_dir( clustering_path) if len(clustering_files) != 1: continue clustering_membership_list = ming_fileio_library.parse_table_with_headers_object_list( clustering_files[0]) acceptable_raw_spectra = [ spectrum for spectrum in clustering_membership_list if spectrum["cluster index"] == str(dataset_scan) ] for raw_spectrum in acceptable_raw_spectra: output_object = {} output_object["dataset_id"] = dataset_accession output_object["cluster_scan"] = dataset_scan output_object["filename"] = raw_spectrum["Original_Path"] output_object["filescan"] = raw_spectrum["ScanNumber"] output_object["metadata"] = "" output_object["basefilename"] = os.path.basename( raw_spectrum["Original_Path"]) if enrichmetadata: try: metadata_list = get_metadata_information_per_filename( raw_spectrum["Original_Path"]) output_object["metadata"] = "|".join(metadata_list) except: print("ReDU is down") output_match_list.append(output_object) print(len(acceptable_raw_spectra)) unique_files = list( set([ spectrum["Original_Path"] for spectrum in acceptable_raw_spectra ])) print(len(unique_files)) for source_file in unique_files: output_object = {} output_object["dataset_id"] = dataset_accession output_object["cluster_scan"] = dataset_scan output_object["filename"] = source_file output_object["metadata"] = "" output_object["basefilename"] = os.path.basename(source_file) if enrichmetadata: try: metadata_list = get_metadata_information_per_filename( source_file) output_object["metadata"] = "|".join(metadata_list) except: print("ReDU is down") output_file_list.append(output_object) #Performing a fix to make sure the spectrum is present because of a renaming from <dataset>/spectrum to <dataset>/ccms_peak for file_dict in output_file_list: splits = file_dict["filename"].split("/") splits[1] = splits[1].replace("spectrum", "ccms_peak") file_dict["filename"] = "/".join(splits) for file_dict in output_match_list: splits = file_dict["filename"].split("/") splits[1] = splits[1].replace("spectrum", "ccms_peak") file_dict["filename"] = "/".join(splits) return output_file_list, output_match_list
def main(): parser = argparse.ArgumentParser( description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[ parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir( tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list( input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data( full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser( description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file') parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file') parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename') parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename') parser.add_argument('input_networking_pairs', help='input_networking_pairs') parser.add_argument('input_library_search', help='input_library_search') parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename') args = parser.parse_args() """Loading group filenames""" group_to_files, files_to_groups = load_group_mapping( args.input_group_mapping_filename) print("Loaded Group Mapping") cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_clusterinfosummary_file) print("Loaded Cluster Summary") attribute_to_groups = load_attribute_mapping( args.input_attribute_mapping_filename) params_object = ming_proteosafe_library.parse_xml_file(open( args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0]) RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0] #Calculating the spectrum counts per group cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0)) cluster_to_files = defaultdict(set) cluster_to_RT = defaultdict(list) line_count = 0 for line in open(args.input_clusterinfo_file): line_count += 1 if line_count == 1: continue if line_count % 10000 == 0: print(line_count) splits = line.rstrip().split("\t") cluster_index = splits[0] filename = os.path.basename(splits[1]) rt = float(splits[6]) group_membership = files_to_groups[filename] cluster_to_files[cluster_index].add(filename) cluster_to_RT[cluster_index].append(rt) for group in group_membership: cluster_to_group_counts[cluster_index][group] += 1 if RUN_MSCLUSTER == "on": cluster_summary_list = filter_clusters_based_on_cluster_size( cluster_summary_list, CLUSTER_MIN_SIZE) print(len(cluster_summary_list)) print("Setting up grouping", len(group_to_files.keys())) for cluster_summary_object in cluster_summary_list: cluster_index = cluster_summary_object["cluster index"] for group in group_to_files: group_count = 0 if group in cluster_to_group_counts[cluster_index]: group_count = cluster_to_group_counts[cluster_index][group] cluster_summary_object[group] = group_count for attribute in attribute_to_groups: groups_to_include = [] for group in attribute_to_groups[attribute]: if group in cluster_summary_object: if cluster_summary_object[group] > 0: groups_to_include.append(group) cluster_summary_object[attribute] = ",".join( groups_to_include).replace("GNPSGROUP:", "") print("Default Attributes") calculate_default_attributes(cluster_summary_list, group_to_files.keys()) print("calculate_cluster_file_stats") calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping) print("rt stats") calculate_rt_stats(cluster_summary_list, cluster_to_RT) print("populate_network_component") populate_network_component(cluster_summary_list, args.input_networking_pairs) print("calculate_ancillary_information") calculate_ancillary_information(cluster_summary_list, params_object["task"][0]) print("populate_network_identifications") populate_network_identifications(cluster_summary_list, args.input_library_search) ming_fileio_library.write_list_dict_table_data( cluster_summary_list, args.output_clusterinfosummary_filename)