def main(): parallel_json = json.loads(open(sys.argv[1]).read()) params_filename = sys.argv[2] task_id_file = sys.argv[3] output_peptide_folder = sys.argv[4] output_psm_folder = sys.argv[5] #output_summary = sys.argv[5] params_dict = ming_proteosafe_library.parse_xml_file(open(params_filename)) source_tasks_text = params_dict["tasks_to_consolidate"][0] row_count, task_file_table = ming_fileio_library.parse_table_with_headers( task_id_file) my_node = parallel_json["node_partition"] total_node = parallel_json["total_paritions"] output_summary = os.path.join(sys.argv[6], str(my_node)) if len(source_tasks_text) > 0: source_tasks_list = json.loads(source_tasks_text) source_tasks_list += task_file_table["TASKID"] source_tasks_list.sort() source_tasks_list = source_tasks_list[my_node::total_node] grab_all_results(source_tasks_list, output_peptide_folder, output_psm_folder, output_summary, params_dict) else: open(output_summary, "w").write("None")
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('networking_pairs_results_file', help='networking_pairs_results_file') parser.add_argument('networking_pairs_results_file_filtered', help='networking_pairs_results_file_filtered') parser.add_argument('networking_pairs_results_file_filtered_classic_output', help='networking_pairs_results_file_filtered_classic_output') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) top_k_val = 10 max_component_size = 0 if "TOPK" in param_obj: top_k_val = int(param_obj["TOPK"][0]) if "MAXIMUM_COMPONENT_SIZE" in param_obj: max_component_size = int(param_obj["MAXIMUM_COMPONENT_SIZE"][0]) G = molecular_network_filtering_library.loading_network(args.networking_pairs_results_file, hasHeaders=True) if G == None: exit(0) molecular_network_filtering_library.filter_top_k(G, top_k_val) molecular_network_filtering_library.filter_component(G, max_component_size) molecular_network_filtering_library.output_graph_with_headers(G, args.networking_pairs_results_file_filtered) molecular_network_filtering_library.output_graph(G, args.networking_pairs_results_file_filtered_classic_output)
def main(): paramxml_input_filename = sys.argv[1] output_json_folder = sys.argv[2] parallelism = int(sys.argv[3]) params_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_input_filename)) try: if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1": parallelism = 1 except: parallelism = 1 #dataset_dict = ming_proteosafe_library.get_all_dataset_dict() all_datasets = ming_proteosafe_library.get_all_datasets() for i in range(parallelism): output_map = {"node_partition": i, "total_paritions": parallelism} partitioned_datasets = all_datasets[i::parallelism] output_map["all_datasets"] = partitioned_datasets dataset_map = {} for dataset in partitioned_datasets: dataset_map[dataset["dataset"]] = dataset output_map["dataset_dict"] = dataset_map output_filename = os.path.join(output_json_folder, str(i) + ".json") open(output_filename, "w").write(json.dumps(output_map))
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping): output_file = open(output_filename, "w") line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) clusters_in_network = set() for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'): clusters_in_network.add(row["cluster index"]) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if not(cluster_number in clusters_in_network): continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) output_header_list = [] output_header_list.append("#OTU ID") for header in mangled_mapping.keys(): if header.find("spec") == -1: continue if os.path.basename(mangled_mapping[header]) in metadata_mapping: output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])]) else: output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header]))) output_file.write("\t".join(output_header_list) + "\n") for cluster_idx in cluster_index_to_file_map: line_output_list = [] line_output_list.append(str(cluster_idx)) #line_string = str(cluster_idx) + "\t" for header in mangled_mapping.keys(): if header.find("spec") == -1: continue line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header])) #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t" #print line_string #output_file.write(line_string + "\n") output_file.write("\t".join(line_output_list) + "\n") output_file.close()
def main(): usage() output_file_path = sys.argv[2] input_file_path = sys.argv[4] params_file_path = sys.argv[5] top_k_val = 10 max_component_size = 0 params = ming_proteosafe_library.parse_xml_file(open( params_file_path, "r")) if "TOPK" in params: top_k_val = int(params["TOPK"][0]) if "MAXIMUM_COMPONENT_SIZE" in params: max_component_size = int(params["MAXIMUM_COMPONENT_SIZE"][0]) #Doing other filtering G = molecular_network_filtering_library.loading_network(input_file_path, hasHeaders=True) #Returning None means that there are no edges in the output if G == None: exit(0) molecular_network_filtering_library.filter_top_k(G, top_k_val) molecular_network_filtering_library.filter_component(G, max_component_size) molecular_network_filtering_library.output_graph(G, output_file_path)
def main(): paramxml_input_filename = sys.argv[1] output_json_folder = sys.argv[2] parallelism = int(sys.argv[3]) params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) try: if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1": parallelism = 1 except: parallelism = 1 #dataset_dict = ming_proteosafe_library.get_all_dataset_dict() all_datasets = ming_proteosafe_library.get_all_datasets() for i in range(parallelism): output_map = {"node_partition" : i, "total_paritions" : parallelism} partitioned_datasets = all_datasets[i::parallelism] output_map["all_datasets"] = partitioned_datasets dataset_map = {} for dataset in partitioned_datasets: dataset_map[dataset["dataset"]] = dataset output_map["dataset_dict"] = dataset_map output_filename = os.path.join(output_json_folder, str(i) + ".json") open(output_filename, "w").write(json.dumps(output_map))
def main(): usage() output_file_path = sys.argv[2] input_file_path = sys.argv[4] params_file_path = sys.argv[5] top_k_val = 10 max_component_size = 0 params = ming_proteosafe_library.parse_xml_file(open(params_file_path, "r")) if "TOPK" in params: top_k_val = int(params["TOPK"][0]) if "MAXIMUM_COMPONENT_SIZE" in params: max_component_size = int(params["MAXIMUM_COMPONENT_SIZE"][0]) #Doing other filtering G = molecular_network_filtering_library.loading_network(input_file_path, hasHeaders=True) #Returning None means that there are no edges in the output if G == None: exit(0) molecular_network_filtering_library.filter_top_k(G, top_k_val) molecular_network_filtering_library.filter_component(G, max_component_size) molecular_network_filtering_library.output_graph(G, output_file_path)
def main(): input_param = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) input_folder = sys.argv[2] output_file = sys.argv[3] scratch_folder = sys.argv[4] path_to_executable = sys.argv[5] path_to_isotopes_table = sys.argv[6] #parent_mass_tolerance = input_param[] parent_mass_tolerance = 0.05 all_input_file_paths = ming_fileio_library.list_files_in_dir(input_folder) output_kl_intermediates = [] for input_file in all_input_file_paths: output_kl_file = os.path.join(scratch_folder, os.path.basename(input_file) + ".kl") cmd = path_to_executable + " --input " + input_file + " --output_summary " + output_kl_file + " " + "--peak_tolerance " + str( parent_mass_tolerance ) + " --isotope_file " + path_to_isotopes_table + " >/dev/null 2>&1 " print(cmd) os.system(cmd) #subprocess.call([cmd]) output_kl_intermediates.append(output_kl_file) combined_table = defaultdict(list) for output_kl_file in output_kl_intermediates: row_count, table_data = ming_fileio_library.parse_table_with_headers( output_kl_file) for key in table_data: combined_table[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(combined_table, output_file)
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename): param_object = ming_proteosafe_library.parse_xml_file( open(param_filename, "r")) output_file = open(output_filename, "w") if param_object["CREATE_CLUSTER_BUCKETS"][0] != "1": output_file.write("No Output") return test_network = molecular_network_library.MolecularNetwork() test_network.load_clustersummary(clusterinfosummary_filename) line_counts, table_data = ming_fileio_library.parse_table_with_headers( cluster_info_filename) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if test_network.get_cluster_index(cluster_number) == None: continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][ mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = { "filename": table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i] } all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) output_header = "#OTU ID\t" for header in mangled_mapping.keys(): output_header += os.path.basename(mangled_mapping[header]) + "\t" output_file.write(output_header + "\n") for cluster_idx in cluster_index_to_file_map: line_string = str(cluster_idx) + "\t" for header in mangled_mapping.keys(): line_string += str( cluster_index_to_file_map[cluster_idx][header]) + "\t" #print line_string output_file.write(line_string + "\n")
def main(): params = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) proteome = ming_protein_library.parse_fasta_proteome_file(sys.argv[2]) row_count, table_data = ming_fileio_library.parse_table_with_headers(sys.argv[3]) decoy_marker = sys.argv[5] add_decoy_to_results(table_data, row_count, decoy_marker) psm_results = add_fdr_to_results(table_data, row_count) output_table = defaultdict(list) #Performing filters filter_type = params["filter.filter"][0] if filter_type == "FDR": fdr_threshold = float(params["FDR.FDR"][0]) for psm in psm_results: if psm["QValue"] < fdr_threshold: for key in psm: output_table[key].append(psm[key]) if filter_type == "PepFDR": fdr_threshold = float(params["PepFDR.PepFDR"][0]) for psm in psm_results: if psm["PepQValue"] < fdr_threshold and psm["QValue"] < fdr_threshold: for key in psm: output_table[key].append(psm[key]) if filter_type == "FPR": print("Lets do nothing, don't know what this is") ming_fileio_library.write_dictionary_table_data(output_table, sys.argv[4])
def load_parameters_file(self, paramsfilename): #Loading the file mapping parameters = ming_proteosafe_library.parse_xml_file( open(paramsfilename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( parameters) self.mangled_mapping = mangled_mapping
def main(): parser = argparse.ArgumentParser(description='Create parallel parameters') parser.add_argument('mgf_filename', help='Input mgf file to network') parser.add_argument('workflow_parameters', help='proteosafe xml parameters') parser.add_argument('parameters_output_folder', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) #Determing number of spectra in mgf file number_of_spectra, number_real_spectra = number_scans_in_mgf_file( args.mgf_filename) parallelism = args.parallelism if parallelism > number_of_spectra: parallelism = 1 recommended_parallelism = max(1, int(number_real_spectra / 1000)) print("recommended_parallelism", recommended_parallelism) parallelism = min(recommended_parallelism, parallelism) number_per_partition = int(number_of_spectra / parallelism) for i in range(parallelism): output_parameter_file = open( os.path.join(args.parameters_output_folder, str(i) + ".params"), "w") output_parameter_file.write("ALIGNS_FORMAT=%s\n" % ("tsv")) output_parameter_file.write("MIN_MATCHED_PEAKS=%s\n" % (params_object["MIN_MATCHED_PEAKS"][0])) output_parameter_file.write( "TOLERANCE_PEAK=%s\n" % (params_object["tolerance.Ion_tolerance"][0])) output_parameter_file.write( "TOLERANCE_PM=%s\n" % (params_object["tolerance.PM_tolerance"][0])) output_parameter_file.write("PAIRS_MIN_COSINE=%s\n" % (params_object["PAIRS_MIN_COSINE"][0])) #output_parameter_file.write("MAX_SHIFT=%s\n" % (params_object["MAX_SHIFT"][0])) output_parameter_file.write("MAX_SHIFT=%s\n" % ("9999")) output_parameter_file.write("MIN_RATIO=%s\n" % ("0.4")) output_parameter_file.write("INPUT_SPECTRA_MS2=%s\n" % (args.mgf_filename)) start_idx = number_per_partition * i end_idx = number_per_partition * (i + 1) - 1 if i == parallelism - 1: end_idx = number_of_spectra output_parameter_file.write("IDX_START=%d\n" % (start_idx)) output_parameter_file.write("IDX_END=%d\n" % (end_idx))
def main(): param_filename = sys.argv[1] metadata_folder = sys.argv[2] input_clusterinfo_file = sys.argv[3] input_clusterinfosummary = sys.argv[4] ili_stl_model_folder = sys.argv[5] output_ili_filename = sys.argv[6] view_ili_html_filename = sys.argv[7] create_output = True param_object = ming_proteosafe_library.parse_xml_file( open(param_filename, "r")) try: if param_object["CREATE_ILI_OUTPUT"][0] != "1": create_output = False except: create_output = False if create_output: ili_stl_model_files_in_folder = ming_fileio_library.list_files_in_dir( ili_stl_model_folder) metadata_files_in_folder = ming_fileio_library.list_files_in_dir( metadata_folder) if len(metadata_files_in_folder) != 1: print( "Metadata file not provided, cannot create ili compatible output without coordinates" ) exit(1) filename_coordinate_mapping = load_filename_to_coordinate_mapping( metadata_files_in_folder[0]) create_ili_output_from_clusterinfo(input_clusterinfo_file, param_filename, input_clusterinfosummary, filename_coordinate_mapping, output_ili_filename) if len(ili_stl_model_files_in_folder) == 1: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write("<script>\n") output_ili_html_file.write( 'window.location.replace("https://ili.embl.de/?https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_stl_model/ili_stl_model-00000.stl;https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_output/ili_quant.csv")\n' % (param_object["task"][0], param_object["task"][0])) output_ili_html_file.write("</script>\n") output_ili_html_file.close() if len(ili_stl_model_files_in_folder) == 0: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write( "No STL file uploaded, cannot directly link to ili\n") output_ili_html_file.close() if len(ili_stl_model_files_in_folder) > 1: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write("Too many stl files uploaded\n") output_ili_html_file.close() else: open(output_ili_filename, "w").write("No Output") open(view_ili_html_filename, "w").write( "ili output was not selected or no metadata file was provided")
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename): output_file = open(output_filename, "w") test_network = molecular_network_library.MolecularNetwork() test_network.load_clustersummary(clusterinfosummary_filename) line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if test_network.get_cluster_index(cluster_number) == None: continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) all_headers = ["filename", "X", "Y", "Z", "radius"] for cluster_idx in cluster_index_to_file_map: all_headers.append(cluster_idx) #writing header output_file.write(",".join(all_headers) + "\n") for sample_name in mangled_mapping: if sample_name.find("spec") == -1: continue real_filename = mangled_mapping[sample_name] if not os.path.basename(real_filename) in filename_coordinate_mapping: continue line_output = [real_filename] coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)] line_output.append(coordinate_object["x"]) line_output.append(coordinate_object["y"]) line_output.append(coordinate_object["z"]) line_output.append(coordinate_object["radius"]) print(line_output, coordinate_object) for cluster_idx in cluster_index_to_file_map: line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name])) output_file.write(",".join(line_output) + "\n") output_file.close()
def main(): parser = argparse.ArgumentParser(description='Create parallel parameters') parser.add_argument('workflow_parameters', help='proteosafe xml parameters') parser.add_argument('input_mgf', help='Input mgf file to network') parser.add_argument('library_folder', help='library_folder') parser.add_argument('library_matches', help='output matches') parser.add_argument('binary_path', help='binary_path') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) temp_parameters_file = "temp_parameters" + ".params" output_parameter_file = open(temp_parameters_file, "w") #Search Criteria output_parameter_file.write("SCORE_THRESHOLD=%s\n" % (params_object["SCORE_THRESHOLD"][0])) output_parameter_file.write("MIN_MATCHED_PEAKS_SEARCH=%s\n" % (params_object["MIN_MATCHED_PEAKS"][0])) output_parameter_file.write("TOP_K_RESULTS=%s\n" % (params_object["TOP_K_RESULTS"][0])) output_parameter_file.write("search_peak_tolerance=%s\n" % (params_object["tolerance.Ion_tolerance"][0])) output_parameter_file.write("search_parentmass_tolerance=%s\n" % (params_object["tolerance.PM_tolerance"][0])) output_parameter_file.write("ANALOG_SEARCH=%s\n" % (params_object["ANALOG_SEARCH"][0])) output_parameter_file.write("MAX_SHIFT_MASS=%s\n" % (params_object["MAX_SHIFT_MASS"][0])) #Filtering Criteria output_parameter_file.write("FILTER_PRECURSOR_WINDOW=%s\n" % (params_object["FILTER_PRECURSOR_WINDOW"][0])) output_parameter_file.write("MIN_PEAK_INT=%s\n" % (params_object["MIN_PEAK_INT"][0])) output_parameter_file.write("WINDOW_FILTER=%s\n" % (params_object["WINDOW_FILTER"][0])) output_parameter_file.write("FILTER_LIBRARY=%s\n" % (params_object["FILTER_LIBRARY"][0])) output_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" % (" ".join(library_files))) output_parameter_file.write("RESULTS_DIR=%s\n" % (args.library_matches)) output_parameter_file.write("searchspectra=%s\n" % (args.input_mgf)) output_parameter_file.close() cmd = "%s ExecSpectralLibrarySearchMolecular %s -ll 0" % ( args.binary_path, temp_parameters_file) os.system(cmd)
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename): output_file = open(output_filename, "w") line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) all_headers = ["filename", "X", "Y", "Z", "radius"] for cluster_idx in cluster_index_to_file_map: all_headers.append(cluster_idx) #writing header output_file.write(",".join(all_headers) + "\n") for sample_name in mangled_mapping: if sample_name.find("spec") == -1: continue real_filename = mangled_mapping[sample_name] if not os.path.basename(real_filename) in filename_coordinate_mapping: continue line_output = [real_filename] coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)] line_output.append(coordinate_object["x"]) line_output.append(coordinate_object["y"]) line_output.append(coordinate_object["z"]) line_output.append(coordinate_object["radius"]) print(line_output, coordinate_object) for cluster_idx in cluster_index_to_file_map: line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name])) output_file.write(",".join(line_output) + "\n") output_file.close()
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('input_clusterinfo_summary', help='Input cluster info summary') parser.add_argument('input_network_pairs_file', help='network_pairs_file') parser.add_argument('input_library_search_file', help='network_pairs_file') parser.add_argument('output_clusterinfo_summary', help='output file') parser.add_argument('output_component_summary', help='output component file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfo_summary) library_ids_dict = load_library_id_dict(args.input_library_search_file) nodes_to_component, component_to_nodes = load_pairs_dict(args.input_network_pairs_file) for cluster in all_clusterinfo_list: cluster_index = cluster["cluster index"] if cluster_index in nodes_to_component: cluster["componentindex"] = nodes_to_component[cluster_index] cluster["GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s" % (nodes_to_component[cluster_index], param_obj["task"][0]) else: cluster["componentindex"] = "-1" cluster["GNPSLinkout_Network"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (param_obj["task"][0], cluster_index, cluster_index) if cluster_index in library_ids_dict: cluster["LibraryID"] = library_ids_dict[cluster_index]["Compound_Name"] cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"] cluster["SpectrumID"] = library_ids_dict[cluster_index]["SpectrumID"] else: cluster["LibraryID"] = "N/A" cluster["MQScore"] = "N/A" cluster["SpectrumID"] = "N/A" ming_fileio_library.write_list_dict_table_data(all_clusterinfo_list, args.output_clusterinfo_summary) output_component_list = [] for componentindex in component_to_nodes: output_dict = {} output_dict["ComponentIndex"] = componentindex output_dict["NodeCount"] = len(component_to_nodes[componentindex]) output_dict["#Spectra"] = len(component_to_nodes[componentindex]) all_lib_identifications = [] for node in component_to_nodes[componentindex]: if node in library_ids_dict: all_lib_identifications.append(library_ids_dict[node]["Compound_Name"]) output_dict["AllIDs"] = "!".join(all_lib_identifications) output_component_list.append(output_dict) ming_fileio_library.write_list_dict_table_data(output_component_list, args.output_component_summary)
def main(): paramxml_input_filename = sys.argv[1] output_mgf_file = sys.argv[2] params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) #Validating the spectrum string if masst_validator.validate(params_obj["spectrum_string"][0], int(params_obj["MIN_MATCHED_PEAKS"][0])) != 0: print("Validation Error on Input") exit(1) spectrum_collection = get_spectrum_collection_from_param_obj(params_obj) spectrum_collection.save_to_mgf(open(output_mgf_file, "w"))
def main(): parser = argparse.ArgumentParser(description='Invoking new workflow with parameters of given workflow') parser.add_argument('workflowparamters', help='workflowparamters') parser.add_argument('credentials', help='credentials.json') parser.add_argument('outputhtml', default='output.html', help='output html with a url') parser.add_argument('--serverurl', default='proteomics2.ucsd.edu', help='Server URL, default is proteomics2.ucsd.edu, other options are massive.ucsd.edu and gnps.ucsd.edu') parser.add_argument('--parametermapping', action='append', help='mapping of current workflow parameters to new parameters in the format: <old parameter>:<new parameter>') parser.add_argument('--newparameters', action='append', help='parameter key: <param name>:<parameter value>') parser.add_argument('--runparameter', default='NONE', help='Workflow xml parameter to check if this parameter is equal to "1" to actually invoke the workflow') args = parser.parse_args() credentials = json.loads(open(args.credentials).read()) workflow_parameters_map = ming_proteosafe_library.parse_xml_file(open(args.workflowparamters)) if args.runparameter != "NONE": if workflow_parameters_map[args.runparameter][0] == "0": output_html_file = open(args.outputhtml, "w") output_html_file.write("User chose not to run tool\n") output_html_file.close() exit(0) new_parameters = {} new_parameters["desc"] = "Analysis subroutine from ProteoSAFe job %s" % (workflow_parameters_map["task"][0]) if args.newparameters != None: for parameter_string in args.newparameters: parameter_key = parameter_string.split(":")[0] parameter_value = parameter_string.split(":")[1] new_parameters[parameter_key] = parameter_value if args.parametermapping != None: for parameter_string in args.parametermapping: parameter_old_key = parameter_string.split(":")[0] parameter_new_key = parameter_string.split(":")[1] new_parameters[parameter_new_key] = workflow_parameters_map[parameter_old_key][0] task_id = ming_proteosafe_library.invoke_workflow(args.serverurl, new_parameters, credentials["username"], credentials["password"]) if task_id == None: exit(1) ming_proteosafe_library.wait_for_workflow_finish(args.serverurl, task_id) """Writing HTML output""" output_html_file = open(args.outputhtml, "w") output_html_file.write("<script>\n") output_html_file.write('window.open("https://%s/ProteoSAFe/status.jsp?task=%s", "_blank")\n' % (args.serverurl, task_id)) output_html_file.write("</script>\n") output_html_file.close()
def main(): parser = argparse.ArgumentParser( description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clustersummary', help='input_clustersummary') parser.add_argument('input_clusterinfo', help='input_clusterinfo') parser.add_argument('output_clusterinfo', help='output_clusterinfo') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open( args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) #Creating acceptable clusters to include in cluster info included_clusters = set() for row in csv.DictReader(open(args.input_clustersummary), delimiter='\t'): included_clusters.add(row["cluster index"]) with open(args.input_clusterinfo) as input_clusterinfo: field_names = [ "cluster index", "AllFiles", "sum(precursor intensity)", "RTMean", "RTStdErr", "parent mass", "ScanNumber", "ProteosafeFilePath", "Original_Path" ] output_clusterinfo_writer = csv.DictWriter(open( args.output_clusterinfo, "w"), fieldnames=field_names, delimiter='\t') output_clusterinfo_writer.writeheader() input_clusterinfo_reader = csv.DictReader(input_clusterinfo, delimiter='\t') for row in input_clusterinfo_reader: if not (row["#ClusterIdx"] in included_clusters): continue output_dict = {} output_dict["cluster index"] = row["#ClusterIdx"] output_dict["AllFiles"] = row["#Filename"] output_dict["sum(precursor intensity)"] = row["#PrecIntensity"] output_dict["RTMean"] = row["#RetTime"] output_dict["RTStdErr"] = "0" output_dict["parent mass"] = row["#ParentMass"] output_dict["ScanNumber"] = row["#Scan"] output_dict["ProteosafeFilePath"] = os.path.join( "spec", os.path.basename(row["#Filename"])) output_dict["Original_Path"] = "f." + mangled_mapping[ os.path.basename(row["#Filename"])] output_clusterinfo_writer.writerow(output_dict) exit(0)
def main(): paramxml_filename = sys.argv[1] psms_input_file = sys.argv[2] kl_input_file = sys.argv[3] output_psms_file = sys.argv[4] parameters_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_filename)) row_count, kl_data = ming_fileio_library.parse_table_with_headers( kl_input_file) kl_dict = {} for i in range(row_count): filename = os.path.basename(kl_data["Filename"][i]) scan = kl_data["Scan"][i] kl_strict = (kl_data["KL Strict"][i]) kl_unstrict = (kl_data["KL"][i]) interpeak_intensity = (kl_data["Interpeak intensity"][i]) key = filename + ":" + str(scan) kl_dict[key] = { "kl_strict": kl_strict, "kl_unstrict": kl_unstrict, "kl_interpeak": interpeak_intensity } #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers( psms_input_file) psm_table_data["kl_strict"] = [] psm_table_data["kl_unstrict"] = [] psm_table_data["kl_interpeak"] = [] for i in range(psm_rows): key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i] if key in kl_dict: psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"]) psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"]) psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"]) else: psm_table_data["kl_strict"].append(-1) psm_table_data["kl_unstrict"].append(-1) psm_table_data["kl_interpeak"].append(-1) #Change C to C+57 #if "cysteine_protease.cysteine" in parameters_obj: # if parameters_obj["cysteine_protease.cysteine"][0] == "c57": # #Lets replace all the cysteines # for i in range(psm_rows): # psm_table_data["sequence"][i] = psm_table_data["sequence"][i].replace("C", "C+57") ming_fileio_library.write_dictionary_table_data(psm_table_data, output_psms_file)
def main(): params_filename = sys.argv[1] output_peptide_folder = sys.argv[2] output_psm_folder = sys.argv[3] output_summary = sys.argv[4] params_dict = ming_proteosafe_library.parse_xml_file(open(params_filename)) source_tasks_text = params_dict["tasks_to_consolidate"][0] if len(source_tasks_text) > 0: source_tasks_list = json.loads(source_tasks_text) grab_all_results(source_tasks_list, output_peptide_folder, output_psm_folder, output_summary) else: open(output_summary, "w").write("None")
def main(): input_folder_path = sys.argv[1] param_xml_filename = sys.argv[2] output_tsv = sys.argv[3] files = ming_fileio_library.list_files_in_dir(input_folder_path) params_obj = ming_proteosafe_library.parse_xml_file(open(param_xml_filename)) top_k = 1 try: top_k = int(params_obj["TOP_K_RESULTS"][0]) except: top_k = 1 #merged_dict = defaultdict(list) merged_results = [] for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file) for i in range(row_count): result_dict = {} for key in table_data: result_dict[key] = table_data[key][i] merged_results.append(result_dict) results_per_spectrum = defaultdict(list) for result_obj in merged_results: spectrum_unique_key = result_obj["SpectrumFile"] + "___" + result_obj["#Scan#"] results_per_spectrum[spectrum_unique_key].append(result_obj) output_results = [] for spectrum_unique_key in results_per_spectrum: sorted_results = sorted(results_per_spectrum[spectrum_unique_key], key=lambda spectrum_obj: float(spectrum_obj["MQScore"]), reverse=True) filtered_results = sorted_results[:top_k] output_results += filtered_results output_dict = defaultdict(list) for result_obj in output_results: for key in result_obj: output_dict[key].append(result_obj[key]) ming_fileio_library.write_dictionary_table_data(output_dict, output_tsv)
def main(): parser = argparse.ArgumentParser(description='Creates bucket table') parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file') parser.add_argument('param_filename', help='param_filename') parser.add_argument('input_clusterinfosummary', help='input_clusterinfosummary') parser.add_argument('output_filename', help='output_filename') parser.add_argument('output_biom_filename', help='output_biom_filename') parser.add_argument('python_runtime', help='python_runtime') parser.add_argument('biom_run_script', help='biom_run_script') parser.add_argument('--metadata_folder', help='Metadata folder') args = parser.parse_args() # input_clusterinfo_file = sys.argv[1] # param_filename = sys.argv[2] # input_clusterinfosummary = sys.argv[3] # output_filename = sys.argv[4] # output_biom_filename = sys.argv[5] # python_runtime = sys.argv[6] # biom_run_script = sys.argv[7] input_clusterinfo_file = args.input_clusterinfo_file param_filename = args.param_filename input_clusterinfosummary = args.input_clusterinfosummary output_filename = args.output_filename output_biom_filename = args.output_biom_filename python_runtime = args.python_runtime biom_run_script = args.biom_run_script metadata_mapping = {} try: metadata_mapping = load_metadata_mapping(args.metadata_folder) except: metadata_mapping = {} create_buckets = True param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) try: if param_object["CREATE_CLUSTER_BUCKETS"][0] != "1": create_buckets = False except: create_buckets = False if create_buckets: create_bucket_from_clusterinfo(input_clusterinfo_file, param_filename, input_clusterinfosummary, output_filename, metadata_mapping) create_biom_file(output_filename, output_biom_filename, python_runtime, biom_run_script) else: open(output_filename, "w").write("No Output") open(output_biom_filename, "w").write("No Output")
def main(): paramxml_input_filename = sys.argv[1] parallel_param_filename = sys.argv[2] input_spectra_folder = sys.argv[3] library_search_results_filename = sys.argv[4] output_matches_filename = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_input_filename)) try: if params_obj["MATCH_REFERENCE_DATASETS"][0] != "1": output_map = {"EMPTY": []} ming_fileio_library.write_dictionary_table_data( output_map, output_matches_filename) exit(0) except: output_map = {"EMPTY": []} ming_fileio_library.write_dictionary_table_data( output_map, output_matches_filename) exit(0) #Loading a dict of identifications identifications_map = load_identification_file_as_map( library_search_results_filename) #If we are doing parallel partition_total = 1 partition_of_node = 0 params_map = json.loads(open(parallel_param_filename).read()) partition_total = params_map["total_paritions"] partition_of_node = params_map["node_partition"] all_datasets = params_map["all_datasets"] all_matches = finding_matches_in_public_data( os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets, identifications_map) output_map = defaultdict(list) for match in all_matches: for key in match: output_map[key].append(match[key]) ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main(): param_filename = sys.argv[1] metadata_folder = sys.argv[2] input_clusterinfo_file = sys.argv[3] input_clusterinfosummary = sys.argv[4] ili_stl_model_folder = sys.argv[5] output_ili_filename = sys.argv[6] view_ili_html_filename = sys.argv[7] create_output = True param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) try: if param_object["CREATE_ILI_OUTPUT"][0] != "1": create_output = False except: create_output = False if create_output: ili_stl_model_files_in_folder = ming_fileio_library.list_files_in_dir(ili_stl_model_folder) metadata_files_in_folder = ming_fileio_library.list_files_in_dir(metadata_folder) if len(metadata_files_in_folder) != 1: print("Metadata file not provided, cannot create ili compatible output without coordinates") exit(1) filename_coordinate_mapping = load_filename_to_coordinate_mapping(metadata_files_in_folder[0]) create_ili_output_from_clusterinfo(input_clusterinfo_file, param_filename, input_clusterinfosummary, filename_coordinate_mapping, output_ili_filename) if len(ili_stl_model_files_in_folder) == 1: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write("<script>\n") output_ili_html_file.write('window.location.replace("https://ili.embl.de/?https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_stl_model/ili_stl_model-00000.stl;https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_output/ili_quant.csv")\n' % (param_object["task"][0],param_object["task"][0])) output_ili_html_file.write("</script>\n") output_ili_html_file.close() if len(ili_stl_model_files_in_folder) == 0: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write("No STL file uploaded, cannot directly link to ili\n") output_ili_html_file.close() if len(ili_stl_model_files_in_folder) > 1: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write("Too many stl files uploaded\n") output_ili_html_file.close() else: open(output_ili_filename, "w").write("No Output") open(view_ili_html_filename, "w").write("ili output was not selected or no metadata file was provided")
def main(): input_parameters_xml = sys.argv[1] output_filename = sys.argv[2] param_obj = ming_proteosafe_library.parse_xml_file( open(input_parameters_xml)) print(param_obj.keys()) peak_list_list = [] metadata_list = [] if "spec_on_server" in param_obj: peak_list_list += (param_obj["spec_on_server"]) if "spec_on_server_group2" in param_obj: peak_list_list += (param_obj["spec_on_server_group2"]) if "spec_on_server_group3" in param_obj: peak_list_list += (param_obj["spec_on_server_group3"]) if "spec_on_server_group4" in param_obj: peak_list_list += (param_obj["spec_on_server_group4"]) if "spec_on_server_group5" in param_obj: peak_list_list += (param_obj["spec_on_server_group5"]) if "spec_on_server_group6" in param_obj: peak_list_list += (param_obj["spec_on_server_group6"]) if "spec_on_server_group6" in param_obj: peak_list_list += (param_obj["spec_on_server_group6"]) if "metadatafile" in param_obj: metadata_list += (param_obj["metadatafile"]) params_dict = {} params_dict["desc"] = "GNPS - Data for Analysis For GNPS Job " + param_obj[ "task"][0] params_dict["workflow"] = "MASSIVE-COMPLETE" params_dict["peak_list_files"] = ";".join(peak_list_list) params_dict["other_files"] = ";".join(metadata_list) output_file = open(output_filename, "w") output_file.write("filenames\tmetadatanames\ttask\n") output_file.write(";".join(peak_list_list)) output_file.write("\t") output_file.write(";".join(metadata_list)) output_file.write("\t") output_file.write(param_obj["task"][0]) output_file.write("\n")
def main(): paramxml_input_filename = sys.argv[1] pairs_info_filename = sys.argv[2] clusterinfo_filename = sys.argv[3] output_all_paths_filename = sys.argv[4] output_all_paths_histogram_filename = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) try: if params_obj["CREATE_TOPOLOGY_SIGNATURES"][0] != "1": open(output_all_paths_filename, "w").write("NONE") open(output_all_paths_histogram_filename, "w").write("NONE") exit(0) except: open(output_all_paths_filename, "w").write("NONE") open(output_all_paths_histogram_filename, "w").write("NONE") exit(0) find_features_in_network(clusterinfo_filename, pairs_info_filename, output_all_paths_filename, output_all_paths_histogram_filename)
def main(): paramxml_input_filename = sys.argv[1] parallel_param_filename = sys.argv[2] output_matches_filename = sys.argv[3] output_filename_unique_files = sys.argv[4] output_filename_all_matches = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_description" : [], "matchedpeaks" : [], "mzerror" : []} match_parameters = get_parameters(params_obj) try: if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1": ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) except: ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) #If we are doing parallel partition_total = 1 partition_of_node = 0 params_map = json.loads(open(parallel_param_filename).read()) partition_total = params_map["total_paritions"] partition_of_node = params_map["node_partition"] dataset_dict = params_map["dataset_dict"] all_datasets = params_map["all_datasets"] SEARCH_RAW = False try: if params_obj["SEARCH_RAW"][0] == "1": SEARCH_RAW = True except: print("Param Not Found", "SEARCH_RAW") """Matchign Clustered Data""" if SEARCH_RAW: match_unclustered(match_parameters, get_spectrum_collection_from_param_obj(params_obj), dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches) else: match_clustered(match_parameters, get_spectrum_collection_from_param_obj(params_obj), dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches)
def main(): parallel_json = json.loads(open(sys.argv[1]).read()) params_filename = sys.argv[2] input_folder_of_results = sys.argv[3] output_folder = sys.argv[4] my_node = parallel_json["node_partition"] total_node = parallel_json["total_paritions"] all_input_files = ming_fileio_library.list_files_in_dir(input_folder_of_results) all_input_files.sort() ### ### TODO We will have to read parameters and see if we need to eliminate some PSMs, with PSM FDR filter, KL Filter, ambiguity score filter, unique intensity filter ### params_obj = ming_proteosafe_library.parse_xml_file(open(params_filename)) total_file_count = 0 all_input_files = all_input_files[my_node::total_node] current_working_psm_set = ming_psm_library.PSMset("Ming") for input_file in all_input_files: #Assume these are variant files #We can treat this like a psm file and then combine all of the as a new variants file total_file_count += 1 print(input_file, total_file_count, "of", len(all_input_files)) input_pickle = open(input_file, 'rb') temp_psm_set = pickle.load(input_pickle) print("Loaded", len(temp_psm_set.psms)) for psm in temp_psm_set.psms: precursor_string = "%s:%d" % (psm.annotation, psm.charge) score = psm.score #Determine minimum score cutoff current_score = psm.sorting_value() peptide_length = len(psm.get_stripped_sequence()) current_working_psm_set.psms.append(psm) #Saving out psms output_filename = os.path.join(output_folder, str(my_node) + ".psms") current_working_psm_set.write_output(open(output_filename, "w"), True)
def main(): parser = argparse.ArgumentParser( description='Invoking new workflow with parameters of given workflow') parser.add_argument('workflowparamters', help='workflowparamters') parser.add_argument('output_mgf', help='output_mgf') parser.add_argument('output_tsv', help='output_tsv') args = parser.parse_args() workflow_parameters_map = ming_proteosafe_library.parse_xml_file( open(args.workflowparamters)) usi_list = workflow_parameters_map["usi_string"][0].split("\n") usi_list = [usi for usi in usi_list if len(usi) > 5] output_mgf = open(args.output_mgf, "w") output_results_list = [] for i, usi in enumerate(usi_list): #Spectrum precursor_mz, peaks = _get_spectrum(usi) if precursor_mz == None: continue output_mgf.write("BEGIN IONS\n") output_mgf.write("TITLE=USI:{}\n".format(usi)) output_mgf.write("PEPMASS={}\n".format(precursor_mz)) output_mgf.write("CHARGE=0\n") output_mgf.write("SCANS={}\n".format(i + 1)) for peak in peaks: output_mgf.write("{} {}\n".format(peak[0], peak[1])) output_mgf.write("END IONS\n") output_dict = {} output_dict["usi"] = usi output_dict["filename"] = args.output_mgf output_dict["scan"] = i + 1 output_results_list.append(output_dict) df = pd.DataFrame(output_results_list) df.to_csv(args.output_tsv, sep="\t", index=False)
def main(): paramxml_filename = sys.argv[1] input_spectrum_filename = sys.argv[2] input_spectrum_all = sys.argv[3] psms_input_file = sys.argv[4] input_collision_energy_folder = sys.argv[5] output_psms_file = sys.argv[6] parameters_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_filename)) scan_metadata_maps = load_collision_energy_mapping(input_collision_energy_folder) target_filename_list, decoy_filename_list = determine_set_of_target_and_decoy_spectrum_files(parameters_obj) input_psm_set = ming_psm_library.PSMset("input psms") input_psm_set.load_MSGF_Plus_tsvfile(psms_input_file) """Filtering on Collision Energy""" print("Size Before Filtering", len(input_psm_set.psms)) filter_psms_to_acceptable_metadata(input_psm_set, scan_metadata_maps, parameters_obj) print("Size After CE Filtering", len(input_psm_set.psms)) """Filtering to current file""" current_file_psms = get_psms_to_current_file(input_psm_set, input_spectrum_filename) target_file_psms = get_psms_to_target_file(input_psm_set, target_filename_list) print(len(current_file_psms), len(target_file_psms)) output_decoys_list = [] if os.path.basename(input_spectrum_filename) in target_filename_list: #no filtering, just save print("Target") output_decoys_list = target_file_psms else: #Find top scoring hit for each precursor blacklisted_decoy_peptides = json.loads(parameters_obj["blacklisted_decoy_peptides_json"][0]) current_file_psms = filtering_out_blacklisted_decoys(current_file_psms, blacklisted_decoy_peptides) output_decoys_list = filtering_out_high_scoring_decoys(current_file_psms, target_file_psms, os.path.join(input_spectrum_all, target_filename_list[0]), input_spectrum_filename) output_decoys_list = filtering_redundant_identifications_per_scan(output_decoys_list) input_psm_set.psms = output_decoys_list input_psm_set.write_output(open(output_psms_file, "w"))
def main(): input_file_of_tsv_results = sys.argv[1] input_params_xml_filename = sys.argv[2] input_library_identifications_filename = sys.argv[3] input_cutoff_scores = sys.argv[4] output_folder = sys.argv[5] output_filename = os.path.join(output_folder, os.path.basename(input_file_of_tsv_results)) params_object = ming_proteosafe_library.parse_xml_file(open(input_params_xml_filename)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_scans_to_identification = library_scans_to_identification_info(input_library_identifications_filename) cutoff_dict = json.loads(open(input_cutoff_scores).read()) psm_list = ming_psm_library.parse_MSGFPlus_tsvfile(input_file_of_tsv_results) output_results_dict = process_ambiguity(psm_list, mangled_mapping, library_scans_to_identification, cutoff_dict) ming_fileio_library.write_dictionary_table_data(output_results_dict, output_filename)
def name_demangle_filenames(input_file, output_file, path_to_param, old_filename_header, new_filename_header): row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( ming_proteosafe_library.parse_xml_file(open(path_to_param))) if old_filename_header == new_filename_header: for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header][i] = unmangled_name else: table_data[new_filename_header] = [] for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header].append(unmangled_name) ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main(): parser = argparse.ArgumentParser(description='Create parallel parameters') parser.add_argument('library_folder', help='Input mgf file to network') parser.add_argument('workflow_parameters', help='proteosafe xml parameters') parser.add_argument('parameters_output_folder', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) for i in range(args.parallelism): output_parameter_file = open(os.path.join(args.parameters_output_folder, str(i) + ".params"), "w") #Search Criteria output_parameter_file.write("MIN_MATCHED_PEAKS=%s\n" % (params_object["MIN_MATCHED_PEAKS"][0])) output_parameter_file.write("TOP_K_RESULTS=%s\n" % (params_object["TOP_K_RESULTS"][0])) output_parameter_file.write("search_peak_tolerance=%s\n" % (params_object["tolerance.Ion_tolerance"][0])) output_parameter_file.write("search_parentmass_tolerance=%s\n" % (params_object["tolerance.PM_tolerance"][0])) output_parameter_file.write("ANALOG_SEARCH=%s\n" % (params_object["ANALOG_SEARCH"][0])) output_parameter_file.write("MAX_SHIFT_MASS=%s\n" % (params_object["MAX_SHIFT_MASS"][0])) output_parameter_file.write("SEARCH_LIBQUALITY=%s\n" % (params_object["SEARCH_LIBQUALITY"][0])) #Filtering Criteria output_parameter_file.write("FILTER_PRECURSOR_WINDOW=%s\n" % (params_object["FILTER_PRECURSOR_WINDOW"][0])) output_parameter_file.write("MIN_PEAK_INT=%s\n" % (params_object["MIN_PEAK_INT"][0])) output_parameter_file.write("WINDOW_FILTER=%s\n" % (params_object["WINDOW_FILTER"][0])) output_parameter_file.write("FILTER_LIBRARY=%s\n" % (params_object["FILTER_LIBRARY"][0])) output_parameter_file.write("NODEIDX=%d\n" % (i)) output_parameter_file.write("NODECOUNT=%d\n" % (args.parallelism)) #For GC output_parameter_file.write("FORCE_EXACT_MATCH=%s\n" % (params_object["FORCE_EXACT_MATCH"][0])) #Libraries output_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" % (" ".join(library_files))) output_parameter_file.close()
def main(): parser = argparse.ArgumentParser(description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clustersummary', help='input_clustersummary') parser.add_argument('input_clusterinfo', help='input_clusterinfo') parser.add_argument('output_clusterinfo', help='output_clusterinfo') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) #Creating acceptable clusters to include in cluster info included_clusters = set() for row in csv.DictReader(open(args.input_clustersummary), delimiter='\t'): included_clusters.add(row["cluster index"]) with open(args.input_clusterinfo) as input_clusterinfo: field_names = ["cluster index", "AllFiles", "sum(precursor intensity)", "RTMean", "RTStdErr", "parent mass", "ScanNumber", "ProteosafeFilePath", "Original_Path"] output_clusterinfo_writer = csv.DictWriter(open(args.output_clusterinfo, "w"), fieldnames=field_names, delimiter='\t') output_clusterinfo_writer.writeheader() input_clusterinfo_reader = csv.DictReader(input_clusterinfo, delimiter='\t') for row in input_clusterinfo_reader: if not (row["#ClusterIdx"] in included_clusters): continue output_dict = {} output_dict["cluster index"] = row["#ClusterIdx"] output_dict["AllFiles"] = row["#Filename"] output_dict["sum(precursor intensity)"] = row["#PrecIntensity"] output_dict["RTMean"] = row["#RetTime"] output_dict["RTStdErr"] = "0" output_dict["parent mass"] = row["#ParentMass"] output_dict["ScanNumber"] = row["#Scan"] output_dict["ProteosafeFilePath"] = os.path.join("spec", os.path.basename(row["#Filename"])) output_dict["Original_Path"] = "f." + mangled_mapping[os.path.basename(row["#Filename"])] output_clusterinfo_writer.writerow(output_dict) exit(0)
def main(): paramxml_input_filename = sys.argv[1] pairs_info_filename = sys.argv[2] clusterinfo_filename = sys.argv[3] output_all_paths_filename = sys.argv[4] output_all_paths_histogram_filename = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_input_filename)) try: if params_obj["CREATE_TOPOLOGY_SIGNATURES"][0] != "1": open(output_all_paths_filename, "w").write("NONE") open(output_all_paths_histogram_filename, "w").write("NONE") exit(0) except: open(output_all_paths_filename, "w").write("NONE") open(output_all_paths_histogram_filename, "w").write("NONE") exit(0) find_features_in_network(clusterinfo_filename, pairs_info_filename, output_all_paths_filename, output_all_paths_histogram_filename)
def name_demangle_filenames_and_instrument_collision(input_file, output_file, path_to_param, path_to_original_results, old_filename_header, new_filename_header): row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file, skip_incomplete_lines=True) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( ming_proteosafe_library.parse_xml_file(open(path_to_param))) if not "FragMethod" in table_data: print("Demangling", path_to_original_results, input_file) collision_mapping = get_scan_mapping_for_collision_method( path_to_original_results) #Adding collision column table_data["FragMethod"] = [] print(len(table_data["filename"]), len(table_data["scan"])) for i in range(row_count): key = table_data["filename"][i] + "_" + table_data["scan"][i] if key in collision_mapping: table_data["FragMethod"].append(collision_mapping[key]) else: table_data["FragMethod"].append("NO_COLLISION") if old_filename_header == new_filename_header: for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header][i] = unmangled_name else: table_data[new_filename_header] = [] for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header].append(unmangled_name) ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main(): paramxml_input_filename = sys.argv[1] output_json_folder = sys.argv[2] parallelism = int(sys.argv[3]) params_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_input_filename)) try: if params_obj["MATCH_REFERENCE_DATASETS"][0] != "1": parallelism = 1 except: parallelism = 1 all_datasets = [] try: temp_datasets = ming_proteosafe_library.get_all_datasets() #Filtering datasets to reference datasets for dataset in temp_datasets: if dataset["title"].find("GNPS_ref_") != -1: all_datasets.append(dataset) except: all_datasets = [] for i in range(parallelism): output_map = {"node_partition": i, "total_paritions": parallelism} partitioned_datasets = all_datasets[i::parallelism] output_map["all_datasets"] = partitioned_datasets dataset_map = {} for dataset in partitioned_datasets: dataset_map[dataset["dataset"]] = dataset output_map["dataset_dict"] = dataset_map output_filename = os.path.join(output_json_folder, str(i) + ".json") open(output_filename, "w").write(json.dumps(output_map))
def main(): parser = argparse.ArgumentParser(description='Create parallel parameters') parser.add_argument('mgf_filename', help='Input mgf file to network') parser.add_argument('workflow_parameters', help='proteosafe xml parameters') parser.add_argument('parameters_output_folder', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) #Determing number of spectra in mgf file number_of_spectra = number_scans_in_mgf_file(args.mgf_filename) parallelism = args.parallelism if parallelism > number_of_spectra: parallelism = 1 number_per_partition = int(number_of_spectra/parallelism) for i in range(parallelism): output_parameter_file = open(os.path.join(args.parameters_output_folder, str(i) + ".params"), "w") output_parameter_file.write("ALIGNS_FORMAT=%s\n" % ("tsv")) output_parameter_file.write("MIN_MATCHED_PEAKS=%s\n" % (params_object["MIN_MATCHED_PEAKS"][0])) output_parameter_file.write("TOLERANCE_PEAK=%s\n" % (params_object["tolerance.Ion_tolerance"][0])) output_parameter_file.write("TOLERANCE_PM=%s\n" % (params_object["tolerance.PM_tolerance"][0])) output_parameter_file.write("PAIRS_MIN_COSINE=%s\n" % (params_object["PAIRS_MIN_COSINE"][0])) output_parameter_file.write("MAX_SHIFT=%s\n" % (params_object["MAX_SHIFT"][0])) output_parameter_file.write("INPUT_SPECTRA_MS2=%s\n" % (args.mgf_filename)) start_idx = number_per_partition * i end_idx = number_per_partition * (i + 1) - 1 if i == parallelism - 1: end_idx = number_of_spectra output_parameter_file.write("IDX_START=%d\n" % (start_idx)) output_parameter_file.write("IDX_END=%d\n" % (end_idx))
def main(): parser = argparse.ArgumentParser(description='Group Mapping from input, defaults and metadata file') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('groupmapping_folder', help='groupmapping_folder') parser.add_argument('attributemapping_folder', help='attributemapping_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_groupmapping_file', help='output_groupmapping_file') parser.add_argument('output_attributemapping_file', help='output_attributemapping_file') parser.add_argument('inputspectrafolder', help='inputspectrafolder') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_obj) file_path_prefix = args.inputspectrafolder output_group_file = open(args.output_groupmapping_file, "w") output_attribute_file = open(args.output_attributemapping_file, "w") """ Writing Default Grouping to output file """ default_groupings = {'G1' : [] , 'G2' : [] ,'G3' : [] ,'G4' : [] ,'G5' : [] ,'G6' : [] } for mangled_name in mangled_file_mapping.keys(): if mangled_name.find("spec-") != -1: default_groupings['G1'].append(mangled_name.rstrip()) if mangled_name.find("spectwo-") != -1: default_groupings['G2'].append(mangled_name.rstrip()) if mangled_name.find("specthree-") != -1: default_groupings['G3'].append(mangled_name.rstrip()) if mangled_name.find("specfour-") != -1: default_groupings['G4'].append(mangled_name.rstrip()) if mangled_name.find("specfive-") != -1: default_groupings['G5'].append(mangled_name.rstrip()) if mangled_name.find("specsix-") != -1: default_groupings['G6'].append(mangled_name.rstrip()) for default_group_key in default_groupings.keys(): default_group_string = "" default_group_string += "GROUP_" + default_group_key +"=" for mangled_name in default_groupings[default_group_key]: default_group_string += os.path.join(file_path_prefix, mangled_name) + ";" if len(default_groupings[default_group_key]) > 0: default_group_string = default_group_string[:-1] output_group_file.write(default_group_string + "\n") """Determining output whether to use group mapping file or metadata file""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.groupmapping_folder) attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.attributemapping_folder) if len(metadata_files_in_folder) > 1: print("Too many metafile inputted") exit(1) if len(metadata_files_in_folder) == 1: #Using metadatat file row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) if not "filename" in table_data: print("Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename") exit(1) attributes_to_groups_mapping = defaultdict(set) group_to_files_mapping = defaultdict(list) for i in range(row_count): filename = table_data["filename"][i] basename_filename = os.path.basename(filename).rstrip() if basename_filename in reverse_file_mangling: mangled_name = reverse_file_mangling[basename_filename] for key in table_data: if key.find("ATTRIBUTE_") != -1: group_name = table_data[key][i] if len(group_name) < 1: continue group_to_files_mapping[group_name].append(os.path.join(file_path_prefix, mangled_name)) attributes_to_groups_mapping[key.replace("ATTRIBUTE_", "")].add(group_name) else: #Filename is not part of sample set continue for group_name in group_to_files_mapping: group_string = "GROUP_" + group_name + "=" + ";".join(group_to_files_mapping[group_name]) output_group_file.write(group_string + "\n") for attribute_name in attributes_to_groups_mapping: attribute_string = attribute_name + "=" + ";".join(list(attributes_to_groups_mapping[attribute_name])) output_attribute_file.write(attribute_string + "\n") exit(0) """Falling back on old group mapping file""" if len(groupmapping_files_in_folder) > 1 or len(attributemapping_files_in_folder) > 1: print("Too many group/attribute mappings inputted") exit(1) if len(groupmapping_files_in_folder) == 1: for line in open(groupmapping_files_in_folder[0], errors='ignore'): splits = line.rstrip().split("=") if len(splits) < 2: continue group_name = splits[0] group_files = [] for filename in splits[1].split(";"): if os.path.basename(filename) in reverse_file_mangling: mangled_name = reverse_file_mangling[os.path.basename(filename)] group_files.append(os.path.join(file_path_prefix, mangled_name)) group_string = group_name + "=" + ";".join(group_files) output_group_file.write(group_string + "\n") if len(attributemapping_files_in_folder) == 1: for line in open(attributemapping_files_in_folder[0]): output_attribute_file.write(line)
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('consensus_feature_file', help='Consensus Quantification File') parser.add_argument('metadata_folder', help='metadata metadata_folder') parser.add_argument('mgf_filename', help='mgf_filename') parser.add_argument('output_clusterinfo_summary', help='output file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) task_id = param_obj["task"][0] group_to_files_mapping = defaultdict(list) attributes_to_groups_mapping = defaultdict(set) metadata_files = glob.glob(os.path.join(args.metadata_folder, "*")) if len(metadata_files) == 1: group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(metadata_files[0]) ROW_NORMALIZATION = "None" try: ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0] except: ROW_NORMALIZATION = "None" GROUP_COUNT_AGGREGATE_METHOD = "Sum" try: GROUP_COUNT_AGGREGATE_METHOD = param_obj["GROUP_COUNT_AGGREGATE_METHOD"][0] except: GROUP_COUNT_AGGREGATE_METHOD = "None" quantification_list = ming_fileio_library.parse_table_with_headers_object_list(args.consensus_feature_file, delimiter=",") input_filenames, input_filename_headers = determine_input_files(quantification_list[0].keys()) ### Filling in Quantification table if it is missing values for quantification_object in quantification_list: ###Handling empty quantification for filename in input_filename_headers: try: if len(quantification_object[filename]) == 0: #print(filename, quantification_object[filename], quantification_object["row ID"]) quantification_object[filename] = 0 except: x = 1 print("Number of Features", len(quantification_list)) #Doing row sum normalization if ROW_NORMALIZATION == "RowSum": print("ROW SUM NORM") for filename_header in input_filename_headers: file_quants = [float(quantification_object[filename_header]) for quantification_object in quantification_list] for quantification_object in quantification_list: quantification_object[filename_header] = float(quantification_object[filename_header]) / sum(file_quants) """Loading MS2 Spectra""" mgf_collection = ming_spectrum_library.SpectrumCollection(args.mgf_filename) mgf_collection.load_from_file() clusters_list = [] for quantification_object in quantification_list: cluster_obj = {} cluster_obj["cluster index"] = quantification_object["row ID"] cluster_obj["precursor mass"] = "{0:.4f}".format(float(quantification_object["row m/z"])) cluster_obj["RTConsensus"] = "{0:.4f}".format(float(quantification_object["row retention time"])) all_charges = [] """Checking about the charge of this cluster""" try: spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] charge = int(spectrum_object.charge) except: charge = 0 """Checking if this spectrum has no peaks""" # try: # spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] # # except: # continue all_files = [os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0] abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers] all_abundances = [float(quantification_object[filename]) for filename in input_filename_headers] if charge != 0: cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]) * charge - charge + 1) else: cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"])) cluster_obj["precursor charge"] = charge try: cluster_obj["RTMean"] = statistics.mean(all_retention_times) cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times) except: cluster_obj["RTMean"] = cluster_obj["RTConsensus"] cluster_obj["RTStdErr"] = 0 cluster_obj["GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (task_id, quantification_object["row ID"], quantification_object["row ID"]) #cluster_obj["AllFiles"] = "###".join(all_files) cluster_obj["sum(precursor intensity)"] = sum(all_abundances) cluster_obj["SumPeakIntensity"] = sum(all_abundances) cluster_obj["number of spectra"] = len(all_files) cluster_obj["UniqueFileSourcesCount"] = len(all_files) group_abundances = determine_group_abundances(group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD) default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"] for group in group_to_files_mapping: group_header = "GNPSGROUP:" + group if group in default_groups: continue cluster_obj[group_header] = group_abundances[group] for group in default_groups: cluster_obj[group] = group_abundances[group] #Writing attributes for attribute in attributes_to_groups_mapping: groups_to_include = [] for group in attributes_to_groups_mapping[attribute]: if group_abundances[group] > 0.0: groups_to_include.append(group) if len(groups_to_include) == 0: cluster_obj[attribute] = "" else: cluster_obj[attribute] = ",".join(groups_to_include) """ Enriching the cluster info with adduct collapsing information """ enrich_adduct_annotations(cluster_obj, quantification_object) clusters_list.append(cluster_obj) ming_fileio_library.write_list_dict_table_data(clusters_list, args.output_clusterinfo_summary)
import sys import os import ming_proteosafe_library param_obj = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) output_filename = sys.argv[2] output_sentences = [] output_sentences.append("<strong>Network Description</strong><br><br>\n\n") output_sentences.append("A molecular network was created with the feature based molecular networking workflow (https://ccms-ucsd.github.io/GNPSDocumentation/featurebasedmolecularnetworking/) on the GNPS website (http://gnps.ucsd.edu).") if param_obj["FILTER_PRECURSOR_WINDOW"][0] == "1": output_sentences.append("The data was filtered by removing all MS/MS fragment ions within +/- 17 Da of the precursor m/z.") if param_obj["WINDOW_FILTER"][0] == "1": output_sentences.append("MS/MS spectra were window filtered by choosing only the top 6 fragment ions in the +/- 50Da window throughout the spectrum.") output_sentences.append("The precursor ion mass tolerance was set to %s Da and a MS/MS fragment ion tolerance of %s Da." % (param_obj["tolerance.PM_tolerance"][0], param_obj["tolerance.Ion_tolerance"][0])) output_sentences.append("A network was then created where edges were filtered to have a cosine score above %s and more than %s matched peaks." % (param_obj["PAIRS_MIN_COSINE"][0], param_obj["MIN_MATCHED_PEAKS"][0])) output_sentences.append("Further, edges between two nodes were kept in the network if and only if each of the nodes appeared in each other's respective top %s most similar nodes." % (param_obj["TOPK"][0])) output_sentences.append("Finally, the maximum size of a molecular family was set to %s, and the lowest scoring edges were removed from molecular families until the molecular family size was below this threshold." % (param_obj["MAXIMUM_COMPONENT_SIZE"][0])) output_sentences.append("The spectra in the network were then searched against GNPS' spectral libraries.") if param_obj["FILTER_LIBRARY"][0] == "1": output_sentences.append("The library spectra were filtered in the same manner as the input data.") output_sentences.append("All matches kept between network spectra and library spectra were required to have a score above %s and at least %s matched peaks." % (param_obj["SCORE_THRESHOLD"][0], param_obj["MIN_MATCHED_PEAKS_SEARCH"][0])) output_sentences.append("<br><br>\n<strong>Citation</strong><br><br>\n") output_sentences.append('Wang, Mingxun, et al. "Sharing and community curation of mass spectrometry data with Global Natural Products Social Molecular Networking." Nature Biotechnology 34.8 (2016): 828-837. PMID: 27504778, https://www.nature.com/articles/nbt.3597') open(output_filename, "w").write(" ".join(output_sentences))
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser(description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file') parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file') parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename') parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename') parser.add_argument('input_networking_pairs', help='input_networking_pairs') parser.add_argument('input_library_search', help='input_library_search') parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename') args = parser.parse_args() """Loading group filenames""" group_to_files, files_to_groups = load_group_mapping(args.input_group_mapping_filename) print("Loaded Group Mapping") cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfosummary_file) print("Loaded Cluster Summary") attribute_to_groups = load_attribute_mapping(args.input_attribute_mapping_filename) params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0]) RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0] #Calculating the spectrum counts per group cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0)) cluster_to_files = defaultdict(set) cluster_to_RT = defaultdict(list) line_count = 0 for line in open(args.input_clusterinfo_file): line_count += 1 if line_count == 1: continue if line_count % 10000 == 0: print(line_count) splits = line.rstrip().split("\t") cluster_index = splits[0] filename = os.path.basename(splits[1]) rt = float(splits[6]) group_membership = files_to_groups[filename] cluster_to_files[cluster_index].add(filename) cluster_to_RT[cluster_index].append(rt) for group in group_membership: cluster_to_group_counts[cluster_index][group] += 1 if RUN_MSCLUSTER == "on": cluster_summary_list = filter_clusters_based_on_cluster_size(cluster_summary_list, CLUSTER_MIN_SIZE) print(len(cluster_summary_list)) print("Setting up grouping", len(group_to_files.keys())) for cluster_summary_object in cluster_summary_list: cluster_index = cluster_summary_object["cluster index"] for group in group_to_files: group_count = 0 if group in cluster_to_group_counts[cluster_index]: group_count = cluster_to_group_counts[cluster_index][group] cluster_summary_object[group] = group_count for attribute in attribute_to_groups: groups_to_include = [] for group in attribute_to_groups[attribute]: if group in cluster_summary_object: if cluster_summary_object[group] > 0: groups_to_include.append(group) cluster_summary_object[attribute] = ",".join(groups_to_include).replace("GNPSGROUP:", "") print("Default Attributes") calculate_default_attributes(cluster_summary_list, group_to_files.keys()) print("calculate_cluster_file_stats") calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping) print("rt stats") calculate_rt_stats(cluster_summary_list, cluster_to_RT) print("calculate_ancillary_information") calculate_ancillary_information(cluster_summary_list, params_object["task"][0]) print("populate_network_component") populate_network_component(cluster_summary_list, args.input_networking_pairs) print("populate_network_identifications") populate_network_identifications(cluster_summary_list, args.input_library_search) ming_fileio_library.write_list_dict_table_data(cluster_summary_list, args.output_clusterinfosummary_filename)
def main(): parser = argparse.ArgumentParser(description='Running sirius wrapper') parser.add_argument('libFiles', help='input') parser.add_argument('input', help='input') parser.add_argument('input_filtered', help='input_filtered') parser.add_argument('workflow_parameters', help='workflow_parameters') parser.add_argument('carbonMarker', help='Carbon_Marker_File') parser.add_argument('result_nonfiltered', help='Kovats_Result_Nonfiltered') parser.add_argument('result_filtered', help='Kovats_Result_Nonfiltered') args = parser.parse_args() lib = args.libFiles param = args.workflow_parameters input = args.input input_filtered = args.input_filtered carbonMarker = args.carbonMarker result_nonfiltered = args.result_nonfiltered result_filtered = args.result_filtered #parse params params_obj = ming_proteosafe_library.parse_xml_file(open(param)) try: cosineScore = float(params_obj["Kovats_Filter_Cosine_Threshold"][0]) except: cosineScore = 0.9 try: errorFilter = float(params_obj["Error_Filter_Threshold"][0])/100 except: errorFilter = 0.1 try: if params_obj["runKovats"][0] == "on": optin = True except: optin = False '''try: minimunFeature = int(params_obj["polyFitting_data_point"][0]) except: minimunFeature = 10''' # set minimumFeature to be 10 currently minimunFeature = 10 if not optin: empty_tsv = open(result,'w') empty_tsv.write('Kovats Calculation Opt Out') return #if there is no csv file if carbonMarker == '': supporting_file = polyFitting.getParams(input_filtered,cosineScore,1.5,lib,minimunFeature) if supporting_file is None: empty_tsv = open(result,'w') empty_tsv.write('Not enough data for polynomial fitting') return mode = 'p' #try: # supporting_file = polyFitting.getParams(input,cosineScore,1.5) # mode = 'p' #except: # empty_tsv = open(result,'w') # empty_tsv.write(param+'\n') # empty_tsv.write(input+'\n') # empty_tsv.write(carbonMarker+'\n') # empty_tsv.write(result+'\n') # return else: supporting_file = carbonMarker mode = 'm' #try: # mapping.csv_builder(input,mode,supporting_file,cosineScore,errorFilter,result,lib) #except: # empty_tsv = open(result,'w') # empty_tsv.write('48,exit') # return mapping.csv_builder(input,mode,supporting_file,cosineScore,errorFilter,result_nonfiltered,result_filtered,lib)
def load_parameters_file(self, paramsfilename): #Loading the file mapping parameters = ming_proteosafe_library.parse_xml_file(open(paramsfilename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(parameters) self.mangled_mapping = mangled_mapping
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('cluster_buckets', help='cluster_buckets') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_folder', help='output_folder') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r")) if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0": print("Do not do things") exit(0) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object) """Reading Metadata File""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) else: print(metadata_files_in_folder[0]) object_list = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0]) if len(object_list) == 0: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: if "#SampleID" in metadata_object: metadata_object["#SampleID"] = metadata_object["#SampleID"] else: #Stripping off all non-alphanumeric characters metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum()) if not "Description" in metadata_object: metadata_object["Description"] = "LoremIpsum" if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" try: mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" except: print(metadata_object["filename"], "Not Mapped") metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped" output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv") output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv") for metadatum in object_list: if "sample_name" in metadatum: if len(metadatum["sample_name"]) > 1: metadatum["#SampleID"] = metadatum["sample_name"] metadata_df = pd.DataFrame(object_list) metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list) """Outputting Manifest Filename""" manifest_df = pd.DataFrame() manifest_df["sample_name"] = metadata_df["#SampleID"] manifest_df["filepath"] = metadata_df["filename"] manifest_df.to_csv(output_manifest_filename, index=False, sep=",") """Calling remote server to do the calculation""" SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024" #SERVER_BASE = "http://mingwangbeta.ucsd.edu:5024" files = {'manifest': open(output_manifest_filename, 'r'), \ 'metadata': open(output_metadata_filename, 'r'), \ 'bucket': open(args.cluster_buckets, 'r')} r_post = requests.post(SERVER_BASE + "/processclassic", files=files) response_dict = r_post.json() with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f) with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f)
def main(): paramxml_input_filename = sys.argv[1] parallel_param_filename = sys.argv[2] input_spectra_folder = sys.argv[3] library_search_results_filename = sys.argv[4] output_matches_filename = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_neighbors" : [], "Compound_Name" : [], "SpectrumID" : []} try: if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1": ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) except: ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) #If we are doing parallel partition_total = 1 partition_of_node = 0 params_map = json.loads(open(parallel_param_filename).read()) partition_total = params_map["total_paritions"] partition_of_node = params_map["node_partition"] dataset_dict = params_map["dataset_dict"] all_datasets = params_map["all_datasets"] #print(len(all_datasets)) #print(partition_of_node) #print(partition_total) #all_datasets = all_datasets[partition_of_node::partition_total] all_matches = finding_matches_in_public_data(os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets) #Lets parse the search results and then populate this thing with search results library_search_result_count, library_search_data = ming_fileio_library.parse_table_with_headers(library_search_results_filename) scan_to_library_map = {} for i in range(library_search_result_count): scan = library_search_data["Scan"][i] scan_to_library_map[scan] = {"Compound_Name" : library_search_data["Compound_Name"][i], "SpectrumID" : library_search_data["SpectrumID"][i]} for dataset in all_matches: #For each dataset, lets try to find the clustering information if len(all_matches[dataset]["matches"]) == 0: continue most_recent_molecular_networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(dataset_dict[dataset]["task"]) molecular_network = get_molecular_network_obj(most_recent_molecular_networking_job) for match in all_matches[dataset]["matches"]: output_map['specs_filename'].append("specs_ms.mgf") output_map['specs_scan'].append(match.query_scan) output_map['dataset_id'].append(dataset_dict[dataset]["dataset"]) output_map['dataset_title'].append(dataset_dict[dataset]["title"]) output_map['dataset_filename'].append(match.filename) output_map['dataset_scan'].append(match.scan) output_map['score'].append(match.score) #List the library identifications if str(match.query_scan) in scan_to_library_map: output_map['Compound_Name'].append(scan_to_library_map[str(match.query_scan)]["Compound_Name"]) output_map['SpectrumID'].append(scan_to_library_map[str(match.query_scan)]["SpectrumID"]) else: output_map['Compound_Name'].append("") output_map['SpectrumID'].append("") #Lets find all the analogs available if molecular_network != None: neighbors_in_dataset = molecular_network.get_node_neighbors(match.scan) output_map['dataset_neighbors'].append(len(neighbors_in_dataset)) else: output_map['dataset_neighbors'].append(0) ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_file', help='output_metadata_file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) default_group_mapping = defaultdict(list) file_to_group_mapping = {} for mangled_name in mangled_file_mapping: if mangled_name.find("specone-") != -1: default_group_mapping["G1"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1" if mangled_name.find("spectwo-") != -1: default_group_mapping["G2"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2" if mangled_name.find("specthree-") != -1: default_group_mapping["G3"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3" if mangled_name.find("specfour-") != -1: default_group_mapping["G4"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4" if mangled_name.find("specfive-") != -1: default_group_mapping["G5"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5" if mangled_name.find("specsix-") != -1: default_group_mapping["G6"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) row_count = 0 table_data = defaultdict(list) if len(metadata_files_in_folder) == 1: row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) print(table_data) for key in table_data: print(key, len(table_data[key])) for i in range(row_count): print(i) filename = table_data["filename"][i] if len(filename) < 2: continue print(filename, filename[0], filename[-1]) if filename[0] == "\"": filename = filename[1:] if filename[-1] == "\"": filename = filename[:-1] table_data["filename"][i] = filename basename_filename = os.path.basename(filename) group_name = "NoDefaultGroup" if basename_filename in file_to_group_mapping: group_name = file_to_group_mapping[basename_filename] table_data["ATTRIBUTE_DefaultGroup"].append(group_name) for input_filename in file_to_group_mapping: if input_filename in table_data["filename"]: continue else: for key in table_data: if key != "ATTRIBUTE_DefaultGroup" and key != "filename": table_data[key].append("N/A") table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename]) table_data["filename"].append(input_filename) ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)