def main(): param_filename = sys.argv[1] metadata_folder = sys.argv[2] input_clusterinfo_file = sys.argv[3] input_clusterinfosummary = sys.argv[4] ili_stl_model_folder = sys.argv[5] output_ili_filename = sys.argv[6] view_ili_html_filename = sys.argv[7] create_output = True param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) try: if param_object["CREATE_ILI_OUTPUT"][0] != "1": create_output = False except: create_output = False if create_output: ili_stl_model_files_in_folder = ming_fileio_library.list_files_in_dir(ili_stl_model_folder) metadata_files_in_folder = ming_fileio_library.list_files_in_dir(metadata_folder) if len(metadata_files_in_folder) != 1: print("Metadata file not provided, cannot create ili compatible output without coordinates") exit(1) filename_coordinate_mapping = load_filename_to_coordinate_mapping(metadata_files_in_folder[0]) create_ili_output_from_clusterinfo(input_clusterinfo_file, param_filename, input_clusterinfosummary, filename_coordinate_mapping, output_ili_filename) if len(ili_stl_model_files_in_folder) == 1: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write("<script>\n") output_ili_html_file.write('window.location.replace("https://ili.embl.de/?https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_stl_model/ili_stl_model-00000.stl;https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_output/ili_quant.csv")\n' % (param_object["task"][0],param_object["task"][0])) output_ili_html_file.write("</script>\n") output_ili_html_file.close() if len(ili_stl_model_files_in_folder) == 0: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write("No STL file uploaded, cannot directly link to ili\n") output_ili_html_file.close() if len(ili_stl_model_files_in_folder) > 1: output_ili_html_file = open(view_ili_html_filename, "w") output_ili_html_file.write("Too many stl files uploaded\n") output_ili_html_file.close() else: open(output_ili_filename, "w").write("No Output") open(view_ili_html_filename, "w").write("ili output was not selected or no metadata file was provided")
def retreive_proteosafe_backend_task_directory_file(task_id, servername, source_folder_name, target_file): proteosafe_data_path = "/data/" + servername + "/tasks/" source_folder_path = os.path.join(proteosafe_data_path, task_id, source_folder_name) source_files = ming_fileio_library.list_files_in_dir(source_folder_path) if len(source_files) == 1: #Can Copy source_file = os.path.join(source_files, source_files[0]) print("Copying from " + source_file + " to " + target_file) shutil.copyfile(source_file, target_file)
def get_proteosafe_result_file_path(task_id, username, source_folder_name): proteosafe_data_path = "/data/ccms-data/tasks/" source_folder_path = os.path.join(proteosafe_data_path, username, task_id, source_folder_name) if not ming_fileio_library.is_path_present(source_folder_path): return [] source_files = ming_fileio_library.list_files_in_dir(source_folder_path) return source_files
def get_proteosafe_backend_result_file_path(task_id, source_folder_name, site): proteosafe_data_path = "/data/" if site == "proteomics2": proteosafe_data_path += "beta-proteomics2" source_folder_path = os.path.join(proteosafe_data_path, "tasks", task_id, source_folder_name) if not ming_fileio_library.is_path_present(source_folder_path): return [] source_files = ming_fileio_library.list_files_in_dir(source_folder_path) return source_files
def main(): input_folder_path = sys.argv[1] output_tsv = sys.argv[2] files = ming_fileio_library.list_files_in_dir(input_folder_path) merged_dict = defaultdict(list) for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file) for key in table_data: merged_dict[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def determine_filetype_of_import(input_folder): input_filenames = ming_fileio_library.list_files_in_dir(input_folder) ext = ming_fileio_library.get_filename_extension(input_filenames[0]) if ext.upper() == ".CDF": return "netcdf" if ext.upper() == ".MZXML": return "mzxml" if ext.upper() == ".MZML": return "mzml" print("Unsupported extension") exit(1)
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder) output_map = defaultdict(list) for parallel_output_filename in all_intermediate_files: row_count, table_data = ming_fileio_library.parse_table_with_headers(parallel_output_filename) for key in table_data: output_map[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
def main(): input_folder_path = sys.argv[1] param_xml_filename = sys.argv[2] output_tsv = sys.argv[3] files = ming_fileio_library.list_files_in_dir(input_folder_path) params_obj = ming_proteosafe_library.parse_xml_file(open(param_xml_filename)) top_k = 1 try: top_k = int(params_obj["TOP_K_RESULTS"][0]) except: top_k = 1 #merged_dict = defaultdict(list) merged_results = [] for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file) for i in range(row_count): result_dict = {} for key in table_data: result_dict[key] = table_data[key][i] merged_results.append(result_dict) results_per_spectrum = defaultdict(list) for result_obj in merged_results: spectrum_unique_key = result_obj["SpectrumFile"] + "___" + result_obj["#Scan#"] results_per_spectrum[spectrum_unique_key].append(result_obj) output_results = [] for spectrum_unique_key in results_per_spectrum: sorted_results = sorted(results_per_spectrum[spectrum_unique_key], key=lambda spectrum_obj: float(spectrum_obj["MQScore"]), reverse=True) filtered_results = sorted_results[:top_k] output_results += filtered_results output_dict = defaultdict(list) for result_obj in output_results: for key in result_obj: output_dict[key].append(result_obj[key]) ming_fileio_library.write_dictionary_table_data(output_dict, output_tsv)
def load_metadata_mapping(metadata_folder): file_name_to_sample_id_mapping = {} all_files = ming_fileio_library.list_files_in_dir(metadata_folder) if len(all_files) != 1: return {} row_count, table_data = ming_fileio_library.parse_table_with_headers(all_files[0]) for i in range(row_count): filename = table_data["filename"][i] sample_id = table_data["#SampleID"][i] file_name_to_sample_id_mapping[filename] = sample_id return file_name_to_sample_id_mapping
def load_collision_energy_mapping(input_folder): scan_maps = {} all_files = ming_fileio_library.list_files_in_dir(input_folder) for input_file in all_files: print(input_file) list_of_metadata = json.loads(open(input_file).read()) for metadata in list_of_metadata: filename = metadata["filename"] scan = metadata["scan"] collision_energy = metadata["scan"] key = filename + ":" + str(scan) scan_maps[key] = metadata return scan_maps
def main(): input_folder_path = sys.argv[1] output_tsv = sys.argv[2] files = ming_fileio_library.list_files_in_dir(input_folder_path) merged_dict = defaultdict(list) for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file) for key in table_data: merged_dict[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def main(): input_pairs = sys.argv[1] #Doing other filtering G = molecular_network_filtering_library.loading_network(input_pairs, hasHeaders=True) molecular_network_filtering_library.add_clusterinfo_summary_to_graph(G, sys.argv[2]) molecular_network_filtering_library.add_library_search_results_to_graph(G, sys.argv[3]) folder_for_additional_pairs = sys.argv[4] all_pairs_files = ming_fileio_library.list_files_in_dir(folder_for_additional_pairs) for additional_pairs_file in all_pairs_files: print("Adding Additional Edges", additional_pairs_file) molecular_network_filtering_library.add_additional_edges(G, additional_pairs_file) nx.write_graphml(G, sys.argv[5], infer_numeric_types=True)
def find_matches_in_dataset(dataset_id, input_spectrum_collection, identification_map): dataset_match_list = [] path_to_peak_collection = os.path.join(PATH_TO_DATASET_UPLOADS, dataset_id, "peak") peak_files = ming_fileio_library.list_files_in_dir(path_to_peak_collection) for input_file in peak_files: print(input_file) relative_user_path_to_file = os.path.relpath(input_file, PATH_TO_DATASET_UPLOADS) reference_spectra = ming_spectrum_library.SpectrumCollection( input_file) reference_spectra.load_from_mzXML(drop_ms1=True) is_blank = 0 if input_file.find("blank") != -1: is_blank = 1 for myspectrum in input_spectrum_collection.spectrum_list: match_list = reference_spectra.search_spectrum( myspectrum, 1.0, 1.0, 4, 0.7, 1) for match in match_list: match_obj = {} match_obj["filename"] = relative_user_path_to_file match_obj["scan"] = match.scan match_obj["score"] = match.score match_obj["query_filename"] = match.query_filename match_obj["query_scan"] = match.query_scan match_obj["ppm_error"] = match.ppm_error match_obj["is_blank"] = is_blank match_obj["dataset_id"] = dataset_id #compound identification if match.scan in identification_map: match_obj["identification"] = identification_map[ match.scan]["identification"] match_obj["spectrum_id"] = identification_map[ match.scan]["spectrum_id"] else: match_obj["identification"] = "" match_obj["spectrum_id"] = "" dataset_match_list.append(match_obj) return dataset_match_list
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir( input_intermediate_folder) output_list = [] for parallel_output_filename in all_intermediate_files: result_list = ming_fileio_library.parse_table_with_headers_object_list( parallel_output_filename) output_list += result_list ming_fileio_library.write_list_dict_table_data(output_list, output_filename)
def main(): input_folder = sys.argv[1] output_filename_folder = sys.argv[2] input_files = ming_fileio_library.list_files_in_dir(input_folder) extension = os.path.split(input_files[0])[1] output_filename = os.path.join(output_filename_folder, "merged" + extension) output_file = open(output_filename, "w") for input_file in input_files: for line in open(input_file): output_file.write(line) output_file.write("\n") output_file.close()
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir( input_intermediate_folder) output_map = defaultdict(list) for parallel_output_filename in all_intermediate_files: row_count, table_data = ming_fileio_library.parse_table_with_headers( parallel_output_filename) for key in table_data: output_map[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
def main(): input_intermediate_file = sys.argv[1] output_tsv_folder = sys.argv[2] output_mgf_folder = sys.argv[3] output_sptxt_folder = sys.argv[4] all_input_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder) library_spectrum_collection = ming_spectrum_library.SpectrumCollection("library spectra") all_json_spectra_list = json.load(open(input_intermediate_file)) print("Loaded", input_intermediate_file, len(all_json_spectra_list)) for library_spectrum in list_of_library_spectra: lib_spec = ming_spectrum_library.PeptideLibrarySpectrum("", 0, 0, library_spectrum["peaks"], library_spectrum["mz"], library_spectrum["charge"], library_spectrum["annotation"], library_spectrum["protein"]) if "score" in library_spectrum: lib_spec.score = library_spectrum["score"] if "variant_score" in library_spectrum: lib_spec.variant_score = library_spectrum["variant_score"] if "spectra_to_consider" in library_spectrum: lib_spec.num_spectra = library_spectrum["spectra_to_consider"] if "ranking" in library_spectrum: lib_spec.spectrum_ranking = library_spectrum["ranking"] if "proteosafe_task" in library_spectrum: lib_spec.proteosafe_task = library_spectrum["proteosafe_task"] if "originalspectrum_filename" in library_spectrum: lib_spec.originalfile_filename = library_spectrum["originalspectrum_filename"] if "originalspectrum_scan" in library_spectrum: lib_spec.originalfile_scan = str(library_spectrum["originalspectrum_scan"]) library_spectrum_collection.spectrum_list.append(lib_spec) output_mgf_filename = os.path.join(output_mgf_folder, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".mgf") output_tsv_filename = os.path.join(output_tsv_filename, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".tsv") output_sptxt_filename = os.path.join(output_tsv_filename, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".sptxt") library_spectrum_collection_split.save_to_mgf(open(output_mgf_filename, "w")) library_spectrum_collection_split.save_to_tsv(open(output_tsv_filename, "w"), output_mgf_filename) try: library_spectrum_collection.save_to_sptxt(open(output_sptxt_filename, "w")) except: traceback.print_exc(file=sys.stdout) print("MEH")
def main(): parallel_json = json.loads(open(sys.argv[1]).read()) params_filename = sys.argv[2] input_folder_of_results = sys.argv[3] output_folder = sys.argv[4] my_node = parallel_json["node_partition"] total_node = parallel_json["total_paritions"] all_input_files = ming_fileio_library.list_files_in_dir(input_folder_of_results) all_input_files.sort() ### ### TODO We will have to read parameters and see if we need to eliminate some PSMs, with PSM FDR filter, KL Filter, ambiguity score filter, unique intensity filter ### params_obj = ming_proteosafe_library.parse_xml_file(open(params_filename)) total_file_count = 0 all_input_files = all_input_files[my_node::total_node] current_working_psm_set = ming_psm_library.PSMset("Ming") for input_file in all_input_files: #Assume these are variant files #We can treat this like a psm file and then combine all of the as a new variants file total_file_count += 1 print(input_file, total_file_count, "of", len(all_input_files)) input_pickle = open(input_file, 'rb') temp_psm_set = pickle.load(input_pickle) print("Loaded", len(temp_psm_set.psms)) for psm in temp_psm_set.psms: precursor_string = "%s:%d" % (psm.annotation, psm.charge) score = psm.score #Determine minimum score cutoff current_score = psm.sorting_value() peptide_length = len(psm.get_stripped_sequence()) current_working_psm_set.psms.append(psm) #Saving out psms output_filename = os.path.join(output_folder, str(my_node) + ".psms") current_working_psm_set.write_output(open(output_filename, "w"), True)
def main(): input_folder = sys.argv[1] input_protein_fdr_filename = sys.argv[2] input_peptide_protein_mapping_filename = sys.argv[3] precursor_to_protein_map = load_precursor_to_protein_mapping( input_peptide_protein_mapping_filename) output_mgf_folder = sys.argv[4] output_tsv_folder = sys.argv[5] output_filename_prefix = sys.argv[6] input_files = ming_fileio_library.list_files_in_dir(input_folder) all_library_spectra = [] for input_filename in input_files: temp_spectra = ming_spectrum_library.load_mgf_peptide_library( input_filename) print("loaded ", len(temp_spectra), "from", input_filename) for spectrum in temp_spectra: peptide = spectrum.peptide protein = spectrum.protein if protein == "CREATION_FALSE_PROTEIN": continue spectrum.protein = precursor_to_protein_map[peptide] all_library_spectra += temp_spectra library_spectrum_collection_split = ming_spectrum_library.SpectrumCollection( "library spectra") library_spectrum_collection_split.spectrum_list = all_library_spectra output_tsv_filename = os.path.join(output_tsv_folder, output_filename_prefix + ".tsv") output_mgf_filename = os.path.join(output_mgf_folder, output_filename_prefix + ".mgf") library_spectrum_collection_split.save_to_mgf( open(output_mgf_filename, "w")) library_spectrum_collection_split.save_to_tsv( open(output_tsv_filename, "w"), output_mgf_filename)
def main(): input_files_list = ming_fileio_library.list_files_in_dir(sys.argv[1]) output_dict = defaultdict(list) output_file = open(sys.argv[2], "w") file_count = 0 for input_file in input_files_list: row_count = 0 for line in open(input_file): if file_count == 0 and row_count == 0: output_file.write(line) elif row_count != 0: output_file.write(line) row_count += 1 file_count += 1 output_file.close()
def main(): input_pairs = sys.argv[1] #Doing other filtering G = molecular_network_filtering_library.loading_network(input_pairs, hasHeaders=True) molecular_network_filtering_library.add_clusterinfo_summary_to_graph( G, sys.argv[2]) molecular_network_filtering_library.add_library_search_results_to_graph( G, sys.argv[3]) folder_for_additional_pairs = sys.argv[4] all_pairs_files = ming_fileio_library.list_files_in_dir( folder_for_additional_pairs) for additional_pairs_file in all_pairs_files: print("Adding Additional Edges", additional_pairs_file) molecular_network_filtering_library.add_additional_edges( G, additional_pairs_file) nx.write_graphml(G, sys.argv[5], infer_numeric_types=True)
def main(): input_intermediate_folder = sys.argv[1] output_file = sys.argv[2] output_dict = defaultdict(list) total_rows = 0 input_filenames = ming_fileio_library.list_files_in_dir( input_intermediate_folder) for input_filename in input_filenames: if total_rows > 10000000: continue row_count, table_data = ming_fileio_library.parse_table_with_headers( input_filename) total_rows += row_count for i in range(row_count): for key in table_data: output_dict[key].append(table_data[key][i]) ming_fileio_library.write_dictionary_table_data(output_dict, output_file)
def main(): parser = argparse.ArgumentParser(description='Create parallel parameters') parser.add_argument('library_folder', help='Input mgf file to network') parser.add_argument('workflow_parameters', help='proteosafe xml parameters') parser.add_argument('parameters_output_folder', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) for i in range(args.parallelism): output_parameter_file = open(os.path.join(args.parameters_output_folder, str(i) + ".params"), "w") #Search Criteria output_parameter_file.write("MIN_MATCHED_PEAKS=%s\n" % (params_object["MIN_MATCHED_PEAKS"][0])) output_parameter_file.write("TOP_K_RESULTS=%s\n" % (params_object["TOP_K_RESULTS"][0])) output_parameter_file.write("search_peak_tolerance=%s\n" % (params_object["tolerance.Ion_tolerance"][0])) output_parameter_file.write("search_parentmass_tolerance=%s\n" % (params_object["tolerance.PM_tolerance"][0])) output_parameter_file.write("ANALOG_SEARCH=%s\n" % (params_object["ANALOG_SEARCH"][0])) output_parameter_file.write("MAX_SHIFT_MASS=%s\n" % (params_object["MAX_SHIFT_MASS"][0])) output_parameter_file.write("SEARCH_LIBQUALITY=%s\n" % (params_object["SEARCH_LIBQUALITY"][0])) #Filtering Criteria output_parameter_file.write("FILTER_PRECURSOR_WINDOW=%s\n" % (params_object["FILTER_PRECURSOR_WINDOW"][0])) output_parameter_file.write("MIN_PEAK_INT=%s\n" % (params_object["MIN_PEAK_INT"][0])) output_parameter_file.write("WINDOW_FILTER=%s\n" % (params_object["WINDOW_FILTER"][0])) output_parameter_file.write("FILTER_LIBRARY=%s\n" % (params_object["FILTER_LIBRARY"][0])) output_parameter_file.write("NODEIDX=%d\n" % (i)) output_parameter_file.write("NODECOUNT=%d\n" % (args.parallelism)) #For GC output_parameter_file.write("FORCE_EXACT_MATCH=%s\n" % (params_object["FORCE_EXACT_MATCH"][0])) #Libraries output_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" % (" ".join(library_files))) output_parameter_file.close()
def determine_filenames_to_load(my_node_number, total_parallel, path_to_merged_library_spectra): merged_library_filename = "" merged_library_files = ming_fileio_library.list_files_in_dir( path_to_merged_library_spectra) total_number_of_json_files = len(merged_library_files) json_file_number_to_load = my_node_number % total_number_of_json_files merged_library_filename = os.path.join( path_to_merged_library_spectra, str(json_file_number_to_load) + ".json") total_nodes_for_file = int( float(total_parallel) / float(total_number_of_json_files)) if total_parallel % total_number_of_json_files > my_node_number % total_number_of_json_files: total_nodes_for_file += 1 my_position_for_file = int( float(my_node_number) / float(total_number_of_json_files)) return merged_library_filename, my_position_for_file, total_nodes_for_file
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('cluster_buckets', help='cluster_buckets') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_folder', help='output_folder') parser.add_argument("conda_activate_bin") parser.add_argument("conda_environment") args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r")) if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0": print("Do not do things") exit(0) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object) """Reading Metadata File""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) else: object_list_temp = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0]) #object_list_temp = pd.read_csv(metadata_files_in_folder[0], sep="\t") object_list = [] for metadata_object in object_list_temp: if len(metadata_object["filename"]) > 1: object_list.append(metadata_object) #Adding all files, if analyzed file is not in list for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue found = False for metadata_object in object_list: if os.path.basename(real_name) == metadata_object["filename"]: found = True break if found is False: object_list.append({"filename" : real_name}) if len(object_list) == 0: print("Do not do things, not enough files") exit(0) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: if "#SampleID" in metadata_object: metadata_object["#SampleID"] = metadata_object["#SampleID"] else: #Stripping off all non-alphanumeric characters #metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum()) metadata_object["#SampleID"] = metadata_object["filename"] if not "Description" in metadata_object: metadata_object["Description"] = "LoremIpsum" if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" #Adding default grouping information try: mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" except: print(metadata_object["filename"], "Not Mapped") metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped" output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv") output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv") for metadatum in object_list: if "sample_name" in metadatum: if len(metadatum["sample_name"]) > 1: metadatum["#SampleID"] = metadatum["sample_name"] metadata_df = pd.DataFrame(object_list) """Outputting Manifest Filename""" manifest_df = pd.DataFrame() manifest_df["sample_name"] = metadata_df["#SampleID"] manifest_df["filepath"] = metadata_df["filename"] manifest_df.to_csv(output_manifest_filename, index=False, sep=",") #Removing protected headers #metadata_df = metadata_df.drop(columns=["feature", "#SampleID"], errors="ignore") metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list) #Running Qiime2 local_qza_table = os.path.join(args.output_folder, "qiime2_table.qza") local_qza_distance = os.path.join(args.output_folder, "qiime2_distance.qza") local_qza_pcoa = os.path.join(args.output_folder, "qiime2_pcoa.qza") local_qzv_emperor = os.path.join(args.output_folder, "qiime2_emperor.qzv") all_cmd = [] all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime metabolomics import-gnpsnetworkingclusteringbuckettable \ --p-manifest {} \ --p-buckettable {} \ --o-feature-table {}".format(args.conda_activate_bin, args.conda_environment, output_manifest_filename, args.cluster_buckets, local_qza_table)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime diversity beta \ --i-table {} \ --p-metric cosine \ --o-distance-matrix {}".format(args.conda_activate_bin, args.conda_environment, local_qza_table, local_qza_distance)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime diversity pcoa \ --i-distance-matrix {} \ --o-pcoa {}".format(args.conda_activate_bin, args.conda_environment, local_qza_distance, local_qza_pcoa)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime emperor plot \ --i-pcoa {} \ --m-metadata-file {} \ --o-visualization {} \ --p-ignore-missing-samples".format(args.conda_activate_bin, args.conda_environment, local_qza_pcoa, output_metadata_filename, local_qzv_emperor)) for cmd in all_cmd: os.system(cmd)
def trace_filename_filesystem(all_datasets, dataset_accession, dataset_scan, enrichmetadata=False): output_file_list = [] output_match_list = [] for dataset_object in all_datasets: if dataset_object["dataset"] == dataset_accession: networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset( dataset_object["task"]) if networking_job == None: continue networking_task_info = ming_proteosafe_library.get_task_information( "gnps.ucsd.edu", networking_job["task"]) task_user = networking_task_info["user"] clustering_path = os.path.join( "/data/ccms-data/tasks", task_user, networking_job["task"], "allclustered_spectra_info_withpath") clustering_files = ming_fileio_library.list_files_in_dir( clustering_path) if len(clustering_files) != 1: continue clustering_membership_list = ming_fileio_library.parse_table_with_headers_object_list( clustering_files[0]) acceptable_raw_spectra = [ spectrum for spectrum in clustering_membership_list if spectrum["cluster index"] == str(dataset_scan) ] for raw_spectrum in acceptable_raw_spectra: output_object = {} output_object["dataset_id"] = dataset_accession output_object["cluster_scan"] = dataset_scan output_object["filename"] = raw_spectrum["Original_Path"] output_object["filescan"] = raw_spectrum["ScanNumber"] output_object["metadata"] = "" output_object["basefilename"] = os.path.basename( raw_spectrum["Original_Path"]) if enrichmetadata: try: metadata_list = get_metadata_information_per_filename( raw_spectrum["Original_Path"]) output_object["metadata"] = "|".join(metadata_list) except: print("ReDU is down") output_match_list.append(output_object) print(len(acceptable_raw_spectra)) unique_files = list( set([ spectrum["Original_Path"] for spectrum in acceptable_raw_spectra ])) print(len(unique_files)) for source_file in unique_files: output_object = {} output_object["dataset_id"] = dataset_accession output_object["cluster_scan"] = dataset_scan output_object["filename"] = source_file output_object["metadata"] = "" output_object["basefilename"] = os.path.basename(source_file) if enrichmetadata: try: metadata_list = get_metadata_information_per_filename( source_file) output_object["metadata"] = "|".join(metadata_list) except: print("ReDU is down") output_file_list.append(output_object) #Performing a fix to make sure the spectrum is present because of a renaming from <dataset>/spectrum to <dataset>/ccms_peak for file_dict in output_file_list: splits = file_dict["filename"].split("/") splits[1] = splits[1].replace("spectrum", "ccms_peak") file_dict["filename"] = "/".join(splits) for file_dict in output_match_list: splits = file_dict["filename"].split("/") splits[1] = splits[1].replace("spectrum", "ccms_peak") file_dict["filename"] = "/".join(splits) return output_file_list, output_match_list
def main(): input_json = json.loads(open(sys.argv[1]).read()) input_intermediate_folder = sys.argv[2] output_folder = sys.argv[3] output_peptide_list_folder = sys.argv[4] my_node = input_json["node_partition"] output_filename = os.path.join(output_folder, str(my_node) + ".json") output_file = open(output_filename, "w") number_of_spectra = 0 input_json_files = ming_fileio_library.list_files_in_dir( input_intermediate_folder) input_json_files.sort() all_spectra = [] for json_filename in input_json_files: #Skip files json_basename = os.path.basename(json_filename).split(".")[0] bin_peptide = int(json_basename.split("_")[2]) if bin_peptide != my_node: continue print("Loading", json_filename) spectrum_list = json.load(open(json_filename)) all_spectra += spectrum_list print("Total Spectra", len(spectrum_list), len(all_spectra)) peptide_dict = defaultdict(list) print("Creating hash") for spectrum in all_spectra: annotation = spectrum["annotation"] + "." + str(spectrum["charge"]) peptide_dict[annotation].append(spectrum) print("writing out strings") all_annotation = list(peptide_dict.keys()) all_annotation.sort() for annotation in all_annotation: output_file.write(json.dumps(peptide_dict[annotation])) output_file.write("\n") output_file.close() #Write out all the peptides into a file output_peptide_dict = defaultdict(list) for annotation_key in peptide_dict: max_score = -10 if len(peptide_dict[annotation_key]) > 0: for spectrum in peptide_dict[annotation_key]: max_score = max(spectrum["score"], max_score) #max score per peptide output_peptide_dict["score"].append(max_score) output_peptide_dict["annotation_key"].append(annotation_key) output_peptide_dict["annotation"].append( peptide_dict[annotation_key][0]["annotation"]) output_peptide_dict["charge"].append( peptide_dict[annotation_key][0]["charge"]) output_peptide_dict["protein"].append( peptide_dict[annotation_key][0]["protein"]) #writing out file output_peptide_filename = os.path.join(output_peptide_list_folder, str(my_node) + ".tsv") ming_fileio_library.write_dictionary_table_data(output_peptide_dict, output_peptide_filename)
def main(): parser = argparse.ArgumentParser( description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_file', help='output_metadata_file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file( open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping( param_obj) default_group_mapping = defaultdict(list) file_to_group_mapping = {} for mangled_name in mangled_file_mapping: if mangled_name.find("specone-") != -1: default_group_mapping["G1"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G1" if mangled_name.find("spectwo-") != -1: default_group_mapping["G2"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G2" if mangled_name.find("specthree-") != -1: default_group_mapping["G3"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G3" if mangled_name.find("specfour-") != -1: default_group_mapping["G4"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G4" if mangled_name.find("specfive-") != -1: default_group_mapping["G5"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G5" if mangled_name.find("specsix-") != -1: default_group_mapping["G6"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G6" metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) row_count = 0 table_data = defaultdict(list) if len(metadata_files_in_folder) == 1: row_count, table_data = ming_fileio_library.parse_table_with_headers( metadata_files_in_folder[0]) print(table_data) for key in table_data: print(key, len(table_data[key])) for i in range(row_count): print(i) filename = table_data["filename"][i] if len(filename) < 2: continue print(filename, filename[0], filename[-1]) if filename[0] == "\"": filename = filename[1:] if filename[-1] == "\"": filename = filename[:-1] table_data["filename"][i] = filename basename_filename = os.path.basename(filename) group_name = "NoDefaultGroup" if basename_filename in file_to_group_mapping: group_name = file_to_group_mapping[basename_filename] table_data["ATTRIBUTE_DefaultGroup"].append(group_name) for input_filename in file_to_group_mapping: if input_filename in table_data["filename"]: continue else: for key in table_data: if key != "ATTRIBUTE_DefaultGroup" and key != "filename": table_data[key].append("N/A") table_data["ATTRIBUTE_DefaultGroup"].append( file_to_group_mapping[input_filename]) table_data["filename"].append(input_filename) ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list used_files = set() for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path result_object["CCMS_filename"] = os.path.basename(full_path) used_files.add(full_path) for mangled_name in spectra_files: full_path = mangled_mapping[os.path.basename(mangled_name)] if full_path in used_files: continue output_dict = {} output_dict["full_CCMS_path"] = full_path output_dict["CCMS_filename"] = os.path.basename(full_path) full_result_list.append(output_dict) pd.DataFrame(full_result_list).to_csv(args.result_file, sep="\t", index=False)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser( description='Group Mapping from input, defaults and metadata file') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('groupmapping_folder', help='groupmapping_folder') parser.add_argument('attributemapping_folder', help='attributemapping_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_groupmapping_file', help='output_groupmapping_file') parser.add_argument('output_attributemapping_file', help='output_attributemapping_file') parser.add_argument('inputspectrafolder', help='inputspectrafolder') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file( open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping( param_obj) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping( param_obj) print(reverse_file_mangling.keys()) file_path_prefix = args.inputspectrafolder output_group_file = open(args.output_groupmapping_file, "w") output_attribute_file = open(args.output_attributemapping_file, "w") """ Writing Default Grouping to output file """ default_groupings = { 'G1': [], 'G2': [], 'G3': [], 'G4': [], 'G5': [], 'G6': [] } for mangled_name in mangled_file_mapping.keys(): if mangled_name.find("spec-") != -1: default_groupings['G1'].append(mangled_name.rstrip()) if mangled_name.find("spectwo-") != -1: default_groupings['G2'].append(mangled_name.rstrip()) if mangled_name.find("specthree-") != -1: default_groupings['G3'].append(mangled_name.rstrip()) if mangled_name.find("specfour-") != -1: default_groupings['G4'].append(mangled_name.rstrip()) if mangled_name.find("specfive-") != -1: default_groupings['G5'].append(mangled_name.rstrip()) if mangled_name.find("specsix-") != -1: default_groupings['G6'].append(mangled_name.rstrip()) for default_group_key in default_groupings.keys(): default_group_string = "" default_group_string += "GROUP_" + default_group_key + "=" for mangled_name in default_groupings[default_group_key]: default_group_string += os.path.join(file_path_prefix, mangled_name) + ";" if len(default_groupings[default_group_key]) > 0: default_group_string = default_group_string[:-1] output_group_file.write(default_group_string + "\n") """Determining output whether to use group mapping file or metadata file""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir( args.groupmapping_folder) attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir( args.attributemapping_folder) if len(metadata_files_in_folder) > 1: print("Too many metafile inputted") exit(1) if len(metadata_files_in_folder) == 1: #Using metadatat file row_count, table_data = ming_fileio_library.parse_table_with_headers( metadata_files_in_folder[0]) if not "filename" in table_data: print( "Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename" ) exit(1) attributes_to_groups_mapping = defaultdict(set) group_to_files_mapping = defaultdict(list) for i in range(row_count): filename = table_data["filename"][i] basename_filename = os.path.basename(filename).rstrip() print(basename_filename, len(reverse_file_mangling.keys())) if basename_filename in reverse_file_mangling: mangled_name = reverse_file_mangling[basename_filename] for key in table_data: if key.find("ATTRIBUTE_") != -1: group_name = table_data[key][i] if len(group_name) < 1: continue group_to_files_mapping[group_name].append( os.path.join(file_path_prefix, mangled_name)) attributes_to_groups_mapping[key.replace( "ATTRIBUTE_", "")].add(group_name) else: #Filename is not part of sample set print(basename_filename, "missing") continue for group_name in group_to_files_mapping: group_string = "GROUP_" + group_name + "=" + ";".join( group_to_files_mapping[group_name]) output_group_file.write(group_string + "\n") for attribute_name in attributes_to_groups_mapping: attribute_string = attribute_name + "=" + ";".join( list(attributes_to_groups_mapping[attribute_name])) output_attribute_file.write(attribute_string + "\n") exit(0) """Falling back on old group mapping file""" if len(groupmapping_files_in_folder) > 1 or len( attributemapping_files_in_folder) > 1: print("Too many group/attribute mappings inputted") exit(1) if len(groupmapping_files_in_folder) == 1: for line in open(groupmapping_files_in_folder[0], errors='ignore'): splits = line.rstrip().split("=") if len(splits) < 2: continue group_name = splits[0] group_files = [] for filename in splits[1].split(";"): if os.path.basename(filename) in reverse_file_mangling: mangled_name = reverse_file_mangling[os.path.basename( filename)] group_files.append( os.path.join(file_path_prefix, mangled_name)) group_string = group_name + "=" + ";".join(group_files) output_group_file.write(group_string + "\n") if len(attributemapping_files_in_folder) == 1: for line in open(attributemapping_files_in_folder[0]): output_attribute_file.write(line)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('cluster_buckets', help='cluster_buckets') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_folder', help='output_folder') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r")) if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0": print("Do not do things") exit(0) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object) """Reading Metadata File""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) else: object_list_temp = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0]) #object_list_temp = pd.read_csv(metadata_files_in_folder[0], sep="\t") object_list = [] for metadata_object in object_list_temp: if len(metadata_object["filename"]) > 1: object_list.append(metadata_object) #Adding all files, if analyzed file is not in list for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue found = False for metadata_object in object_list: if os.path.basename(real_name) == metadata_object["filename"]: found = True break if found is False: object_list.append({"filename" : real_name}) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: if "#SampleID" in metadata_object: metadata_object["#SampleID"] = metadata_object["#SampleID"] else: #Stripping off all non-alphanumeric characters #metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum()) metadata_object["#SampleID"] = metadata_object["filename"] if not "Description" in metadata_object: metadata_object["Description"] = "LoremIpsum" if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" #Adding default grouping information try: mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" except: print(metadata_object["filename"], "Not Mapped") metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped" output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv") output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv") for metadatum in object_list: if "sample_name" in metadatum: if len(metadatum["sample_name"]) > 1: metadatum["#SampleID"] = metadatum["sample_name"] #Removing metadata filenames that are not in the actual data #analysis_files = metadata_df = pd.DataFrame(object_list) metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list) """Outputting Manifest Filename""" manifest_df = pd.DataFrame() manifest_df["sample_name"] = metadata_df["#SampleID"] manifest_df["filepath"] = metadata_df["filename"] manifest_df.to_csv(output_manifest_filename, index=False, sep=",") """Calling remote server to do the calculation""" SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024" files = {'manifest': open(output_manifest_filename, 'r'), \ 'metadata': open(output_metadata_filename, 'r'), \ 'bucket': open(args.cluster_buckets, 'r')} r_post = requests.post(SERVER_BASE + "/processclassic", files=files) response_dict = r_post.json() with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f) with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main(): parser = argparse.ArgumentParser(description='Create parallel parameters') parser.add_argument('library_folder', help='Input mgf file to network') parser.add_argument('workflow_parameters', help='proteosafe xml parameters') parser.add_argument('parameters_output_folder', help='output folder for parameters') parser.add_argument('parameters_analog_output_folder', help='output folder for analog parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) for i in range(args.parallelism): output_parameter_file = open( os.path.join(args.parameters_output_folder, str(i) + ".params"), "w") output_analog_parameter_file = open( os.path.join(args.parameters_analog_output_folder, str(i) + ".params"), "w") #Search Criteria output_parameter_file.write( "MIN_MATCHED_PEAKS_SEARCH=%s\n" % (params_object["MIN_MATCHED_PEAKS_SEARCH"][0])) output_parameter_file.write("TOP_K_RESULTS=%s\n" % (params_object["TOP_K_RESULTS"][0])) output_parameter_file.write( "search_peak_tolerance=%s\n" % (params_object["tolerance.Ion_tolerance"][0])) output_parameter_file.write( "search_parentmass_tolerance=%s\n" % (params_object["tolerance.PM_tolerance"][0])) output_parameter_file.write("ANALOG_SEARCH=%s\n" % ("0")) output_parameter_file.write("MAX_SHIFT_MASS=%s\n" % (params_object["MAX_SHIFT_MASS"][0])) output_analog_parameter_file.write( "MIN_MATCHED_PEAKS_SEARCH=%s\n" % (params_object["MIN_MATCHED_PEAKS_SEARCH"][0])) output_analog_parameter_file.write("TOP_K_RESULTS=%s\n" % (params_object["TOP_K_RESULTS"][0])) output_analog_parameter_file.write( "search_peak_tolerance=%s\n" % (params_object["tolerance.Ion_tolerance"][0])) output_analog_parameter_file.write( "search_parentmass_tolerance=%s\n" % (params_object["tolerance.PM_tolerance"][0])) output_analog_parameter_file.write("ANALOG_SEARCH=%s\n" % (params_object["ANALOG_SEARCH"][0])) output_analog_parameter_file.write( "MAX_SHIFT_MASS=%s\n" % (params_object["MAX_SHIFT_MASS"][0])) #Filtering Criteria output_parameter_file.write( "FILTER_PRECURSOR_WINDOW=%s\n" % (params_object["FILTER_PRECURSOR_WINDOW"][0])) output_parameter_file.write("MIN_PEAK_INT=%s\n" % (params_object["MIN_PEAK_INT"][0])) output_parameter_file.write("WINDOW_FILTER=%s\n" % (params_object["WINDOW_FILTER"][0])) output_parameter_file.write("FILTER_LIBRARY=%s\n" % (params_object["FILTER_LIBRARY"][0])) output_analog_parameter_file.write( "FILTER_PRECURSOR_WINDOW=%s\n" % (params_object["FILTER_PRECURSOR_WINDOW"][0])) output_analog_parameter_file.write("MIN_PEAK_INT=%s\n" % (params_object["MIN_PEAK_INT"][0])) output_analog_parameter_file.write("WINDOW_FILTER=%s\n" % (params_object["WINDOW_FILTER"][0])) output_analog_parameter_file.write( "FILTER_LIBRARY=%s\n" % (params_object["FILTER_LIBRARY"][0])) #Scoring Criteria output_parameter_file.write( "MIN_MATCHED_PEAKS_SEARCH=%s\n" % (params_object["MIN_MATCHED_PEAKS_SEARCH"][0])) output_parameter_file.write("SCORE_THRESHOLD=%s\n" % (params_object["SCORE_THRESHOLD"][0])) output_analog_parameter_file.write( "MIN_MATCHED_PEAKS_SEARCH=%s\n" % (params_object["MIN_MATCHED_PEAKS_SEARCH"][0])) output_analog_parameter_file.write( "SCORE_THRESHOLD=%s\n" % (params_object["SCORE_THRESHOLD"][0])) #Parallelism output_parameter_file.write("NODEIDX=%d\n" % (i)) output_parameter_file.write("NODECOUNT=%d\n" % (args.parallelism)) output_analog_parameter_file.write("NODEIDX=%d\n" % (i)) output_analog_parameter_file.write("NODECOUNT=%d\n" % (args.parallelism)) #Search Library output_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" % (" ".join(library_files))) output_analog_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" % (" ".join(library_files))) output_parameter_file.close() output_analog_parameter_file.close()
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_file', help='output_metadata_file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) default_group_mapping = defaultdict(list) file_to_group_mapping = {} for mangled_name in mangled_file_mapping: if mangled_name.find("specone-") != -1: default_group_mapping["G1"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1" if mangled_name.find("spectwo-") != -1: default_group_mapping["G2"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2" if mangled_name.find("specthree-") != -1: default_group_mapping["G3"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3" if mangled_name.find("specfour-") != -1: default_group_mapping["G4"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4" if mangled_name.find("specfive-") != -1: default_group_mapping["G5"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5" if mangled_name.find("specsix-") != -1: default_group_mapping["G6"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) row_count = 0 table_data = defaultdict(list) if len(metadata_files_in_folder) == 1: row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) print(table_data) for key in table_data: print(key, len(table_data[key])) for i in range(row_count): print(i) filename = table_data["filename"][i] if len(filename) < 2: continue print(filename, filename[0], filename[-1]) if filename[0] == "\"": filename = filename[1:] if filename[-1] == "\"": filename = filename[:-1] table_data["filename"][i] = filename basename_filename = os.path.basename(filename) group_name = "NoDefaultGroup" if basename_filename in file_to_group_mapping: group_name = file_to_group_mapping[basename_filename] table_data["ATTRIBUTE_DefaultGroup"].append(group_name) for input_filename in file_to_group_mapping: if input_filename in table_data["filename"]: continue else: for key in table_data: if key != "ATTRIBUTE_DefaultGroup" and key != "filename": table_data[key].append("N/A") table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename]) table_data["filename"].append(input_filename) ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('cluster_buckets', help='cluster_buckets') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_folder', help='output_folder') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r")) if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0": print("Do not do things") exit(0) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object) """Reading Metadata File""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) else: print(metadata_files_in_folder[0]) object_list = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0]) if len(object_list) == 0: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename" : real_name}) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: if "#SampleID" in metadata_object: metadata_object["#SampleID"] = metadata_object["#SampleID"] else: #Stripping off all non-alphanumeric characters metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum()) if not "Description" in metadata_object: metadata_object["Description"] = "LoremIpsum" if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" try: mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" except: print(metadata_object["filename"], "Not Mapped") metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped" output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv") output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv") for metadatum in object_list: if "sample_name" in metadatum: if len(metadatum["sample_name"]) > 1: metadatum["#SampleID"] = metadatum["sample_name"] metadata_df = pd.DataFrame(object_list) metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list) """Outputting Manifest Filename""" manifest_df = pd.DataFrame() manifest_df["sample_name"] = metadata_df["#SampleID"] manifest_df["filepath"] = metadata_df["filename"] manifest_df.to_csv(output_manifest_filename, index=False, sep=",") """Calling remote server to do the calculation""" SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024" #SERVER_BASE = "http://mingwangbeta.ucsd.edu:5024" files = {'manifest': open(output_manifest_filename, 'r'), \ 'metadata': open(output_metadata_filename, 'r'), \ 'bucket': open(args.cluster_buckets, 'r')} r_post = requests.post(SERVER_BASE + "/processclassic", files=files) response_dict = r_post.json() with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f) with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f: r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True) r.raw.decode_content = True shutil.copyfileobj(r.raw, f)
def main(): parser = argparse.ArgumentParser(description='Modifying script') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_table', help='output_metadata_table') parser.add_argument('output_view_emporer', help='output_metadata_table') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file( open(args.param_xml, "r")) """Outputting html""" from urllib.parse import urlencode, quote_plus parameters_for_qiime = { 'biom': 'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=biom_output/networking_quant.biom' % (param_object["task"][0]), 'metadata': 'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=metadata_for_qiime/metadata_for_qiime.txt' % (param_object["task"][0]) } output_html_file = open(args.output_view_emporer, "w") output_html_file.write("<script>\n") output_html_file.write( 'window.location.replace("https://mingwangbeta.ucsd.edu/emperor?%s")\n' % urlencode(parameters_for_qiime)) output_html_file.write("</script>\n") output_html_file.close() reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping( param_object) metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename": real_name}) #open(args.output_metadata_table, "w").write("NO OUTPUT") #open(args.output_view_emporer, "w").write("Please Include Metadata File") #exit(0) else: object_list = ming_fileio_library.parse_table_with_headers_object_list( metadata_files_in_folder[0]) if len(object_list) == 0: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename": real_name}) #open(args.output_metadata_table, "w").write("NO OUTPUT") #open(args.output_view_emporer, "w").write("Please Include Non Empty Metadata File") #exit(0) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: metadata_object[ "#SampleID"] = ming_fileio_library.get_filename_without_extension( metadata_object["filename"]) if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" ming_fileio_library.write_list_dict_table_data(object_list, args.output_metadata_table, header_list)
def main(): parser = argparse.ArgumentParser( description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[ parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir( tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list( input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data( full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser(description='Group Mapping from input, defaults and metadata file') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('groupmapping_folder', help='groupmapping_folder') parser.add_argument('attributemapping_folder', help='attributemapping_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_groupmapping_file', help='output_groupmapping_file') parser.add_argument('output_attributemapping_file', help='output_attributemapping_file') parser.add_argument('inputspectrafolder', help='inputspectrafolder') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_obj) file_path_prefix = args.inputspectrafolder output_group_file = open(args.output_groupmapping_file, "w") output_attribute_file = open(args.output_attributemapping_file, "w") """ Writing Default Grouping to output file """ default_groupings = {'G1' : [] , 'G2' : [] ,'G3' : [] ,'G4' : [] ,'G5' : [] ,'G6' : [] } for mangled_name in mangled_file_mapping.keys(): if mangled_name.find("spec-") != -1: default_groupings['G1'].append(mangled_name.rstrip()) if mangled_name.find("spectwo-") != -1: default_groupings['G2'].append(mangled_name.rstrip()) if mangled_name.find("specthree-") != -1: default_groupings['G3'].append(mangled_name.rstrip()) if mangled_name.find("specfour-") != -1: default_groupings['G4'].append(mangled_name.rstrip()) if mangled_name.find("specfive-") != -1: default_groupings['G5'].append(mangled_name.rstrip()) if mangled_name.find("specsix-") != -1: default_groupings['G6'].append(mangled_name.rstrip()) for default_group_key in default_groupings.keys(): default_group_string = "" default_group_string += "GROUP_" + default_group_key +"=" for mangled_name in default_groupings[default_group_key]: default_group_string += os.path.join(file_path_prefix, mangled_name) + ";" if len(default_groupings[default_group_key]) > 0: default_group_string = default_group_string[:-1] output_group_file.write(default_group_string + "\n") """Determining output whether to use group mapping file or metadata file""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.groupmapping_folder) attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.attributemapping_folder) if len(metadata_files_in_folder) > 1: print("Too many metafile inputted") exit(1) if len(metadata_files_in_folder) == 1: #Using metadatat file row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) if not "filename" in table_data: print("Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename") exit(1) attributes_to_groups_mapping = defaultdict(set) group_to_files_mapping = defaultdict(list) for i in range(row_count): filename = table_data["filename"][i] basename_filename = os.path.basename(filename).rstrip() if basename_filename in reverse_file_mangling: mangled_name = reverse_file_mangling[basename_filename] for key in table_data: if key.find("ATTRIBUTE_") != -1: group_name = table_data[key][i] if len(group_name) < 1: continue group_to_files_mapping[group_name].append(os.path.join(file_path_prefix, mangled_name)) attributes_to_groups_mapping[key.replace("ATTRIBUTE_", "")].add(group_name) else: #Filename is not part of sample set continue for group_name in group_to_files_mapping: group_string = "GROUP_" + group_name + "=" + ";".join(group_to_files_mapping[group_name]) output_group_file.write(group_string + "\n") for attribute_name in attributes_to_groups_mapping: attribute_string = attribute_name + "=" + ";".join(list(attributes_to_groups_mapping[attribute_name])) output_attribute_file.write(attribute_string + "\n") exit(0) """Falling back on old group mapping file""" if len(groupmapping_files_in_folder) > 1 or len(attributemapping_files_in_folder) > 1: print("Too many group/attribute mappings inputted") exit(1) if len(groupmapping_files_in_folder) == 1: for line in open(groupmapping_files_in_folder[0], errors='ignore'): splits = line.rstrip().split("=") if len(splits) < 2: continue group_name = splits[0] group_files = [] for filename in splits[1].split(";"): if os.path.basename(filename) in reverse_file_mangling: mangled_name = reverse_file_mangling[os.path.basename(filename)] group_files.append(os.path.join(file_path_prefix, mangled_name)) group_string = group_name + "=" + ";".join(group_files) output_group_file.write(group_string + "\n") if len(attributemapping_files_in_folder) == 1: for line in open(attributemapping_files_in_folder[0]): output_attribute_file.write(line)