def get_stripped_extenstion_file_mapping(params): all_mappings = get_mangled_file_mapping(params) output_mapping = {} for mangled_name in all_mappings: output_mapping[ming_fileio_library.get_filename_without_extension(mangled_name)] = all_mappings[mangled_name] return output_mapping
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping): output_file = open(output_filename, "w") line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) clusters_in_network = set() for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'): clusters_in_network.add(row["cluster index"]) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if not(cluster_number in clusters_in_network): continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) output_header_list = [] output_header_list.append("#OTU ID") for header in mangled_mapping.keys(): if header.find("spec") == -1: continue if os.path.basename(mangled_mapping[header]) in metadata_mapping: output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])]) else: output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header]))) output_file.write("\t".join(output_header_list) + "\n") for cluster_idx in cluster_index_to_file_map: line_output_list = [] line_output_list.append(str(cluster_idx)) #line_string = str(cluster_idx) + "\t" for header in mangled_mapping.keys(): if header.find("spec") == -1: continue line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header])) #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t" #print line_string #output_file.write(line_string + "\n") output_file.write("\t".join(line_output_list) + "\n") output_file.close()
def simple_presence_of_merged_spectra_processing(input_integrals_filename, output_clusterinfo_filename, mangled_mapping): extension_stripped_mangled_mapping = {} for key in mangled_mapping: without_ext = ming_fileio_library.get_filename_without_extension(key) extension_stripped_mangled_mapping[without_ext] = mangled_mapping[key] header_order = open(input_integrals_filename).readline().rstrip().split(",")[1:] table_list = ming_fileio_library.parse_table_with_headers_object_list(input_integrals_filename, delimiter=",") #Removing other header infroamtion table_list = table_list[2:] output_dict = defaultdict(list) print("for zheng's sanity print the wholetable ----") print(table_list) for result_object in table_list: try: sample_name = result_object["RTS:"] except: sample_name = "unknown" scan_number = 0 for header in header_order: scan_number += 1 abundance = result_object[header] output_dict["filename"].append( sample_name ) output_dict["abundance"].append( abundance ) output_dict["scan_number"].append( scan_number ) output_dict["RT"].append( header ) ming_fileio_library.write_dictionary_table_data(output_dict, output_clusterinfo_filename)
def resolve_metadata_filename_to_all_files(filename, dataset_files): stripped_extension = ming_fileio_library.get_filename_without_extension(filename) acceptable_filenames = ["f." + dataset_filename for dataset_filename in dataset_files if dataset_filename.find(stripped_extension) != -1] if len(acceptable_filenames) != 1: return None return acceptable_filenames[0]
def main(): parser = argparse.ArgumentParser(description='Modifying script') parser.add_argument('param_xml', help='metadata_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_table', help='output_metadata_table') parser.add_argument('output_view_emporer', help='output_metadata_table') args = parser.parse_args() param_object = ming_proteosafe_library.parse_xml_file( open(args.param_xml, "r")) """Outputting html""" from urllib.parse import urlencode, quote_plus parameters_for_qiime = { 'biom': 'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=biom_output/networking_quant.biom' % (param_object["task"][0]), 'metadata': 'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=metadata_for_qiime/metadata_for_qiime.txt' % (param_object["task"][0]) } output_html_file = open(args.output_view_emporer, "w") output_html_file.write("<script>\n") output_html_file.write( 'window.location.replace("https://mingwangbeta.ucsd.edu/emperor?%s")\n' % urlencode(parameters_for_qiime)) output_html_file.write("</script>\n") output_html_file.close() reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping( param_object) metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) object_list = [] if len(metadata_files_in_folder) != 1: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename": real_name}) #open(args.output_metadata_table, "w").write("NO OUTPUT") #open(args.output_view_emporer, "w").write("Please Include Metadata File") #exit(0) else: object_list = ming_fileio_library.parse_table_with_headers_object_list( metadata_files_in_folder[0]) if len(object_list) == 0: for real_name in reverse_file_mangling: mangled_name = reverse_file_mangling[real_name] if mangled_name.find("spec") == -1: continue object_list.append({"filename": real_name}) #open(args.output_metadata_table, "w").write("NO OUTPUT") #open(args.output_view_emporer, "w").write("Please Include Non Empty Metadata File") #exit(0) #Writing headers header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"] for key in object_list[0]: if not key in header_list: header_list.append(key) header_list.append("ATTRIBUTE_GNPSDefaultGroup") for metadata_object in object_list: if not "#SampleID" in metadata_object: metadata_object[ "#SampleID"] = ming_fileio_library.get_filename_without_extension( metadata_object["filename"]) if not "BarcodeSequence" in metadata_object: metadata_object["BarcodeSequence"] = "GATACA" if not "LinkerPrimerSequence" in metadata_object: metadata_object["LinkerPrimerSequence"] = "GATACA" mangled_name = reverse_file_mangling[metadata_object["filename"]] if mangled_name.find("spec-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1" elif mangled_name.find("spectwo-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2" elif mangled_name.find("specthree-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3" elif mangled_name.find("specfour-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4" elif mangled_name.find("specfive-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5" elif mangled_name.find("specsix-") != -1: metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6" ming_fileio_library.write_list_dict_table_data(object_list, args.output_metadata_table, header_list)
def main(): input_paramxml = sys.argv[1] input_tsv_filename = sys.argv[2] intermediate_output_folder = sys.argv[3] output_file_bins = int(sys.argv[4]) params_obj = ming_proteosafe_library.parse_xml_file(open(input_paramxml)) snr_threshold = get_snr_filter(params_obj) #Filtering Criteria minimum_explained_intensity = 0.0 min_number_of_peaks_within_1_percent_of_max = 0.0 min_signal_peaks = 0.0 min_number_of_annotated_ions = 0.0 max_kl_strict_score = 50 max_ppm_error = 100000000 try: minimum_explained_intensity = float( params_obj["min_explained_intensity"][0]) min_number_of_peaks_within_1_percent_of_max = float( params_obj["min_number_of_peaks_within_1_percent_of_max"][0]) min_signal_peaks = float(params_obj["min_signal_peaks"][0]) min_number_of_annotated_ions = float( params_obj["min_number_of_annotated_ions"][0]) max_kl_strict_score = float(params_obj["kl_strict_max"][0]) if max_kl_strict_score == 0: max_kl_strict_score = 50 max_ppm_error = float(params_obj["max_ppm_error"][0]) except: print("exception") minimum_explained_intensity = 0.0 min_number_of_peaks_within_1_percent_of_max = 0.0 min_signal_peaks = 0.0 max_kl_strict_score = 50 #lets find the 1% variant point, and then the naive solution is to to take the top scoring one psm_set = ming_psm_library.PSMset("") psm_set.load_PSM_tsvfile(input_tsv_filename, load_extra_metadata=True) filename_to_psm_dict = group_psms_by_filename(psm_set) #All output files, we are going to bin them starting now output_filename_prefix = os.path.join( intermediate_output_folder, ming_fileio_library.get_filename_without_extension( os.path.basename(input_tsv_filename)) + "_partition_") output_files = {} output_files_number_spectra = {} for i in range(output_file_bins): output_filename = output_filename_prefix + str(i) + ".json" output_file = open(output_filename, "w") output_file.write("[") output_files[i] = output_file output_files_number_spectra[i] = 0 for filename in filename_to_psm_dict: extracted_spectra = extract_psms_from_filename( filename, filename_to_psm_dict[filename], snr_threshold, minimum_explained_intensity, min_signal_peaks, min_number_of_peaks_within_1_percent_of_max, min_number_of_annotated_ions, max_ppm_error) for spectrum in extracted_spectra: hashed_index = int( hashlib.sha1( spectrum["annotation"].encode('utf-8')).hexdigest(), 16) % (output_file_bins) if output_files_number_spectra[hashed_index] == 0: output_files[hashed_index].write(json.dumps(spectrum) + "\n") else: output_files[hashed_index].write("," + json.dumps(spectrum) + "\n") output_files_number_spectra[hashed_index] += 1 for i in range(output_file_bins): output_files[i].write("]") output_files[i].close()