def get_stripped_extenstion_file_mapping(params):
    all_mappings = get_mangled_file_mapping(params)
    output_mapping = {}
    for mangled_name in all_mappings:
        output_mapping[ming_fileio_library.get_filename_without_extension(mangled_name)] = all_mappings[mangled_name]

    return output_mapping
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)


    clusters_in_network = set()
    for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'):
        clusters_in_network.add(row["cluster index"])

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if not(cluster_number in clusters_in_network):
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0)
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header_list = []
    output_header_list.append("#OTU ID")
    for header in mangled_mapping.keys():
        if header.find("spec") == -1:
            continue
        if os.path.basename(mangled_mapping[header]) in metadata_mapping:
            output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])])
        else:
            output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header])))

    output_file.write("\t".join(output_header_list) + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_output_list = []
        line_output_list.append(str(cluster_idx))
        #line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            if header.find("spec") == -1:
                continue
            line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header]))
            #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        #output_file.write(line_string + "\n")
        output_file.write("\t".join(line_output_list) + "\n")
    output_file.close()
Пример #3
0
def simple_presence_of_merged_spectra_processing(input_integrals_filename, output_clusterinfo_filename, mangled_mapping):
    extension_stripped_mangled_mapping = {}
    for key in mangled_mapping:
        without_ext = ming_fileio_library.get_filename_without_extension(key)
        extension_stripped_mangled_mapping[without_ext] = mangled_mapping[key]


    header_order = open(input_integrals_filename).readline().rstrip().split(",")[1:]

    table_list = ming_fileio_library.parse_table_with_headers_object_list(input_integrals_filename, delimiter=",")
    #Removing other header infroamtion
    table_list = table_list[2:]

    output_dict = defaultdict(list)

    print("for zheng's sanity print the wholetable ----")
    print(table_list)
    for result_object in table_list:
        try:
            sample_name = result_object["RTS:"]
        except:
            sample_name = "unknown"
        scan_number = 0
        for header in header_order:
            scan_number += 1
            abundance = result_object[header]
            output_dict["filename"].append( sample_name )
            output_dict["abundance"].append( abundance )
            output_dict["scan_number"].append( scan_number )
            output_dict["RT"].append( header )

    ming_fileio_library.write_dictionary_table_data(output_dict, output_clusterinfo_filename)
Пример #4
0
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)


    clusters_in_network = set()
    for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'):
        clusters_in_network.add(row["cluster index"])

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if not(cluster_number in clusters_in_network):
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0)
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header_list = []
    output_header_list.append("#OTU ID")
    for header in mangled_mapping.keys():
        if header.find("spec") == -1:
            continue
        if os.path.basename(mangled_mapping[header]) in metadata_mapping:
            output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])])
        else:
            output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header])))

    output_file.write("\t".join(output_header_list) + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_output_list = []
        line_output_list.append(str(cluster_idx))
        #line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            if header.find("spec") == -1:
                continue
            line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header]))
            #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        #output_file.write(line_string + "\n")
        output_file.write("\t".join(line_output_list) + "\n")
    output_file.close()
def get_stripped_extenstion_file_mapping(params):
    all_mappings = get_mangled_file_mapping(params)
    output_mapping = {}
    for mangled_name in all_mappings:
        output_mapping[ming_fileio_library.get_filename_without_extension(mangled_name)] = all_mappings[mangled_name]

    return output_mapping
def resolve_metadata_filename_to_all_files(filename, dataset_files):
    stripped_extension = ming_fileio_library.get_filename_without_extension(filename)

    acceptable_filenames = ["f." + dataset_filename for dataset_filename in dataset_files if dataset_filename.find(stripped_extension) != -1]

    if len(acceptable_filenames) != 1:
        return None

    return acceptable_filenames[0]
Пример #7
0
def main():
    parser = argparse.ArgumentParser(description='Modifying script')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_table', help='output_metadata_table')
    parser.add_argument('output_view_emporer', help='output_metadata_table')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(
        open(args.param_xml, "r"))
    """Outputting html"""
    from urllib.parse import urlencode, quote_plus
    parameters_for_qiime = {
        'biom':
        'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=biom_output/networking_quant.biom'
        % (param_object["task"][0]),
        'metadata':
        'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=metadata_for_qiime/metadata_for_qiime.txt'
        % (param_object["task"][0])
    }

    output_html_file = open(args.output_view_emporer, "w")
    output_html_file.write("<script>\n")
    output_html_file.write(
        'window.location.replace("https://mingwangbeta.ucsd.edu/emperor?%s")\n'
        % urlencode(parameters_for_qiime))
    output_html_file.write("</script>\n")
    output_html_file.close()

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(
        param_object)

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)

    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename": real_name})
        #open(args.output_metadata_table, "w").write("NO OUTPUT")
        #open(args.output_view_emporer, "w").write("Please Include Metadata File")
        #exit(0)
    else:
        object_list = ming_fileio_library.parse_table_with_headers_object_list(
            metadata_files_in_folder[0])

        if len(object_list) == 0:
            for real_name in reverse_file_mangling:
                mangled_name = reverse_file_mangling[real_name]
                if mangled_name.find("spec") == -1:
                    continue
                object_list.append({"filename": real_name})
            #open(args.output_metadata_table, "w").write("NO OUTPUT")
            #open(args.output_view_emporer, "w").write("Please Include Non Empty Metadata File")
            #exit(0)

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            metadata_object[
                "#SampleID"] = ming_fileio_library.get_filename_without_extension(
                    metadata_object["filename"])
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        mangled_name = reverse_file_mangling[metadata_object["filename"]]
        if mangled_name.find("spec-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
        elif mangled_name.find("spectwo-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
        elif mangled_name.find("specthree-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
        elif mangled_name.find("specfour-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
        elif mangled_name.find("specfive-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
        elif mangled_name.find("specsix-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"

    ming_fileio_library.write_list_dict_table_data(object_list,
                                                   args.output_metadata_table,
                                                   header_list)
Пример #8
0
def main():
    input_paramxml = sys.argv[1]
    input_tsv_filename = sys.argv[2]
    intermediate_output_folder = sys.argv[3]
    output_file_bins = int(sys.argv[4])

    params_obj = ming_proteosafe_library.parse_xml_file(open(input_paramxml))
    snr_threshold = get_snr_filter(params_obj)

    #Filtering Criteria
    minimum_explained_intensity = 0.0
    min_number_of_peaks_within_1_percent_of_max = 0.0
    min_signal_peaks = 0.0
    min_number_of_annotated_ions = 0.0
    max_kl_strict_score = 50
    max_ppm_error = 100000000

    try:
        minimum_explained_intensity = float(
            params_obj["min_explained_intensity"][0])
        min_number_of_peaks_within_1_percent_of_max = float(
            params_obj["min_number_of_peaks_within_1_percent_of_max"][0])
        min_signal_peaks = float(params_obj["min_signal_peaks"][0])
        min_number_of_annotated_ions = float(
            params_obj["min_number_of_annotated_ions"][0])
        max_kl_strict_score = float(params_obj["kl_strict_max"][0])
        if max_kl_strict_score == 0:
            max_kl_strict_score = 50
        max_ppm_error = float(params_obj["max_ppm_error"][0])
    except:
        print("exception")
        minimum_explained_intensity = 0.0
        min_number_of_peaks_within_1_percent_of_max = 0.0
        min_signal_peaks = 0.0
        max_kl_strict_score = 50

    #lets find the 1% variant point, and then the naive solution is to to take the top scoring one
    psm_set = ming_psm_library.PSMset("")
    psm_set.load_PSM_tsvfile(input_tsv_filename, load_extra_metadata=True)

    filename_to_psm_dict = group_psms_by_filename(psm_set)

    #All output files, we are going to bin them starting now
    output_filename_prefix = os.path.join(
        intermediate_output_folder,
        ming_fileio_library.get_filename_without_extension(
            os.path.basename(input_tsv_filename)) + "_partition_")
    output_files = {}
    output_files_number_spectra = {}
    for i in range(output_file_bins):
        output_filename = output_filename_prefix + str(i) + ".json"
        output_file = open(output_filename, "w")
        output_file.write("[")
        output_files[i] = output_file
        output_files_number_spectra[i] = 0

    for filename in filename_to_psm_dict:
        extracted_spectra = extract_psms_from_filename(
            filename, filename_to_psm_dict[filename], snr_threshold,
            minimum_explained_intensity, min_signal_peaks,
            min_number_of_peaks_within_1_percent_of_max,
            min_number_of_annotated_ions, max_ppm_error)
        for spectrum in extracted_spectra:
            hashed_index = int(
                hashlib.sha1(
                    spectrum["annotation"].encode('utf-8')).hexdigest(),
                16) % (output_file_bins)
            if output_files_number_spectra[hashed_index] == 0:
                output_files[hashed_index].write(json.dumps(spectrum) + "\n")
            else:
                output_files[hashed_index].write("," + json.dumps(spectrum) +
                                                 "\n")
            output_files_number_spectra[hashed_index] += 1

    for i in range(output_file_bins):
        output_files[i].write("]")
        output_files[i].close()