def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)


    clusters_in_network = set()
    for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'):
        clusters_in_network.add(row["cluster index"])

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if not(cluster_number in clusters_in_network):
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0)
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header_list = []
    output_header_list.append("#OTU ID")
    for header in mangled_mapping.keys():
        if header.find("spec") == -1:
            continue
        if os.path.basename(mangled_mapping[header]) in metadata_mapping:
            output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])])
        else:
            output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header])))

    output_file.write("\t".join(output_header_list) + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_output_list = []
        line_output_list.append(str(cluster_idx))
        #line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            if header.find("spec") == -1:
                continue
            line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header]))
            #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        #output_file.write(line_string + "\n")
        output_file.write("\t".join(line_output_list) + "\n")
    output_file.close()
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename,
                                   clusterinfosummary_filename,
                                   output_filename):
    param_object = ming_proteosafe_library.parse_xml_file(
        open(param_filename, "r"))
    output_file = open(output_filename, "w")
    if param_object["CREATE_CLUSTER_BUCKETS"][0] != "1":
        output_file.write("No Output")
        return

    test_network = molecular_network_library.MolecularNetwork()
    test_network.load_clustersummary(clusterinfosummary_filename)

    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        cluster_info_filename)

    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if test_network.get_cluster_index(cluster_number) == None:
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][
            mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {
            "filename": table_data["#Filename"][i],
            "intensity": table_data["#PrecIntensity"][i]
        }
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header = "#OTU ID\t"
    for header in mangled_mapping.keys():
        output_header += os.path.basename(mangled_mapping[header]) + "\t"

    output_file.write(output_header + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            line_string += str(
                cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        output_file.write(line_string + "\n")
def determine_filenames_to_load(my_node_number, params_obj, path_to_existing_library, path_to_new_library_spectra):
    existing_library_filename = ""
    new_library_filename = ""

    basic_filename = str(my_node_number) + ".json"
    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_obj)

    existing_library_files = ming_fileio_library.list_files_in_dir(path_to_existing_library)
    new_library_spectra_files = ming_fileio_library.list_files_in_dir(path_to_new_library_spectra)

    for filename in existing_library_files:
        base_filename = os.path.basename(filename)
        unmangled_name = mangled_file_mapping[base_filename]
        if os.path.basename(unmangled_name) == basic_filename:
            existing_library_filename = os.path.join(path_to_existing_library, base_filename)

    for filename in new_library_spectra_files:
        base_filename = os.path.basename(filename)
        unmangled_name = mangled_file_mapping[base_filename]
        if os.path.basename(unmangled_name) == basic_filename:
            new_library_filename = os.path.join(path_to_new_library_spectra, base_filename)



    return existing_library_filename, new_library_filename
Пример #4
0
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)


    clusters_in_network = set()
    for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'):
        clusters_in_network.add(row["cluster index"])

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if not(cluster_number in clusters_in_network):
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0)
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header_list = []
    output_header_list.append("#OTU ID")
    for header in mangled_mapping.keys():
        if header.find("spec") == -1:
            continue
        if os.path.basename(mangled_mapping[header]) in metadata_mapping:
            output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])])
        else:
            output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header])))

    output_file.write("\t".join(output_header_list) + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_output_list = []
        line_output_list.append(str(cluster_idx))
        #line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            if header.find("spec") == -1:
                continue
            line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header]))
            #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        #output_file.write(line_string + "\n")
        output_file.write("\t".join(line_output_list) + "\n")
    output_file.close()
 def load_parameters_file(self, paramsfilename):
     #Loading the file mapping
     parameters = ming_proteosafe_library.parse_xml_file(
         open(paramsfilename, "r"))
     mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
         parameters)
     self.mangled_mapping = mangled_mapping
Пример #6
0
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename):
    output_file = open(output_filename, "w")
    test_network = molecular_network_library.MolecularNetwork()
    test_network.load_clustersummary(clusterinfosummary_filename)
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if test_network.get_cluster_index(cluster_number) == None:
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    all_headers = ["filename", "X", "Y", "Z", "radius"]
    for cluster_idx in cluster_index_to_file_map:
        all_headers.append(cluster_idx)

    #writing header
    output_file.write(",".join(all_headers) + "\n")

    for sample_name in mangled_mapping:
        if sample_name.find("spec") == -1:
            continue
        real_filename = mangled_mapping[sample_name]

        if not os.path.basename(real_filename) in filename_coordinate_mapping:
            continue

        line_output = [real_filename]
        coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)]
        line_output.append(coordinate_object["x"])
        line_output.append(coordinate_object["y"])
        line_output.append(coordinate_object["z"])
        line_output.append(coordinate_object["radius"])
        print(line_output, coordinate_object)
        for cluster_idx in cluster_index_to_file_map:
            line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name]))
        output_file.write(",".join(line_output) + "\n")

    output_file.close()
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    all_headers = ["filename", "X", "Y", "Z", "radius"]
    for cluster_idx in cluster_index_to_file_map:
        all_headers.append(cluster_idx)

    #writing header
    output_file.write(",".join(all_headers) + "\n")

    for sample_name in mangled_mapping:
        if sample_name.find("spec") == -1:
            continue
        real_filename = mangled_mapping[sample_name]

        if not os.path.basename(real_filename) in filename_coordinate_mapping:
            continue

        line_output = [real_filename]
        coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)]
        line_output.append(coordinate_object["x"])
        line_output.append(coordinate_object["y"])
        line_output.append(coordinate_object["z"])
        line_output.append(coordinate_object["radius"])
        print(line_output, coordinate_object)
        for cluster_idx in cluster_index_to_file_map:
            line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name]))
        output_file.write(",".join(line_output) + "\n")

    output_file.close()
Пример #8
0
def main():
    parser = argparse.ArgumentParser(
        description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clustersummary', help='input_clustersummary')
    parser.add_argument('input_clusterinfo', help='input_clusterinfo')
    parser.add_argument('output_clusterinfo', help='output_clusterinfo')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(open(
        args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)

    #Creating acceptable clusters to include in cluster info
    included_clusters = set()
    for row in csv.DictReader(open(args.input_clustersummary), delimiter='\t'):
        included_clusters.add(row["cluster index"])

    with open(args.input_clusterinfo) as input_clusterinfo:
        field_names = [
            "cluster index", "AllFiles", "sum(precursor intensity)", "RTMean",
            "RTStdErr", "parent mass", "ScanNumber", "ProteosafeFilePath",
            "Original_Path"
        ]
        output_clusterinfo_writer = csv.DictWriter(open(
            args.output_clusterinfo, "w"),
                                                   fieldnames=field_names,
                                                   delimiter='\t')
        output_clusterinfo_writer.writeheader()

        input_clusterinfo_reader = csv.DictReader(input_clusterinfo,
                                                  delimiter='\t')
        for row in input_clusterinfo_reader:
            if not (row["#ClusterIdx"] in included_clusters):
                continue
            output_dict = {}
            output_dict["cluster index"] = row["#ClusterIdx"]
            output_dict["AllFiles"] = row["#Filename"]
            output_dict["sum(precursor intensity)"] = row["#PrecIntensity"]
            output_dict["RTMean"] = row["#RetTime"]
            output_dict["RTStdErr"] = "0"
            output_dict["parent mass"] = row["#ParentMass"]
            output_dict["ScanNumber"] = row["#Scan"]
            output_dict["ProteosafeFilePath"] = os.path.join(
                "spec", os.path.basename(row["#Filename"]))
            output_dict["Original_Path"] = "f." + mangled_mapping[
                os.path.basename(row["#Filename"])]
            output_clusterinfo_writer.writerow(output_dict)

    exit(0)
def main():
    input_file_of_tsv_results = sys.argv[1]
    input_params_xml_filename = sys.argv[2]
    input_library_identifications_filename = sys.argv[3]
    input_cutoff_scores = sys.argv[4]
    output_folder = sys.argv[5]

    output_filename = os.path.join(output_folder, os.path.basename(input_file_of_tsv_results))

    params_object = ming_proteosafe_library.parse_xml_file(open(input_params_xml_filename))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)

    library_scans_to_identification = library_scans_to_identification_info(input_library_identifications_filename)

    cutoff_dict = json.loads(open(input_cutoff_scores).read())

    psm_list = ming_psm_library.parse_MSGFPlus_tsvfile(input_file_of_tsv_results)
    output_results_dict = process_ambiguity(psm_list, mangled_mapping, library_scans_to_identification, cutoff_dict)

    ming_fileio_library.write_dictionary_table_data(output_results_dict, output_filename)
def name_demangle_filenames(input_file, output_file, path_to_param,
                            old_filename_header, new_filename_header):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_file)
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        ming_proteosafe_library.parse_xml_file(open(path_to_param)))

    if old_filename_header == new_filename_header:
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header][i] = unmangled_name
    else:
        table_data[new_filename_header] = []
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header].append(unmangled_name)

    ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main():
    parser = argparse.ArgumentParser(description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clustersummary', help='input_clustersummary')
    parser.add_argument('input_clusterinfo', help='input_clusterinfo')
    parser.add_argument('output_clusterinfo', help='output_clusterinfo')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)


    #Creating acceptable clusters to include in cluster info
    included_clusters = set()
    for row in csv.DictReader(open(args.input_clustersummary), delimiter='\t'):
        included_clusters.add(row["cluster index"])

    with open(args.input_clusterinfo) as input_clusterinfo:
        field_names = ["cluster index", "AllFiles", "sum(precursor intensity)", "RTMean", "RTStdErr", "parent mass", "ScanNumber", "ProteosafeFilePath", "Original_Path"]
        output_clusterinfo_writer = csv.DictWriter(open(args.output_clusterinfo, "w"), fieldnames=field_names, delimiter='\t')
        output_clusterinfo_writer.writeheader()

        input_clusterinfo_reader = csv.DictReader(input_clusterinfo, delimiter='\t')
        for row in input_clusterinfo_reader:
            if not (row["#ClusterIdx"] in included_clusters):
                continue
            output_dict = {}
            output_dict["cluster index"] = row["#ClusterIdx"]
            output_dict["AllFiles"] = row["#Filename"]
            output_dict["sum(precursor intensity)"] = row["#PrecIntensity"]
            output_dict["RTMean"] = row["#RetTime"]
            output_dict["RTStdErr"] = "0"
            output_dict["parent mass"] = row["#ParentMass"]
            output_dict["ScanNumber"] = row["#Scan"]
            output_dict["ProteosafeFilePath"] = os.path.join("spec", os.path.basename(row["#Filename"]))
            output_dict["Original_Path"] = "f." + mangled_mapping[os.path.basename(row["#Filename"])]
            output_clusterinfo_writer.writerow(output_dict)

    exit(0)
Пример #12
0
def name_demangle_filenames_and_instrument_collision(input_file, output_file,
                                                     path_to_param,
                                                     path_to_original_results,
                                                     old_filename_header,
                                                     new_filename_header):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_file, skip_incomplete_lines=True)
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        ming_proteosafe_library.parse_xml_file(open(path_to_param)))

    if not "FragMethod" in table_data:
        print("Demangling", path_to_original_results, input_file)
        collision_mapping = get_scan_mapping_for_collision_method(
            path_to_original_results)

        #Adding collision column
        table_data["FragMethod"] = []
        print(len(table_data["filename"]), len(table_data["scan"]))
        for i in range(row_count):
            key = table_data["filename"][i] + "_" + table_data["scan"][i]
            if key in collision_mapping:
                table_data["FragMethod"].append(collision_mapping[key])
            else:
                table_data["FragMethod"].append("NO_COLLISION")

    if old_filename_header == new_filename_header:
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header][i] = unmangled_name
    else:
        table_data[new_filename_header] = []
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header].append(unmangled_name)

    ming_fileio_library.write_dictionary_table_data(table_data, output_file)
Пример #13
0
def main():
    parser = argparse.ArgumentParser(
        description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters',
                        help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary',
                        help='output folder for parameters')
    parser.add_argument('--parallelism',
                        default=1,
                        type=int,
                        help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[
        parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)
    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(
        tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(
            input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(
        full_result_list,
        os.path.join(args.result_folder,
                     str(uuid.uuid4()) + ".tsv"))
def main():
    parser = argparse.ArgumentParser(
        description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_file', help='output_metadata_file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(
        open(args.proteosafe_parameters))

    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        param_obj)

    default_group_mapping = defaultdict(list)
    file_to_group_mapping = {}
    for mangled_name in mangled_file_mapping:
        if mangled_name.find("specone-") != -1:
            default_group_mapping["G1"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G1"
        if mangled_name.find("spectwo-") != -1:
            default_group_mapping["G2"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G2"
        if mangled_name.find("specthree-") != -1:
            default_group_mapping["G3"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G3"
        if mangled_name.find("specfour-") != -1:
            default_group_mapping["G4"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G4"
        if mangled_name.find("specfive-") != -1:
            default_group_mapping["G5"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G5"
        if mangled_name.find("specsix-") != -1:
            default_group_mapping["G6"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G6"

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)

    row_count = 0
    table_data = defaultdict(list)
    if len(metadata_files_in_folder) == 1:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            metadata_files_in_folder[0])

    print(table_data)
    for key in table_data:
        print(key, len(table_data[key]))

    for i in range(row_count):
        print(i)
        filename = table_data["filename"][i]
        if len(filename) < 2:
            continue
        print(filename, filename[0], filename[-1])

        if filename[0] == "\"":
            filename = filename[1:]
        if filename[-1] == "\"":
            filename = filename[:-1]

        table_data["filename"][i] = filename

        basename_filename = os.path.basename(filename)
        group_name = "NoDefaultGroup"
        if basename_filename in file_to_group_mapping:
            group_name = file_to_group_mapping[basename_filename]
        table_data["ATTRIBUTE_DefaultGroup"].append(group_name)

    for input_filename in file_to_group_mapping:
        if input_filename in table_data["filename"]:
            continue
        else:
            for key in table_data:
                if key != "ATTRIBUTE_DefaultGroup" and key != "filename":
                    table_data[key].append("N/A")

            table_data["ATTRIBUTE_DefaultGroup"].append(
                file_to_group_mapping[input_filename])
            table_data["filename"].append(input_filename)

    ming_fileio_library.write_dictionary_table_data(table_data,
                                                    args.output_metadata_file)
Пример #15
0
def process(param_xml, metadata_folder, output_metadata_folder):
    params_object = ming_proteosafe_library.parse_xml_file(open(param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)

    input_metadata_filenames = glob.glob(os.path.join(metadata_folder, "*"))

    user_metadata_df = None
    if len(input_metadata_filenames) == 1:
        user_metadata_df = pd.read_csv(input_metadata_filenames[0], sep="\t")
    
    if len(input_metadata_filenames) > 1:
        print("You have selected too many metadata files, please only select one")
        exit(1)
    
    # We didnt input metadata file, lets see what we can do with sheets
    if len(input_metadata_filenames) == 0:
        try:
            from urllib.parse import urlparse
            sheets_url = params_object["googlesheetsmetadata"][0]

            if len(sheets_url) > 10:
                parsed_url = urlparse(sheets_url)
                path = parsed_url.path
                path_splits = path.split("/")
                sheets_id = path_splits[3]

                json_url = "https://gnps-sheets-proxy.herokuapp.com/sheets.json?sheets_id={}".format(sheets_id)

                r = requests.get(json_url)
                user_metadata_df = pd.DataFrame(r.json())
        except:
            pass

    # Merging Default Groups in
    # default_group_list = []
    # for mangled_name in mangled_mapping.keys():
    #     group_dict = {}
    #     group_dict["filename"] = os.path.basename(mangled_mapping[mangled_name])
    #     if mangled_name.find("spec-") != -1:
    #         group_dict["DefaultGroup"] = "G1"
    #     if mangled_name.find("specone-") != -1:
    #         group_dict["DefaultGroup"] = "G1"
    #     if mangled_name.find("spectwo-") != -1:
    #         group_dict["DefaultGroup"] = "G2"
    #     if mangled_name.find("specthree-") != -1:
    #         group_dict["DefaultGroup"] = "G3"
    #     if mangled_name.find("specfour-") != -1:
    #         group_dict["DefaultGroup"] = "G4"
    #     if mangled_name.find("specfive-") != -1:
    #         group_dict["DefaultGroup"] = "G5"
    #     if mangled_name.find("specsix-") != -1:
    #         group_dict["DefaultGroup"] = "G6"

    #     if len(group_dict) > 1:
    #         default_group_list.append(group_dict)

    # default_metadata_df = pd.DataFrame(default_group_list)

    # if user_metadata_df is not None:
    #     merged_metadata_df = default_metadata_df.merge(user_metadata_df, how="outer", on="filename")
    # else:
    #     merged_metadata_df = default_metadata_df

    merged_metadata_df = user_metadata_df

    if merged_metadata_df is not None:
        output_metadata_filename = os.path.join(output_metadata_folder, "gnps_metadata.tsv")
        merged_metadata_df.to_csv(output_metadata_filename, sep="\t", index=False)
Пример #16
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list
    
    used_files = set()
    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path
        result_object["CCMS_filename"] = os.path.basename(full_path)
        used_files.add(full_path)

    for mangled_name in spectra_files:
        full_path = mangled_mapping[os.path.basename(mangled_name)]
        if full_path in used_files:
            continue

        output_dict = {}
        output_dict["full_CCMS_path"] = full_path
        output_dict["CCMS_filename"] = os.path.basename(full_path)
        full_result_list.append(output_dict)

    pd.DataFrame(full_result_list).to_csv(args.result_file, sep="\t", index=False)
def main():
    parser = argparse.ArgumentParser(description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file')
    parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file')
    parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename')
    parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename')
    parser.add_argument('input_networking_pairs', help='input_networking_pairs')
    parser.add_argument('input_library_search', help='input_library_search')
    parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename')
    args = parser.parse_args()

    """Loading group filenames"""
    group_to_files, files_to_groups = load_group_mapping(args.input_group_mapping_filename)
    print("Loaded Group Mapping")
    cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfosummary_file)
    print("Loaded Cluster Summary")

    attribute_to_groups = load_attribute_mapping(args.input_attribute_mapping_filename)

    params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)

    CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0])
    RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0]

    #Calculating the spectrum counts per group
    cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0))
    cluster_to_files = defaultdict(set)
    cluster_to_RT = defaultdict(list)
    line_count = 0
    for line in open(args.input_clusterinfo_file):
        line_count += 1
        if line_count == 1:
            continue
        if line_count % 10000 == 0:
            print(line_count)

        splits = line.rstrip().split("\t")
        cluster_index = splits[0]
        filename = os.path.basename(splits[1])
        rt = float(splits[6])

        group_membership = files_to_groups[filename]
        cluster_to_files[cluster_index].add(filename)
        cluster_to_RT[cluster_index].append(rt)

        for group in group_membership:
            cluster_to_group_counts[cluster_index][group] += 1

    if RUN_MSCLUSTER == "on":
        cluster_summary_list = filter_clusters_based_on_cluster_size(cluster_summary_list, CLUSTER_MIN_SIZE)

    print(len(cluster_summary_list))

    print("Setting up grouping", len(group_to_files.keys()))
    for cluster_summary_object in cluster_summary_list:
        cluster_index = cluster_summary_object["cluster index"]
        for group in group_to_files:
            group_count = 0
            if group in cluster_to_group_counts[cluster_index]:
                group_count = cluster_to_group_counts[cluster_index][group]
            cluster_summary_object[group] = group_count

        for attribute in attribute_to_groups:
            groups_to_include = []
            for group in attribute_to_groups[attribute]:
                if group in cluster_summary_object:
                    if cluster_summary_object[group] > 0:
                        groups_to_include.append(group)

            cluster_summary_object[attribute] = ",".join(groups_to_include).replace("GNPSGROUP:", "")


    print("Default Attributes")
    calculate_default_attributes(cluster_summary_list, group_to_files.keys())

    print("calculate_cluster_file_stats")
    calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping)

    print("rt stats")
    calculate_rt_stats(cluster_summary_list, cluster_to_RT)

    print("calculate_ancillary_information")
    calculate_ancillary_information(cluster_summary_list, params_object["task"][0])

    print("populate_network_component")
    populate_network_component(cluster_summary_list, args.input_networking_pairs)

    print("populate_network_identifications")
    populate_network_identifications(cluster_summary_list, args.input_library_search)

    ming_fileio_library.write_list_dict_table_data(cluster_summary_list, args.output_clusterinfosummary_filename)
Пример #18
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_file', help='output_metadata_file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))

    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)

    default_group_mapping = defaultdict(list)
    file_to_group_mapping = {}
    for mangled_name in mangled_file_mapping:
        if mangled_name.find("specone-") != -1:
            default_group_mapping["G1"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1"
        if mangled_name.find("spectwo-") != -1:
            default_group_mapping["G2"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2"
        if mangled_name.find("specthree-") != -1:
            default_group_mapping["G3"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3"
        if mangled_name.find("specfour-") != -1:
            default_group_mapping["G4"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4"
        if mangled_name.find("specfive-") != -1:
            default_group_mapping["G5"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5"
        if mangled_name.find("specsix-") != -1:
            default_group_mapping["G6"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6"

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)

    row_count = 0
    table_data = defaultdict(list)
    if len(metadata_files_in_folder) == 1:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

    print(table_data)
    for key in table_data:
        print(key, len(table_data[key]))

    for i in range(row_count):
        print(i)
        filename = table_data["filename"][i]
        if len(filename) < 2:
            continue
        print(filename, filename[0], filename[-1])

        if filename[0] == "\"":
            filename = filename[1:]
        if filename[-1] == "\"":
            filename = filename[:-1]

        table_data["filename"][i] = filename

        basename_filename = os.path.basename(filename)
        group_name = "NoDefaultGroup"
        if basename_filename in file_to_group_mapping:
            group_name = file_to_group_mapping[basename_filename]
        table_data["ATTRIBUTE_DefaultGroup"].append(group_name)



    for input_filename in file_to_group_mapping:
        if input_filename in table_data["filename"]:
            continue
        else:
            for key in table_data:
                if key != "ATTRIBUTE_DefaultGroup" and key != "filename":
                    table_data[key].append("N/A")

            table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename])
            table_data["filename"].append(input_filename)

    ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")


    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
Пример #21
0
def main():
    parser = argparse.ArgumentParser(
        description='Group Mapping from input, defaults and metadata file')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('groupmapping_folder', help='groupmapping_folder')
    parser.add_argument('attributemapping_folder',
                        help='attributemapping_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_groupmapping_file',
                        help='output_groupmapping_file')
    parser.add_argument('output_attributemapping_file',
                        help='output_attributemapping_file')
    parser.add_argument('inputspectrafolder', help='inputspectrafolder')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(
        open(args.proteosafe_parameters))
    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        param_obj)
    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(
        param_obj)
    print(reverse_file_mangling.keys())
    file_path_prefix = args.inputspectrafolder

    output_group_file = open(args.output_groupmapping_file, "w")
    output_attribute_file = open(args.output_attributemapping_file, "w")
    """
    Writing Default Grouping to output file
    """
    default_groupings = {
        'G1': [],
        'G2': [],
        'G3': [],
        'G4': [],
        'G5': [],
        'G6': []
    }
    for mangled_name in mangled_file_mapping.keys():
        if mangled_name.find("spec-") != -1:
            default_groupings['G1'].append(mangled_name.rstrip())
        if mangled_name.find("spectwo-") != -1:
            default_groupings['G2'].append(mangled_name.rstrip())
        if mangled_name.find("specthree-") != -1:
            default_groupings['G3'].append(mangled_name.rstrip())
        if mangled_name.find("specfour-") != -1:
            default_groupings['G4'].append(mangled_name.rstrip())
        if mangled_name.find("specfive-") != -1:
            default_groupings['G5'].append(mangled_name.rstrip())
        if mangled_name.find("specsix-") != -1:
            default_groupings['G6'].append(mangled_name.rstrip())

    for default_group_key in default_groupings.keys():
        default_group_string = ""
        default_group_string += "GROUP_" + default_group_key + "="
        for mangled_name in default_groupings[default_group_key]:
            default_group_string += os.path.join(file_path_prefix,
                                                 mangled_name) + ";"
        if len(default_groupings[default_group_key]) > 0:
            default_group_string = default_group_string[:-1]
        output_group_file.write(default_group_string + "\n")
    """Determining output whether to use group mapping file or metadata file"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)
    groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.groupmapping_folder)
    attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.attributemapping_folder)

    if len(metadata_files_in_folder) > 1:
        print("Too many metafile inputted")
        exit(1)
    if len(metadata_files_in_folder) == 1:
        #Using metadatat file
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            metadata_files_in_folder[0])
        if not "filename" in table_data:
            print(
                "Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename"
            )
            exit(1)
        attributes_to_groups_mapping = defaultdict(set)
        group_to_files_mapping = defaultdict(list)
        for i in range(row_count):
            filename = table_data["filename"][i]
            basename_filename = os.path.basename(filename).rstrip()
            print(basename_filename, len(reverse_file_mangling.keys()))
            if basename_filename in reverse_file_mangling:
                mangled_name = reverse_file_mangling[basename_filename]
                for key in table_data:
                    if key.find("ATTRIBUTE_") != -1:
                        group_name = table_data[key][i]
                        if len(group_name) < 1:
                            continue
                        group_to_files_mapping[group_name].append(
                            os.path.join(file_path_prefix, mangled_name))
                        attributes_to_groups_mapping[key.replace(
                            "ATTRIBUTE_", "")].add(group_name)
            else:
                #Filename is not part of sample set
                print(basename_filename, "missing")
                continue

        for group_name in group_to_files_mapping:
            group_string = "GROUP_" + group_name + "=" + ";".join(
                group_to_files_mapping[group_name])
            output_group_file.write(group_string + "\n")

        for attribute_name in attributes_to_groups_mapping:
            attribute_string = attribute_name + "=" + ";".join(
                list(attributes_to_groups_mapping[attribute_name]))
            output_attribute_file.write(attribute_string + "\n")
        exit(0)
    """Falling back on old group mapping file"""
    if len(groupmapping_files_in_folder) > 1 or len(
            attributemapping_files_in_folder) > 1:
        print("Too many group/attribute mappings inputted")
        exit(1)

    if len(groupmapping_files_in_folder) == 1:
        for line in open(groupmapping_files_in_folder[0], errors='ignore'):
            splits = line.rstrip().split("=")
            if len(splits) < 2:
                continue

            group_name = splits[0]
            group_files = []
            for filename in splits[1].split(";"):
                if os.path.basename(filename) in reverse_file_mangling:
                    mangled_name = reverse_file_mangling[os.path.basename(
                        filename)]
                    group_files.append(
                        os.path.join(file_path_prefix, mangled_name))

            group_string = group_name + "=" + ";".join(group_files)
            output_group_file.write(group_string + "\n")

    if len(attributemapping_files_in_folder) == 1:
        for line in open(attributemapping_files_in_folder[0]):
            output_attribute_file.write(line)
	def load_parameters_file(self, paramsfilename):
		#Loading the file mapping
		parameters = ming_proteosafe_library.parse_xml_file(open(paramsfilename, "r"))
		mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(parameters)
		self.mangled_mapping = mangled_mapping
Пример #23
0
def main():
    parser = argparse.ArgumentParser(
        description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clusterinfo_file',
                        help='input_clusterinfo_file')
    parser.add_argument('input_clusterinfosummary_file',
                        help='input_clusterinfosummary_file')
    parser.add_argument('input_group_mapping_filename',
                        help='input_group_mapping_filename')
    parser.add_argument('input_attribute_mapping_filename',
                        help='input_attribute_mapping_filename')
    parser.add_argument('input_networking_pairs',
                        help='input_networking_pairs')
    parser.add_argument('input_library_search', help='input_library_search')
    parser.add_argument('output_clusterinfosummary_filename',
                        help='output_clusterinfosummary_filename')
    args = parser.parse_args()
    """Loading group filenames"""
    group_to_files, files_to_groups = load_group_mapping(
        args.input_group_mapping_filename)
    print("Loaded Group Mapping")
    cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_clusterinfosummary_file)
    print("Loaded Cluster Summary")

    attribute_to_groups = load_attribute_mapping(
        args.input_attribute_mapping_filename)

    params_object = ming_proteosafe_library.parse_xml_file(open(
        args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)

    CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0])
    RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0]

    #Calculating the spectrum counts per group
    cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0))
    cluster_to_files = defaultdict(set)
    cluster_to_RT = defaultdict(list)
    line_count = 0
    for line in open(args.input_clusterinfo_file):
        line_count += 1
        if line_count == 1:
            continue
        if line_count % 10000 == 0:
            print(line_count)

        splits = line.rstrip().split("\t")
        cluster_index = splits[0]
        filename = os.path.basename(splits[1])
        rt = float(splits[6])

        group_membership = files_to_groups[filename]
        cluster_to_files[cluster_index].add(filename)
        cluster_to_RT[cluster_index].append(rt)

        for group in group_membership:
            cluster_to_group_counts[cluster_index][group] += 1

    if RUN_MSCLUSTER == "on":
        cluster_summary_list = filter_clusters_based_on_cluster_size(
            cluster_summary_list, CLUSTER_MIN_SIZE)

    print(len(cluster_summary_list))

    print("Setting up grouping", len(group_to_files.keys()))
    for cluster_summary_object in cluster_summary_list:
        cluster_index = cluster_summary_object["cluster index"]
        for group in group_to_files:
            group_count = 0
            if group in cluster_to_group_counts[cluster_index]:
                group_count = cluster_to_group_counts[cluster_index][group]
            cluster_summary_object[group] = group_count

        for attribute in attribute_to_groups:
            groups_to_include = []
            for group in attribute_to_groups[attribute]:
                if group in cluster_summary_object:
                    if cluster_summary_object[group] > 0:
                        groups_to_include.append(group)

            cluster_summary_object[attribute] = ",".join(
                groups_to_include).replace("GNPSGROUP:", "")

    print("Default Attributes")
    calculate_default_attributes(cluster_summary_list, group_to_files.keys())

    print("calculate_cluster_file_stats")
    calculate_cluster_file_stats(cluster_summary_list, cluster_to_files,
                                 mangled_mapping)

    print("rt stats")
    calculate_rt_stats(cluster_summary_list, cluster_to_RT)

    print("populate_network_component")
    populate_network_component(cluster_summary_list,
                               args.input_networking_pairs)

    print("calculate_ancillary_information")
    calculate_ancillary_information(cluster_summary_list,
                                    params_object["task"][0])

    print("populate_network_identifications")
    populate_network_identifications(cluster_summary_list,
                                     args.input_library_search)

    ming_fileio_library.write_list_dict_table_data(
        cluster_summary_list, args.output_clusterinfosummary_filename)
def main():
    parser = argparse.ArgumentParser(description='Group Mapping from input, defaults and metadata file')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('groupmapping_folder', help='groupmapping_folder')
    parser.add_argument('attributemapping_folder', help='attributemapping_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_groupmapping_file', help='output_groupmapping_file')
    parser.add_argument('output_attributemapping_file', help='output_attributemapping_file')
    parser.add_argument('inputspectrafolder', help='inputspectrafolder')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))
    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)
    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_obj)
    file_path_prefix = args.inputspectrafolder

    output_group_file = open(args.output_groupmapping_file, "w")
    output_attribute_file = open(args.output_attributemapping_file, "w")

    """
    Writing Default Grouping to output file
    """
    default_groupings = {'G1' : [] , 'G2' : [] ,'G3' : [] ,'G4' : [] ,'G5' : [] ,'G6' : [] }
    for mangled_name in mangled_file_mapping.keys():
        if mangled_name.find("spec-") != -1:
            default_groupings['G1'].append(mangled_name.rstrip())
        if mangled_name.find("spectwo-") != -1:
            default_groupings['G2'].append(mangled_name.rstrip())
        if mangled_name.find("specthree-") != -1:
            default_groupings['G3'].append(mangled_name.rstrip())
        if mangled_name.find("specfour-") != -1:
            default_groupings['G4'].append(mangled_name.rstrip())
        if mangled_name.find("specfive-") != -1:
            default_groupings['G5'].append(mangled_name.rstrip())
        if mangled_name.find("specsix-") != -1:
            default_groupings['G6'].append(mangled_name.rstrip())

    for default_group_key in default_groupings.keys():
        default_group_string = ""
        default_group_string += "GROUP_" + default_group_key +"="
        for mangled_name in default_groupings[default_group_key]:
            default_group_string += os.path.join(file_path_prefix, mangled_name) + ";"
        if len(default_groupings[default_group_key]) > 0:
            default_group_string = default_group_string[:-1]
        output_group_file.write(default_group_string + "\n")


    """Determining output whether to use group mapping file or metadata file"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.groupmapping_folder)
    attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.attributemapping_folder)

    if len(metadata_files_in_folder) > 1:
        print("Too many metafile inputted")
        exit(1)
    if len(metadata_files_in_folder) == 1:
        #Using metadatat file
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

        if not "filename" in table_data:
            print("Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename")
            exit(1)

        attributes_to_groups_mapping = defaultdict(set)
        group_to_files_mapping = defaultdict(list)
        for i in range(row_count):
            filename = table_data["filename"][i]
            basename_filename = os.path.basename(filename).rstrip()
            if basename_filename in reverse_file_mangling:
                mangled_name = reverse_file_mangling[basename_filename]
                for key in table_data:
                    if key.find("ATTRIBUTE_") != -1:
                        group_name = table_data[key][i]
                        if len(group_name) < 1:
                            continue
                        group_to_files_mapping[group_name].append(os.path.join(file_path_prefix, mangled_name))
                        attributes_to_groups_mapping[key.replace("ATTRIBUTE_", "")].add(group_name)
            else:
                #Filename is not part of sample set
                continue

        for group_name in group_to_files_mapping:
            group_string = "GROUP_" + group_name + "="  + ";".join(group_to_files_mapping[group_name])
            output_group_file.write(group_string + "\n")

        for attribute_name in attributes_to_groups_mapping:
            attribute_string = attribute_name + "=" + ";".join(list(attributes_to_groups_mapping[attribute_name]))
            output_attribute_file.write(attribute_string + "\n")
        exit(0)

    """Falling back on old group mapping file"""
    if len(groupmapping_files_in_folder) > 1 or len(attributemapping_files_in_folder) > 1:
        print("Too many group/attribute mappings inputted")
        exit(1)

    if len(groupmapping_files_in_folder) == 1:
        for line in open(groupmapping_files_in_folder[0], errors='ignore'):
            splits = line.rstrip().split("=")
            if len(splits) < 2:
                continue

            group_name = splits[0]
            group_files = []
            for filename in splits[1].split(";"):
                if os.path.basename(filename) in reverse_file_mangling:
                    mangled_name = reverse_file_mangling[os.path.basename(filename)]
                    group_files.append(os.path.join(file_path_prefix, mangled_name))

            group_string = group_name + "=" + ";".join(group_files)
            output_group_file.write(group_string + "\n")

    if len(attributemapping_files_in_folder) == 1:
        for line in open(attributemapping_files_in_folder[0]):
            output_attribute_file.write(line)