def main():
    first_pass_results_filename = sys.argv[1]
    second_pass_results_filename = sys.argv[2]
    fasta_db_filename = sys.argv[3]

    second_pass_proteins_filename = sys.argv[4]

    output_first_pass_peptides = sys.argv[5]
    output_second_pass_peptides = sys.argv[6]

    output_psms_first_pass = sys.argv[7]
    output_psms_updated_evalues = sys.argv[8]

    output_original_high_FDR_psms = sys.argv[9]
    output_updated_high_FDR_psms = sys.argv[10]

    #Low FDR Original and updated evals
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename)
    psm_list_first_pass.remove_duplicated_rows()
    psm_list_first_pass.filter_to_fdr_by_length(0.01)
    psm_list_first_pass.write_output(open(output_psms_first_pass, "w"))

    update_psm_set_with_second_pass_psms(first_pass_results_filename,
                                         second_pass_results_filename,
                                         output_psms_updated_evalues)

    #High FDR for other purposes to show things
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename)
    psm_list_first_pass.remove_duplicated_rows()
    psm_list_first_pass.filter_to_fdr_by_length(0.01)
    psm_list_first_pass.write_output(open(output_original_high_FDR_psms, "w"))

    update_psm_set_with_second_pass_psms(first_pass_results_filename,
                                         second_pass_results_filename,
                                         output_updated_high_FDR_psms, 0.05)

    #Precursor Level
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename)
    psm_list_first_pass.remove_duplicated_rows()
    full_peptides_list_first_pass = library_creation.create_library_unique_peptides_filtered(
        [psm_list_first_pass], filter_by_length=True)
    full_peptides_list_first_pass.write_output(
        open(output_first_pass_peptides, "w"))

    psm_list_second_pass = ming_psm_library.PSMset(
        second_pass_results_filename)
    psm_list_second_pass.load_MSGF_Plus_tsvfile(second_pass_results_filename)
    psm_list_second_pass.remove_duplicated_rows()
    full_peptides_list_second_pass = library_creation.create_library_unique_peptides_filtered(
        [psm_list_second_pass], filter_by_length=True)
    full_peptides_list_second_pass.write_output(
        open(output_second_pass_peptides, "w"))
Пример #2
0
def grab_results_from_task(task_id, user, output_peptide_directory,
                           output_psm_directory, params_obj,
                           folder_for_results):
    return_dict = {}
    return_dict["number_psms"] = 0
    return_dict["number_peptides"] = 0
    return_dict["task_id"] = task_id

    #Copying the psm files
    path_to_psm_files_list = ming_proteosafe_library.get_proteosafe_result_file_path(
        task_id, user, folder_for_results)
    if len(path_to_psm_files_list) == 1:
        output_psm_path = os.path.join(output_psm_directory, task_id + ".psms")
        path_to_param_file = ming_proteosafe_library.get_proteosafe_result_file_path(
            task_id, user, "params")[0]

        #These are original results that are from MSGF+ that includes the fragmentation method
        print(
            task_id, user,
            ming_proteosafe_library.get_proteosafe_result_file_path(
                task_id, user, "mergedResult"))
        path_to_merged_results = ming_proteosafe_library.get_proteosafe_result_file_path(
            task_id, user, "mergedResult")[0]

        print(path_to_psm_files_list[0] + " to " + output_psm_path)
        #name_demangle_filenames(path_to_psm_files_list[0], output_psm_path, path_to_param_file, "filename", "filename")
        name_demangle_filenames_and_instrument_collision(
            path_to_psm_files_list[0], output_psm_path, path_to_param_file,
            path_to_merged_results, "filename", "filename")

        #Now lets generate the peptide list from the psm list
        psm_set = ming_psm_library.PSMset("task results")
        psm_set.load_PSM_tsvfile(output_psm_path, True)
        print("PSM Count", len(psm_set.psms))
        psm_set.psms = filter_psms_with_params(params_obj, psm_set.psms)
        #Setting the task of each psm
        for psm in psm_set.psms:
            psm.extra_metadata["proteosafe_task"] = task_id

        print("PSM Count Filtered", len(psm_set.psms))
        psm_set.filter_to_fdr_by_length(0.05)

        output_pickle = open(output_psm_path, 'wb')
        pickle.dump(psm_set, output_pickle, pickle.HIGHEST_PROTOCOL)
        output_pickle.close()

        output_peptide_path = output_psm_path = os.path.join(
            output_peptide_directory, task_id + ".peptides")

        peptide_variant_set = save_psms_as_peptides(psm_set,
                                                    output_peptide_path, 0.05)

        return_dict["number_psms"] = len(psm_set.psms)
        return_dict["number_peptides"] = len(peptide_variant_set.peptide_list)

    return return_dict
def get_first_pass_variant_set(first_pass_results_filename):
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename)
    psm_list_first_pass.filter_to_fdr_by_length(0.05)
    print "First Pass PSMs: " + str(len(psm_list_first_pass))

    full_peptides_list_first_pass = library_creation.create_library_unique_peptides_filtered(
        [psm_list_first_pass], filter_by_length=True)
    print "First Pass Variants: " + str(len(full_peptides_list_first_pass))

    return full_peptides_list_first_pass
Пример #4
0
def save_psms_as_peptides(psm_set, output_peptide_path, fdr):
    peptide_variant_set = library_creation.create_library_unique_peptides_filtered(
        [psm_set], fdr, filter_by_length=True)

    psm_set = ming_psm_library.PSMset("task results")
    for peptide in peptide_variant_set.peptide_list:
        psm_set.psms.append(peptide.get_best_psm())

    output_pickle = open(output_peptide_path, 'wb')
    pickle.dump(psm_set, output_pickle, pickle.HIGHEST_PROTOCOL)
    output_pickle.close()

    return peptide_variant_set
def update_psm_set_with_second_pass_psms(first_pass_psms,
                                         second_pass_psms,
                                         output_psms,
                                         FDR=0.05):
    #print(second_pass_psms)

    print("Loading second pass PSMs", second_pass_psms)
    psm_list_second_pass = ming_psm_library.PSMset(second_pass_psms)
    psm_list_second_pass.load_MSGF_Plus_tsvfile(second_pass_psms)
    psm_list_second_pass.remove_duplicated_rows()

    print("Loading first pass PSMs", first_pass_psms)
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_psms)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_psms)
    psm_list_first_pass.remove_duplicated_rows()

    update_evalues_first_second_pass(psm_list_first_pass, psm_list_second_pass)

    psm_list_first_pass.filter_to_fdr_by_length(FDR)

    #Writing out the results
    psm_list_first_pass.write_output(open(output_psms, "w"),
                                     write_extra_metadata=True)
Пример #6
0
def main():
    input_searchresults_filename = sys.argv[1]
    output_peptide_list = sys.argv[2]
    output_peptide_list_with_decoy_filename = sys.argv[3]

    psm_list = ming_psm_library.PSMset(input_searchresults_filename)
    psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename)
    psm_list.filter_to_fdr_by_length(0.01)
    print len(psm_list)

    full_peptides_list = library_creation.create_library_unique_peptides_filtered(
        [psm_list], fdr=0.01, filter_by_length=True)

    output_file = open(output_peptide_list, "w")

    all_peptides = [
        peptide.get_stripped_sequence()
        for peptide in full_peptides_list.peptide_list
    ]
    all_peptides = list(set(all_peptides))

    for peptide in all_peptides:
        output_file.write(peptide + "\n")

    #Now lets load the PSMs and keep all variants, and then output them with the decoys present
    print "GIVING US FULL RESULT SET"
    psm_list = ming_psm_library.PSMset(input_searchresults_filename)
    psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename)
    full_peptides_list = library_creation.create_library_unique_peptides_filtered(
        [psm_list], 1.0)
    output_peptide_list_with_decoy_file = open(
        output_peptide_list_with_decoy_filename, "w")

    output_peptide_list_with_decoy_file.write(
        ming_psm_library.PeptideVariant.output_header() + "\n")
    for peptide in full_peptides_list.peptide_list:
        output_peptide_list_with_decoy_file.write(str(peptide) + "\n")
Пример #7
0
def main():
    input_fasta_filename = sys.argv[1]
    input_searchresults_filename = sys.argv[2]
    output_proteins_as_list = sys.argv[3]

    proteome = ming_protein_library.parse_fasta_proteome_file(
        input_fasta_filename)

    #for protein in proteome.protein_list:
    #    print protein.protein

    psm_list = ming_psm_library.PSMset(input_searchresults_filename)
    psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename)

    full_peptides_list = library_creation.create_library_unique_peptides_filtered(
        [psm_list], 0.01, filter_by_length=True)

    target_peptide_strings = []
    decoy_peptide_strings = []
    for peptide_obj in full_peptides_list.peptide_list:
        peptide_to_search = peptide_obj.get_stripped_sequence()
        if peptide_obj.is_decoy():
            decoy_peptide_strings.append(peptide_to_search[::-1])
        else:
            target_peptide_strings.append(peptide_to_search)

    protein_coverage_of_targets = proteome.get_proteins_with_number_of_peptides_covered_map(
        target_peptide_strings)
    protein_coverage_of_decoys = proteome.get_proteins_with_number_of_peptides_covered_map(
        decoy_peptide_strings)

    output_file = open(output_proteins_as_list, "w")
    output_file.write(
        "protein\tdecoy_count\ttarget_count\ttotal_count\tlength\n")

    for protein in protein_coverage_of_targets:
        output_string = protein + "\t"
        output_string += str(protein_coverage_of_decoys[protein]) + "\t"
        output_string += str(protein_coverage_of_targets[protein]) + "\t"
        output_string += str(protein_coverage_of_targets[protein] +
                             protein_coverage_of_decoys[protein]) + "\t"
        output_string += str(len(
            proteome.protein_map[protein].sequence)) + "\n"

        output_file.write(output_string)
    output_file.close()
def main():
    parallel_json = json.loads(open(sys.argv[1]).read())
    params_filename = sys.argv[2]
    input_folder_of_results = sys.argv[3]
    output_folder = sys.argv[4]

    my_node = parallel_json["node_partition"]
    total_node = parallel_json["total_paritions"]

    all_input_files = ming_fileio_library.list_files_in_dir(input_folder_of_results)
    all_input_files.sort()

    ###
    ### TODO We will have to read parameters and see if we need to eliminate some PSMs, with PSM FDR filter, KL Filter, ambiguity score filter, unique intensity filter
    ###

    params_obj = ming_proteosafe_library.parse_xml_file(open(params_filename))
    total_file_count = 0
    all_input_files = all_input_files[my_node::total_node]
    current_working_psm_set = ming_psm_library.PSMset("Ming")

    for input_file in all_input_files:
        #Assume these are variant files
        #We can treat this like a psm file and then combine all of the as a new variants file
        total_file_count += 1
        print(input_file, total_file_count, "of", len(all_input_files))
        input_pickle = open(input_file, 'rb')
        temp_psm_set = pickle.load(input_pickle)
        print("Loaded", len(temp_psm_set.psms))

        for psm in temp_psm_set.psms:
            precursor_string = "%s:%d" % (psm.annotation, psm.charge)
            score = psm.score

            #Determine minimum score cutoff
            current_score = psm.sorting_value()
            peptide_length = len(psm.get_stripped_sequence())

            current_working_psm_set.psms.append(psm)

    #Saving out psms
    output_filename = os.path.join(output_folder, str(my_node) + ".psms")
    current_working_psm_set.write_output(open(output_filename, "w"), True)
def grab_results_from_multipass(task_id, user, output_peptide_directory,
                                output_psm_directory):
    return_dict = {}
    return_dict["number_psms"] = 0
    return_dict["number_peptides"] = 0
    return_dict["task_id"] = task_id

    #Copying the psm files
    path_to_psm_files_list = ming_proteosafe_library.get_proteosafe_result_file_path(
        task_id, user, "updated_eval_psms_with_kl_with_ambiguity")
    if len(path_to_psm_files_list) == 1:
        output_psm_path = os.path.join(output_psm_directory, task_id + ".psms")
        path_to_param_file = ming_proteosafe_library.get_proteosafe_result_file_path(
            task_id, user, "params")[0]

        #path_to_merged_results = ming_proteosafe_library.get_proteosafe_backend_result_file_path(task_id, "mergedResult", "proteomics2")[0]
        print(
            ming_proteosafe_library.get_proteosafe_result_file_path(
                task_id, user, "mergedResult"))
        path_to_merged_results = ming_proteosafe_library.get_proteosafe_result_file_path(
            task_id, user, "mergedResult")[0]

        print(path_to_psm_files_list[0] + " to " + output_psm_path)
        #name_demangle_filenames(path_to_psm_files_list[0], output_psm_path, path_to_param_file, "filename", "filename")
        name_demangle_filenames_and_instrument_collision(
            path_to_psm_files_list[0], output_psm_path, path_to_param_file,
            path_to_merged_results, "filename", "filename")

        #Now lets generate the peptide list from the psm list
        psm_set = ming_psm_library.PSMset("task results")
        psm_set.load_PSM_tsvfile(output_psm_path)
        output_peptide_path = output_psm_path = os.path.join(
            output_peptide_directory, task_id + ".peptides")

        peptide_variant_set = library_creation.create_library_unique_peptides_filtered(
            [psm_set], 0.01)
        peptide_variant_set.write_output(open(output_peptide_path, "w"))

        return_dict["number_psms"] = len(psm_set.psms)
        return_dict["number_peptides"] = len(peptide_variant_set.peptide_list)

    return return_dict
Пример #10
0
def main():
    paramxml_filename = sys.argv[1]
    input_spectrum_filename = sys.argv[2]
    input_spectrum_all = sys.argv[3]
    psms_input_file = sys.argv[4]
    input_collision_energy_folder = sys.argv[5]
    output_psms_file = sys.argv[6]

    parameters_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_filename))
    scan_metadata_maps = load_collision_energy_mapping(input_collision_energy_folder)

    target_filename_list, decoy_filename_list = determine_set_of_target_and_decoy_spectrum_files(parameters_obj)

    input_psm_set = ming_psm_library.PSMset("input psms")
    input_psm_set.load_MSGF_Plus_tsvfile(psms_input_file)

    """Filtering on Collision Energy"""
    print("Size Before Filtering", len(input_psm_set.psms))
    filter_psms_to_acceptable_metadata(input_psm_set, scan_metadata_maps, parameters_obj)
    print("Size After CE Filtering", len(input_psm_set.psms))

    """Filtering to current file"""
    current_file_psms = get_psms_to_current_file(input_psm_set, input_spectrum_filename)
    target_file_psms = get_psms_to_target_file(input_psm_set, target_filename_list)
    print(len(current_file_psms), len(target_file_psms))

    output_decoys_list = []
    if os.path.basename(input_spectrum_filename) in target_filename_list:
        #no filtering, just save
        print("Target")
        output_decoys_list = target_file_psms
    else:
        #Find top scoring hit for each precursor

        blacklisted_decoy_peptides = json.loads(parameters_obj["blacklisted_decoy_peptides_json"][0])
        current_file_psms = filtering_out_blacklisted_decoys(current_file_psms, blacklisted_decoy_peptides)
        output_decoys_list = filtering_out_high_scoring_decoys(current_file_psms, target_file_psms, os.path.join(input_spectrum_all, target_filename_list[0]), input_spectrum_filename)

    output_decoys_list = filtering_redundant_identifications_per_scan(output_decoys_list)
    input_psm_set.psms = output_decoys_list

    input_psm_set.write_output(open(output_psms_file, "w"))
Пример #11
0
def main():
    input_fasta_filename = sys.argv[1]
    input_searchresults_filename = sys.argv[2]
    output_fasta_filename = sys.argv[3]
    output_proteins_as_list = sys.argv[4]

    proteome = ming_protein_library.parse_fasta_proteome_file(input_fasta_filename)

    #for protein in proteome.protein_list:
    #    print protein.protein

    psm_list = ming_psm_library.PSMset(input_searchresults_filename)
    psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename)
    psm_list.filter_to_fdr_by_length(0.01)
    print len(psm_list)

    full_peptides_list = library_creation.create_library_unique_peptides_filtered([psm_list], fdr=0.01, filter_by_length=True)

    #Testing efficient version fo this
    all_peptide_strings = []
    for peptide_obj in full_peptides_list.peptide_list:
        peptide_to_search = peptide_obj.get_stripped_sequence()
        all_peptide_strings.append(peptide_obj.get_stripped_sequence())

    all_proteins = proteome.get_proteins_covered_by_k_peptides(all_peptide_strings, 2, True)

    all_protein_names = []
    for protein in all_proteins:
        all_protein_names.append(protein.protein)

    output_protein_filename = output_fasta_filename
    open(output_protein_filename, "w").write(json.dumps(all_protein_names))

    #Outputting the list of proteins
    output_protein_list_file = open(output_proteins_as_list, "w")

    output_protein_list_file.write("Protein\n")
    for protein in all_protein_names:
        output_protein_list_file.write(protein + "\n")

    exit(0)
Пример #12
0
def create_library_merged_psm_list_separate_fdr_peptide_length(
        psm_set, fdr=0.01):
    full_peptide_set = ming_psm_library.PeptideVariantSet("Combined")

    peptide_length_map = {}
    for psm in psm_set.psms:
        peptide_length = len(psm.get_stripped_sequence())
        if not peptide_length in peptide_length_map:
            peptide_length_map[peptide_length] = ming_psm_library.PSMset(
                "length" + str(peptide_length))
        peptide_length_map[peptide_length].psms.append(psm)

    #Lets do FDR on each length
    for peptide_length in peptide_length_map:
        #print peptide_length_map[peptide_length]
        peptide_set = ming_psm_library.PeptideVariantSet("Test")
        peptide_set.add_psms_set(peptide_length_map[peptide_length])
        peptide_set.filter_to_fdr(fdr)

        full_peptide_set.add_variant_set(peptide_set)

    #print full_peptide_set.peptide_list
    return full_peptide_set
Пример #13
0
def main():
    paramxml_filename = sys.argv[1]
    psms_input_file = sys.argv[2]
    kl_input_file = sys.argv[3]
    output_psms_file = sys.argv[4]

    parameters_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_filename))

    input_psm_set = ming_psm_library.PSMset("input psms")
    input_psm_set.load_MSGF_Plus_tsvfile(psms_input_file)
    input_psm_set.remove_redundant_psms()
    #input_psm_set.filter_to_fdr(0.05)
    input_psm_set.filter_to_fdr_by_length(0.01)

    row_count, kl_data = ming_fileio_library.parse_table_with_headers(
        kl_input_file)
    kl_dict = {}
    for i in range(row_count):
        filename = os.path.basename(kl_data["Filename"][i])
        scan = kl_data["Scan"][i]
        kl_strict = (kl_data["KL Strict"][i])
        kl_unstrict = (kl_data["KL"][i])
        interpeak_intensity = (kl_data["Interpeak intensity"][i])
        key = filename + ":" + str(scan)
        kl_dict[key] = {
            "kl_strict": kl_strict,
            "kl_unstrict": kl_unstrict,
            "kl_interpeak": interpeak_intensity
        }

    #for psm in input_psm_set.psms:
    #    key = psm.filename + ":" + str(psm.scan)
    #    if key in kl_dict:
    #        psm.kl = kl_dict[key]

    output_file = open(output_psms_file, "w")
    input_psm_set.write_output(output_file)
    output_file.close()

    #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary
    psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers(
        output_psms_file)
    psm_table_data["kl_strict"] = []
    psm_table_data["kl_unstrict"] = []
    psm_table_data["kl_interpeak"] = []
    for i in range(psm_rows):
        key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i]
        if key in kl_dict:
            psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"])
            psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"])
            psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"])
        else:
            psm_table_data["kl_strict"].append(-1)
            psm_table_data["kl_unstrict"].append(-1)
            psm_table_data["kl_interpeak"].append(-1)

    #Change C to C+57
    #if "cysteine_protease.cysteine" in parameters_obj:
    #    if parameters_obj["cysteine_protease.cysteine"][0] == "c57":
    #        #Lets replace all the cysteines
    #        for i in range(psm_rows):
    #            psm_table_data["sequence"][i] = psm_table_data["sequence"][i].replace("C", "C+57")

    ming_fileio_library.write_dictionary_table_data(psm_table_data,
                                                    output_psms_file)
Пример #14
0
def main():
    print(sys.argv)
    paramxml_filename = sys.argv[1]
    psms_input_file = sys.argv[2]
    kl_input_file = sys.argv[3]

    output_psms_file = sys.argv[4]
    output_decoy_psms_file = sys.argv[5]

    parameters_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_filename))

    target_filename_list, decoy_filename_list = determine_set_of_target_and_decoy_spectrum_files(
        parameters_obj)

    input_psm_set = ming_psm_library.PSMset("input psms")
    input_psm_set.load_PSM_tsvfile(psms_input_file, load_extra_metadata=True)

    decoy_psm_set = ming_psm_library.PSMset("decoy psms")
    decoy_psm_set.psms = input_psm_set.synthetic_psms_by_length_decoy_set(
        target_filename_list, decoy_filename_list)

    print("GETTING ALL SYNETHTIC with 0% FDR")
    input_psm_set.filter_synthetic_psms_by_length(target_filename_list,
                                                  decoy_filename_list)

    row_count, kl_data = ming_fileio_library.parse_table_with_headers(
        kl_input_file)
    kl_dict = {}
    for i in range(row_count):
        filename = os.path.basename(kl_data["Filename"][i])
        scan = kl_data["Scan"][i]
        kl_strict = (kl_data["KL Strict"][i])
        kl_unstrict = (kl_data["KL"][i])
        interpeak_intensity = (kl_data["Interpeak intensity"][i])
        key = filename + ":" + str(scan)
        kl_dict[key] = {
            "kl_strict": kl_strict,
            "kl_unstrict": kl_unstrict,
            "kl_interpeak": interpeak_intensity
        }

    output_file = open(output_psms_file, "w")
    input_psm_set.write_output(output_file, write_extra_metadata=True)
    decoy_psm_set.write_output(open(output_decoy_psms_file, "w"),
                               write_extra_metadata=True)
    output_file.close()

    #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary
    psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers(
        output_psms_file)

    psm_table_data["kl_strict"] = []
    psm_table_data["kl_unstrict"] = []
    psm_table_data["kl_interpeak"] = []

    psm_table_data["ambiguity_total_score"] = []
    psm_table_data["first_second_unique_ratio"] = []
    psm_table_data["first_unique_count"] = []
    psm_table_data["first_unique_intensity"] = []
    psm_table_data["numberpsms"] = []
    psm_table_data["second_unique_count"] = []
    psm_table_data["second_unique_intensity"] = []
    psm_table_data["spectrum_unique_key"] = []
    psm_table_data["modified_sequence"] = []

    for i in range(psm_rows):
        key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i]
        if key in kl_dict:
            psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"])
            psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"])
            psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"])
        else:
            psm_table_data["kl_strict"].append(-1)
            psm_table_data["kl_unstrict"].append(-1)
            psm_table_data["kl_interpeak"].append(-1)

        #writing the ambiguity stuff, but just assuming no ambiguity
        psm_table_data["ambiguity_total_score"].append("-1")
        psm_table_data["first_second_unique_ratio"].append("-1")
        psm_table_data["first_unique_count"].append("-1")
        psm_table_data["first_unique_intensity"].append("-1")
        psm_table_data["numberpsms"].append(1)
        psm_table_data["second_unique_count"].append("-1")
        psm_table_data["second_unique_intensity"].append("-1")
        psm_table_data["spectrum_unique_key"].append(key)
        psm_table_data["modified_sequence"].append(
            psm_table_data["sequence"][i][:-2])

    ming_fileio_library.write_dictionary_table_data(psm_table_data,
                                                    output_psms_file)
def get_second_pass_psms(second_pass_results_filename):
    psm_list_second_pass = ming_psm_library.PSMset(
        second_pass_results_filename)
    psm_list_second_pass.load_MSGF_Plus_tsvfile(second_pass_results_filename)

    return psm_list_second_pass
Пример #16
0
def main():
    input_paramxml = sys.argv[1]
    input_tsv_filename = sys.argv[2]
    intermediate_output_folder = sys.argv[3]
    output_file_bins = int(sys.argv[4])

    params_obj = ming_proteosafe_library.parse_xml_file(open(input_paramxml))
    snr_threshold = get_snr_filter(params_obj)

    #Filtering Criteria
    minimum_explained_intensity = 0.0
    min_number_of_peaks_within_1_percent_of_max = 0.0
    min_signal_peaks = 0.0
    min_number_of_annotated_ions = 0.0
    max_kl_strict_score = 50
    max_ppm_error = 100000000

    try:
        minimum_explained_intensity = float(
            params_obj["min_explained_intensity"][0])
        min_number_of_peaks_within_1_percent_of_max = float(
            params_obj["min_number_of_peaks_within_1_percent_of_max"][0])
        min_signal_peaks = float(params_obj["min_signal_peaks"][0])
        min_number_of_annotated_ions = float(
            params_obj["min_number_of_annotated_ions"][0])
        max_kl_strict_score = float(params_obj["kl_strict_max"][0])
        if max_kl_strict_score == 0:
            max_kl_strict_score = 50
        max_ppm_error = float(params_obj["max_ppm_error"][0])
    except:
        print("exception")
        minimum_explained_intensity = 0.0
        min_number_of_peaks_within_1_percent_of_max = 0.0
        min_signal_peaks = 0.0
        max_kl_strict_score = 50

    #lets find the 1% variant point, and then the naive solution is to to take the top scoring one
    psm_set = ming_psm_library.PSMset("")
    psm_set.load_PSM_tsvfile(input_tsv_filename, load_extra_metadata=True)

    filename_to_psm_dict = group_psms_by_filename(psm_set)

    #All output files, we are going to bin them starting now
    output_filename_prefix = os.path.join(
        intermediate_output_folder,
        ming_fileio_library.get_filename_without_extension(
            os.path.basename(input_tsv_filename)) + "_partition_")
    output_files = {}
    output_files_number_spectra = {}
    for i in range(output_file_bins):
        output_filename = output_filename_prefix + str(i) + ".json"
        output_file = open(output_filename, "w")
        output_file.write("[")
        output_files[i] = output_file
        output_files_number_spectra[i] = 0

    for filename in filename_to_psm_dict:
        extracted_spectra = extract_psms_from_filename(
            filename, filename_to_psm_dict[filename], snr_threshold,
            minimum_explained_intensity, min_signal_peaks,
            min_number_of_peaks_within_1_percent_of_max,
            min_number_of_annotated_ions, max_ppm_error)
        for spectrum in extracted_spectra:
            hashed_index = int(
                hashlib.sha1(
                    spectrum["annotation"].encode('utf-8')).hexdigest(),
                16) % (output_file_bins)
            if output_files_number_spectra[hashed_index] == 0:
                output_files[hashed_index].write(json.dumps(spectrum) + "\n")
            else:
                output_files[hashed_index].write("," + json.dumps(spectrum) +
                                                 "\n")
            output_files_number_spectra[hashed_index] += 1

    for i in range(output_file_bins):
        output_files[i].write("]")
        output_files[i].close()