def grab_single_result(task_id, output_peptide_directory,
                       output_psm_directory):
    return_dict = {}
    return_dict["number_psms"] = 0
    return_dict["number_peptides"] = 0
    return_dict["task_id"] = task_id

    task_info = ming_proteosafe_library.get_task_information(
        "proteomics2.ucsd.edu", task_id)
    user = task_info["user"]
    if task_info["status"] == "FAILED":
        return return_dict

    #lets check whether whether this has the peptide output, if not we can create it
    path_to_secondpass_peptides_files_list = ming_proteosafe_library.get_proteosafe_result_file_path(
        task_id, user, "updated_eval_psms_with_kl_with_ambiguity")

    if len(path_to_secondpass_peptides_files_list) == 0:
        return_dict = grab_results_from_MSGFDB(task_id, user,
                                               output_peptide_directory,
                                               output_psm_directory)
        return return_dict

    if len(path_to_secondpass_peptides_files_list) == 1:
        return_dict = grab_results_from_multipass(task_id, user,
                                                  output_peptide_directory,
                                                  output_psm_directory)
        return return_dict
Exemplo n.º 2
0
def main():
    params_obj = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))

    augment_task_id = params_obj["task"][0]

    all_tasks_output_dict = defaultdict(list)
    all_augments_output_dict = defaultdict(list)
    all_spectrum_files_output_dict = defaultdict(list)

    search_task_to_augment = {}
    search_task_to_extraction = {}

    all_search_tasks = set()

    process_tree = True
    while process_tree:
        print("AUGMENT", augment_task_id, len(augment_task_id))
        augment_task_information = ming_proteosafe_library.get_task_information(
            "proteomics2.ucsd.edu", augment_task_id)

        extract_task_id = ""
        previous_augment_task_id = ""

        for filename in augment_task_information["files"]:
            if filename.find("unfiltered_peptide_list") != -1:
                previous_augment_task_id = ming_fileio_library.get_root_folder(
                    filename.replace(
                        ming_fileio_library.get_root_folder(filename) + "/",
                        ""))
            if filename.find("extracted_spectra_peptides_merged") != -1:
                extract_task_id = ming_fileio_library.get_root_folder(
                    filename.replace(
                        ming_fileio_library.get_root_folder(filename) + "/",
                        ""))

        previous_augment_task_id = previous_augment_task_id.strip()
        if len(previous_augment_task_id) < 10:
            process_tree = False

        print(previous_augment_task_id, extract_task_id)

        all_augments_output_dict["augment_task"].append(augment_task_id)
        all_augments_output_dict["extract_task"].append(extract_task_id)
        all_augments_output_dict["precursor_count"].append(0)
        all_augments_output_dict["timestamp"].append(
            augment_task_information["createtime"])

        #Processing extract task_id
        extract_task_info = ming_proteosafe_library.get_task_information(
            "proteomics2.ucsd.edu", extract_task_id)
        extract_task_parameters = ming_proteosafe_library.get_task_parameters(
            "proteomics2.ucsd.edu", extract_task_id)

        tasks_to_extract = json.loads(
            extract_task_parameters["tasks_to_consolidate"][0])

        for task in tasks_to_extract:
            search_task_to_augment[task] = augment_task_id
            search_task_to_extraction[task] = extract_task_id

            all_tasks_output_dict["search_task_id"].append(task)
            all_tasks_output_dict["extract_task_id"].append(extract_task_id)
            all_tasks_output_dict["augment_task_id"].append(augment_task_id)

            all_search_tasks.add(task)

        print(extract_task_parameters["task_file"][0])
        path_to_task_file = os.path.join(
            "/data/ccms-data/uploads",
            extract_task_parameters["task_file"][0][2:-1])
        if os.path.isfile(path_to_task_file):
            print("SEARCH FILE", path_to_task_file)
            try:
                row_count, table_data = ming_fileio_library.parse_table_with_headers(
                    path_to_task_file)
                print("Rows", row_count)
                for i in range(row_count):
                    search_task_id = table_data["TASKID"][i]
                    print(i, search_task_id)

                    search_task_to_augment[search_task_id] = augment_task_id
                    search_task_to_extraction[search_task_id] = extract_task_id

                    all_tasks_output_dict["search_task_id"].append(
                        search_task_id)
                    all_tasks_output_dict["extract_task_id"].append(
                        extract_task_id)
                    all_tasks_output_dict["augment_task_id"].append(
                        augment_task_id)

                    all_search_tasks.add(search_task_id)
            except:
                raise
                continue

        augment_task_id = previous_augment_task_id

    print(len(all_search_tasks))

    for i in range(len(all_tasks_output_dict["search_task_id"])):
        search_task = all_tasks_output_dict["search_task_id"][i]
        try:
            print(search_task)
            task_information = ming_proteosafe_library.get_task_information(
                "proteomics2.ucsd.edu", search_task)
            all_tasks_output_dict["search_description"].append(
                task_information["description"])
            for filename in task_information["files"]:
                if filename.find(".mzXML") != -1 or filename.find(
                        ".mzML") != -1:
                    all_spectrum_files_output_dict["spectrum_filename"].append(
                        filename)
                    all_spectrum_files_output_dict["search_task"].append(
                        search_task)
                    all_spectrum_files_output_dict[
                        "search_description"].append(
                            task_information["description"])
        except KeyboardInterrupt:
            raise
        except:
            all_tasks_output_dict["search_description"].append("")
            print("error", search_task)
            continue

    provenace_structure = {}
    provenace_structure["search_task_to_augment"] = search_task_to_augment
    provenace_structure[
        "search_task_to_extraction"] = search_task_to_extraction

    open(sys.argv[2], "w").write(json.dumps(provenace_structure, indent=4))

    ming_fileio_library.write_dictionary_table_data(all_tasks_output_dict,
                                                    sys.argv[3])
    ming_fileio_library.write_dictionary_table_data(all_augments_output_dict,
                                                    sys.argv[4])
    ming_fileio_library.write_dictionary_table_data(
        all_spectrum_files_output_dict, sys.argv[5])
Exemplo n.º 3
0
def trace_filename_filesystem(all_datasets,
                              dataset_accession,
                              dataset_scan,
                              enrichmetadata=False):
    output_file_list = []
    output_match_list = []
    for dataset_object in all_datasets:
        if dataset_object["dataset"] == dataset_accession:
            networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(
                dataset_object["task"])
            if networking_job == None:
                continue

            networking_task_info = ming_proteosafe_library.get_task_information(
                "gnps.ucsd.edu", networking_job["task"])
            task_user = networking_task_info["user"]

            clustering_path = os.path.join(
                "/data/ccms-data/tasks", task_user, networking_job["task"],
                "allclustered_spectra_info_withpath")
            clustering_files = ming_fileio_library.list_files_in_dir(
                clustering_path)
            if len(clustering_files) != 1:
                continue

            clustering_membership_list = ming_fileio_library.parse_table_with_headers_object_list(
                clustering_files[0])

            acceptable_raw_spectra = [
                spectrum for spectrum in clustering_membership_list
                if spectrum["cluster index"] == str(dataset_scan)
            ]

            for raw_spectrum in acceptable_raw_spectra:
                output_object = {}
                output_object["dataset_id"] = dataset_accession
                output_object["cluster_scan"] = dataset_scan
                output_object["filename"] = raw_spectrum["Original_Path"]
                output_object["filescan"] = raw_spectrum["ScanNumber"]
                output_object["metadata"] = ""
                output_object["basefilename"] = os.path.basename(
                    raw_spectrum["Original_Path"])

                if enrichmetadata:
                    try:
                        metadata_list = get_metadata_information_per_filename(
                            raw_spectrum["Original_Path"])
                        output_object["metadata"] = "|".join(metadata_list)
                    except:
                        print("ReDU is down")

                output_match_list.append(output_object)

            print(len(acceptable_raw_spectra))
            unique_files = list(
                set([
                    spectrum["Original_Path"]
                    for spectrum in acceptable_raw_spectra
                ]))
            print(len(unique_files))
            for source_file in unique_files:
                output_object = {}
                output_object["dataset_id"] = dataset_accession
                output_object["cluster_scan"] = dataset_scan
                output_object["filename"] = source_file
                output_object["metadata"] = ""
                output_object["basefilename"] = os.path.basename(source_file)

                if enrichmetadata:
                    try:
                        metadata_list = get_metadata_information_per_filename(
                            source_file)
                        output_object["metadata"] = "|".join(metadata_list)
                    except:
                        print("ReDU is down")

                output_file_list.append(output_object)

    #Performing a fix to make sure the spectrum is present because of a renaming from <dataset>/spectrum to <dataset>/ccms_peak
    for file_dict in output_file_list:
        splits = file_dict["filename"].split("/")
        splits[1] = splits[1].replace("spectrum", "ccms_peak")
        file_dict["filename"] = "/".join(splits)

    for file_dict in output_match_list:
        splits = file_dict["filename"].split("/")
        splits[1] = splits[1].replace("spectrum", "ccms_peak")
        file_dict["filename"] = "/".join(splits)

    return output_file_list, output_match_list