예제 #1
0
def sort_bgzip_tabix_release_files(bgzip_path, tabix_path, vcf_sort_script_path, assembly_accession,
                                   species_release_folder):
    commands = []
    # These files are left behind by the sort_vcf_sorted_chromosomes.sh script
    # To be idempotent, remove such files
    commands.append("rm -f {0}/{1}/*.chromosomes".format(species_release_folder, assembly_accession))
    for vcf_file_category in release_vcf_file_categories:
        unsorted_release_file_name = get_unsorted_release_vcf_file_name(species_release_folder, assembly_accession,
                                                                        vcf_file_category)
        sorted_release_file_name = get_release_vcf_file_name(species_release_folder, assembly_accession,
                                                             vcf_file_category)
        commands.append("rm -f {2} && {0} -f {1} {2}".format(vcf_sort_script_path,
                                                             unsorted_release_file_name,
                                                             sorted_release_file_name))
        commands.extend(get_bgzip_tabix_commands_for_file(bgzip_path, tabix_path, sorted_release_file_name))
    for text_release_file_category in release_text_file_categories:
        unsorted_release_file_name = get_unsorted_release_text_file_name(species_release_folder, assembly_accession,
                                                                         text_release_file_category)
        sorted_release_file_name = get_release_text_file_name(species_release_folder, assembly_accession,
                                                              text_release_file_category)
        commands.append("(sort -V {1} | uniq > {2})".format(vcf_sort_script_path,
                                                            unsorted_release_file_name,
                                                            sorted_release_file_name))
        commands.append("(gzip < {0} > {0}.gz)".format(sorted_release_file_name))
    command = " && ".join(commands)
    run_command_with_output("Sort, bgzip and tabix release files for assembly: " + assembly_accession,
                            command)
def count_rs_ids_in_release_files(count_ids_script_path, assembly_accession,
                                  species_release_folder):
    release_count_filename = os.path.join(species_release_folder,
                                          assembly_accession,
                                          "README_rs_ids_counts.txt")
    with open(release_count_filename, "w") as release_count_file_handle:
        release_count_file_handle.write("# Unique RS ID counts\n")
        for vcf_file_category in release_vcf_file_categories:
            release_vcf_file_name = get_release_vcf_file_name(
                species_release_folder, assembly_accession, vcf_file_category)
            num_ids_in_file = run_command_with_output(
                "Counting RS IDs in file: " + release_vcf_file_name,
                "{0} {1}.gz".format(count_ids_script_path,
                                    release_vcf_file_name),
                return_process_output=True)
            release_count_file_handle.write(num_ids_in_file)
        for text_release_file_category in release_text_file_categories:
            text_release_file_name = get_release_text_file_name(
                species_release_folder, assembly_accession,
                text_release_file_category)
            num_ids_in_file = run_command_with_output(
                "Counting RS IDs in file: " + text_release_file_name,
                "zcat {0}.gz | cut -f1 | uniq | wc -l".format(
                    text_release_file_name),
                return_process_output=True)
            release_count_file_handle.write("{0}.gz\t{1}".format(
                os.path.basename(text_release_file_name),
                str(num_ids_in_file)))
def validate_release_vcf_files(private_config_xml_file, profile, taxonomy_id,
                               assembly_accession,
                               release_species_inventory_table,
                               release_version, species_release_folder,
                               vcf_validator_path, assembly_checker_path):
    run_command_with_output(
        "Remove existing VCF validation and assembly report outputs...",
        "rm -f {0}/{1}/{2} {0}/{1}/{3}".format(
            species_release_folder, assembly_accession,
            vcf_validation_output_file_pattern,
            asm_report_output_file_pattern))
    validate_release_vcf_files_commands = []
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file),
                          user="******") as \
            metadata_connection_handle:
        release_inventory_info_for_assembly = get_release_inventory_info_for_assembly(
            taxonomy_id, assembly_accession, release_species_inventory_table,
            release_version, metadata_connection_handle)
        fasta_path = release_inventory_info_for_assembly["fasta_path"]
        assembly_report_path = release_inventory_info_for_assembly[
            "report_path"]
        remove_index_if_outdated(fasta_path)
        if assembly_report_path.startswith("file:/"):
            assembly_report_path = assembly_report_path.replace("file:/", "/")

        for vcf_file_category in release_vcf_file_categories:

            release_vcf_file_name = get_release_vcf_file_name(
                species_release_folder, assembly_accession, vcf_file_category)
            release_vcf_dir = os.path.dirname(release_vcf_file_name)
            if "multimap" not in vcf_file_category:
                validate_release_vcf_files_commands.append(
                    "({0} -i {1} -o {2}) || true".format(
                        vcf_validator_path, release_vcf_file_name,
                        release_vcf_dir))
                validate_release_vcf_files_commands.append(
                    "({0} -i {1} -f {2} -a {3} -o {4} -r text,summary) || true"
                    .format(assembly_checker_path, release_vcf_file_name,
                            fasta_path, assembly_report_path, release_vcf_dir))

        # We don't expect the validation commands to all pass, hence use semi-colon to run them back to back
        final_validate_command = " ; ".join(
            validate_release_vcf_files_commands)
        run_command_with_output(
            "Validating release files for assembly: " + assembly_accession,
            final_validate_command)
예제 #4
0
def move_release_files_to_unsorted_category(assembly_accession,
                                            species_release_folder,
                                            vcf_file_category,
                                            unsorted_release_file_path):
    unsorted_release_file_name = os.path.basename(unsorted_release_file_path)
    release_file_path = get_release_vcf_file_name(species_release_folder,
                                                  assembly_accession,
                                                  vcf_file_category)
    release_file_name = os.path.basename(release_file_path)
    for variant_source in ["eva", "dbsnp"]:
        vcf_file_name = release_file_path.replace(
            release_file_name, "{0}_{1}".format(variant_source,
                                                release_file_name))
        unsorted_file_name = unsorted_release_file_path.replace(
            unsorted_release_file_name,
            "{0}_{1}".format(variant_source, unsorted_release_file_name))
        if os.path.exists(
                vcf_file_name) and not os.path.exists(unsorted_file_name):
            os.rename(vcf_file_name, unsorted_file_name)