def get_vcfs(input_bam_file, output_vcf_path):
    """
    Usage: get_vcfs(input_bam_file, output_vcf_path)
    :param: input_bam_file - One BAM file to be parsed.
    :param: output_vcf_path - The location where the generated VCF file to be placed.
    :return: Number indicating whether get_vcfs was run successfully.
        0 - ran successfully
        10 - output path did not include a name
    """
    if not os.path.exists(output_vcf_path):
        open(output_vcf_path, 'w').close()
    if not isinstance(input_bam_file, str) or \
        input_bam_file == '' or \
        not input_bam_file.endswith(".bam") or \
        not os.path.isfile(input_bam_file) or \
        not os.access(input_bam_file, os.R_OK) or \
        not isinstance(output_vcf_path, str) or \
        output_vcf_path == '' or \
        not output_vcf_path.endswith(".vcf") or \
        output_vcf_path.rfind('/') == -1 or \
        not os.path.isdir(output_vcf_path[:output_vcf_path.rfind('/') + 1]) or \
        output_vcf_path[output_vcf_path.rfind('/') + 1:] == '' or \
        not all(c in ascii_letters + digits + '-' + '_' + '.' for c in \
        output_vcf_path[output_vcf_path.rfind('/') + 1:]) or \
        not os.access(output_vcf_path, os.W_OK):
        print(
            "Invalid Input or Output path supplied. The input file itself must exist and be readable and end with"
            ".bam. While the output file must be located in an existing file directory, and its name may not be "
            "empty and may only contain ASCII characters, numbers, ., -, and _. Moreover, the intended path musts be"
            "writable and must end with .vcf.")
        exit()
        return

    print("Obtaining the VCF file from %s" %
          input_bam_file[input_bam_file.rfind('/') + 1:])

    check_container_status(input_bam_file[:input_bam_file.rfind('/') + 1])

    subprocess.run(
        'docker exec -ti bio_c sh -c "cd /bio/ && lumpyexpress -B %s -o %s -P"'
        % (input_bam_file[input_bam_file.rfind('/') + 1:],
           output_vcf_path[output_vcf_path.rfind('/') + 1:]),
        shell=True)

    shutil.move(
        input_bam_file[:input_bam_file.rfind('/') + 1] +
        output_vcf_path[output_vcf_path.rfind('/') + 1:], output_vcf_path)

    return 0


#
#get_vcfs("/home/nathantaitano/Desktop/pimpiSVs/new_bams/EA00676.bam", "/home/nathantaitano/Desktop/pimpiSVs/new_bams/EA00676.vcf")
#get_vcfs("/home/nathantaitano/Desktop/pimpiSVs/new_bams/BGV008037.bam", "/home/nathantaitano/Desktop/pimpiSVs/new_bams/BGV008037.vcf")
Пример #2
0
def merge_bam_list(bam_list, output_directory=None, accession=None, v=False, k=False):
    mount_directory = os.path.dirname(bam_list[0])
    docker_tools.check_container_status(mount_directory)
    mount_directory += '/'

    if v: print("Fixing bam headers")
    fixed_header_bam_list = []
    for bam_file in bam_list:
        bam_name = bam_file.split('/')[-1]
        if bam_file.endswith("_fixed_header.bam"):
            fixed_header_bam_list.append("/bio/" + bam_name)
            continue
        else:
            fixed_header_bam = fix_header("/bio/" + bam_name, accession=accession)
            fixed_header_bam_list.append(fixed_header_bam)

    if v: print("Merging bams")
    bam_file_string = ' '.join(fixed_header_bam_list)
    if accession is None:
        accession = bam_file_list[0].split('/')[-1][0:-17]      #Gets rid of "_fixed_header.bam"
    merged_file = "/bio/" + str(accession) + "m.bam"

    subprocess.call('docker exec -it bio_c sh -c "samtools merge -c %s %s"' \
                    % (merged_file, bam_file_string), shell=True)

    if not k:
        if v: print("Deleting fixed header bam files")
        for file in fixed_header_bam_list:
            fixed_header_file = mount_directory + file[5:]
            if fixed_header_file not in bam_list:
                os.remove(fixed_header_file)

    if output_directory is not None:
        merged_bam_base_name = merged_file[4:]
        output_location = output_directory + merged_bam_base_name
        os.rename(mount_directory + merged_bam_base_name, output_location)
    return
def get_accession(bam_input, accession_file=None, q=False, v=False):
    mount_directory = os.path.dirname(bam_input)
    docker_tools.check_container_status(mount_directory)
    mount_directory += '/'

    bam_input_name = bam_input.split('/')[-1]
    already_parsed = False
    if not q:
        if accession_file is not None and accession_file.endswith(".txt"):
            if v: print("Checking if %s has already been parsed" % bam_input)
            open_accession_file = open(accession_file, 'r+')
            line = open_accession_file.readline()
            while line:
                bam_name = line.split(':')[0]
                if bam_input_name == bam_name:
                    already_parsed = True
                line = open_accession_file.readline()
            open_accession_file.close()
        else:
            accession_file = mount_directory + "accession_file.txt"
            if os.path.exists(accession_file):
                if v:
                    print("Checking if %s has already been parsed" % bam_input)
                open_accession_file = open(accession_file, 'r+')
                line = open_accession_file.readline()
                while line:
                    bam_name = line.split(':')[0]
                    if bam_input_name == bam_name:
                        already_parsed = True
                    line = open_accession_file.readline()
                open_accession_file.close()

        if already_parsed:
            print("\n%s has already been parsed. Its accession is in %s\n" %
                  (bam_input, accession_file))
            return 1

        if v: print("Opening accession file at %s" % accession_file)
        accession_file_exists = os.path.exists(accession_file)
        open_accession_file = open(accession_file, "a+")

    #Prepare temp file info
    base_name = bam_input_name[0:-4]
    random_suffix = str(random.randint(100000, 999999))
    temp_file_base_name = "temp" + base_name + '-' + random_suffix
    temp_file = mount_directory + temp_file_base_name

    if v: print("Creating temporary file at %s" % temp_file)
    open_temp_file = open(temp_file, 'x')
    open_temp_file.close()

    #Write input bam's header to temp file
    subprocess.call("docker exec -it bio_c sh -c 'samtools view -H %s > %s'" %
                    ("/bio/" + bam_input_name, "/bio/" + temp_file_base_name),
                    shell=True)

    if v: print("Parsing bam header for accession information")
    accession = None
    open_temp_file = open(temp_file, 'r')
    line = open_temp_file.readline()
    #Get to the @RG line where SM: is
    while line and not line.startswith("@RG"):
        line = open_temp_file.readline()

    try:
        SM_substring = line.split("SM:")[1]
    except:
        print(
            "\nAccession could not be found. BAM file header may be corrupt?\n"
        )
        return 2

    accession = SM_substring.split('\t')[0]

    open_temp_file.close()
    os.remove(temp_file)

    if not q:
        open_accession_file.write(bam_input_name + ':' + accession + '\n')
        open_accession_file.close()

    print("\nThe accession of %s is %s" % (bam_input, accession))
    if not q:
        print("Accession written to %s\n" % accession_file)

    return (accession)
def unknown_N_filter(input_vcf,
                     output_vcf,
                     reference_genome,
                     v=False,
                     N_threshold=10,
                     flank_rad=50):
    """
    Filters a vcf based on the quality of the reference genome. An SV is
    filtered out if there are more than a certain amount of unknown nucleotide
    bases within a certain amount of base pairs of the SV's endpoints. For
    example, if we input the flank radius to be 50 and the N threshold to 10,
    then if an SV had 15 unknown nucleotides directly before and after its start position
    in the reference genome, then it would be filtered out. The nucleotide
    bases of the reference genome are found using the bedtools nuc command.

    Usage:
    python unknown_N_filter.py -i /vcf/to/be/filtered.vcf -f /reference/fasta/file.fa

    Parameters:
        Required:
            -i: vcf to be filtered
            -f: reference genome in fasta format
                Default: same directory as the input vcf, with _n_filtered.vcf suffix
                IMPORTANT: there must be an index file (.fai) for the reference genome in
                           the same directory as the reference genome. It should be named
                           like so: reference_genome_name.fa.fai
        Optional:
            -o: ouptut vcf
                Default: same directory as the input vcf with "_n_filtered" suffix
            -n: threshold for amount of N in order for an SV to be filtered
                Default: 10
            -r: number of base pairs to fetch nucleotide pairs for
                Default: 50
            -v: verbose mode
            -h: help (display this message)

    Returns a tuple containing a list of clean IDs and a list containing a list of dirty IDs
    """
    mount_directory = os.path.dirname(reference_genome)
    docker_tools.check_container_status(mount_directory)
    mount_directory += '/'

    index_dict = {}
    reference_genome_index_file = reference_genome + ".fai"
    if v:
        print("Parsing reference genome index at %s" %
              reference_genome_index_file)
    try:
        open_index_file = open(reference_genome_index_file, 'r')
    except:
        raise Exception(
            "Cannot find index file (.fai) for reference genome in same directory as \
                            reference genome. Try running samtools faidx on the reference genome"
        )
    line = open_index_file.readline()
    while line:
        cols = line.split('\t')
        chr_name = cols[0]
        chr_length = int(cols[1])
        index_dict[chr_name] = chr_length
        line = open_index_file.readline()

    if v: print("Opening %s to create end point flanks" % input_vcf)
    vcf_reader = open(input_vcf, 'r')

    if v: print("Reading through the header")
    line = vcf_reader.readline()
    while line.startswith('#'):
        line = vcf_reader.readline()

    vcf_name = input_vcf.split('/')[-1][:-4]

    random_suffix = str(random.randint(100000, 999999))
    bed_name = "temp" + vcf_name + '-' + random_suffix + ".bed"
    bed_file = mount_directory + bed_name

    if v: print("Creating temporary bed file at %s" % bed_file)
    bed_writer = open(bed_file, 'w+')

    while line:
        cols = line.split('\t')
        chr_name = cols[0]
        start_pos = int(cols[1]) - 1
        sv_id = cols[2]
        sv_info = cols[7]

        if ";END=" in sv_info:  # If it is any SV but BND
            end_pos = int(sv_info.split("END=")[1].split(';')[0])
        else:
            end_pos = int(cols[1])

        start_pos_minus_flank = start_pos - flank_rad
        start_pos_plus_flank = start_pos + flank_rad
        end_pos_minus_flank = end_pos - flank_rad
        end_pos_plus_flank = end_pos + flank_rad

        chr_length = index_dict[chr_name]

        if start_pos_minus_flank < 0:
            start_pos_minus_flank = 0
        if end_pos_minus_flank < 0:
            end_pos_minus_flank = 0
        if start_pos_plus_flank > chr_length:
            start_pos_plus_flank = chr_length
        if end_pos_plus_flank > chr_length:
            end_pos_plus_flank = chr_length

        start_pos_minus_flank = str(start_pos_minus_flank)
        start_pos_plus_flank = str(start_pos_plus_flank)
        end_pos_minus_flank = str(end_pos_minus_flank)
        end_pos_plus_flank = str(end_pos_plus_flank)

        bed_writer.write(chr_name + "\t" + start_pos_minus_flank + "\t" +
                         start_pos_plus_flank + "\t" + sv_id + "\n")

        bed_writer.write(chr_name + "\t" + end_pos_minus_flank + "\t" +
                         end_pos_plus_flank + "\t" + sv_id + "\n")

        line = vcf_reader.readline()

    if v: print("%s converted to BED format at %s" % (input_vcf, bed_file))
    vcf_reader.close()
    bed_writer.close()

    fasta_name = reference_genome.split('/')[-1]
    nuc_name = "temp" + vcf_name + '-' + random_suffix + ".nuc"
    nuc_file = mount_directory + nuc_name

    if v: print("Generating nucleotide content at %s" % (nuc_file))
    subprocess.call(
        'docker exec -it bio_c sh -c "bedtools nuc -fi %s -bed %s > %s"' %
        ("/bio/" + fasta_name, "/bio/" + bed_name, "/bio/" + nuc_name),
        shell=True)

    os.remove(bed_file)

    if v: print("Parsing nucleotide content")
    open_nuc_file = open(nuc_file, 'r')
    open_nuc_file.readline()  #Skip header
    line = open_nuc_file.readline()

    clean_IDs = []
    bad_IDs = []

    while line:
        cols = line.split('\t')
        id = cols[3]
        N_count = int(cols[10])

        if N_count >= N_threshold:
            if id not in bad_IDs:
                bad_IDs.append(id)
                if id in clean_IDs:
                    clean_IDs.remove(id)
        else:
            if id not in bad_IDs:
                if id not in clean_IDs:
                    clean_IDs.append(id)

        line = open_nuc_file.readline()

    open_nuc_file.close()
    os.remove(nuc_file)

    if v:
        print("Finished parsing nuc content")
        print("Filtering vcf")
    vcf_reader = open(input_vcf, 'r')

    if v: print("Creating filtered vcf file at %s" % output_vcf)
    vcf_writer = open(output_vcf, 'w+')

    line = vcf_reader.readline()
    while line.startswith('#'):
        vcf_writer.write(line)
        line = vcf_reader.readline()

    while line:
        cols = line.split('\t')
        id = cols[2]
        if id in clean_IDs:
            vcf_writer.write(line)

        line = vcf_reader.readline()

    vcf_reader.close()
    vcf_writer.close()

    print("All SVs processed. Filtered vcf at %s" % output_vcf)

    return 0
    subprocess.call(
        "docker exec -i bio_c sh -c 'cd /bio/ && /speedseq/bin/speedseq align -M 20 -R \"@RG\tID:%s\tSM:%s\tLB:%s\" -o %s %s %s_1.fastq.gz %s_2.fastq.gz'"
        % (accession, accession, accession, accession,
           "/bio/BGV1.0_genome.fasta", accession, accession),
        shell=True)

    # Lumpy express
    subprocess.call(
        "docker exec -i bio_c sh -c 'cd /bio/ && lumpyexpress -B %s -S %s -D %s'"
        % (accession + ".bam", accession + ".splitters.bam",
           accession + ".discordants.bam"),
        shell=True)

    return


# Enter your own mount_directory here. This is where the bams will go
mount_directory = "/home/nathantaitano/Desktop/pimpiSVs/new_bams"
docker_tools.check_container_status(mount_directory)

# Installs speedseq in the docker container
subprocess.call(
    "docker exec -i bio_c sh -c 'git clone --recursive https://github.com/hall-lab/speedseq && cd speedseq && make align'",
    shell=True)

for SRRname, accession in sraAccessionDict.items():
    sraProcessFile(SRRname, accession)

for EBIname, acession in ebiAccessionDict.items():
    ebiProcessFile(EBIname, acession)