Exemplo n.º 1
0
def run_flye_polishing(
    asm_cns, reads, asm_dir, contig_name, thread, polish_iterations, presets
):
    """Run Flye polishing"""
    if presets == "pacbio":
        presets_flye = "--pacbio-raw"
    else:
        presets_flye = "--nano-raw"

    tmp_out_dir = os.path.join(asm_dir, contig_name)
    mkdir(tmp_out_dir)
    try:
        subprocess.call(
            [
                "flye",
                "--polish-target",
                asm_cns,
                presets_flye,
                reads,
                "--out-dir",
                tmp_out_dir,
                "--thread",
                str(thread),
                "--iterations",
                str(polish_iterations),
            ]
        )
    except Exception as e:
        print(e)
        print("Polishing failed, exiting...")
        return None

    # rename contig file
    polished_contig = os.path.join(
        tmp_out_dir, "polished_" + str(polish_iterations) + ".fasta"
    )
    if check_exist(polished_contig):
        os.rename(polished_contig, asm_cns)
        shutil.rmtree(tmp_out_dir)
        return asm_cns
    else:
        return None
Exemplo n.º 2
0
def repeatmask(ref, library, outdir, thread):
    mkdir(outdir)
    try:
        subprocess.call([
            "RepeatMasker",
            "-dir",
            outdir,
            "-gff",
            "-s",
            "-nolow",
            "-no_is",
            "-e",
            "ncbi",
            "-lib",
            library,
            "-pa",
            str(thread),
            ref,
        ])
        ref_rm = os.path.join(outdir, os.path.basename(ref) + ".masked")
        gff = os.path.join(outdir, os.path.basename(ref) + ".out.gff")
        gff3 = os.path.join(outdir, os.path.basename(ref) + ".out.gff3")
        if not os.path.isfile(ref_rm):
            ref_rm_out = os.path.join(outdir, os.path.basename(ref) + ".out")
            with open(ref_rm_out, "r") as input:
                for line in input:
                    if "There were no repetitive sequences detected" in line:
                        print("No repetitive sequences detected")
                        ref_rm = ref
                        gff = None
                        gff3 = None
                    else:
                        raise Exception("Repeatmasking failed, exiting...")
        else:
            parse_rm_out(gff, gff3)
            open(ref_rm, "r")
    except Exception as e:
        print(e)
        print("Repeatmasking failed, exiting...")
        sys.exit(1)
    return ref_rm, gff3
Exemplo n.º 3
0
def run_flye_assembly(sv_reads, asm_dir, contig_name, thread, presets):
    """Run Flye assembly"""
    if presets == "pacbio":
        presets_flye = "--pacbio-raw"
    else:
        presets_flye = "--nano-raw"

    tmp_out_dir = os.path.join(asm_dir, contig_name)
    mkdir(tmp_out_dir)
    try:
        subprocess.call(
            [
                "flye",
                presets_flye,
                sv_reads,
                "--out-dir",
                tmp_out_dir,
                "--thread",
                str(thread),
                "--iterations",
                "0",
            ]
        )
    except Exception as e:
        print(e)
        print("Assembly failed, exiting...")
        return
    # rename contigs
    contig_path = os.path.join(tmp_out_dir, "assembly.fasta")
    contig_path_new = os.path.join(asm_dir, contig_name + ".cns.fa")
    if check_exist(contig_path):
        os.rename(contig_path, contig_path_new)
        # remove tmp files
        shutil.rmtree(tmp_out_dir)
        return contig_path_new
    else:
        print("assembly failed")
        return None
Exemplo n.º 4
0
def prep_assembly_inputs(
    vcf_parsed, out, sample_name, bam, raw_reads, reads_dir, read_type="sv"
):
    """Prepare reads for local assembly"""
    # logging.info("Prepare reads for local assembly")

    if read_type == "sv":  # TODO: figure out what this does
        # extract read IDs
        read_ids = os.path.join(out, sample_name + ".id")
        with open(vcf_parsed, "r") as input, open(read_ids, "w") as output:
            for line in input:
                entry = line.replace("\n", "").split("\t")
                read_list = entry[8].split(",")
                for read in read_list:
                    output.write(read + "\n")
    else:  # TODO: think about using this for assembly, filter for cigar reads
        window = 1000
        samfile = pysam.AlignmentFile(bam, "rb")
        read_ids = os.path.join(out, sample_name + ".id")
        vcf_parsed_new = vcf_parsed + ".new"
        with open(vcf_parsed, "r") as input, open(read_ids, "w") as output, open(
            vcf_parsed_new, "w"
        ) as VCF:
            for line in input:
                entry = line.replace("\n", "").split("\t")

                # get sniffles read list
                read_list = entry[8].split(",")
                reads_sniffles = set(read_list)

                ins_chr = entry[0]
                ins_breakpoint = round((int(entry[1]) + int(entry[2])) / 2)
                start = ins_breakpoint - window
                end = ins_breakpoint + window
                reads = set()
                # coverage = 0
                for read in samfile.fetch(ins_chr, start, end):
                    reads.add(read.query_name)
                for read in reads:
                    output.write(read + "\n")

                # write
                out_line = line.replace("\n", "") + "\t" + str(len(reads))
                VCF.write(out_line + "\n")
                vcf_parsed = vcf_parsed_new

    # generate unique ID list
    read_ids_unique = read_ids + ".unique"
    command = "cat " + read_ids + " | sort | uniq"
    with open(read_ids_unique, "w") as output:
        subprocess.call(command, stdout=output, shell=True)

    # filter raw reads using read list
    subset_fa = os.path.join(out, sample_name + ".subset.fa")
    command = "seqtk subseq " + raw_reads + " " + read_ids_unique + " | seqtk seq -a"
    with open(subset_fa, "w") as output:
        subprocess.call(command, stdout=output, shell=True)

    # reorder reads
    subset_fa_reorder = out + "/" + sample_name + ".subset.reorder.fa"
    extract_reads(subset_fa, read_ids, subset_fa_reorder)

    # separate reads into multiple files, using csplit
    mkdir(reads_dir)
    csplit_prefix = reads_dir + "/contig"
    m = []
    k = 1
    with open(vcf_parsed, "r") as input:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            if read_type == "sv":
                k = k + 2 * (len(entry[8].split(",")))
            else:
                k = k + 2 * int(entry[14])
            m.append(k)
    if len(m) == 1:
        subprocess.call(["cp", subset_fa_reorder, reads_dir + "/contig0"])
    elif len(m) == 0:
        print("No insertion detected, exiting...")
    else:
        m = m[:-1]
        index = " ".join(str(i) for i in m)
        command = (
            "csplit -s -f " + csplit_prefix + " -n 1 " + subset_fa_reorder + " " + index
        )
        subprocess.call(command, shell=True)

    # remove tmp files
    os.remove(read_ids)
    os.remove(read_ids_unique)
    os.remove(subset_fa)
    os.remove(subset_fa_reorder)
Exemplo n.º 5
0
def get_local_contigs(
    assembler,
    polisher,
    contig_dir,
    vcf_parsed,
    out,
    sample_name,
    bam,
    raw_reads,
    thread,
    presets,
    polish_iterations,
):
    """Perform local assembly using reads from parsed VCF file in parallel"""

    # Prepare reads used for local assembly and polishing
    sv_reads_dir = os.path.join(out, "sv_reads")

    try:
        prep_assembly_inputs(
            vcf_parsed, out, sample_name, bam, raw_reads, sv_reads_dir, read_type="sv"
        )
    except Exception as e:
        print(e)
        print("Prepare local assembly input data failed, exiting...")
        sys.exit(1)

    mkdir(contig_dir)

    k = 0
    asm_pa_list = []
    with open(vcf_parsed, "r") as input:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            # rename variant reads
            sv_reads = sv_reads_dir + "/contig" + str(k)
            sv_reads_rename = sv_reads_dir + "/" + contig_name + ".reads.fa"
            os.rename(sv_reads, sv_reads_rename)
            thread_asm = 1
            asm_pa = [
                sv_reads_rename,
                contig_dir,
                contig_name,
                thread_asm,
                presets,
                assembler,
                polisher,
                polish_iterations,
            ]
            asm_pa_list.append(asm_pa)
            k = k + 1
    # run assembly in parallel
    logging.info("Perform local assembly of non-reference TE loci...")
    start_time = time.time()

    try:
        pool = Pool(processes=thread)
        contig_list = pool.map(run_assembly_polishing, asm_pa_list)
        pool.close()
        pool.join()
    except Exception as e:
        print(e)
        print("Local assembly failed, exiting...")
        sys.exit(1)

    proc_time = time.time() - start_time

    # merge all contigs
    assembly_passed_loci = set()
    merged_contigs = os.path.join(out, sample_name + ".contigs.fa")
    with open(merged_contigs, "w") as merged_output_handle:
        for contig in contig_list:
            if check_exist(contig):
                contig_name = os.path.basename(contig).replace(".cns.fa", "")
                assembly_passed_loci.add(contig_name)
                parsed_contig = os.path.join(contig_dir, contig_name + ".cns.ctg1.fa")
                with open(contig, "r") as input:
                    records = SeqIO.parse(input, "fasta")
                    for record in records:
                        if record.id == "ctg1" or record.id == "contig_1":
                            record.id = contig_name
                            record.description = "len=" + str(len(record.seq))
                            SeqIO.write(record, merged_output_handle, "fasta")
                            with open(parsed_contig, "w") as parsed_output_handle:
                                SeqIO.write(record, parsed_output_handle, "fasta")

    logging.info("Local assembly finished in " + format_time(proc_time))
    return merged_contigs, assembly_passed_loci
Exemplo n.º 6
0
def get_args():
    parser = argparse.ArgumentParser(
        description="Program for detecting non-reference TEs in long read data"
    )
    optional = parser._action_groups.pop()
    required = parser.add_argument_group("required arguments")

    # required
    required.add_argument(
        "-i",
        "--reads",
        type=str,
        help="reads in fasta/fastq format or read alignments in bam format",
        required=True,
    )
    required.add_argument(
        "-r",
        "--reference",
        type=str,
        help="reference genome in fasta format",
        required=True,
    )
    required.add_argument(
        "-l",
        "--library",
        type=str,
        help="TE consensus sequences in fasta format",
        required=True,
    )

    # optional
    optional.add_argument(
        "--aligner",
        type=str,
        help=
        "choose method for read alignment, please provide 'nglmr' or 'minimap2' (default = 'nglmr')",
        required=False,
    )
    optional.add_argument(
        "--assembler",
        type=str,
        help=
        "Choose the method to be used for local contig assembly step, please provide 'wtdbg2' or 'flye' (default = 'wtdbg2')",
        required=False,
    )
    optional.add_argument(
        "--polisher",
        type=str,
        help=
        "Choose the method to be used for local contig polishing step, please provide 'wtdbg2' or 'flye' (default = 'wtdbg2')",
        required=False,
    )
    optional.add_argument(
        "-x",
        "--presets",
        type=str,
        help=
        "parameter presets for different sequencing technologies, please provide 'pacbio' or 'ont' (default = 'pacbio')",
        required=False,
    )
    optional.add_argument(
        "-p",
        "--polish_iterations",
        type=int,
        help="iterations of contig polishing (default = 1)",
        required=False,
    )
    optional.add_argument(
        "-o",
        "--out",
        type=str,
        help="directory to output data (default = '.')",
        required=False,
    )
    optional.add_argument(
        "-t",
        "--thread",
        type=int,
        help="max cpu threads to use (default = '1')",
        required=False,
    )
    optional.add_argument(
        "-g",
        "--gap",
        type=int,
        help="max gap size for flanking sequence alignment (default = '20')",
        required=False,
    )
    optional.add_argument(
        "-v",
        "--overlap",
        type=int,
        help=
        "max overlap size for flanking sequence alignment (default = '20')",
        required=False,
    )
    optional.add_argument(
        "--flank_len",
        type=int,
        help="flanking sequence length (default = '500')",
        required=False,
    )
    optional.add_argument(
        "--af_flank_interval",
        type=int,
        help=
        "5' and 3'flanking sequence interval size used for allele frequency estimation (default = '100')",
        required=False,
    )
    optional.add_argument(
        "--af_flank_offset",
        type=int,
        help=
        "5' and 3' flanking sequence offset size used for allele frequency estimation (default = '200')",
        required=False,
    )
    optional.add_argument(
        "--af_te_interval",
        type=int,
        help=
        "5' and 3' te sequence interval size used for allele frequency estimation (default: '50')",
        required=False,
    )
    optional.add_argument(
        "--af_te_offset",
        type=int,
        help=
        "5' and 3' te sequence offset size used for allele frequency estimation (default: '50')",
        required=False,
    )
    optional.add_argument(
        "--different_contig_name",
        action="store_true",
        help=
        "If provided then TELR does not require the contig name to match before and after annotation liftover (default: require contig name to be the same before and after liftover)",
        required=False,
    )
    optional.add_argument(
        "--minimap2_family",
        action="store_true",
        help=
        "If provided then minimap2 will be used to annotate TE families in the assembled contigs (default: use repeatmasker for contig TE annotation)",
        required=False,
    )
    optional.add_argument(
        "-k",
        "--keep_files",
        action="store_true",
        help=
        "If provided then all intermediate files will be kept (default: remove intermediate files)",
        required=False,
    )
    parser._action_groups.append(optional)
    args = parser.parse_args()

    # checks if in files exist
    try:
        test = open(args.reads, "r")
    except Exception as e:
        print(e)
        logging.exception("Can not open input file: " + args.reads)
        sys.exit(1)

    try:
        test = open(args.reference, "r")
    except Exception as e:
        print(e)
        logging.exception("Can not open input file: " + args.reference)
        sys.exit(1)

    try:
        test = open(args.library, "r")
    except Exception as e:
        print(e)
        logging.exception("Can not open input file: " + args.library)
        sys.exit(1)

    # check if optional arguments are valid
    if args.aligner is None:
        args.aligner = "nglmr"
    elif args.aligner not in ["nglmr", "minimap2"]:
        print(
            "Please provide a valid alignment method (nglmr/minimap2), exiting..."
        )
        sys.exit(1)

    if args.assembler is None:
        args.assembler = "wtdbg2"
    elif args.assembler not in ["wtdbg2", "flye"]:
        print(
            "Please provide a valid assembly method (wtdbg2/flye), exiting...")
        sys.exit(1)

    if args.polisher is None:
        args.polisher = "wtdbg2"
    elif args.polisher not in ["wtdbg2", "flye"]:
        print("Please provide a valid polish method (wtdbg2/flye), exiting...")
        sys.exit(1)

    if args.presets is None:
        args.presets = "pacbio"
    elif args.presets not in ["pacbio", "ont"]:
        print("Please provide a valid preset option (pacbio/ont), exiting...")
        sys.exit(1)

    if args.polish_iterations is None:
        args.polish_iterations = 1
    elif args.polish_iterations < 1:
        print(
            "Please provide a valid number of iterations for polishing, exiting..."
        )

    # sets up out dir variable
    if args.out is None:
        args.out = "."
    args.out = os.path.abspath(args.out)
    mkdir(args.out)

    if args.thread is None:
        args.thread = 1

    if args.flank_len is None:
        args.flank_len = 500

    if args.af_flank_interval is None:
        args.af_flank_interval = 100
    else:
        if args.af_flank_interval <= 0:
            print(
                "Please provide a valid flanking sequence interval size (positive integer) for allele frequency estimation, exiting..."
            )
            sys.exit(1)

    if args.af_flank_offset is None:
        args.af_flank_offset = 200
    else:
        if args.af_flank_offset < 0:
            print(
                "Please provide a valid flanking sequence offset size (positive integer) for allele frequency estimation, exiting..."
            )

    if args.af_te_interval is None:
        args.af_te_interval = 50
    else:
        if args.af_te_interval <= 0:
            print(
                "Please provide a valid TE interval size (positive integer) for allele frequency estimation, exiting..."
            )

    if args.af_te_offset is None:
        args.af_te_offset = 50
    else:
        if args.af_te_offset < 0:
            print(
                "Please provide a valid TE offset size (positive integer) for allele frequency estimation, exiting..."
            )

    if args.gap is None:
        args.gap = 20

    if args.overlap is None:
        args.overlap = 20

    return args
Exemplo n.º 7
0
def annotate_contig(
    contigs,
    assembly_passed_loci,
    te_library,
    vcf_parsed,
    out,
    sample_name,
    thread,
    presets,
    minimap2_family,
    loci_eval,
):
    logging.info("Annotate contigs...")
    if presets == "pacbio":
        minimap2_presets = "map-pb"
    else:
        minimap2_presets = "map-ont"

    # map sequence to contigs
    vcf_seq2contig_out = os.path.join(out, "seq2contig.paf")
    # if os.path.isfile(vcf_seq2contig_out):
    #     os.remove(vcf_seq2contig_out)

    # TODO: consider that some contigs might not exist
    seq2contig_passed_loci = set()
    vcf_seq2contig_dir = os.path.join(out, "vcf_seq2contig")
    mkdir(vcf_seq2contig_dir)
    with open(vcf_parsed, "r") as input, open(vcf_seq2contig_out,
                                              "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            if contig_name in assembly_passed_loci:
                vcf_seq = entry[7]
                query = os.path.join(vcf_seq2contig_dir,
                                     contig_name + ".seq.fa")
                create_fa(contig_name, vcf_seq, query)
                subject = os.path.join(
                    vcf_seq2contig_dir,
                    contig_name + ".contig.fa")  ## TODO: this can be replaced
                with open(subject, "w") as subject_output_handle:
                    try:
                        subprocess.call(
                            ["samtools", "faidx", contigs, contig_name],
                            stdout=subject_output_handle,
                        )
                    except subprocess.CalledProcessError:
                        print(contig_name + ":contig assembly doesn't exist")
                        continue
                cmd = [
                    "minimap2",
                    "-cx",
                    minimap2_presets,
                    "--secondary=no",
                    "-v",
                    "0",
                    subject,
                    query,
                ]
                vcf_seq2contig_output = get_cmd_output(cmd)
                if vcf_seq2contig_output != "":
                    output.write(vcf_seq2contig_output)
                    seq2contig_passed_loci.add(contig_name)
                    # with open(vcf_seq2contig_out, "a") as output:
                os.remove(query)
                os.remove(subject)
    os.rmdir(vcf_seq2contig_dir)

    # covert to bed format
    seq2contig_bed = os.path.join(out, "seq2contig.bed")
    with open(vcf_seq2contig_out, "r") as input, open(seq2contig_bed,
                                                      "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            bed_line = "\t".join(
                [entry[0], entry[7], entry[8], entry[5], entry[11], entry[4]])
            output.write(bed_line + "\n")

    # # report ins-contig failed loci
    # with open(loci_eval, "a") as output:
    #     for locus in assembly_passed_loci:
    #         if locus not in seq2contig_passed_loci:
    #             output.write(
    #                 "\t".join(
    #                     [locus, "Sniffles VCF sequence not mapped to assembled contig"]
    #                 )
    #                 + "\n"
    #             )

    # map TE library to contigs using minimap2
    # TE-contig alignment
    te2contig_out = os.path.join(out, sample_name + ".te2contig.paf")
    if os.path.isfile(te2contig_out):
        os.remove(te2contig_out)
    for locus in seq2contig_passed_loci:
        contig_fa = os.path.join(out, locus + ".fa")
        with open(contig_fa, "w") as output:
            subprocess.call(["samtools", "faidx", contigs, locus],
                            stdout=output)
        # map TE library to contig using minimap2
        with open(te2contig_out, "a") as output:
            subprocess.call(
                [
                    "minimap2",
                    "-cx",
                    minimap2_presets,
                    contig_fa,
                    te_library,
                    "-v",
                    "0",
                    "-t",
                    str(thread),
                ],
                stdout=output,
            )
        os.remove(contig_fa)
    # convert to bed format
    te2contig_bed = os.path.join(out, sample_name + ".te2contig.bed")
    with open(te2contig_out, "r") as input, open(te2contig_bed, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            bed_line = "\t".join(
                [entry[5], entry[7], entry[8], entry[0], entry[11], entry[4]])
            output.write(bed_line + "\n")

    # Use VCF sequence alignment to filter minimap2 TE-contig alignment
    te2contig_filter_raw = os.path.join(out,
                                        sample_name + ".te2contig_filter.tsv")
    with open(te2contig_filter_raw, "w") as output:
        subprocess.call(
            [
                "bedtools",
                "intersect",
                "-a",
                te2contig_bed,
                "-b",
                seq2contig_bed,
                "-wao",
            ],
            stdout=output,
        )

    # filter and merge
    # get rid of -1 and make it into bed format
    te2contig_filter_tmp_bed = os.path.join(
        out, sample_name + ".te2contig_filter.tmp.bed")
    with open(te2contig_filter_raw,
              "r") as input, open(te2contig_filter_tmp_bed, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            # the overlap between VCF sequence alignment and TE-contig alignment has to be over 10bp
            if int(entry[12]) > 10:
                out_line = "\t".join([
                    entry[0], entry[1], entry[2], entry[3], entry[4], entry[5]
                ])
                output.write(out_line + "\n")
    # sort # TODO: package this part, hide variables
    te2contig_filter_tmp_sort_bed = (out + "/" + sample_name +
                                     ".te2contig_filter.tmp.sort.bed")
    command = "bedtools sort -i " + te2contig_filter_tmp_bed
    with open(te2contig_filter_tmp_sort_bed, "w") as output:
        subprocess.call(command, shell=True, stdout=output)

    # find out what's filtered out
    seq_mm2_overlap_loci = set()
    with open(te2contig_filter_tmp_sort_bed, "r") as input:
        for line in input:
            seq_mm2_overlap_loci.add(line.split("\t")[0])
    # seq_mm2_overlap_loci = create_loci_set(te2contig_filter_tmp_sort_bed)
    with open(loci_eval, "a") as output:
        for locus in seq2contig_passed_loci:
            if locus not in seq_mm2_overlap_loci:
                output.write("\t".join([
                    locus, "VCF sequence doesn't overlap contig annotation"
                ]) + "\n")

    # merge
    contig_te_annotation_tmp = out + "/" + sample_name + ".te2contig_filter.bed.tmp"
    command = (
        'bedtools merge -d 10000 -c 4,6 -o distinct,distinct -delim "|" -i ' +
        te2contig_filter_tmp_sort_bed)
    with open(contig_te_annotation_tmp, "w") as output:
        subprocess.call(command, shell=True, stdout=output)

    contig_te_annotation = out + "/" + sample_name + ".te2contig_filter.bed"
    with open(contig_te_annotation_tmp,
              "r") as input, open(contig_te_annotation, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = entry[0]
            contig_te_start = entry[1]
            contig_te_end = entry[2]
            contig_te_family = entry[3]
            contig_te_strand = entry[4]
            if contig_te_strand != "+" and contig_te_strand != "-":
                contig_te_strand = "."
            out_line = "\t".join([
                contig_name,
                contig_te_start,
                contig_te_end,
                contig_te_family,
                ".",
                contig_te_strand,
            ])
            output.write(out_line + "\n")

    contig_te_annotation_sorted = out + "/" + sample_name + ".te2contig_filter_sort.bed"
    command = "bedtools sort -i " + contig_te_annotation
    with open(contig_te_annotation_sorted, "w") as output:
        subprocess.call(command, shell=True, stdout=output)

    # seq_mm2_overlap_merge_loci = create_loci_set(contig_te_annotation)

    # remove tmp files
    os.remove(te2contig_bed)
    os.remove(te2contig_out)
    os.remove(seq2contig_bed)
    os.remove(te2contig_filter_raw)
    os.remove(te2contig_filter_tmp_bed)
    os.remove(te2contig_filter_tmp_sort_bed)
    os.remove(contig_te_annotation)

    # extract sequence and RM
    if "+" in sample_name:
        sample_name_replace = sample_name.replace("+", "plus")
    else:
        sample_name_replace = sample_name
    te_fa = out + "/" + sample_name_replace + ".te.fa"
    with open(te_fa, "w") as output:
        subprocess.call(
            [
                "bedtools",
                "getfasta",
                "-fi",
                contigs,
                "-bed",
                contig_te_annotation_sorted,
            ],
            stdout=output,
        )

    if not minimap2_family:
        print(
            "Use repeatmasker to annotate contig TE families instead of minimap2"
        )
        repeatmasker_dir = os.path.join(out, "contig_te_repeatmask")
        mkdir(repeatmasker_dir)
        try:
            subprocess.call([
                "RepeatMasker",
                "-dir",
                repeatmasker_dir,
                "-gff",
                "-s",
                "-nolow",
                "-no_is",
                "-xsmall",
                "-e",
                "ncbi",
                "-lib",
                te_library,
                "-pa",
                str(thread),
                te_fa,
            ])
            contig_te_repeatmasked = os.path.join(
                repeatmasker_dir,
                os.path.basename(te_fa) + ".out.gff")
            open(contig_te_repeatmasked, "r")
        except Exception as e:
            print(e)
            print("Repeatmasking contig TE sequences failed, exiting...")
            sys.exit(1)

        ## parse and merge
        te2contig_rm = out + "/" + sample_name + ".te2contig_rm.bed"
        with open(contig_te_repeatmasked,
                  "r") as input, open(te2contig_rm, "w") as output:
            for line in input:
                if "##" not in line:
                    entry = line.replace("\n", "").split("\t")
                    contig_name = entry[0].rsplit(":", 1)[0]
                    start = entry[0].rsplit(":", 1)[1].split("-")[0]
                    end = entry[0].rsplit(":", 1)[1].split("-")[1]
                    # contigs = entry[0].replace(':', '-').split("-")
                    family = re.sub('Target "Motif:|".*', "", entry[8])
                    strand = entry[6]
                    score = entry[5]
                    out_line = "\t".join(
                        [contig_name, start, end, family, score, strand])
                    output.write(out_line + "\n")
        print("Done\n")

        contig_rm_annotation = out + "/" + sample_name + ".te2contig_rm.merge.bed"
        command = 'bedtools merge -c 4,6 -o distinct -delim "|" -i ' + te2contig_rm
        with open(contig_rm_annotation, "w") as output:
            subprocess.call(command, shell=True, stdout=output)
        # os.remove(te2contig_rm)

        # replace contig_te_annotation family with ones from RM
        contig_te_annotation_new = contig_te_annotation_sorted.replace(
            "bed", "family_reannotated.bed")
        contig_rm_family_dict = dict()
        with open(contig_rm_annotation, "r") as input:
            for line in input:
                entry = line.replace("\n", "").split("\t")
                contig_name = entry[0]
                family = entry[3]
                contig_rm_family_dict[contig_name] = family

        with open(contig_te_annotation_new,
                  "w") as output, open(contig_te_annotation_sorted,
                                       "r") as input:
            for line in input:
                entry = line.replace("\n", "").split("\t")
                contig_name = entry[0]
                contig_te_start = entry[1]
                contig_te_end = entry[2]
                if contig_name in contig_rm_family_dict:
                    contig_te_family = contig_rm_family_dict[contig_name]
                    contig_te_strand = entry[5]
                    out_line = "\t".join([
                        contig_name,
                        contig_te_start,
                        contig_te_end,
                        contig_te_family,
                        ".",
                        contig_te_strand,
                    ])
                    output.write(out_line + "\n")

        contig_te_annotation_sorted = contig_te_annotation_new

    # build frequency dict
    te_freq = dict()
    with open(vcf_parsed, "r") as input:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            freq = entry[5]
            te_freq[contig_name] = freq

    return contig_te_annotation_sorted, te_fa
Exemplo n.º 8
0
def main():
    args = get_args()
    # logging config
    formatstr = "%(asctime)s: %(levelname)s: %(message)s"
    datestr = "%m/%d/%Y %H:%M:%S"
    logging.basicConfig(
        level=logging.DEBUG,
        filename=os.path.join(args.out, "TELR.log"),
        filemode="w",
        format=formatstr,
        datefmt=datestr,
    )
    logging.info("CMD: " + " ".join(sys.argv))
    start_time = time.time()

    # create directory for intermediate files
    tmp_dir = os.path.join(args.out, "intermediate_files")
    mkdir(tmp_dir)

    # Parse input
    sample_name = os.path.splitext(os.path.basename(args.reads))[0]
    reads, reference, library, fasta, skip_alignment = parse_input(
        args.reads, args.reference, args.library, sample_name, tmp_dir)

    # # Alignment
    bam = os.path.join(tmp_dir, sample_name + "_sort.bam")
    if not skip_alignment:
        alignment(
            bam,
            fasta,
            reference,
            tmp_dir,
            sample_name,
            args.thread,
            args.aligner,
            args.presets,
        )
    else:
        sort_index_bam(reads, bam, args.thread)

    # initialize loci eveluation file
    loci_eval = os.path.join(args.out, sample_name + ".loci_eval.tsv")
    if os.path.isfile(loci_eval):
        os.remove(loci_eval)

    # Detect and parse SV
    vcf = os.path.join(tmp_dir, sample_name + ".vcf")
    detect_sv(vcf, bam, reference, tmp_dir, sample_name, args.thread)

    # Parse SV and filter for TE candidate locus
    vcf_parsed = os.path.join(tmp_dir, sample_name + ".vcf_filtered.tsv")
    vcf_parse_filter(
        vcf,
        vcf_parsed,
        bam,
        library,
        tmp_dir,
        sample_name,
        args.thread,
        loci_eval,
    )

    # Local assembly
    contig_dir = os.path.join(tmp_dir, "contig_assembly")
    merged_contigs, assembly_passed_loci = get_local_contigs(
        assembler=args.assembler,
        polisher=args.polisher,
        contig_dir=contig_dir,
        vcf_parsed=vcf_parsed,
        out=tmp_dir,
        sample_name=sample_name,
        bam=bam,
        raw_reads=fasta,
        thread=args.thread,
        presets=args.presets,
        polish_iterations=args.polish_iterations,
    )

    # Annotate contig for TE region
    contig_te_annotation, te_fa = annotate_contig(
        merged_contigs,
        assembly_passed_loci,
        library,
        vcf_parsed,
        tmp_dir,
        sample_name,
        args.thread,
        args.presets,
        args.minimap2_family,
        loci_eval,
    )

    # calculate AF
    te_freq = get_af(
        tmp_dir,
        sample_name,
        bam,
        fasta,
        contig_te_annotation,
        contig_dir,
        vcf_parsed,
        args.af_flank_interval,
        args.af_flank_offset,
        args.af_te_interval,
        args.af_te_offset,
        args.presets,
        args.thread,
    )

    # repeatmask reference genome using custom TE library
    repeatmask_ref_dir = os.path.join(tmp_dir, "ref_repeatmask")
    ref_masked, te_gff = repeatmask(
        ref=reference,
        library=library,
        outdir=repeatmask_ref_dir,
        thread=args.thread,
    )
    ref_te_bed = os.path.join(tmp_dir, os.path.basename(reference) + ".te.bed")
    if te_gff is not None:
        gff3tobed(te_gff, ref_te_bed)
    else:
        ref_te_bed = None

    # find TEs
    liftover_json = find_te(
        reference=reference,
        contigs_fa=merged_contigs,
        contig_te_bed=contig_te_annotation,
        ref_te_bed=ref_te_bed,
        out=tmp_dir,
        gap=args.gap,
        overlap=args.overlap,
        flank_len=args.flank_len,
        different_contig_name=args.different_contig_name,
        keep_files=args.keep_files,
        thread=args.thread,
    )

    # generate output files
    if liftover_json:
        generate_output(
            liftover_report_path=liftover_json,
            te_freq_dict=te_freq,
            te_fa=te_fa,
            vcf_parsed=vcf_parsed,
            contig_te_annotation=contig_te_annotation,
            contig_fa=merged_contigs,
            out=args.out,
            sample_name=sample_name,
            ref=reference,
        )
    else:
        print("No non-reference TE insertion found")
        logging.info("TELR found no non-reference TE insertions")

    # clean tmp files
    if not args.keep_files:
        shutil.rmtree(tmp_dir)
    os.remove(loci_eval)

    # export conda environment
    env_file = os.path.join(args.out, "conda_env.yml")
    export_env(env_file)

    proc_time = time.time() - start_time
    print("TELR finished!")
    logging.info("TELR finished in " + format_time(proc_time))
Exemplo n.º 9
0
def filter_vcf(ins, ins_filtered, te_library, out, sample_name, thread,
               loci_eval):
    """
    Filter insertion sequences from Sniffles VCF by repeatmasking with TE concensus
    """
    # constrct fasta from parsed vcf file
    if "+" in sample_name:
        sample_name_replace = sample_name.replace("+", "plus")
    else:
        sample_name_replace = sample_name
    ins_seqs = os.path.join(out, sample_name_replace + ".vcf_ins.fasta")
    write_ins_seqs(ins, ins_seqs)

    # get the length of the insertion sequence TODO: this can be generalized
    contig_len = dict()
    if os.path.isfile(ins_seqs):
        with open(ins_seqs, "r") as handle:
            records = SeqIO.parse(handle, "fasta")
            for record in records:
                contig_len[record.id] = len(record.seq)

    # run RM on the inserted seqeunce
    repeatmasker_dir = os.path.join(out, "vcf_ins_repeatmask")
    mkdir(repeatmasker_dir)
    try:
        subprocess.call([
            "RepeatMasker",
            "-dir",
            repeatmasker_dir,
            "-gff",
            "-s",
            "-nolow",
            "-no_is",
            "-xsmall",
            "-e",
            "ncbi",
            "-lib",
            te_library,
            "-pa",
            str(thread),
            ins_seqs,
        ])
        ins_repeatmasked = os.path.join(
            repeatmasker_dir,
            os.path.basename(ins_seqs) + ".out.gff")
        open(ins_repeatmasked, "r")
    except Exception as e:
        print(e)
        print("Repeatmasking VCF insertion sequences failed, exiting...")
        sys.exit(1)

    # merge RM gff
    ins_rm_merge = os.path.join(repeatmasker_dir,
                                os.path.basename(ins_seqs) + ".out.merge.bed")
    with open(ins_rm_merge, "w") as output:
        subprocess.call(["bedtools", "merge", "-i", ins_repeatmasked],
                        stdout=output)

    # extract VCF sequences that contain TEs
    ins_te_loci = dict()
    with open(ins_rm_merge, "r") as input:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = entry[0]
            length = int(entry[2]) - int(entry[1])
            ins_te_prop = round(length / contig_len[contig_name], 2)
            if contig_name in ins_te_loci:
                ins_te_loci[
                    contig_name] = ins_te_loci[contig_name] + ins_te_prop
            else:
                ins_te_loci[contig_name] = ins_te_prop

    with open(ins, "r") as input, open(ins_filtered, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            # TODO: maybe add filter for insertion sequences covered by TE?
            if contig_name in ins_te_loci:
                out_line = line.replace("\n", "") + "\t" + str(
                    ins_te_loci[contig_name])
                output.write(out_line + "\n")
    # os.remove(ins_seqs)

    # report removed loci
    with open(loci_eval, "a") as output:
        for locus in create_loci_set(ins):
            if locus not in ins_te_loci:
                output.write(
                    "\t".join([locus, "VCF sequence not repeatmasked"]) + "\n")