def run_flye_polishing( asm_cns, reads, asm_dir, contig_name, thread, polish_iterations, presets ): """Run Flye polishing""" if presets == "pacbio": presets_flye = "--pacbio-raw" else: presets_flye = "--nano-raw" tmp_out_dir = os.path.join(asm_dir, contig_name) mkdir(tmp_out_dir) try: subprocess.call( [ "flye", "--polish-target", asm_cns, presets_flye, reads, "--out-dir", tmp_out_dir, "--thread", str(thread), "--iterations", str(polish_iterations), ] ) except Exception as e: print(e) print("Polishing failed, exiting...") return None # rename contig file polished_contig = os.path.join( tmp_out_dir, "polished_" + str(polish_iterations) + ".fasta" ) if check_exist(polished_contig): os.rename(polished_contig, asm_cns) shutil.rmtree(tmp_out_dir) return asm_cns else: return None
def repeatmask(ref, library, outdir, thread): mkdir(outdir) try: subprocess.call([ "RepeatMasker", "-dir", outdir, "-gff", "-s", "-nolow", "-no_is", "-e", "ncbi", "-lib", library, "-pa", str(thread), ref, ]) ref_rm = os.path.join(outdir, os.path.basename(ref) + ".masked") gff = os.path.join(outdir, os.path.basename(ref) + ".out.gff") gff3 = os.path.join(outdir, os.path.basename(ref) + ".out.gff3") if not os.path.isfile(ref_rm): ref_rm_out = os.path.join(outdir, os.path.basename(ref) + ".out") with open(ref_rm_out, "r") as input: for line in input: if "There were no repetitive sequences detected" in line: print("No repetitive sequences detected") ref_rm = ref gff = None gff3 = None else: raise Exception("Repeatmasking failed, exiting...") else: parse_rm_out(gff, gff3) open(ref_rm, "r") except Exception as e: print(e) print("Repeatmasking failed, exiting...") sys.exit(1) return ref_rm, gff3
def run_flye_assembly(sv_reads, asm_dir, contig_name, thread, presets): """Run Flye assembly""" if presets == "pacbio": presets_flye = "--pacbio-raw" else: presets_flye = "--nano-raw" tmp_out_dir = os.path.join(asm_dir, contig_name) mkdir(tmp_out_dir) try: subprocess.call( [ "flye", presets_flye, sv_reads, "--out-dir", tmp_out_dir, "--thread", str(thread), "--iterations", "0", ] ) except Exception as e: print(e) print("Assembly failed, exiting...") return # rename contigs contig_path = os.path.join(tmp_out_dir, "assembly.fasta") contig_path_new = os.path.join(asm_dir, contig_name + ".cns.fa") if check_exist(contig_path): os.rename(contig_path, contig_path_new) # remove tmp files shutil.rmtree(tmp_out_dir) return contig_path_new else: print("assembly failed") return None
def prep_assembly_inputs( vcf_parsed, out, sample_name, bam, raw_reads, reads_dir, read_type="sv" ): """Prepare reads for local assembly""" # logging.info("Prepare reads for local assembly") if read_type == "sv": # TODO: figure out what this does # extract read IDs read_ids = os.path.join(out, sample_name + ".id") with open(vcf_parsed, "r") as input, open(read_ids, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") read_list = entry[8].split(",") for read in read_list: output.write(read + "\n") else: # TODO: think about using this for assembly, filter for cigar reads window = 1000 samfile = pysam.AlignmentFile(bam, "rb") read_ids = os.path.join(out, sample_name + ".id") vcf_parsed_new = vcf_parsed + ".new" with open(vcf_parsed, "r") as input, open(read_ids, "w") as output, open( vcf_parsed_new, "w" ) as VCF: for line in input: entry = line.replace("\n", "").split("\t") # get sniffles read list read_list = entry[8].split(",") reads_sniffles = set(read_list) ins_chr = entry[0] ins_breakpoint = round((int(entry[1]) + int(entry[2])) / 2) start = ins_breakpoint - window end = ins_breakpoint + window reads = set() # coverage = 0 for read in samfile.fetch(ins_chr, start, end): reads.add(read.query_name) for read in reads: output.write(read + "\n") # write out_line = line.replace("\n", "") + "\t" + str(len(reads)) VCF.write(out_line + "\n") vcf_parsed = vcf_parsed_new # generate unique ID list read_ids_unique = read_ids + ".unique" command = "cat " + read_ids + " | sort | uniq" with open(read_ids_unique, "w") as output: subprocess.call(command, stdout=output, shell=True) # filter raw reads using read list subset_fa = os.path.join(out, sample_name + ".subset.fa") command = "seqtk subseq " + raw_reads + " " + read_ids_unique + " | seqtk seq -a" with open(subset_fa, "w") as output: subprocess.call(command, stdout=output, shell=True) # reorder reads subset_fa_reorder = out + "/" + sample_name + ".subset.reorder.fa" extract_reads(subset_fa, read_ids, subset_fa_reorder) # separate reads into multiple files, using csplit mkdir(reads_dir) csplit_prefix = reads_dir + "/contig" m = [] k = 1 with open(vcf_parsed, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") if read_type == "sv": k = k + 2 * (len(entry[8].split(","))) else: k = k + 2 * int(entry[14]) m.append(k) if len(m) == 1: subprocess.call(["cp", subset_fa_reorder, reads_dir + "/contig0"]) elif len(m) == 0: print("No insertion detected, exiting...") else: m = m[:-1] index = " ".join(str(i) for i in m) command = ( "csplit -s -f " + csplit_prefix + " -n 1 " + subset_fa_reorder + " " + index ) subprocess.call(command, shell=True) # remove tmp files os.remove(read_ids) os.remove(read_ids_unique) os.remove(subset_fa) os.remove(subset_fa_reorder)
def get_local_contigs( assembler, polisher, contig_dir, vcf_parsed, out, sample_name, bam, raw_reads, thread, presets, polish_iterations, ): """Perform local assembly using reads from parsed VCF file in parallel""" # Prepare reads used for local assembly and polishing sv_reads_dir = os.path.join(out, "sv_reads") try: prep_assembly_inputs( vcf_parsed, out, sample_name, bam, raw_reads, sv_reads_dir, read_type="sv" ) except Exception as e: print(e) print("Prepare local assembly input data failed, exiting...") sys.exit(1) mkdir(contig_dir) k = 0 asm_pa_list = [] with open(vcf_parsed, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) # rename variant reads sv_reads = sv_reads_dir + "/contig" + str(k) sv_reads_rename = sv_reads_dir + "/" + contig_name + ".reads.fa" os.rename(sv_reads, sv_reads_rename) thread_asm = 1 asm_pa = [ sv_reads_rename, contig_dir, contig_name, thread_asm, presets, assembler, polisher, polish_iterations, ] asm_pa_list.append(asm_pa) k = k + 1 # run assembly in parallel logging.info("Perform local assembly of non-reference TE loci...") start_time = time.time() try: pool = Pool(processes=thread) contig_list = pool.map(run_assembly_polishing, asm_pa_list) pool.close() pool.join() except Exception as e: print(e) print("Local assembly failed, exiting...") sys.exit(1) proc_time = time.time() - start_time # merge all contigs assembly_passed_loci = set() merged_contigs = os.path.join(out, sample_name + ".contigs.fa") with open(merged_contigs, "w") as merged_output_handle: for contig in contig_list: if check_exist(contig): contig_name = os.path.basename(contig).replace(".cns.fa", "") assembly_passed_loci.add(contig_name) parsed_contig = os.path.join(contig_dir, contig_name + ".cns.ctg1.fa") with open(contig, "r") as input: records = SeqIO.parse(input, "fasta") for record in records: if record.id == "ctg1" or record.id == "contig_1": record.id = contig_name record.description = "len=" + str(len(record.seq)) SeqIO.write(record, merged_output_handle, "fasta") with open(parsed_contig, "w") as parsed_output_handle: SeqIO.write(record, parsed_output_handle, "fasta") logging.info("Local assembly finished in " + format_time(proc_time)) return merged_contigs, assembly_passed_loci
def get_args(): parser = argparse.ArgumentParser( description="Program for detecting non-reference TEs in long read data" ) optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") # required required.add_argument( "-i", "--reads", type=str, help="reads in fasta/fastq format or read alignments in bam format", required=True, ) required.add_argument( "-r", "--reference", type=str, help="reference genome in fasta format", required=True, ) required.add_argument( "-l", "--library", type=str, help="TE consensus sequences in fasta format", required=True, ) # optional optional.add_argument( "--aligner", type=str, help= "choose method for read alignment, please provide 'nglmr' or 'minimap2' (default = 'nglmr')", required=False, ) optional.add_argument( "--assembler", type=str, help= "Choose the method to be used for local contig assembly step, please provide 'wtdbg2' or 'flye' (default = 'wtdbg2')", required=False, ) optional.add_argument( "--polisher", type=str, help= "Choose the method to be used for local contig polishing step, please provide 'wtdbg2' or 'flye' (default = 'wtdbg2')", required=False, ) optional.add_argument( "-x", "--presets", type=str, help= "parameter presets for different sequencing technologies, please provide 'pacbio' or 'ont' (default = 'pacbio')", required=False, ) optional.add_argument( "-p", "--polish_iterations", type=int, help="iterations of contig polishing (default = 1)", required=False, ) optional.add_argument( "-o", "--out", type=str, help="directory to output data (default = '.')", required=False, ) optional.add_argument( "-t", "--thread", type=int, help="max cpu threads to use (default = '1')", required=False, ) optional.add_argument( "-g", "--gap", type=int, help="max gap size for flanking sequence alignment (default = '20')", required=False, ) optional.add_argument( "-v", "--overlap", type=int, help= "max overlap size for flanking sequence alignment (default = '20')", required=False, ) optional.add_argument( "--flank_len", type=int, help="flanking sequence length (default = '500')", required=False, ) optional.add_argument( "--af_flank_interval", type=int, help= "5' and 3'flanking sequence interval size used for allele frequency estimation (default = '100')", required=False, ) optional.add_argument( "--af_flank_offset", type=int, help= "5' and 3' flanking sequence offset size used for allele frequency estimation (default = '200')", required=False, ) optional.add_argument( "--af_te_interval", type=int, help= "5' and 3' te sequence interval size used for allele frequency estimation (default: '50')", required=False, ) optional.add_argument( "--af_te_offset", type=int, help= "5' and 3' te sequence offset size used for allele frequency estimation (default: '50')", required=False, ) optional.add_argument( "--different_contig_name", action="store_true", help= "If provided then TELR does not require the contig name to match before and after annotation liftover (default: require contig name to be the same before and after liftover)", required=False, ) optional.add_argument( "--minimap2_family", action="store_true", help= "If provided then minimap2 will be used to annotate TE families in the assembled contigs (default: use repeatmasker for contig TE annotation)", required=False, ) optional.add_argument( "-k", "--keep_files", action="store_true", help= "If provided then all intermediate files will be kept (default: remove intermediate files)", required=False, ) parser._action_groups.append(optional) args = parser.parse_args() # checks if in files exist try: test = open(args.reads, "r") except Exception as e: print(e) logging.exception("Can not open input file: " + args.reads) sys.exit(1) try: test = open(args.reference, "r") except Exception as e: print(e) logging.exception("Can not open input file: " + args.reference) sys.exit(1) try: test = open(args.library, "r") except Exception as e: print(e) logging.exception("Can not open input file: " + args.library) sys.exit(1) # check if optional arguments are valid if args.aligner is None: args.aligner = "nglmr" elif args.aligner not in ["nglmr", "minimap2"]: print( "Please provide a valid alignment method (nglmr/minimap2), exiting..." ) sys.exit(1) if args.assembler is None: args.assembler = "wtdbg2" elif args.assembler not in ["wtdbg2", "flye"]: print( "Please provide a valid assembly method (wtdbg2/flye), exiting...") sys.exit(1) if args.polisher is None: args.polisher = "wtdbg2" elif args.polisher not in ["wtdbg2", "flye"]: print("Please provide a valid polish method (wtdbg2/flye), exiting...") sys.exit(1) if args.presets is None: args.presets = "pacbio" elif args.presets not in ["pacbio", "ont"]: print("Please provide a valid preset option (pacbio/ont), exiting...") sys.exit(1) if args.polish_iterations is None: args.polish_iterations = 1 elif args.polish_iterations < 1: print( "Please provide a valid number of iterations for polishing, exiting..." ) # sets up out dir variable if args.out is None: args.out = "." args.out = os.path.abspath(args.out) mkdir(args.out) if args.thread is None: args.thread = 1 if args.flank_len is None: args.flank_len = 500 if args.af_flank_interval is None: args.af_flank_interval = 100 else: if args.af_flank_interval <= 0: print( "Please provide a valid flanking sequence interval size (positive integer) for allele frequency estimation, exiting..." ) sys.exit(1) if args.af_flank_offset is None: args.af_flank_offset = 200 else: if args.af_flank_offset < 0: print( "Please provide a valid flanking sequence offset size (positive integer) for allele frequency estimation, exiting..." ) if args.af_te_interval is None: args.af_te_interval = 50 else: if args.af_te_interval <= 0: print( "Please provide a valid TE interval size (positive integer) for allele frequency estimation, exiting..." ) if args.af_te_offset is None: args.af_te_offset = 50 else: if args.af_te_offset < 0: print( "Please provide a valid TE offset size (positive integer) for allele frequency estimation, exiting..." ) if args.gap is None: args.gap = 20 if args.overlap is None: args.overlap = 20 return args
def annotate_contig( contigs, assembly_passed_loci, te_library, vcf_parsed, out, sample_name, thread, presets, minimap2_family, loci_eval, ): logging.info("Annotate contigs...") if presets == "pacbio": minimap2_presets = "map-pb" else: minimap2_presets = "map-ont" # map sequence to contigs vcf_seq2contig_out = os.path.join(out, "seq2contig.paf") # if os.path.isfile(vcf_seq2contig_out): # os.remove(vcf_seq2contig_out) # TODO: consider that some contigs might not exist seq2contig_passed_loci = set() vcf_seq2contig_dir = os.path.join(out, "vcf_seq2contig") mkdir(vcf_seq2contig_dir) with open(vcf_parsed, "r") as input, open(vcf_seq2contig_out, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) if contig_name in assembly_passed_loci: vcf_seq = entry[7] query = os.path.join(vcf_seq2contig_dir, contig_name + ".seq.fa") create_fa(contig_name, vcf_seq, query) subject = os.path.join( vcf_seq2contig_dir, contig_name + ".contig.fa") ## TODO: this can be replaced with open(subject, "w") as subject_output_handle: try: subprocess.call( ["samtools", "faidx", contigs, contig_name], stdout=subject_output_handle, ) except subprocess.CalledProcessError: print(contig_name + ":contig assembly doesn't exist") continue cmd = [ "minimap2", "-cx", minimap2_presets, "--secondary=no", "-v", "0", subject, query, ] vcf_seq2contig_output = get_cmd_output(cmd) if vcf_seq2contig_output != "": output.write(vcf_seq2contig_output) seq2contig_passed_loci.add(contig_name) # with open(vcf_seq2contig_out, "a") as output: os.remove(query) os.remove(subject) os.rmdir(vcf_seq2contig_dir) # covert to bed format seq2contig_bed = os.path.join(out, "seq2contig.bed") with open(vcf_seq2contig_out, "r") as input, open(seq2contig_bed, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") bed_line = "\t".join( [entry[0], entry[7], entry[8], entry[5], entry[11], entry[4]]) output.write(bed_line + "\n") # # report ins-contig failed loci # with open(loci_eval, "a") as output: # for locus in assembly_passed_loci: # if locus not in seq2contig_passed_loci: # output.write( # "\t".join( # [locus, "Sniffles VCF sequence not mapped to assembled contig"] # ) # + "\n" # ) # map TE library to contigs using minimap2 # TE-contig alignment te2contig_out = os.path.join(out, sample_name + ".te2contig.paf") if os.path.isfile(te2contig_out): os.remove(te2contig_out) for locus in seq2contig_passed_loci: contig_fa = os.path.join(out, locus + ".fa") with open(contig_fa, "w") as output: subprocess.call(["samtools", "faidx", contigs, locus], stdout=output) # map TE library to contig using minimap2 with open(te2contig_out, "a") as output: subprocess.call( [ "minimap2", "-cx", minimap2_presets, contig_fa, te_library, "-v", "0", "-t", str(thread), ], stdout=output, ) os.remove(contig_fa) # convert to bed format te2contig_bed = os.path.join(out, sample_name + ".te2contig.bed") with open(te2contig_out, "r") as input, open(te2contig_bed, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") bed_line = "\t".join( [entry[5], entry[7], entry[8], entry[0], entry[11], entry[4]]) output.write(bed_line + "\n") # Use VCF sequence alignment to filter minimap2 TE-contig alignment te2contig_filter_raw = os.path.join(out, sample_name + ".te2contig_filter.tsv") with open(te2contig_filter_raw, "w") as output: subprocess.call( [ "bedtools", "intersect", "-a", te2contig_bed, "-b", seq2contig_bed, "-wao", ], stdout=output, ) # filter and merge # get rid of -1 and make it into bed format te2contig_filter_tmp_bed = os.path.join( out, sample_name + ".te2contig_filter.tmp.bed") with open(te2contig_filter_raw, "r") as input, open(te2contig_filter_tmp_bed, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") # the overlap between VCF sequence alignment and TE-contig alignment has to be over 10bp if int(entry[12]) > 10: out_line = "\t".join([ entry[0], entry[1], entry[2], entry[3], entry[4], entry[5] ]) output.write(out_line + "\n") # sort # TODO: package this part, hide variables te2contig_filter_tmp_sort_bed = (out + "/" + sample_name + ".te2contig_filter.tmp.sort.bed") command = "bedtools sort -i " + te2contig_filter_tmp_bed with open(te2contig_filter_tmp_sort_bed, "w") as output: subprocess.call(command, shell=True, stdout=output) # find out what's filtered out seq_mm2_overlap_loci = set() with open(te2contig_filter_tmp_sort_bed, "r") as input: for line in input: seq_mm2_overlap_loci.add(line.split("\t")[0]) # seq_mm2_overlap_loci = create_loci_set(te2contig_filter_tmp_sort_bed) with open(loci_eval, "a") as output: for locus in seq2contig_passed_loci: if locus not in seq_mm2_overlap_loci: output.write("\t".join([ locus, "VCF sequence doesn't overlap contig annotation" ]) + "\n") # merge contig_te_annotation_tmp = out + "/" + sample_name + ".te2contig_filter.bed.tmp" command = ( 'bedtools merge -d 10000 -c 4,6 -o distinct,distinct -delim "|" -i ' + te2contig_filter_tmp_sort_bed) with open(contig_te_annotation_tmp, "w") as output: subprocess.call(command, shell=True, stdout=output) contig_te_annotation = out + "/" + sample_name + ".te2contig_filter.bed" with open(contig_te_annotation_tmp, "r") as input, open(contig_te_annotation, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") contig_name = entry[0] contig_te_start = entry[1] contig_te_end = entry[2] contig_te_family = entry[3] contig_te_strand = entry[4] if contig_te_strand != "+" and contig_te_strand != "-": contig_te_strand = "." out_line = "\t".join([ contig_name, contig_te_start, contig_te_end, contig_te_family, ".", contig_te_strand, ]) output.write(out_line + "\n") contig_te_annotation_sorted = out + "/" + sample_name + ".te2contig_filter_sort.bed" command = "bedtools sort -i " + contig_te_annotation with open(contig_te_annotation_sorted, "w") as output: subprocess.call(command, shell=True, stdout=output) # seq_mm2_overlap_merge_loci = create_loci_set(contig_te_annotation) # remove tmp files os.remove(te2contig_bed) os.remove(te2contig_out) os.remove(seq2contig_bed) os.remove(te2contig_filter_raw) os.remove(te2contig_filter_tmp_bed) os.remove(te2contig_filter_tmp_sort_bed) os.remove(contig_te_annotation) # extract sequence and RM if "+" in sample_name: sample_name_replace = sample_name.replace("+", "plus") else: sample_name_replace = sample_name te_fa = out + "/" + sample_name_replace + ".te.fa" with open(te_fa, "w") as output: subprocess.call( [ "bedtools", "getfasta", "-fi", contigs, "-bed", contig_te_annotation_sorted, ], stdout=output, ) if not minimap2_family: print( "Use repeatmasker to annotate contig TE families instead of minimap2" ) repeatmasker_dir = os.path.join(out, "contig_te_repeatmask") mkdir(repeatmasker_dir) try: subprocess.call([ "RepeatMasker", "-dir", repeatmasker_dir, "-gff", "-s", "-nolow", "-no_is", "-xsmall", "-e", "ncbi", "-lib", te_library, "-pa", str(thread), te_fa, ]) contig_te_repeatmasked = os.path.join( repeatmasker_dir, os.path.basename(te_fa) + ".out.gff") open(contig_te_repeatmasked, "r") except Exception as e: print(e) print("Repeatmasking contig TE sequences failed, exiting...") sys.exit(1) ## parse and merge te2contig_rm = out + "/" + sample_name + ".te2contig_rm.bed" with open(contig_te_repeatmasked, "r") as input, open(te2contig_rm, "w") as output: for line in input: if "##" not in line: entry = line.replace("\n", "").split("\t") contig_name = entry[0].rsplit(":", 1)[0] start = entry[0].rsplit(":", 1)[1].split("-")[0] end = entry[0].rsplit(":", 1)[1].split("-")[1] # contigs = entry[0].replace(':', '-').split("-") family = re.sub('Target "Motif:|".*', "", entry[8]) strand = entry[6] score = entry[5] out_line = "\t".join( [contig_name, start, end, family, score, strand]) output.write(out_line + "\n") print("Done\n") contig_rm_annotation = out + "/" + sample_name + ".te2contig_rm.merge.bed" command = 'bedtools merge -c 4,6 -o distinct -delim "|" -i ' + te2contig_rm with open(contig_rm_annotation, "w") as output: subprocess.call(command, shell=True, stdout=output) # os.remove(te2contig_rm) # replace contig_te_annotation family with ones from RM contig_te_annotation_new = contig_te_annotation_sorted.replace( "bed", "family_reannotated.bed") contig_rm_family_dict = dict() with open(contig_rm_annotation, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") contig_name = entry[0] family = entry[3] contig_rm_family_dict[contig_name] = family with open(contig_te_annotation_new, "w") as output, open(contig_te_annotation_sorted, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") contig_name = entry[0] contig_te_start = entry[1] contig_te_end = entry[2] if contig_name in contig_rm_family_dict: contig_te_family = contig_rm_family_dict[contig_name] contig_te_strand = entry[5] out_line = "\t".join([ contig_name, contig_te_start, contig_te_end, contig_te_family, ".", contig_te_strand, ]) output.write(out_line + "\n") contig_te_annotation_sorted = contig_te_annotation_new # build frequency dict te_freq = dict() with open(vcf_parsed, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) freq = entry[5] te_freq[contig_name] = freq return contig_te_annotation_sorted, te_fa
def main(): args = get_args() # logging config formatstr = "%(asctime)s: %(levelname)s: %(message)s" datestr = "%m/%d/%Y %H:%M:%S" logging.basicConfig( level=logging.DEBUG, filename=os.path.join(args.out, "TELR.log"), filemode="w", format=formatstr, datefmt=datestr, ) logging.info("CMD: " + " ".join(sys.argv)) start_time = time.time() # create directory for intermediate files tmp_dir = os.path.join(args.out, "intermediate_files") mkdir(tmp_dir) # Parse input sample_name = os.path.splitext(os.path.basename(args.reads))[0] reads, reference, library, fasta, skip_alignment = parse_input( args.reads, args.reference, args.library, sample_name, tmp_dir) # # Alignment bam = os.path.join(tmp_dir, sample_name + "_sort.bam") if not skip_alignment: alignment( bam, fasta, reference, tmp_dir, sample_name, args.thread, args.aligner, args.presets, ) else: sort_index_bam(reads, bam, args.thread) # initialize loci eveluation file loci_eval = os.path.join(args.out, sample_name + ".loci_eval.tsv") if os.path.isfile(loci_eval): os.remove(loci_eval) # Detect and parse SV vcf = os.path.join(tmp_dir, sample_name + ".vcf") detect_sv(vcf, bam, reference, tmp_dir, sample_name, args.thread) # Parse SV and filter for TE candidate locus vcf_parsed = os.path.join(tmp_dir, sample_name + ".vcf_filtered.tsv") vcf_parse_filter( vcf, vcf_parsed, bam, library, tmp_dir, sample_name, args.thread, loci_eval, ) # Local assembly contig_dir = os.path.join(tmp_dir, "contig_assembly") merged_contigs, assembly_passed_loci = get_local_contigs( assembler=args.assembler, polisher=args.polisher, contig_dir=contig_dir, vcf_parsed=vcf_parsed, out=tmp_dir, sample_name=sample_name, bam=bam, raw_reads=fasta, thread=args.thread, presets=args.presets, polish_iterations=args.polish_iterations, ) # Annotate contig for TE region contig_te_annotation, te_fa = annotate_contig( merged_contigs, assembly_passed_loci, library, vcf_parsed, tmp_dir, sample_name, args.thread, args.presets, args.minimap2_family, loci_eval, ) # calculate AF te_freq = get_af( tmp_dir, sample_name, bam, fasta, contig_te_annotation, contig_dir, vcf_parsed, args.af_flank_interval, args.af_flank_offset, args.af_te_interval, args.af_te_offset, args.presets, args.thread, ) # repeatmask reference genome using custom TE library repeatmask_ref_dir = os.path.join(tmp_dir, "ref_repeatmask") ref_masked, te_gff = repeatmask( ref=reference, library=library, outdir=repeatmask_ref_dir, thread=args.thread, ) ref_te_bed = os.path.join(tmp_dir, os.path.basename(reference) + ".te.bed") if te_gff is not None: gff3tobed(te_gff, ref_te_bed) else: ref_te_bed = None # find TEs liftover_json = find_te( reference=reference, contigs_fa=merged_contigs, contig_te_bed=contig_te_annotation, ref_te_bed=ref_te_bed, out=tmp_dir, gap=args.gap, overlap=args.overlap, flank_len=args.flank_len, different_contig_name=args.different_contig_name, keep_files=args.keep_files, thread=args.thread, ) # generate output files if liftover_json: generate_output( liftover_report_path=liftover_json, te_freq_dict=te_freq, te_fa=te_fa, vcf_parsed=vcf_parsed, contig_te_annotation=contig_te_annotation, contig_fa=merged_contigs, out=args.out, sample_name=sample_name, ref=reference, ) else: print("No non-reference TE insertion found") logging.info("TELR found no non-reference TE insertions") # clean tmp files if not args.keep_files: shutil.rmtree(tmp_dir) os.remove(loci_eval) # export conda environment env_file = os.path.join(args.out, "conda_env.yml") export_env(env_file) proc_time = time.time() - start_time print("TELR finished!") logging.info("TELR finished in " + format_time(proc_time))
def filter_vcf(ins, ins_filtered, te_library, out, sample_name, thread, loci_eval): """ Filter insertion sequences from Sniffles VCF by repeatmasking with TE concensus """ # constrct fasta from parsed vcf file if "+" in sample_name: sample_name_replace = sample_name.replace("+", "plus") else: sample_name_replace = sample_name ins_seqs = os.path.join(out, sample_name_replace + ".vcf_ins.fasta") write_ins_seqs(ins, ins_seqs) # get the length of the insertion sequence TODO: this can be generalized contig_len = dict() if os.path.isfile(ins_seqs): with open(ins_seqs, "r") as handle: records = SeqIO.parse(handle, "fasta") for record in records: contig_len[record.id] = len(record.seq) # run RM on the inserted seqeunce repeatmasker_dir = os.path.join(out, "vcf_ins_repeatmask") mkdir(repeatmasker_dir) try: subprocess.call([ "RepeatMasker", "-dir", repeatmasker_dir, "-gff", "-s", "-nolow", "-no_is", "-xsmall", "-e", "ncbi", "-lib", te_library, "-pa", str(thread), ins_seqs, ]) ins_repeatmasked = os.path.join( repeatmasker_dir, os.path.basename(ins_seqs) + ".out.gff") open(ins_repeatmasked, "r") except Exception as e: print(e) print("Repeatmasking VCF insertion sequences failed, exiting...") sys.exit(1) # merge RM gff ins_rm_merge = os.path.join(repeatmasker_dir, os.path.basename(ins_seqs) + ".out.merge.bed") with open(ins_rm_merge, "w") as output: subprocess.call(["bedtools", "merge", "-i", ins_repeatmasked], stdout=output) # extract VCF sequences that contain TEs ins_te_loci = dict() with open(ins_rm_merge, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") contig_name = entry[0] length = int(entry[2]) - int(entry[1]) ins_te_prop = round(length / contig_len[contig_name], 2) if contig_name in ins_te_loci: ins_te_loci[ contig_name] = ins_te_loci[contig_name] + ins_te_prop else: ins_te_loci[contig_name] = ins_te_prop with open(ins, "r") as input, open(ins_filtered, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) # TODO: maybe add filter for insertion sequences covered by TE? if contig_name in ins_te_loci: out_line = line.replace("\n", "") + "\t" + str( ins_te_loci[contig_name]) output.write(out_line + "\n") # os.remove(ins_seqs) # report removed loci with open(loci_eval, "a") as output: for locus in create_loci_set(ins): if locus not in ins_te_loci: output.write( "\t".join([locus, "VCF sequence not repeatmasked"]) + "\n")