def read_insertions(telocate_out, sample_name, chromosomes, rp_threshold=0): insertions = [] with open(telocate_out,"r") as raw: for x, line in enumerate(raw): if x > 1: insert = mccutils.Insertion() split_line = line.split("\t") insert.chromosome = split_line[0] insert.start = int(split_line[1]) te_name = split_line[3].split("/")[1] if "old" in split_line[15]: insert.type = "reference" insert.end = insert.start+int(split_line[2]) insert.name = te_name+"_reference_"+sample_name+"_telocate_rp_" else: insert.type = "non-reference" insert.end = insert.start insert.name = te_name+"_non-reference_"+sample_name+"_telocate_rp_" if split_line[12] == "parallel": insert.strand = "+" elif split_line[12] == "uncertain": insert.strand = "." else: insert.strand = "-" insert.telocate.read_pair_support = int(split_line[7]) if insert.telocate.read_pair_support >= rp_threshold and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def read_insertions(bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=False): inserts = [] with open(bed, "r") as b: for line in b: insert = mccutils.Insertion() split_line = line.split("\t") insert.chromosome = split_line[0] if insert.chromosome in chromosomes: insert.start = int(split_line[1]) insert.end = int(split_line[2]) if reference: te_name = split_line[4].split(",")[0] insert.family = te_to_family[te_name] insert.strand = split_line[3] insert.type = "reference" insert.name = insert.family + "|reference|" + sample_name + "|tepid|nonab|" else: te_chrom = split_line[3] te_start = split_line[4] te_end = split_line[5] insert.family = te_pos_to_family[te_chrom + "_" + te_start + "_" + te_end] insert.type = "non-reference" insert.name = insert.family + "|non-reference|" + sample_name + "|tepid|" insert.tepid.id = split_line[-1].replace("\n", "") inserts.append(insert) return inserts
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log): insertions = [] tmp_gff = out+"/tmp.ref_nonabs.gff" command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed] mccutils.run_command_stdout(command, tmp_gff, log=log) with open(tmp_gff,"r") as gff: for line in gff: if "#" not in line: line = line.replace(";","\t") split_line = line.split("\t") insert = mccutils.Insertion() insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.temp.support = "!" insert.name = split_line[9].split("=")[1]+"_reference_"+sample+"_temp_nonab_" insert.strand = split_line[6] insert.temp.classification = "!" insert.temp.junction1Support = "!" insert.temp.junction2Support = "!" insert.temp.junction1 = '!' insert.temp.junction2 = "!" insert.temp.frequency = "!" insert.type = "reference" insertions.append(insert) mccutils.remove(tmp_gff) return insertions
def get_ref_tes(gff, taxon, chroms): ref_inserts = [] te_family = {} with open(taxon, "r") as t: for line in t: split_line = line.split("\t") te_id = split_line[0] family = split_line[1] te_family[te_id] = family with open(gff, "r") as g: for line in g: if "#" not in line: split_line = line.split("\t") insert = mccutils.Insertion() insert.type = "reference" insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.strand = split_line[6] insert.family = te_family[split_line[2]] if insert.chromosome in chroms: ref_inserts.append(insert) return ref_inserts
def read_insertions(retroseq_vcf, sample_name, chromosomes, support_threshold=0, breakpoint_threshold=6): insertions = [] with open(retroseq_vcf, "r") as vcf: for line in vcf: if "#" not in line: insert = mccutils.Insertion() line = line.replace(":", "\t") line = line.replace("=", "\t") line = line.replace(",", "\t") split_line = line.split("\t") insert.chromosome = split_line[0] insert.start = int(split_line[10]) insert.end = int(split_line[11]) insert.retroseq.read_support = int(split_line[6]) insert.name = split_line[ 9] + "_non-reference_" + sample_name + "_retroseq_rp_" insert.retroseq.breakpoint_confidence = int(split_line[20]) if insert.retroseq.read_support >= support_threshold and insert.retroseq.breakpoint_confidence >= breakpoint_threshold and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def get_insertions(gff, sample_name, chromosomes, l_support_threshold=0, r_support_threshold=0, l_junction_threshold=0, r_junction_threshold=0, insert_type="ref"): insertions = [] with open(gff, "r") as ingff: for line in ingff: if "#" not in line: line = line.replace(";", "\t") split_line = line.split("\t") insert = mccutils.Insertion() insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.strand = split_line[6] insert.type = insert_type insert.name = split_line[8].split("=")[1] te_name = "" if insert_type == "ref": insert.type = "reference" insert.relocate2.right_junction = int( split_line[11].split(":")[1]) insert.relocate2.left_junction = int( split_line[12].split(":")[1]) insert.relocate2.right_support = int( split_line[13].split(":")[1]) insert.relocate2.left_support = int( split_line[14].split(":")[1]) else: insert.type = "non-reference" te_name = split_line[9].split("=")[1] insert.name = te_name + "_non-reference_" + sample_name + "_relocate2_sr_" insert.relocate2.right_junction = int( split_line[12].split("=")[1]) insert.relocate2.left_junction = int( split_line[13].split("=")[1]) insert.relocate2.right_support = int( split_line[14].split("=")[1]) insert.relocate2.left_support = int( split_line[15].split("=")[1]) if (insert.relocate2.right_junction >= r_junction_threshold and insert.relocate2.left_junction >= l_junction_threshold and insert.relocate2.right_support >= r_support_threshold and insert.relocate2.left_support >= l_support_threshold and insert.chromosome in chromosomes and te_name != "repeat_name"): insertions.append(insert) return insertions
def read_insertion_summary(infile, sample): insertions = [] with open(infile, "r") as inf: for x, line in enumerate(inf): if x > 0: insert = mccutils.Insertion() split_line = line.split("\t") if len(split_line) == 14: insert.chromosome = split_line[0] insert.start = int(split_line[1]) - 1 insert.end = int(split_line[2]) insert.name = split_line[ 3] + "_non-reference_" + split_line[ 7] + "_" + sample + "_temp_" if "antisense" in split_line[4]: insert.strand = "-" else: insert.strand = "+" insert.temp.classification = split_line[5] insert.temp.support = float(split_line[6]) insert.temp.frequency = float(split_line[7]) insert.temp.junction1 = int(split_line[8]) insert.temp.junction1Support = int(split_line[9]) insert.temp.junction2 = int(split_line[10]) insert.temp.junction2Support = int(split_line[11]) insert.temp.fivePrimeSupport = float(split_line[12]) insert.temp.threePrimeSupport = float( split_line[13].replace("\n", "")) insert.type = "non-reference" if insert.end >= insert.start and insert.end > 0 and insert.start > -1: # if split read, use junction positions as start and end if insert.temp.junction1Support > 0 and insert.temp.junction2Support > 0: insert.start = insert.temp.junction1 insert.end = insert.temp.junction2 insert.name = insert.name + "sr_" # read pair else: insert.name = insert.name + "rp_" insertions.append(insert) else: print( "<TEMP POST> Omitting malformed line from insertion summary results:", line) else: print( "<TEMP POST> Omitting malformed line from insertion summary results:", line) return insertions
def read_insertions(predictions, ref_tes, chroms, sample, both_end_support_needed=True, support_threshold=0.1): insertions = [] with open(predictions, "r") as tsv: for line in tsv: split_line = line.split("\t") insert = mccutils.Insertion() insert.chromosome = split_line[1] insert.start = int(split_line[2]) insert.end = int(split_line[2]) insert.strand = split_line[3] insert.family = split_line[4] insert.popoolationte2.support_type = split_line[6] insert.popoolationte2.frequency = float(split_line[8]) if (insert.popoolationte2.support_type == "FR" or not both_end_support_needed ) and insert.popoolationte2.frequency > support_threshold: # determine if insert is a ref insert for x in range(0, len(ref_tes)): if ref_tes[ x].start <= insert.start and insert.start <= ref_tes[ x].end: insert.family = ref_tes[x].family insert.popoolationte2.added = ref_tes[ x].popoolationte2.added if not ref_tes[x].popoolationte2.added: ref_tes[x].popoolationte2.added = True insert.type = "reference" insert.start = ref_tes[x].start insert.end = ref_tes[x].end insert.strand = ref_tes[x].strand if insert.type == "reference": insert.name = insert.family + "_reference_" + str( insert.popoolationte2.frequency ) + "_" + sample + "_popoolationte2_rp_" else: insert.type = "non-reference" insert.name = insert.family + "_non-reference_" + str( insert.popoolationte2.frequency ) + "_" + sample + "_popoolationte2_rp_" if not insert.popoolationte2.added: insertions.append(insert) return insertions
def get_insertions(gff, sample_name, chromosomes, ref_l_threshold=0, ref_r_threshold=0, nonref_l_threshold=0, nonref_r_threshold=0): insertions = [] with open(gff, "r") as ingff: for line in ingff: if "#" not in line: split_line = line.split("\t") feats = split_line[8].split(";") insert = mccutils.Insertion() insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.strand = split_line[6] feat_id = "" feat_te_name = "" for feat in feats: if "ID=" in feat: feat_id = feat.split("=")[1] elif "TE_Name=" in feat: feat_te_name = feat.split("=")[1] elif "Note=" in feat: if "Shared" in feat: insert.type = "reference" elif "Non-reference" in feat: insert.type = "non-reference" else: insert.type = "missing" elif "left_flanking_read_count=" in feat: insert.relocate.left_support = int(feat.split("=")[1]) elif "right_flanking_read_count=" in feat: insert.relocate.right_support = int(feat.split("=")[1]) if insert.type == "reference": insert.name = feat_te_name+"_reference_"+sample_name+"_relocate_sr_" elif insert.type == "non-reference": feat_te_name = feat_id.split(".")[0] insert.name = feat_te_name+"_non-reference_"+sample_name+"_relocate_sr_" if insert.type == "reference" and insert.relocate.left_support >= ref_l_threshold and insert.relocate.right_support >= ref_r_threshold and insert.chromosome in chromosomes: insertions.append(insert) elif insert.type == "non-reference" and insert.relocate.left_support >= nonref_l_threshold and insert.relocate.right_support >= nonref_r_threshold and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def read_insertions(popoolationte, sample_name, chromosomes, require_both_end_support=True, percent_read_support_threshold=0.1): insertions = [] with open(popoolationte, "r") as tsv: for line in tsv: insert = mccutils.Insertion() split_line = line.split("\t") insert.chromosome = split_line[0] if "-" in split_line[8]: insert.start = int(float(split_line[1])) insert.end = int(float(split_line[15])) elif "-" in split_line[15]: insert.start = int(float(split_line[9])) insert.end = int(float(split_line[1])) else: insert.start = int(float(split_line[9])) insert.end = int(float(split_line[15])) if "-" in split_line[6]: insert.name = split_line[3]+"_non-reference_"+split_line[4]+"_"+sample_name+"_popoolationte_rp_" else: insert.name = split_line[3]+"_reference_"+split_line[4]+"_"+sample_name+"_popoolationte_rp_" insert.popoolationte.support_type = split_line[2] if "F" in insert.popoolationte.support_type: insert.popoolationte.f_read_support = int(split_line[12]) insert.popoolationte.f_read_support_percent = float(split_line[10]) if "R" in insert.popoolationte.support_type: insert.popoolationte.r_read_support = int(split_line[19]) insert.popoolationte.r_read_support_percent = float(split_line[17]) if not require_both_end_support: if (insert.popoolationte.f_read_support_percent >= percent_read_support_threshold or insert.popoolationte.r_read_support_percent >= percent_read_support_threshold) and insert.chromosome in chromosomes: insertions.append(insert) else: if "FR" in insert.popoolationte.support_type and (insert.popoolationte.f_read_support_percent >= percent_read_support_threshold or insert.popoolationte.r_read_support_percent >= percent_read_support_threshold) and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def read_insertions(bed, chromosomes, sample_name, out_dir, min_read_cutoff=0): insertions = [] with open(bed, "r") as inbed: for line in inbed: insert = mccutils.Insertion() line = line.replace(";", "\t") split_line = line.split("\t") insert.chromosome = split_line[0] insert.start = int(split_line[1]) + 1 insert.end = int(split_line[2]) insert.type = split_line[8].replace("\n", "") insert.strand = split_line[4] insert.family = split_line[5] insert.name = insert.family + "_" + insert.type + "_" + sample_name + "_ngs_te_mapper_sr_" insert.ngs_te_mapper.support = int(split_line[7]) if insert.ngs_te_mapper.support > min_read_cutoff and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name): ref_tes = [] with open(te_gff, "r") as gff: for line in gff: ref_te = mccutils.Insertion() split_line = line.split("\t") ref_te.chromosome = split_line[0] ref_te.start = int(split_line[3]) ref_te.end = int(split_line[4]) ref_te.strand = split_line[6] feats = split_line[8] split_feats = feats.split(";") te_id = "" for f in split_feats: if "ID=" in f: te_id = f.split("=")[1] ref_te.family = te_to_family[te_id] ref_te.type = "reference" ref_te.name = ref_te.family + "|reference|" + sample_name + "|tepid|nonab|" ref_tes.append(ref_te) absent = [] for deletion in deletions: key = "_".join([ deletion.chromosome, str(deletion.start), str(deletion.end), deletion.strand, deletion.family ]) absent.append(key) non_absent = [] for te in ref_tes: key = "_".join( [te.chromosome, str(te.start), str(te.end), te.strand, te.family]) if key not in absent: non_absent.append(te) return non_absent
def read_insertions(jitterbug_gff, taxonomy, chroms, sample_name, min_fwd_read_support=0, min_rev_read_support=0, min_sr_support=0, min_zygosity=0.0): insertions = [] te_family = {} with open(taxonomy, "r") as tsv: for line in tsv: line = line.replace("\n", "") split_line = line.split("\t") te_family[split_line[0]] = split_line[1] with open(jitterbug_gff, "r") as gff: for line in gff: line = line.replace("\n", "") split_line = line.split("\t") if len(split_line) == 9: insert = mccutils.Insertion() insert.chromosome = split_line[0] if insert.chromosome in chroms: insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.type = "non-reference" feats = split_line[8] feats = feats.replace(" ", "") feats = feats.split(";") supporting_families = [] sr = False family = "NONE" for feat in feats: if "softclipped_pos" in feat: pos = feat.split("=")[1] pos = pos.replace("(", "") pos = pos.replace(")", "") pos = pos.split(",") start = int(pos[0]) - 1 end = int(pos[1]) if start > -1 and end > -1: insert.start = start insert.end = end sr = True if "predicted_superfam" in feat: te = feat.split("=")[1] family = te_family[te] if "supporting_fwd_reads" in feat: insert.jitterbug.supporting_fwd_reads = int( feat.split("=")[1]) if "supporting_rev_reads" in feat: insert.jitterbug.supporting_rev_reads = int( feat.split("=")[1]) if "softclipped_support" in feat: insert.jitterbug.split_read_support = int( feat.split("=")[1]) if "zygosity" in feat: insert.jitterbug.zygosity = float( feat.split("=")[1]) insert.name = family + "|non-reference|" + sample_name + "|jitterbug|" if sr: insert.name += "sr|" else: insert.name = "rp|" if ((insert.jitterbug.supporting_fwd_reads >= min_fwd_read_support) and (insert.jitterbug.supporting_rev_reads >= min_rev_read_support) and (insert.jitterbug.split_read_support >= min_sr_support) and (insert.jitterbug.zygosity >= min_zygosity)): insertions.append(insert) return insertions