Exemplo n.º 1
0
def read_insertions(telocate_out, sample_name, chromosomes, rp_threshold=0):
    insertions = []
    with open(telocate_out,"r") as raw:
        for x, line in enumerate(raw):
            if x > 1:
                insert = mccutils.Insertion()
                split_line = line.split("\t")
                insert.chromosome = split_line[0]
                insert.start = int(split_line[1])
                
                te_name = split_line[3].split("/")[1]
                if "old" in split_line[15]:
                    insert.type = "reference"
                    insert.end = insert.start+int(split_line[2])
                    insert.name = te_name+"_reference_"+sample_name+"_telocate_rp_"
                else:
                    insert.type = "non-reference"
                    insert.end = insert.start
                    insert.name = te_name+"_non-reference_"+sample_name+"_telocate_rp_"

                if split_line[12] == "parallel":
                    insert.strand = "+"
                elif split_line[12] == "uncertain":
                    insert.strand = "."
                else:
                    insert.strand = "-"

                insert.telocate.read_pair_support = int(split_line[7])

                if insert.telocate.read_pair_support >= rp_threshold and insert.chromosome in chromosomes:
                    insertions.append(insert)
    
    return insertions
Exemplo n.º 2
0
def read_insertions(bed,
                    te_to_family,
                    sample_name,
                    te_pos_to_family,
                    chromosomes,
                    reference=False):
    inserts = []
    with open(bed, "r") as b:
        for line in b:
            insert = mccutils.Insertion()
            split_line = line.split("\t")
            insert.chromosome = split_line[0]
            if insert.chromosome in chromosomes:
                insert.start = int(split_line[1])
                insert.end = int(split_line[2])

                if reference:
                    te_name = split_line[4].split(",")[0]
                    insert.family = te_to_family[te_name]
                    insert.strand = split_line[3]
                    insert.type = "reference"
                    insert.name = insert.family + "|reference|" + sample_name + "|tepid|nonab|"
                else:
                    te_chrom = split_line[3]
                    te_start = split_line[4]
                    te_end = split_line[5]
                    insert.family = te_pos_to_family[te_chrom + "_" +
                                                     te_start + "_" + te_end]
                    insert.type = "non-reference"
                    insert.name = insert.family + "|non-reference|" + sample_name + "|tepid|"

                insert.tepid.id = split_line[-1].replace("\n", "")
                inserts.append(insert)

    return inserts
Exemplo n.º 3
0
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log):
    insertions = []
    tmp_gff = out+"/tmp.ref_nonabs.gff"
    command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed]
    mccutils.run_command_stdout(command, tmp_gff, log=log)

    with open(tmp_gff,"r") as gff:
        for line in gff:
            if "#" not in line:
                line = line.replace(";","\t")
                split_line = line.split("\t")
                insert = mccutils.Insertion()
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.temp.support = "!"
                insert.name = split_line[9].split("=")[1]+"_reference_"+sample+"_temp_nonab_"
                insert.strand = split_line[6]
                insert.temp.classification = "!"
                insert.temp.junction1Support = "!"
                insert.temp.junction2Support = "!"
                insert.temp.junction1 = '!'
                insert.temp.junction2 = "!"
                insert.temp.frequency = "!"
                insert.type = "reference"
                
                insertions.append(insert)
    
    mccutils.remove(tmp_gff)

    return insertions
Exemplo n.º 4
0
def get_ref_tes(gff, taxon, chroms):
    ref_inserts = []
    te_family = {}
    with open(taxon, "r") as t:
        for line in t:
            split_line = line.split("\t")
            te_id = split_line[0]
            family = split_line[1]
            te_family[te_id] = family

    with open(gff, "r") as g:
        for line in g:
            if "#" not in line:
                split_line = line.split("\t")
                insert = mccutils.Insertion()
                insert.type = "reference"
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.strand = split_line[6]
                insert.family = te_family[split_line[2]]
                if insert.chromosome in chroms:
                    ref_inserts.append(insert)

    return ref_inserts
Exemplo n.º 5
0
def read_insertions(retroseq_vcf,
                    sample_name,
                    chromosomes,
                    support_threshold=0,
                    breakpoint_threshold=6):
    insertions = []

    with open(retroseq_vcf, "r") as vcf:
        for line in vcf:
            if "#" not in line:
                insert = mccutils.Insertion()
                line = line.replace(":", "\t")
                line = line.replace("=", "\t")
                line = line.replace(",", "\t")
                split_line = line.split("\t")
                insert.chromosome = split_line[0]
                insert.start = int(split_line[10])
                insert.end = int(split_line[11])
                insert.retroseq.read_support = int(split_line[6])
                insert.name = split_line[
                    9] + "_non-reference_" + sample_name + "_retroseq_rp_"
                insert.retroseq.breakpoint_confidence = int(split_line[20])

                if insert.retroseq.read_support >= support_threshold and insert.retroseq.breakpoint_confidence >= breakpoint_threshold and insert.chromosome in chromosomes:
                    insertions.append(insert)

    return insertions
Exemplo n.º 6
0
def get_insertions(gff,
                   sample_name,
                   chromosomes,
                   l_support_threshold=0,
                   r_support_threshold=0,
                   l_junction_threshold=0,
                   r_junction_threshold=0,
                   insert_type="ref"):
    insertions = []
    with open(gff, "r") as ingff:
        for line in ingff:
            if "#" not in line:
                line = line.replace(";", "\t")
                split_line = line.split("\t")
                insert = mccutils.Insertion()
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.strand = split_line[6]
                insert.type = insert_type

                insert.name = split_line[8].split("=")[1]

                te_name = ""
                if insert_type == "ref":
                    insert.type = "reference"
                    insert.relocate2.right_junction = int(
                        split_line[11].split(":")[1])
                    insert.relocate2.left_junction = int(
                        split_line[12].split(":")[1])
                    insert.relocate2.right_support = int(
                        split_line[13].split(":")[1])
                    insert.relocate2.left_support = int(
                        split_line[14].split(":")[1])
                else:
                    insert.type = "non-reference"
                    te_name = split_line[9].split("=")[1]
                    insert.name = te_name + "_non-reference_" + sample_name + "_relocate2_sr_"
                    insert.relocate2.right_junction = int(
                        split_line[12].split("=")[1])
                    insert.relocate2.left_junction = int(
                        split_line[13].split("=")[1])
                    insert.relocate2.right_support = int(
                        split_line[14].split("=")[1])
                    insert.relocate2.left_support = int(
                        split_line[15].split("=")[1])

                if (insert.relocate2.right_junction >= r_junction_threshold and
                        insert.relocate2.left_junction >= l_junction_threshold
                        and
                        insert.relocate2.right_support >= r_support_threshold
                        and
                        insert.relocate2.left_support >= l_support_threshold
                        and insert.chromosome in chromosomes
                        and te_name != "repeat_name"):
                    insertions.append(insert)

    return insertions
Exemplo n.º 7
0
def read_insertion_summary(infile, sample):
    insertions = []
    with open(infile, "r") as inf:
        for x, line in enumerate(inf):
            if x > 0:
                insert = mccutils.Insertion()
                split_line = line.split("\t")
                if len(split_line) == 14:
                    insert.chromosome = split_line[0]
                    insert.start = int(split_line[1]) - 1
                    insert.end = int(split_line[2])
                    insert.name = split_line[
                        3] + "_non-reference_" + split_line[
                            7] + "_" + sample + "_temp_"

                    if "antisense" in split_line[4]:
                        insert.strand = "-"
                    else:
                        insert.strand = "+"

                    insert.temp.classification = split_line[5]
                    insert.temp.support = float(split_line[6])
                    insert.temp.frequency = float(split_line[7])
                    insert.temp.junction1 = int(split_line[8])
                    insert.temp.junction1Support = int(split_line[9])
                    insert.temp.junction2 = int(split_line[10])
                    insert.temp.junction2Support = int(split_line[11])
                    insert.temp.fivePrimeSupport = float(split_line[12])
                    insert.temp.threePrimeSupport = float(
                        split_line[13].replace("\n", ""))
                    insert.type = "non-reference"

                    if insert.end >= insert.start and insert.end > 0 and insert.start > -1:

                        # if split read, use junction positions as start and end
                        if insert.temp.junction1Support > 0 and insert.temp.junction2Support > 0:
                            insert.start = insert.temp.junction1
                            insert.end = insert.temp.junction2
                            insert.name = insert.name + "sr_"

                        # read pair
                        else:
                            insert.name = insert.name + "rp_"

                        insertions.append(insert)
                    else:
                        print(
                            "<TEMP POST> Omitting malformed line from insertion summary results:",
                            line)
                else:
                    print(
                        "<TEMP POST> Omitting malformed line from insertion summary results:",
                        line)

    return insertions
Exemplo n.º 8
0
def read_insertions(predictions,
                    ref_tes,
                    chroms,
                    sample,
                    both_end_support_needed=True,
                    support_threshold=0.1):
    insertions = []

    with open(predictions, "r") as tsv:
        for line in tsv:
            split_line = line.split("\t")
            insert = mccutils.Insertion()
            insert.chromosome = split_line[1]
            insert.start = int(split_line[2])
            insert.end = int(split_line[2])
            insert.strand = split_line[3]
            insert.family = split_line[4]
            insert.popoolationte2.support_type = split_line[6]
            insert.popoolationte2.frequency = float(split_line[8])

            if (insert.popoolationte2.support_type == "FR"
                    or not both_end_support_needed
                ) and insert.popoolationte2.frequency > support_threshold:
                # determine if insert is a ref insert
                for x in range(0, len(ref_tes)):
                    if ref_tes[
                            x].start <= insert.start and insert.start <= ref_tes[
                                x].end:
                        insert.family = ref_tes[x].family
                        insert.popoolationte2.added = ref_tes[
                            x].popoolationte2.added
                        if not ref_tes[x].popoolationte2.added:
                            ref_tes[x].popoolationte2.added = True

                        insert.type = "reference"
                        insert.start = ref_tes[x].start
                        insert.end = ref_tes[x].end
                        insert.strand = ref_tes[x].strand

                if insert.type == "reference":
                    insert.name = insert.family + "_reference_" + str(
                        insert.popoolationte2.frequency
                    ) + "_" + sample + "_popoolationte2_rp_"
                else:
                    insert.type = "non-reference"
                    insert.name = insert.family + "_non-reference_" + str(
                        insert.popoolationte2.frequency
                    ) + "_" + sample + "_popoolationte2_rp_"

                if not insert.popoolationte2.added:
                    insertions.append(insert)

    return insertions
Exemplo n.º 9
0
def get_insertions(gff, sample_name, chromosomes, ref_l_threshold=0, ref_r_threshold=0, nonref_l_threshold=0, nonref_r_threshold=0):
    insertions = []
    with open(gff, "r") as ingff:
        for line in ingff:
            if "#" not in line:
                split_line = line.split("\t")
                feats = split_line[8].split(";")
                insert = mccutils.Insertion()
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.strand = split_line[6]

                feat_id = ""
                feat_te_name = ""
                for feat in feats:
                    if "ID=" in feat:
                        feat_id = feat.split("=")[1]
                    elif "TE_Name=" in feat:
                        feat_te_name = feat.split("=")[1]
                    elif "Note=" in feat:
                        if "Shared" in feat:
                            insert.type = "reference"
                        elif "Non-reference" in feat:
                            insert.type = "non-reference"
                        else:
                            insert.type = "missing"
                    
                    elif "left_flanking_read_count=" in feat:
                        insert.relocate.left_support = int(feat.split("=")[1])
                    
                    elif "right_flanking_read_count=" in feat:
                        insert.relocate.right_support = int(feat.split("=")[1])
                
                if insert.type == "reference":
                    insert.name = feat_te_name+"_reference_"+sample_name+"_relocate_sr_"
                elif insert.type == "non-reference":
                    feat_te_name = feat_id.split(".")[0]
                    insert.name = feat_te_name+"_non-reference_"+sample_name+"_relocate_sr_"
            
            if insert.type == "reference" and insert.relocate.left_support >= ref_l_threshold and insert.relocate.right_support >= ref_r_threshold and insert.chromosome in chromosomes:
                insertions.append(insert)
            elif insert.type == "non-reference" and insert.relocate.left_support >= nonref_l_threshold and insert.relocate.right_support >= nonref_r_threshold and insert.chromosome in chromosomes:
                insertions.append(insert)
    
    return insertions
Exemplo n.º 10
0
def read_insertions(popoolationte, sample_name, chromosomes, require_both_end_support=True, percent_read_support_threshold=0.1):
    insertions = []

    with open(popoolationte, "r") as tsv:
        for line in tsv:
            insert = mccutils.Insertion()
            split_line = line.split("\t")
            insert.chromosome = split_line[0]
            if "-" in split_line[8]:
                insert.start = int(float(split_line[1]))
                insert.end = int(float(split_line[15]))
            
            elif "-" in split_line[15]:
                insert.start = int(float(split_line[9]))
                insert.end = int(float(split_line[1]))
            
            else:
                insert.start = int(float(split_line[9]))
                insert.end = int(float(split_line[15]))               

            if "-" in split_line[6]:
                insert.name = split_line[3]+"_non-reference_"+split_line[4]+"_"+sample_name+"_popoolationte_rp_"
            else:
                insert.name = split_line[3]+"_reference_"+split_line[4]+"_"+sample_name+"_popoolationte_rp_"

            insert.popoolationte.support_type = split_line[2]

            if "F" in insert.popoolationte.support_type:
                insert.popoolationte.f_read_support = int(split_line[12])
                insert.popoolationte.f_read_support_percent = float(split_line[10])
            
            if "R" in insert.popoolationte.support_type:
                insert.popoolationte.r_read_support = int(split_line[19])
                insert.popoolationte.r_read_support_percent = float(split_line[17])

            if not require_both_end_support:
                if (insert.popoolationte.f_read_support_percent >= percent_read_support_threshold or insert.popoolationte.r_read_support_percent >= percent_read_support_threshold) and insert.chromosome in chromosomes:
                    insertions.append(insert)
            
            else:
                if "FR" in insert.popoolationte.support_type and (insert.popoolationte.f_read_support_percent >= percent_read_support_threshold or insert.popoolationte.r_read_support_percent >= percent_read_support_threshold) and insert.chromosome in chromosomes:
                    insertions.append(insert)
            
    return insertions
Exemplo n.º 11
0
def read_insertions(bed, chromosomes, sample_name, out_dir, min_read_cutoff=0):
    insertions = []
    with open(bed, "r") as inbed:
        for line in inbed:
            insert = mccutils.Insertion()
            line = line.replace(";", "\t")
            split_line = line.split("\t")
            insert.chromosome = split_line[0]
            insert.start = int(split_line[1]) + 1
            insert.end = int(split_line[2])
            insert.type = split_line[8].replace("\n", "")
            insert.strand = split_line[4]
            insert.family = split_line[5]
            insert.name = insert.family + "_" + insert.type + "_" + sample_name + "_ngs_te_mapper_sr_"
            insert.ngs_te_mapper.support = int(split_line[7])
            if insert.ngs_te_mapper.support > min_read_cutoff and insert.chromosome in chromosomes:
                insertions.append(insert)

    return insertions
Exemplo n.º 12
0
def get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name):
    ref_tes = []
    with open(te_gff, "r") as gff:
        for line in gff:
            ref_te = mccutils.Insertion()
            split_line = line.split("\t")
            ref_te.chromosome = split_line[0]
            ref_te.start = int(split_line[3])
            ref_te.end = int(split_line[4])
            ref_te.strand = split_line[6]
            feats = split_line[8]
            split_feats = feats.split(";")
            te_id = ""
            for f in split_feats:
                if "ID=" in f:
                    te_id = f.split("=")[1]

            ref_te.family = te_to_family[te_id]
            ref_te.type = "reference"
            ref_te.name = ref_te.family + "|reference|" + sample_name + "|tepid|nonab|"
            ref_tes.append(ref_te)

    absent = []
    for deletion in deletions:
        key = "_".join([
            deletion.chromosome,
            str(deletion.start),
            str(deletion.end), deletion.strand, deletion.family
        ])
        absent.append(key)

    non_absent = []
    for te in ref_tes:
        key = "_".join(
            [te.chromosome,
             str(te.start),
             str(te.end), te.strand, te.family])
        if key not in absent:
            non_absent.append(te)

    return non_absent
Exemplo n.º 13
0
def read_insertions(jitterbug_gff,
                    taxonomy,
                    chroms,
                    sample_name,
                    min_fwd_read_support=0,
                    min_rev_read_support=0,
                    min_sr_support=0,
                    min_zygosity=0.0):
    insertions = []

    te_family = {}
    with open(taxonomy, "r") as tsv:
        for line in tsv:
            line = line.replace("\n", "")
            split_line = line.split("\t")
            te_family[split_line[0]] = split_line[1]

    with open(jitterbug_gff, "r") as gff:
        for line in gff:
            line = line.replace("\n", "")
            split_line = line.split("\t")
            if len(split_line) == 9:
                insert = mccutils.Insertion()

                insert.chromosome = split_line[0]
                if insert.chromosome in chroms:
                    insert.start = int(split_line[3])
                    insert.end = int(split_line[4])
                    insert.type = "non-reference"

                    feats = split_line[8]
                    feats = feats.replace(" ", "")
                    feats = feats.split(";")
                    supporting_families = []
                    sr = False
                    family = "NONE"
                    for feat in feats:
                        if "softclipped_pos" in feat:
                            pos = feat.split("=")[1]
                            pos = pos.replace("(", "")
                            pos = pos.replace(")", "")
                            pos = pos.split(",")
                            start = int(pos[0]) - 1
                            end = int(pos[1])

                            if start > -1 and end > -1:
                                insert.start = start
                                insert.end = end
                                sr = True

                        if "predicted_superfam" in feat:
                            te = feat.split("=")[1]
                            family = te_family[te]

                        if "supporting_fwd_reads" in feat:
                            insert.jitterbug.supporting_fwd_reads = int(
                                feat.split("=")[1])

                        if "supporting_rev_reads" in feat:
                            insert.jitterbug.supporting_rev_reads = int(
                                feat.split("=")[1])

                        if "softclipped_support" in feat:
                            insert.jitterbug.split_read_support = int(
                                feat.split("=")[1])

                        if "zygosity" in feat:
                            insert.jitterbug.zygosity = float(
                                feat.split("=")[1])

                    insert.name = family + "|non-reference|" + sample_name + "|jitterbug|"
                    if sr:
                        insert.name += "sr|"
                    else:
                        insert.name = "rp|"

                    if ((insert.jitterbug.supporting_fwd_reads >=
                         min_fwd_read_support)
                            and (insert.jitterbug.supporting_rev_reads >=
                                 min_rev_read_support) and
                        (insert.jitterbug.split_read_support >= min_sr_support)
                            and (insert.jitterbug.zygosity >= min_zygosity)):
                        insertions.append(insert)

    return insertions