示例#1
0
文件: find_RI.py 项目: babonis/gimme
def add_intervals(graph, exonsDb):
    iv_tree = IntervalTree()
    for node in graph.nodes():
        exon = exonsDb[node]
        iv_tree.add_interval(exon)

    return iv_tree
    def __init__(self,
                 gff_filename,
                 group_filename,
                 internal_fuzzy_max_dist=0,
                 self_prefix=None,
                 allow_5merge=False,
                 fastq_filename=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.allow_5merge = allow_5merge
        self.record_d = dict(
            (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        #sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2
        self.tree = defaultdict(lambda: {
            '+': IntervalTree(),
            '-': IntervalTree()
        })  # chr --> strand --> tree
        self.fastq_dict = None
        if fastq_filename is not None:
            self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename)

        #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        #raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(
            self.group_filename,
            self.self_prefix)  # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
示例#3
0
class TestFindRI(unittest.TestCase):
    def setUp(self):
        self.exonsDB = {}
        self.ex1 = Exon('chrX', 1000, 2000, 'ex1.1', '+')
        self.ex2 = Exon('chrX', 3000, 4000, 'ex1.1', '+')
        self.ex3 = Exon('chrX', 5000, 6000, 'ex1.1', '+')
        self.ex4 = Exon('chrX', 7000, 8000, 'ex1.1', '+')
        self.exonsDB[str(self.ex1)] = self.ex1
        self.exonsDB[str(self.ex2)] = self.ex2
        self.exonsDB[str(self.ex3)] = self.ex3
        self.exonsDB[str(self.ex4)] = self.ex4
        self.tree = IntervalTree()
        self.tree.add_interval(self.ex1)
        self.tree.add_interval(self.ex2)
        self.tree.add_interval(self.ex3)
        self.tree.add_interval(self.ex4)
        self.graph = nx.DiGraph()

    def test_no_retained_introns(self):
        self.path1 = [str(self.ex1), str(self.ex2), str(self.ex3)]
        self.path2 = [str(self.ex1), str(self.ex3), str(self.ex4)]
        self.graph.add_path(self.path1)
        self.graph.add_path(self.path2)
        self.events = list(find_RI(self.graph, self.tree, self.exonsDB))

        self.assertEqual(len(self.events), 0)

    def test_one_retained_introns(self):
        self.ex5 = Exon('chrX', 3000, 6000, 'ex1.1', '+')
        self.exonsDB[str(self.ex5)] = self.ex5
        self.tree.add_interval(self.ex5)

        self.path1 = [str(self.ex1), str(self.ex2),
                        str(self.ex3), str(self.ex4)]
        self.path2 = [str(self.ex1), str(self.ex5), str(self.ex4)]
        self.graph.add_path(self.path1)
        self.graph.add_path(self.path2)
        self.events = list(find_RI(self.graph, self.tree, self.exonsDB))

        self.assertEqual(len(self.events), 1)

    def test_two_retained_introns(self):
        self.ex5 = Exon('chrX', 1000, 4000, 'ex1.1', '+')
        self.exonsDB[str(self.ex5)] = self.ex5
        self.tree.add_interval(self.ex5)

        self.ex6 = Exon('chrX', 5000, 8000, 'ex1.1', '+')
        self.exonsDB[str(self.ex6)] = self.ex6
        self.tree.add_interval(self.ex6)

        self.path1 = [str(self.ex1), str(self.ex2),
                        str(self.ex3), str(self.ex4)]
        self.path2 = [str(self.ex5), str(self.ex6)]
        self.graph.add_path(self.path1)
        self.graph.add_path(self.path2)
        self.events = list(find_RI(self.graph, self.tree, self.exonsDB))

        self.assertEqual(len(self.events), 2)
示例#4
0
def read_scrubbed_junction_to_tree(junction_filename):
    tree = defaultdict(lambda: IntervalTree())
    for line in open(junction_filename):
        chrom, left, right, strand = line.strip().split('\t')
        left, right = int(left), int(right) # already 0-based start, 0-based end
        tree[chrom,strand].add(left, right, Interval(left, right))
    return tree
示例#5
0
def main_maize(ki11_snps=None, dirs=None):
    if ki11_snps is None:
        ki11_snps = defaultdict(lambda: {})  # chrom -> pos -> VCF record
        debug_count = 0
        for r in vcf.VCFReader(open('B73Ki11.q20.vcf')):
            ki11_snps[r.CHROM][r.POS] = r
            #if debug_count > 100000: break
            debug_count += 1

    print >> sys.stderr, 'Finished reading B73Ki11.q20.vcf.'

    ki11_shortread_cov = defaultdict(
        lambda: {})  # chrom -> pos -> short read cov
    # read the raw Ki11 pileup to get coverage in places where no SNPs were called
    for r in sp.MPileUpReader('Ki11.raw.mpileup'):
        if r is not None:
            ki11_shortread_cov[r.chr][r.pos] = r.cov
    print >> sys.stderr, "Fnished reading Ki11.raw.mpileup."

    repeat_by_chrom = {}
    # read the Tandem Repeat Finder summary
    for r in DictReader(open('B73_RefV4.fa.repeat_list.txt'), delimiter='\t'):
        if r['chrom'] not in repeat_by_chrom:
            repeat_by_chrom[r['chrom']] = IntervalTree()
        repeat_by_chrom[r['chrom']].add(int(r['start0']), int(r['end1']))

    print >> sys.stderr, 'Finished reading B73_RefV4.fa.repeat_list.txt.'

    FIELDS = [
        'dir', 'chrom', 'pos', 'ref', 'alt_Short', 'alt_PB', 'in_Short',
        'in_PB', 'cov_Short', 'cov_PB', 'genomic_HP'
    ]
    out_f = open('evaled.isophase_SNP.txt', 'w')
    writer_f = DictWriter(out_f, FIELDS, delimiter='\t')
    writer_f.writeheader()

    debug_count = 0
    if dirs is None: dirs = glob.glob('by_loci/*size*/')
    for d1 in dirs:
        #if debug_count > 100: break
        debug_count += 1
        mpileup = os.path.join(d1, 'ccs.mpileup')
        mapfile = os.path.join(d1, 'fake.mapping.txt')
        vcffile = os.path.join(d1, 'phased.partial.vcf')
        nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND')
        if not os.path.exists(vcffile):
            assert os.path.exists(nosnp)
            print >> sys.stderr, (
                'Skipping {0} because no SNPs found.').format(d1)
        else:
            print >> sys.stderr, ('Evaluating {0}.').format(d1)
            good_positions, cov_at_pos = get_positions_to_recover(
                mapfile, mpileup, ki11_snps, min_cov=30
            )  # use lower min cov here becuz a few close cases where BQ filtering lowered cov
            name = d1.split('/')[1]
            eval_isophase(vcffile, ki11_snps, good_positions, cov_at_pos,
                          repeat_by_chrom, ki11_shortread_cov, writer_f, name)

    out_f.close()
    return ki11_snps
示例#6
0
def scrub_junctions(report_filename, output_filename, min_sample, min_transcript, accept_all_canonical):
    tree = defaultdict(lambda: IntervalTree())
    f = open(output_filename, 'w')
    for _label, junctions in read_junction_report(report_filename):
        good = scrub_junction_by_label(junctions, min_sample, min_transcript, accept_all_canonical)
        for r in good:
            a, b = int(r['left']), int(r['right']) # 0-based start, 0-basde end
            f.write("{chrom}\t{left}\t{right}\t{strand}\n".format(\
                chrom=r['chr'], left=r['left'], right=r['right'], strand=r['strand']))
            tree[r['chr'],r['strand']].add(a, b, Interval(a, b))
    f.close()
    return tree
    def __init__(self,
                 gff_filename,
                 group_filename,
                 internal_fuzzy_max_dist=0,
                 self_prefix=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.record_d = dict(
            (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        self.tree = defaultdict(lambda: {
            '+': IntervalTree(),
            '-': IntervalTree()
        })  # chr --> strand --> tree

        #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        #raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(
            self.group_filename,
            self.self_prefix)  # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
示例#8
0
文件: find_MXE.py 项目: babonis/gimme
def remove_overlaps(events, exonsDB):
    tree = IntervalTree()
    all_nodes = set()
    for path in events:
        for node in path:
            all_nodes.add(node)
            exon = exonsDB[node]
            tree.add_interval(exon)

    overlapped_exons = set()
    for node in all_nodes:
        exon = exonsDB[node]
        for overlap in tree.find(exon.start, exon.end):
            if (overlap.start != exon.start or
                    overlap.end != exon.end):
                overlapped_exons.add(node)

    new_events = []
    for path in events:
        if len(set(path).intersection(overlapped_exons)) == 0:
            new_events.append(path)
    return new_events
def read_scrubbed_junction_to_tree(junction_filename):
    tree = defaultdict(lambda: IntervalTree())
    f = open(junction_filename)
    if not f.readline().startswith('track'): f.seek(0)
    for line in f:
        raw = line.strip().split('\t')
        if len(raw) == 4: chrom, left, right, strand = raw
        elif len(raw) == 6: chrom, left, right, _name, _count, strand = raw
        else:
            raise Exception, "Expects junction BED file to have either 4 or 6 columns! Saw {0}!".format(
                len(raw))
        left, right = int(left), int(
            right)  # already 0-based start, 0-based end
        tree[chrom, strand].add(left, right, Interval(left, right))
    return tree
def read_probe_bed(bed_filename, start_base=0, end_base=1):
    """
    Read a probe BED file <chrom>, <start>, <end>
    Return dict of chrom --> IntervalTree w/ data=(index, interval)
    """
    tree = {}
    gene_info = {}
    i = 0
    reader = BED.SimpleBEDReader(bed_filename, start_base, end_base)
    for r in reader:
        if r.chr not in tree: tree[r.chr] = IntervalTree()
        tree[r.chr].add(r.start, r.end, (i, Interval(r.start, r.end)))
        if r.name is not None:
            gene_info[i] = r.name
        i += 1
    return tree, gene_info
示例#11
0
 def setUp(self):
     self.exonsDB = {}
     self.ex1 = Exon('chrX', 1000, 2000, 'ex1.1', '+')
     self.ex2 = Exon('chrX', 3000, 4000, 'ex1.1', '+')
     self.ex3 = Exon('chrX', 5000, 6000, 'ex1.1', '+')
     self.ex4 = Exon('chrX', 7000, 8000, 'ex1.1', '+')
     self.exonsDB[str(self.ex1)] = self.ex1
     self.exonsDB[str(self.ex2)] = self.ex2
     self.exonsDB[str(self.ex3)] = self.ex3
     self.exonsDB[str(self.ex4)] = self.ex4
     self.tree = IntervalTree()
     self.tree.add_interval(self.ex1)
     self.tree.add_interval(self.ex2)
     self.tree.add_interval(self.ex3)
     self.tree.add_interval(self.ex4)
     self.graph = nx.DiGraph()
def find_best_match_junction(
    tree: IntervalTree,
    donor: int,
    accep: int,
    max_diff: int = 20,
) -> Optional[Interval]:
    """
    donor, accept -- both should be 0-based
    """
    hits = tree.find(donor, accep)
    if len(hits) == 0:
        return None
    elif len(hits) == 1:
        if hits[0].start - donor > max_diff or hits[0].end - accep > max_diff:
            return None
        return hits[0]
    else:  # multiple hits, find the closest one
        diff = []
        for h in hits:
            if h.start - donor > max_diff or h.end - accep > max_diff:
                continue
            diff.append((abs(h.start - donor) + abs(h.end - accep), h))
        diff.sort(key=lambda x: x[0])
        return diff[0][1]
示例#13
0
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon,
                             internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
        return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1  #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(
                r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename + '.fuzzy', 'w')
    f_group = open(group_filename + '.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(
            group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                               and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
def categorize_aln_by_annotation(gene_annotation_file,
                                 input_fasta,
                                 input_sam,
                                 output_prefix,
                                 min_overlap_bp=200,
                                 min_query_overlap=.5,
                                 min_gene_overlap=.8):

    t = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr -> strand -> IntervalTree
    info = {}

    #reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t')
    for r in DictReader(open(gene_annotation_file), delimiter='\t'):
        if r['#Replicon Name'] != 'chr':
            print("Ignore", r, file=sys.stderr)
            continue
        info[r['Locus tag']] = (int(r['Start']), int(r['Stop']),
                                r['Locus tag'])
        t[r['Replicon Accession']][r['Strand']].add(int(r['Start']),
                                                    int(r['Stop']),
                                                    r['Locus tag'])

    #pdb.set_trace()

    result = defaultdict(lambda: [])  # gene -> list of rec
    d = dict(
        (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), 'fasta'))

    reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d)
    for r in reader:
        #if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS':
        #    pdb.set_trace()
        ans = match_w_annotation(t, r, info, min_overlap_bp, min_query_overlap,
                                 min_gene_overlap)
        # ans is AMatch(name, strand, start, end, record)
        result[ans.name].append(ans)

    novel_ct = defaultdict(lambda: {
        '+': ClusterTree(0, 0),
        '-': ClusterTree(0, 0)
    })
    novel_list = []
    novel_index = 0

    f = open(output_prefix + '.sam', 'w')
    f.write(reader.header)
    f1 = open(output_prefix + '.report.txt', 'w')
    f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n")
    for k, v in result.items():
        # v is: list of AMatch(name, strand, start, end, record)
        if k.startswith('novel-unannotated'):
            # write novel later, we are grouping them by loci first
            #tagRG='novel'
            for x in v:
                novel_ct[x.record.sID][x.strand].insert(
                    x.start, x.end, novel_index)
                novel_index += 1
                novel_list.append(x)
            continue
        elif k.startswith('novel-antisense'):
            tagRG = 'novel-antisense'
        elif k.startswith('novel-partial'):
            tagRG = 'novel-partial'
        elif k.startswith('poly-'):
            tagRG = 'poly'
        else:
            tagRG = 'single'
        v.sort(key=lambda x: (x.start, x.end),
               reverse=True
               if v[0].strand == '-' else False)  # sort by start, then end
        for i, x in enumerate(v):
            f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format(
                x.record.record_line, i + 1, tagRG, k))
            if x.strand == '+':
                f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                    x.record.qID, tagRG, k, i+1, x.strand, x.start+1, x.end))
            else:  # - strand, start is end, end is start
                f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                    x.record.qID, tagRG, k, i+1, x.strand, x.end, x.start+1))

    # now write the novel stuff, grouped by regions
    novel_region_index = 1
    for d1 in novel_ct.values():
        for ct in d1.values():
            gn = 'novel-' + str(novel_region_index)
            for _start, _end, _indices in ct.getregions():
                v = [novel_list[ind] for ind in _indices]
                v.sort(key=lambda x: (x.start, x.end),
                       reverse=True if v[0].strand == '-' else
                       False)  # sort by start, then end
                for i, x in enumerate(v):
                    f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format(
                        x.record.record_line, i + 1, "novel-unannotated", gn))
                    if x.strand == '+':
                        f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                            x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.start+1, x.end))
                    else:
                        f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                            x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.end, x.start+1))
                novel_region_index += 1

    f.close()
    f1.close()

    print("Output written to:", f.name, file=sys.stderr)
    print("Output written to:", f1.name, file=sys.stderr)
def categorize_aln_by_annotation(
    gene_annotation_file: str,
    input_fasta: str,
    input_sam: str,
    output_prefix: str,
    min_overlap_bp: int = 200,
    min_query_overlap: float = 0.5,
    min_gene_overlap: float = 0.8,
) -> None:

    t = defaultdict(
        lambda: {"+": IntervalTree(), "-": IntervalTree()}
    )  # chr -> strand -> IntervalTree
    info = {}

    # reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t')
    for r in DictReader(open(gene_annotation_file), delimiter="\t"):
        if r["#Replicon Name"] != "chr":
            logger.info(f"Ignore {r}")
            continue
        info[r["Locus tag"]] = (int(r["Start"]), int(r["Stop"]), r["Locus tag"])
        t[r["Replicon Accession"]][r["Strand"]].add(
            int(r["Start"]), int(r["Stop"]), r["Locus tag"]
        )

    # pdb.set_trace()

    result = defaultdict(lambda: [])  # gene -> list of rec
    d = {r.id: len(r.seq) for r in SeqIO.parse(open(input_fasta), "fasta")}

    reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d)
    for r in reader:
        # if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS':
        #    pdb.set_trace()
        ans = match_w_annotation(
            t, r, info, min_overlap_bp, min_query_overlap, min_gene_overlap
        )
        # ans is AMatch(name, strand, start, end, record)
        result[ans.name].append(ans)

    novel_ct = defaultdict(lambda: {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)})
    novel_list = []
    novel_index = 0

    with open(f"{output_prefix}.sam", "w") as f, open(
        f"{output_prefix}.report.txt", "w"
    ) as f1:
        f.write(reader.header)
        f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n")
        for k, v in result.items():
            # v is: list of AMatch(name, strand, start, end, record)
            if k.startswith("novel-unannotated"):
                # write novel later, we are grouping them by loci first
                # tagRG='novel'
                for x in v:
                    novel_ct[x.record.sID][x.strand].insert(x.start, x.end, novel_index)
                    novel_index += 1
                    novel_list.append(x)
                continue
            elif k.startswith("novel-antisense"):
                tagRG = "novel-antisense"
            elif k.startswith("novel-partial"):
                tagRG = "novel-partial"
            elif k.startswith("poly-"):
                tagRG = "poly"
            else:
                tagRG = "single"
            v.sort(
                key=lambda x: (x.start, x.end),
                reverse=bool(v[0].strand == "-"),
            )  # sort by start, then end
            for i, x in enumerate(v):
                f.write(
                    f"{x.record.record_line}\tSN:Z:{i + 1:06d}\tRG:Z:{tagRG}\tgn:Z:{k}\n"
                )
                if x.strand == "+":
                    f1.write(
                        f"{x.record.qID}\t{tagRG}\t{k}\t{i + 1:06d}\t{x.strand}\t{x.start + 1}\t{x.end}\n"
                    )
                else:  # - strand, start is end, end is start
                    f1.write(
                        f"{x.record.qID}\t{tagRG}\t{k}\t{i + 1:06d}\t{x.strand}\t{x.end}\t{x.start + 1}\n"
                    )

        # now write the novel stuff, grouped by regions
        novel_region_index = 1
        for d1 in novel_ct.values():
            for ct in d1.values():
                gn = f"novel-{str(novel_region_index)}"
                for *_, _indices in ct.getregions():
                    v = [novel_list[ind] for ind in _indices]
                    v.sort(
                        key=lambda x: (x.start, x.end),
                        reverse=bool(v[0].strand == "-"),
                    )  # sort by start, then end
                    for i, x in enumerate(v):
                        f.write(
                            f"{x.record.record_line}\tSN:Z:{i + 1:06d}\tRG:Z:{'novel-unannotated'}\tgn:Z:{gn}\n"
                        )
                        if x.strand == "+":
                            f1.write(
                                f"{x.record.qID}\t{'novel-unannotated'}\t{gn}\t{i + 1:06d}\t{x.strand}\t{x.start + 1}\t{x.end}\n"
                            )
                        else:
                            f1.write(
                                f"{x.record.qID}\t{'novel-unannotated'}\t{gn}\t{i + 1:06d}\t{x.strand}\t{x.end}\t{x.start + 1}\n"
                            )
                    novel_region_index += 1

        logger.info(f"Output written to: {f.name}")
        logger.info(f"Output written to: {f1.name}")
def collapse_fuzzy_junctions(
    gff_filename: Union[str, Path],
    group_filename: Union[str, Path],
    allow_extra_5exon: bool,
    internal_fuzzy_max_dist: int,
    max_5_diff: int,
    max_3_diff: int,
) -> defaultdict:
    def can_merge(m, r1, r2):
        if m == "exact":
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == "subset":
            r1, r2 = r2, r1  # rotate so r1 is always the longer one
        if m == "super" or m == "subset":
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == "+":
                return (abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <=
                        internal_fuzzy_max_dist and r1.ref_exons[-n2].start <=
                        r2.ref_exons[0].start < r1.ref_exons[-n2].end)
            else:
                return (abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <=
                        internal_fuzzy_max_dist and r1.ref_exons[n2 - 1].start
                        <= r2.ref_exons[-1].end < r1.ref_exons[n2].end)
        return False

    d = {}
    # chr --> strand --> tree
    recs = defaultdict(lambda: {"+": IntervalTree(), "-": IntervalTree()})
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions(
                r,
                r2,
                internal_fuzzy_max_dist=internal_fuzzy_max_dist,
                max_5_diff=max_5_diff,
                max_3_diff=max_3_diff,
            )
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split("\t")
            group_info[pbid] = members.split(",")

    # pick for each fuzzy group the one that has the most exons
    keys = list(fuzzy_match.keys())
    keys.sort(key=lambda x: int(x.split(".")[1]))

    with open(f"{gff_filename}.fuzzy",
              "w") as f_gff, open(f"{group_filename}.fuzzy", "w") as f_group:
        for k in keys:
            all_members = []
            best_pbid, best_size, best_num_exons = (
                fuzzy_match[k][0],
                len(group_info[fuzzy_match[k][0]]),
                len(d[fuzzy_match[k][0]].ref_exons),
            )
            all_members += group_info[fuzzy_match[k][0]]
            for pbid in fuzzy_match[k][1:]:
                _num_exons = len(d[pbid].ref_exons)
                _size = len(group_info[pbid])
                all_members += group_info[pbid]
                if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                                   and _size > best_size):
                    best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
            GFF.write_collapseGFF_format(f_gff, d[best_pbid])
            f_group.write(f'{best_pbid}\t{",".join(all_members)}\n')

    return fuzzy_match
示例#17
0
def main_maize(ki11_snps=None, dirs=None):
    if ki11_snps is None:
        ki11_snps = defaultdict(lambda: {})  # chrom -> pos -> VCF record
        debug_count = 0
        for r in vcfpy.Reader("B73Ki11.q20.vcf"):
            ki11_snps[r.CHROM][r.POS] = r
            # if debug_count > 100000: break
            debug_count += 1

    logger.info("Finished reading B73Ki11.q20.vcf.")

    ki11_shortread_cov = defaultdict(
        lambda: {})  # chrom -> pos -> short read cov
    # read the raw Ki11 pileup to get coverage in places where no SNPs were called
    for r in MPileUpReader("Ki11.raw.mpileup"):
        if r is not None:
            ki11_shortread_cov[r.chr][r.pos] = r.cov
    logger.info("Fnished reading Ki11.raw.mpileup.")

    repeat_by_chrom = {}
    # read the Tandem Repeat Finder summary
    for r in DictReader(open("B73_RefV4.fa.repeat_list.txt"), delimiter="\t"):
        if r["chrom"] not in repeat_by_chrom:
            repeat_by_chrom[r["chrom"]] = IntervalTree()
        repeat_by_chrom[r["chrom"]].add(int(r["start0"]), int(r["end1"]))

    logger.info("Finished reading B73_RefV4.fa.repeat_list.txt.")

    FIELDS = [
        "dir",
        "chrom",
        "pos",
        "ref",
        "alt_Short",
        "alt_PB",
        "in_Short",
        "in_PB",
        "cov_Short",
        "cov_PB",
        "genomic_HP",
    ]
    with open("evaled.isophase_SNP.txt", "w") as out_f:
        writer_f = DictWriter(out_f, FIELDS, delimiter="\t")
        writer_f.writeheader()

        debug_count = 0
        if dirs is None:
            dirs = glob.glob("by_loci/*size*/")
        for d1 in dirs:
            # if debug_count > 100: break
            debug_count += 1
            mpileup = Path(d1, "ccs.mpileup")
            mapfile = Path(d1, "fake.mapping.txt")
            vcffile = Path(d1, "phased.partial.vcf")
            nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND")
            if not vcffile.exists():
                assert nosnp.exists()
                logger.info(f"Skipping {d1} because no SNPs found.")
            else:
                logger.info(f"Evaluating {d1}.")
                good_positions, cov_at_pos = get_positions_to_recover(
                    mapfile, mpileup, ki11_snps, min_cov=30
                )  # use lower min cov here becuz a few close cases where BQ filtering lowered cov
                name = d1.split("/")[1]
                eval_isophase(
                    vcffile,
                    ki11_snps,
                    good_positions,
                    cov_at_pos,
                    repeat_by_chrom,
                    ki11_shortread_cov,
                    writer_f,
                    name,
                )

    return ki11_snps