def add_intervals(graph, exonsDb): iv_tree = IntervalTree() for node in graph.nodes(): exon = exonsDb[node] iv_tree.add_interval(exon) return iv_tree
def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, allow_5merge=False, fastq_filename=None): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.internal_fuzzy_max_dist = internal_fuzzy_max_dist self.allow_5merge = allow_5merge self.record_d = dict( (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename)) #sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2 self.tree = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree self.fastq_dict = None if fastq_filename is not None: self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename) #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist #raw_input() self.read_gff_as_interval_tree() self.group_info = MegaPBTree.read_group( self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
class TestFindRI(unittest.TestCase): def setUp(self): self.exonsDB = {} self.ex1 = Exon('chrX', 1000, 2000, 'ex1.1', '+') self.ex2 = Exon('chrX', 3000, 4000, 'ex1.1', '+') self.ex3 = Exon('chrX', 5000, 6000, 'ex1.1', '+') self.ex4 = Exon('chrX', 7000, 8000, 'ex1.1', '+') self.exonsDB[str(self.ex1)] = self.ex1 self.exonsDB[str(self.ex2)] = self.ex2 self.exonsDB[str(self.ex3)] = self.ex3 self.exonsDB[str(self.ex4)] = self.ex4 self.tree = IntervalTree() self.tree.add_interval(self.ex1) self.tree.add_interval(self.ex2) self.tree.add_interval(self.ex3) self.tree.add_interval(self.ex4) self.graph = nx.DiGraph() def test_no_retained_introns(self): self.path1 = [str(self.ex1), str(self.ex2), str(self.ex3)] self.path2 = [str(self.ex1), str(self.ex3), str(self.ex4)] self.graph.add_path(self.path1) self.graph.add_path(self.path2) self.events = list(find_RI(self.graph, self.tree, self.exonsDB)) self.assertEqual(len(self.events), 0) def test_one_retained_introns(self): self.ex5 = Exon('chrX', 3000, 6000, 'ex1.1', '+') self.exonsDB[str(self.ex5)] = self.ex5 self.tree.add_interval(self.ex5) self.path1 = [str(self.ex1), str(self.ex2), str(self.ex3), str(self.ex4)] self.path2 = [str(self.ex1), str(self.ex5), str(self.ex4)] self.graph.add_path(self.path1) self.graph.add_path(self.path2) self.events = list(find_RI(self.graph, self.tree, self.exonsDB)) self.assertEqual(len(self.events), 1) def test_two_retained_introns(self): self.ex5 = Exon('chrX', 1000, 4000, 'ex1.1', '+') self.exonsDB[str(self.ex5)] = self.ex5 self.tree.add_interval(self.ex5) self.ex6 = Exon('chrX', 5000, 8000, 'ex1.1', '+') self.exonsDB[str(self.ex6)] = self.ex6 self.tree.add_interval(self.ex6) self.path1 = [str(self.ex1), str(self.ex2), str(self.ex3), str(self.ex4)] self.path2 = [str(self.ex5), str(self.ex6)] self.graph.add_path(self.path1) self.graph.add_path(self.path2) self.events = list(find_RI(self.graph, self.tree, self.exonsDB)) self.assertEqual(len(self.events), 2)
def read_scrubbed_junction_to_tree(junction_filename): tree = defaultdict(lambda: IntervalTree()) for line in open(junction_filename): chrom, left, right, strand = line.strip().split('\t') left, right = int(left), int(right) # already 0-based start, 0-based end tree[chrom,strand].add(left, right, Interval(left, right)) return tree
def main_maize(ki11_snps=None, dirs=None): if ki11_snps is None: ki11_snps = defaultdict(lambda: {}) # chrom -> pos -> VCF record debug_count = 0 for r in vcf.VCFReader(open('B73Ki11.q20.vcf')): ki11_snps[r.CHROM][r.POS] = r #if debug_count > 100000: break debug_count += 1 print >> sys.stderr, 'Finished reading B73Ki11.q20.vcf.' ki11_shortread_cov = defaultdict( lambda: {}) # chrom -> pos -> short read cov # read the raw Ki11 pileup to get coverage in places where no SNPs were called for r in sp.MPileUpReader('Ki11.raw.mpileup'): if r is not None: ki11_shortread_cov[r.chr][r.pos] = r.cov print >> sys.stderr, "Fnished reading Ki11.raw.mpileup." repeat_by_chrom = {} # read the Tandem Repeat Finder summary for r in DictReader(open('B73_RefV4.fa.repeat_list.txt'), delimiter='\t'): if r['chrom'] not in repeat_by_chrom: repeat_by_chrom[r['chrom']] = IntervalTree() repeat_by_chrom[r['chrom']].add(int(r['start0']), int(r['end1'])) print >> sys.stderr, 'Finished reading B73_RefV4.fa.repeat_list.txt.' FIELDS = [ 'dir', 'chrom', 'pos', 'ref', 'alt_Short', 'alt_PB', 'in_Short', 'in_PB', 'cov_Short', 'cov_PB', 'genomic_HP' ] out_f = open('evaled.isophase_SNP.txt', 'w') writer_f = DictWriter(out_f, FIELDS, delimiter='\t') writer_f.writeheader() debug_count = 0 if dirs is None: dirs = glob.glob('by_loci/*size*/') for d1 in dirs: #if debug_count > 100: break debug_count += 1 mpileup = os.path.join(d1, 'ccs.mpileup') mapfile = os.path.join(d1, 'fake.mapping.txt') vcffile = os.path.join(d1, 'phased.partial.vcf') nosnp = os.path.join(d1, 'phased.partial.NO_SNPS_FOUND') if not os.path.exists(vcffile): assert os.path.exists(nosnp) print >> sys.stderr, ( 'Skipping {0} because no SNPs found.').format(d1) else: print >> sys.stderr, ('Evaluating {0}.').format(d1) good_positions, cov_at_pos = get_positions_to_recover( mapfile, mpileup, ki11_snps, min_cov=30 ) # use lower min cov here becuz a few close cases where BQ filtering lowered cov name = d1.split('/')[1] eval_isophase(vcffile, ki11_snps, good_positions, cov_at_pos, repeat_by_chrom, ki11_shortread_cov, writer_f, name) out_f.close() return ki11_snps
def scrub_junctions(report_filename, output_filename, min_sample, min_transcript, accept_all_canonical): tree = defaultdict(lambda: IntervalTree()) f = open(output_filename, 'w') for _label, junctions in read_junction_report(report_filename): good = scrub_junction_by_label(junctions, min_sample, min_transcript, accept_all_canonical) for r in good: a, b = int(r['left']), int(r['right']) # 0-based start, 0-basde end f.write("{chrom}\t{left}\t{right}\t{strand}\n".format(\ chrom=r['chr'], left=r['left'], right=r['right'], strand=r['strand'])) tree[r['chr'],r['strand']].add(a, b, Interval(a, b)) f.close() return tree
def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.internal_fuzzy_max_dist = internal_fuzzy_max_dist self.record_d = dict( (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename)) self.tree = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist #raw_input() self.read_gff_as_interval_tree() self.group_info = MegaPBTree.read_group( self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
def remove_overlaps(events, exonsDB): tree = IntervalTree() all_nodes = set() for path in events: for node in path: all_nodes.add(node) exon = exonsDB[node] tree.add_interval(exon) overlapped_exons = set() for node in all_nodes: exon = exonsDB[node] for overlap in tree.find(exon.start, exon.end): if (overlap.start != exon.start or overlap.end != exon.end): overlapped_exons.add(node) new_events = [] for path in events: if len(set(path).intersection(overlapped_exons)) == 0: new_events.append(path) return new_events
def read_scrubbed_junction_to_tree(junction_filename): tree = defaultdict(lambda: IntervalTree()) f = open(junction_filename) if not f.readline().startswith('track'): f.seek(0) for line in f: raw = line.strip().split('\t') if len(raw) == 4: chrom, left, right, strand = raw elif len(raw) == 6: chrom, left, right, _name, _count, strand = raw else: raise Exception, "Expects junction BED file to have either 4 or 6 columns! Saw {0}!".format( len(raw)) left, right = int(left), int( right) # already 0-based start, 0-based end tree[chrom, strand].add(left, right, Interval(left, right)) return tree
def read_probe_bed(bed_filename, start_base=0, end_base=1): """ Read a probe BED file <chrom>, <start>, <end> Return dict of chrom --> IntervalTree w/ data=(index, interval) """ tree = {} gene_info = {} i = 0 reader = BED.SimpleBEDReader(bed_filename, start_base, end_base) for r in reader: if r.chr not in tree: tree[r.chr] = IntervalTree() tree[r.chr].add(r.start, r.end, (i, Interval(r.start, r.end))) if r.name is not None: gene_info[i] = r.name i += 1 return tree, gene_info
def setUp(self): self.exonsDB = {} self.ex1 = Exon('chrX', 1000, 2000, 'ex1.1', '+') self.ex2 = Exon('chrX', 3000, 4000, 'ex1.1', '+') self.ex3 = Exon('chrX', 5000, 6000, 'ex1.1', '+') self.ex4 = Exon('chrX', 7000, 8000, 'ex1.1', '+') self.exonsDB[str(self.ex1)] = self.ex1 self.exonsDB[str(self.ex2)] = self.ex2 self.exonsDB[str(self.ex3)] = self.ex3 self.exonsDB[str(self.ex4)] = self.ex4 self.tree = IntervalTree() self.tree.add_interval(self.ex1) self.tree.add_interval(self.ex2) self.tree.add_interval(self.ex3) self.tree.add_interval(self.ex4) self.graph = nx.DiGraph()
def find_best_match_junction( tree: IntervalTree, donor: int, accep: int, max_diff: int = 20, ) -> Optional[Interval]: """ donor, accept -- both should be 0-based """ hits = tree.find(donor, accep) if len(hits) == 0: return None elif len(hits) == 1: if hits[0].start - donor > max_diff or hits[0].end - accep > max_diff: return None return hits[0] else: # multiple hits, find the closest one diff = [] for h in hits: if h.start - donor > max_diff or h.end - accep > max_diff: continue diff.append((abs(h.start - donor) + abs(h.end - accep), h)) diff.sort(key=lambda x: x[0]) return diff[0][1]
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions( r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename + '.fuzzy', 'w') f_group = open(group_filename + '.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len( group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match
def categorize_aln_by_annotation(gene_annotation_file, input_fasta, input_sam, output_prefix, min_overlap_bp=200, min_query_overlap=.5, min_gene_overlap=.8): t = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr -> strand -> IntervalTree info = {} #reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t') for r in DictReader(open(gene_annotation_file), delimiter='\t'): if r['#Replicon Name'] != 'chr': print("Ignore", r, file=sys.stderr) continue info[r['Locus tag']] = (int(r['Start']), int(r['Stop']), r['Locus tag']) t[r['Replicon Accession']][r['Strand']].add(int(r['Start']), int(r['Stop']), r['Locus tag']) #pdb.set_trace() result = defaultdict(lambda: []) # gene -> list of rec d = dict( (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), 'fasta')) reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d) for r in reader: #if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS': # pdb.set_trace() ans = match_w_annotation(t, r, info, min_overlap_bp, min_query_overlap, min_gene_overlap) # ans is AMatch(name, strand, start, end, record) result[ans.name].append(ans) novel_ct = defaultdict(lambda: { '+': ClusterTree(0, 0), '-': ClusterTree(0, 0) }) novel_list = [] novel_index = 0 f = open(output_prefix + '.sam', 'w') f.write(reader.header) f1 = open(output_prefix + '.report.txt', 'w') f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n") for k, v in result.items(): # v is: list of AMatch(name, strand, start, end, record) if k.startswith('novel-unannotated'): # write novel later, we are grouping them by loci first #tagRG='novel' for x in v: novel_ct[x.record.sID][x.strand].insert( x.start, x.end, novel_index) novel_index += 1 novel_list.append(x) continue elif k.startswith('novel-antisense'): tagRG = 'novel-antisense' elif k.startswith('novel-partial'): tagRG = 'novel-partial' elif k.startswith('poly-'): tagRG = 'poly' else: tagRG = 'single' v.sort(key=lambda x: (x.start, x.end), reverse=True if v[0].strand == '-' else False) # sort by start, then end for i, x in enumerate(v): f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format( x.record.record_line, i + 1, tagRG, k)) if x.strand == '+': f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, tagRG, k, i+1, x.strand, x.start+1, x.end)) else: # - strand, start is end, end is start f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, tagRG, k, i+1, x.strand, x.end, x.start+1)) # now write the novel stuff, grouped by regions novel_region_index = 1 for d1 in novel_ct.values(): for ct in d1.values(): gn = 'novel-' + str(novel_region_index) for _start, _end, _indices in ct.getregions(): v = [novel_list[ind] for ind in _indices] v.sort(key=lambda x: (x.start, x.end), reverse=True if v[0].strand == '-' else False) # sort by start, then end for i, x in enumerate(v): f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format( x.record.record_line, i + 1, "novel-unannotated", gn)) if x.strand == '+': f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.start+1, x.end)) else: f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.end, x.start+1)) novel_region_index += 1 f.close() f1.close() print("Output written to:", f.name, file=sys.stderr) print("Output written to:", f1.name, file=sys.stderr)
def categorize_aln_by_annotation( gene_annotation_file: str, input_fasta: str, input_sam: str, output_prefix: str, min_overlap_bp: int = 200, min_query_overlap: float = 0.5, min_gene_overlap: float = 0.8, ) -> None: t = defaultdict( lambda: {"+": IntervalTree(), "-": IntervalTree()} ) # chr -> strand -> IntervalTree info = {} # reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t') for r in DictReader(open(gene_annotation_file), delimiter="\t"): if r["#Replicon Name"] != "chr": logger.info(f"Ignore {r}") continue info[r["Locus tag"]] = (int(r["Start"]), int(r["Stop"]), r["Locus tag"]) t[r["Replicon Accession"]][r["Strand"]].add( int(r["Start"]), int(r["Stop"]), r["Locus tag"] ) # pdb.set_trace() result = defaultdict(lambda: []) # gene -> list of rec d = {r.id: len(r.seq) for r in SeqIO.parse(open(input_fasta), "fasta")} reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d) for r in reader: # if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS': # pdb.set_trace() ans = match_w_annotation( t, r, info, min_overlap_bp, min_query_overlap, min_gene_overlap ) # ans is AMatch(name, strand, start, end, record) result[ans.name].append(ans) novel_ct = defaultdict(lambda: {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)}) novel_list = [] novel_index = 0 with open(f"{output_prefix}.sam", "w") as f, open( f"{output_prefix}.report.txt", "w" ) as f1: f.write(reader.header) f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n") for k, v in result.items(): # v is: list of AMatch(name, strand, start, end, record) if k.startswith("novel-unannotated"): # write novel later, we are grouping them by loci first # tagRG='novel' for x in v: novel_ct[x.record.sID][x.strand].insert(x.start, x.end, novel_index) novel_index += 1 novel_list.append(x) continue elif k.startswith("novel-antisense"): tagRG = "novel-antisense" elif k.startswith("novel-partial"): tagRG = "novel-partial" elif k.startswith("poly-"): tagRG = "poly" else: tagRG = "single" v.sort( key=lambda x: (x.start, x.end), reverse=bool(v[0].strand == "-"), ) # sort by start, then end for i, x in enumerate(v): f.write( f"{x.record.record_line}\tSN:Z:{i + 1:06d}\tRG:Z:{tagRG}\tgn:Z:{k}\n" ) if x.strand == "+": f1.write( f"{x.record.qID}\t{tagRG}\t{k}\t{i + 1:06d}\t{x.strand}\t{x.start + 1}\t{x.end}\n" ) else: # - strand, start is end, end is start f1.write( f"{x.record.qID}\t{tagRG}\t{k}\t{i + 1:06d}\t{x.strand}\t{x.end}\t{x.start + 1}\n" ) # now write the novel stuff, grouped by regions novel_region_index = 1 for d1 in novel_ct.values(): for ct in d1.values(): gn = f"novel-{str(novel_region_index)}" for *_, _indices in ct.getregions(): v = [novel_list[ind] for ind in _indices] v.sort( key=lambda x: (x.start, x.end), reverse=bool(v[0].strand == "-"), ) # sort by start, then end for i, x in enumerate(v): f.write( f"{x.record.record_line}\tSN:Z:{i + 1:06d}\tRG:Z:{'novel-unannotated'}\tgn:Z:{gn}\n" ) if x.strand == "+": f1.write( f"{x.record.qID}\t{'novel-unannotated'}\t{gn}\t{i + 1:06d}\t{x.strand}\t{x.start + 1}\t{x.end}\n" ) else: f1.write( f"{x.record.qID}\t{'novel-unannotated'}\t{gn}\t{i + 1:06d}\t{x.strand}\t{x.end}\t{x.start + 1}\n" ) novel_region_index += 1 logger.info(f"Output written to: {f.name}") logger.info(f"Output written to: {f1.name}")
def collapse_fuzzy_junctions( gff_filename: Union[str, Path], group_filename: Union[str, Path], allow_extra_5exon: bool, internal_fuzzy_max_dist: int, max_5_diff: int, max_3_diff: int, ) -> defaultdict: def can_merge(m, r1, r2): if m == "exact": return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == "subset": r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == "super" or m == "subset": n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == "+": return (abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end) else: return (abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and r1.ref_exons[n2 - 1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end) return False d = {} # chr --> strand --> tree recs = defaultdict(lambda: {"+": IntervalTree(), "-": IntervalTree()}) fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions( r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist, max_5_diff=max_5_diff, max_3_diff=max_3_diff, ) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split("\t") group_info[pbid] = members.split(",") # pick for each fuzzy group the one that has the most exons keys = list(fuzzy_match.keys()) keys.sort(key=lambda x: int(x.split(".")[1])) with open(f"{gff_filename}.fuzzy", "w") as f_gff, open(f"{group_filename}.fuzzy", "w") as f_group: for k in keys: all_members = [] best_pbid, best_size, best_num_exons = ( fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons), ) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _num_exons = len(d[pbid].ref_exons) _size = len(group_info[pbid]) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write(f'{best_pbid}\t{",".join(all_members)}\n') return fuzzy_match
def main_maize(ki11_snps=None, dirs=None): if ki11_snps is None: ki11_snps = defaultdict(lambda: {}) # chrom -> pos -> VCF record debug_count = 0 for r in vcfpy.Reader("B73Ki11.q20.vcf"): ki11_snps[r.CHROM][r.POS] = r # if debug_count > 100000: break debug_count += 1 logger.info("Finished reading B73Ki11.q20.vcf.") ki11_shortread_cov = defaultdict( lambda: {}) # chrom -> pos -> short read cov # read the raw Ki11 pileup to get coverage in places where no SNPs were called for r in MPileUpReader("Ki11.raw.mpileup"): if r is not None: ki11_shortread_cov[r.chr][r.pos] = r.cov logger.info("Fnished reading Ki11.raw.mpileup.") repeat_by_chrom = {} # read the Tandem Repeat Finder summary for r in DictReader(open("B73_RefV4.fa.repeat_list.txt"), delimiter="\t"): if r["chrom"] not in repeat_by_chrom: repeat_by_chrom[r["chrom"]] = IntervalTree() repeat_by_chrom[r["chrom"]].add(int(r["start0"]), int(r["end1"])) logger.info("Finished reading B73_RefV4.fa.repeat_list.txt.") FIELDS = [ "dir", "chrom", "pos", "ref", "alt_Short", "alt_PB", "in_Short", "in_PB", "cov_Short", "cov_PB", "genomic_HP", ] with open("evaled.isophase_SNP.txt", "w") as out_f: writer_f = DictWriter(out_f, FIELDS, delimiter="\t") writer_f.writeheader() debug_count = 0 if dirs is None: dirs = glob.glob("by_loci/*size*/") for d1 in dirs: # if debug_count > 100: break debug_count += 1 mpileup = Path(d1, "ccs.mpileup") mapfile = Path(d1, "fake.mapping.txt") vcffile = Path(d1, "phased.partial.vcf") nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND") if not vcffile.exists(): assert nosnp.exists() logger.info(f"Skipping {d1} because no SNPs found.") else: logger.info(f"Evaluating {d1}.") good_positions, cov_at_pos = get_positions_to_recover( mapfile, mpileup, ki11_snps, min_cov=30 ) # use lower min cov here becuz a few close cases where BQ filtering lowered cov name = d1.split("/")[1] eval_isophase( vcffile, ki11_snps, good_positions, cov_at_pos, repeat_by_chrom, ki11_shortread_cov, writer_f, name, ) return ki11_snps