def make_hits_union(ids, d1, d2): d = {} for id in ids: d[id] = [] if id in d1: d[id].extend(d1[id]) if id in d2: d[id].extend(d2[id]) genome_intervals.merge_overlapping_in_list(d[id]) return d
def test_merge_overlapping_in_list(self): '''merge_overlapping_in_list() merges correctly''' a = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(51, 60), genome_intervals.Interval(10, 20), genome_intervals.Interval(20, 30), genome_intervals.Interval(20, 30), genome_intervals.Interval(29, 50), genome_intervals.Interval(65, 70) ] b = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(10, 60), genome_intervals.Interval(65, 70) ] genome_intervals.merge_overlapping_in_list(a) self.assertSequenceEqual(a, b)
def get_nucmer_hits(coords_file): qry_hits = {} ref_hits = {} nucmer_reader = nucmer.file_reader(coords_file) for hit in nucmer_reader: # nucmer hits are 1-based. INside the script, use 0-based. start, end = sorted([hit.ref_start - 1, hit.ref_end - 1]) if hit.ref_name not in ref_hits: ref_hits[hit.ref_name] = [] ref_hits[hit.ref_name].append(genome_intervals.Interval(start, end)) start, end = sorted([hit.qry_start - 1, hit.qry_end - 1]) if hit.qry_name not in qry_hits: qry_hits[hit.qry_name] = [] qry_hits[hit.qry_name].append(genome_intervals.Interval(start, end)) for l in ref_hits.values(): genome_intervals.merge_overlapping_in_list(l) for l in qry_hits.values(): genome_intervals.merge_overlapping_in_list(l) return ref_hits, qry_hits
def file2regions(fname): regions = {} f = utils.open_file_read(fname) for line in f: if line.startswith('#'): continue (chr, start, end) = line.rstrip().split() if chr not in regions: regions[chr] = [] regions[chr].append(genome_intervals.Interval(start, end)) utils.close(f) return regions regions = file2regions(options.infile) f = utils.open_file_write(options.outfile) for chr, l in sorted(regions.items()): genome_intervals.merge_overlapping_in_list(l) for region in l: print(chr, region.start, region.end, sep='\t', file=f) utils.close(f)