def test_fill_subset(self): """ Test filling undefined positions on a subset. """ walker = sparse('a', [1, 3, 5, 6, 8, 10]) expected = [('a', 1, 1)] + list(filled('a', 3, 8, [4, 7])) + [('a', 10, 10)] assert_equal(list(wiggelen.fill(walker, regions={'a': (3, 8)})), expected)
def test_fill_open(self): """ Test filling undefined positions. """ walker = sparse('a', [3, 5, 6, 8]) expected = list(filled('a', 3, 8, [4, 7])) assert_equal(list(wiggelen.fill(walker)), expected)
def test_fill_closed(self): """ Test filling undefined positions with start and stop. """ walker = sparse('a', [3, 5, 6, 8]) expected = list(filled('a', 1, 10, [1, 2, 4, 7, 9, 10])) assert_equal(list(wiggelen.fill(walker, regions={'a': (1, 10)})), expected)
def find_max(regions, wig): # find max within a given region. Assumes regions are ordered and non-overlapping # find max by walking through wig file # Let's also calculate a new score sum just to be extra sure it's right. The # old score sum was done during merging regions, so it could have been messed up regions_with_max = [] start, end, score_sum = regions.pop(0) max_value = -1 max_position = None new_score_sum = 0 for chrom, position, value in fill(walk(wig)): if start <= position and position <= end: new_score_sum += value if value >= max_value: max_value = value max_position = position if position >= end and max_value != -1: # position is past region and max_value has been recorded region_with_max = (start, end, new_score_sum, max_value, max_position) regions_with_max.append(region_with_max) # grab new region and reset max if len(regions) == 0: # break when there are no more regions break start, end, score_sum = regions.pop(0) max_value = -1 max_position = None new_score_sum = 0 return regions_with_max
def test_fill_only_edges(self): """ Test filling edges of undefined positions. """ walker = sparse('a', [3, 5, 6, 14]) expected = [('a', 3, 3), ('a', 4, None), ('a', 5, 5), ('a', 6, 6), ('a', 7, None), ('a', 13, None), ('a', 14, 14)] assert_equal(list(wiggelen.fill(walker, only_edges=True)), expected)
def read_wig(filename, norm=False): wig_dict = {position : value for region, position, value in fill(walk(open(filename)))} if norm: print "Normalizing..." wig_median = float(np.median(wig_dict.values())) wig_norm = {x : wig_dict[x]/wig_median for x in wig_dict} return wig_norm else: return wig_dict
def main(infile, threshold, merge_dist, min_width, strand, outfile): # list of tuples, (start, end, avg_exp) called_regions = [] start = None end = None total_exp = 0 # fill function steps through every position, returns None if position not # in original wig file print("Calling preliminary regions...") wig = open(infile) for region, position, value in fill(walk(wig)): if start is None: # initialize start of new region to current position start = position if value is None: if total_exp > 0: # if a region already exists, end it called_regions.append((start, end, total_exp)) # reset start, end, and total_exp start = None end = None total_exp = 0 elif value < threshold: if total_exp > 0: # if a region already exists, end it called_regions.append((start, end, total_exp)) # reset start, end, and total_exp start = None end = None total_exp = 0 elif value >= threshold: # value exceeds threshold, continue region total_exp += value end = position wig.close() if total_exp != 0: # finished iterating but one last region called_regions.append((start, end, total_exp)) print("Filtering out regions smaller than minimum width...") # filter out regions that are below minimum width filtered_regions = [ x for x in called_regions if x[1] - x[0] + 1 >= min_width ] print("Merging regions...") merged_regions = merge_regions(filtered_regions, merge_dist) # find max region and re-do score sum # open wig file again print("Finding region max and calculating total score...") regions_with_max = find_max(merged_regions, open(infile)) write_bed(regions_with_max, strand, outfile)
def test_fill_regions(self): """ Test filling undefined positions over multiple regions. """ a = sparse('a', [3, 5, 6, 8]) b = sparse('b', [3, 5, 6, 8]) c = sparse('c', [1, 3, 5, 6, 8, 10]) walker = chain(a, b, c) e_a = list(sparse('a', [3, 5, 6, 8])) e_b = list(filled('b', 1, 10, [1, 2, 4, 7, 9, 10])) e_c = [('c', 1, 1)] + list(filled('c', 3, 8, [4, 7])) + [('c', 10, 10)] expected = list(chain(e_a, e_b, e_c)) assert_equal(list(wiggelen.fill(walker, regions={'b': (1, 10), 'c': (3, 8)})), expected)
def test_fill_regions(self): """ Test filling undefined positions over multiple regions. """ a = sparse('a', [3, 5, 6, 8]) b = sparse('b', [3, 5, 6, 8]) c = sparse('c', [1, 3, 5, 6, 8, 10]) walker = chain(a, b, c) e_a = list(sparse('a', [3, 5, 6, 8])) e_b = list(filled('b', 1, 10, [1, 2, 4, 7, 9, 10])) e_c = [('c', 1, 1)] + list(filled('c', 3, 8, [4, 7])) + [('c', 10, 10)] expected = list(chain(e_a, e_b, e_c)) assert_equal( list(wiggelen.fill(walker, regions={ 'b': (1, 10), 'c': (3, 8) })), expected)
return wig_sum if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('bed', help='bed file') parser.add_argument('plus_wig', help='wig file for plus strand') parser.add_argument('minus_wig', help='wig file for minus strand') parser.add_argument('output_name', help='name of output file') args = parser.parse_args() plus_wig_dict = { position: value for region, position, value in fill(walk(open(args.plus_wig))) } minus_wig_dict = { position: value for region, position, value in fill(walk(open(args.minus_wig))) } outfile = open(args.output_name, 'w') with open(args.bed) as infile: for line in infile: fields = line.strip().split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) name = fields[3]