def turnSegmentGWRIntoRBDict(gwr, extend_dist=20000, min_reciprocal_overlap=0.6, report=True): """ 2010-3-17 extend_dist is used to enlarge the segments in each data_obj of gwr, """ sys.stderr.write("Turning a segment-gwr (start-stop style) into an RBDict ...") from RBTree import RBDict # 2010-1-26 RBDict is more efficiency than binary_tree. rbDict = RBDict(cmpfn=leftWithinRightAlsoEqualCmp) for data_obj in gwr.data_obj_ls: start = max(data_obj.position-extend_dist, 0) stop = data_obj.stop_position+extend_dist segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=data_obj.chromosome, span_ls=[start, stop], \ min_reciprocal_overlap=min_reciprocal_overlap) rbDict[segmentKey] = data_obj if report: print "\tDepth of rbDict: %d" % (rbDict.depth()) print "\tOptimum Depth: %f (%d) (%f%% depth efficiency)" % (rbDict.optimumdepth(), math.ceil(rbDict.optimumdepth()), math.ceil(rbDict.optimumdepth()) / rbDict.depth()) sys.stderr.write("%s objects converted.\n"%len(rbDict)) return rbDict
# test program if this file is run if __name__ == "__main__": import os, sys, math #import pdb #pdb.set_trace() cnv_ls = [[1, (2323,2600)], [2,(50000,)], [3,(43214,78788)], [5,(150,500)], [5,(500,950)], [5, (43241, 43242)]] no_of_cnvs = len(cnv_ls) min_reciprocal_overlap = 0.6 #from BinarySearchTree import binary_tree #tree = binary_tree() from RBTree import RBDict #2010-1-26 binary_tree and RBDict are swappable. but RBDict is more efficient (balanced). tree = RBDict(cmpfn=leftWithinRightAlsoEqualCmp) # 2010-1-28 use the custom cmpfn if you want the case that left within right is regarded as equal as well. for cnv in cnv_ls: segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=cnv[0], span_ls=cnv[1], min_reciprocal_overlap=min_reciprocal_overlap) tree[segmentKey] = cnv print "Binary Tree Test\n" print "Node Count: %d" % len(tree) print "Depth: %d" % tree.depth() print "Optimum Depth: %f (%d) (%f%% depth efficiency)" % (tree.optimumdepth(), math.ceil(tree.optimumdepth()), math.ceil(tree.optimumdepth()) / tree.depth()) print "Efficiency: %f%% (total possible used: %d, total wasted: %d): " % (tree.efficiency() * 100, len(tree) / tree.efficiency(), (len(tree) / tree.efficiency()) - len(tree)) """