def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert( item, rightSet.linenum, item.fields ) if rightlen == 0: rightlen = item.nfields for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect( interval, lambda node: result.append( node ) ) overlap_not_met = 0 for item in result: if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1): overlap = interval.end-item.start elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1): overlap = item.end-interval.start elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1): overlap = item.end-item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end-interval.start if overlap < mincols: overlap_not_met += 1 continue outfields = list(interval) map(outfields.append, item.other) setattr( item, "visited", True ) yield outfields if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: outfields = list(interval) for x in range(rightlen): outfields.append(".") yield outfields if leftfill: def report_unvisited( node, results ): if not hasattr(node, "visited"): results.append( node ) results = [] rightTree.traverse( lambda x: report_unvisited( x, results ) ) for item in results: outfields = list() for x in range(leftlen): outfields.append(".") map(outfields.append, item.other) yield outfields
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert(item, rightSet.linenum, item.fields) if rightlen == 0: rightlen = item.nfields for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect(interval, lambda node: result.append(node)) overlap_not_met = 0 for item in result: if item.start in range(interval.start, interval.end + 1) and item.end not in range( interval.start, interval.end + 1): overlap = interval.end - item.start elif item.end in range(interval.start, interval.end + 1) and item.start not in range( interval.start, interval.end + 1): overlap = item.end - interval.start elif item.start in range(interval.start, interval.end + 1) and item.end in range( interval.start, interval.end + 1): overlap = item.end - item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end - interval.start if overlap < mincols: overlap_not_met += 1 continue outfields = list(interval) map(outfields.append, item.other) setattr(item, "visited", True) yield outfields if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: outfields = list(interval) for x in range(rightlen): outfields.append(".") yield outfields if leftfill: def report_unvisited(node, results): if not hasattr(node, "visited"): results.append(node) results = [] rightTree.traverse(lambda x: report_unvisited(x, results)) for item in results: outfields = list() for x in range(leftlen): outfields.append(".") map(outfields.append, item.other) yield outfields
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1,-1]): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightStrandCol = -1 minoverlap = mincols rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert( item, rightSet.linenum, item.fields ) if rightlen == 0: rightlen = item.nfields if rightStrandCol == -1: rightStrandCol = item.strand_col for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect( interval, lambda node: result.append( node ) ) overlap_not_met = 0 leftbases = interval.end - interval.start for item in result: rightbases = item.end - item.start if (asfraction==True): if rightbases < leftbases: mincols = rightbases else: mincols = leftbases mincols = math.floor(mincols * minoverlap) if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1): overlap = interval.end-item.start elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1): overlap = item.end-interval.start elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1): overlap = item.end-item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end-interval.start if overlap < mincols: overlap_not_met += 1 continue else: #check strand strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]] if (strandMatched == -1 and matchStrand > 0): #needed match but found a complement overlap_not_met += 1 continue if (strandMatched == 1 and matchStrand < 0): #needed complement but found a match overlap_not_met += 1 continue if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)): #strict criteria but only permissive match found overlap_not_met += 1 continue #strand criteria met setattr( item, "visited", True ) yield(getSelectedColumns( interval.fields, item.other, outColumns )) if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: yield(getSelectedColumns( interval.fields, rightlen, outColumns )) if leftfill: def report_unvisited( node, results ): if not hasattr(node, "visited"): results.append( node ) results = [] rightTree.traverse( lambda x: report_unvisited( x, results ) ) for item in results: yield(getSelectedColumns( leftlen, item.other, outColumns))
def join( leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1, -1], ): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightStrandCol = -1 minoverlap = mincols rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert(item, rightSet.linenum, item.fields) if rightlen == 0: rightlen = item.nfields if rightStrandCol == -1: rightStrandCol = item.strand_col for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect(interval, lambda node: result.append(node)) overlap_not_met = 0 leftbases = interval.end - interval.start for item in result: rightbases = item.end - item.start if asfraction == True: if rightbases < leftbases: mincols = rightbases else: mincols = leftbases mincols = math.floor(mincols * minoverlap) if item.start in range(interval.start, interval.end + 1) and item.end not in range( interval.start, interval.end + 1 ): overlap = interval.end - item.start elif item.end in range(interval.start, interval.end + 1) and item.start not in range( interval.start, interval.end + 1 ): overlap = item.end - interval.start elif item.start in range(interval.start, interval.end + 1) and item.end in range( interval.start, interval.end + 1 ): overlap = item.end - item.start else: # the intersecting item's start and end are outside the interval range overlap = interval.end - interval.start if overlap < mincols: overlap_not_met += 1 continue else: # check strand strandMatched = ( STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]] ) if strandMatched == -1 and matchStrand > 0: # needed match but found a complement overlap_not_met += 1 continue if strandMatched == 1 and matchStrand < 0: # needed complement but found a match overlap_not_met += 1 continue if strandMatched == 0 and (matchStrand < -1 or matchStrand > 1): # strict criteria but only permissive match found overlap_not_met += 1 continue # strand criteria met setattr(item, "visited", True) yield (getSelectedColumns(interval.fields, item.other, outColumns)) if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: yield (getSelectedColumns(interval.fields, rightlen, outColumns)) if leftfill: def report_unvisited(node, results): if not hasattr(node, "visited"): results.append(node) results = [] rightTree.traverse(lambda x: report_unvisited(x, results)) for item in results: yield (getSelectedColumns(leftlen, item.other, outColumns))