def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert( item, rightSet.linenum, item.fields ) if rightlen == 0: rightlen = item.nfields for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect( interval, lambda node: result.append( node ) ) overlap_not_met = 0 for item in result: if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1): overlap = interval.end-item.start elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1): overlap = item.end-interval.start elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1): overlap = item.end-item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end-interval.start if overlap < mincols: overlap_not_met += 1 continue outfields = list(interval) map(outfields.append, item.other) setattr( item, "visited", True ) yield outfields if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: outfields = list(interval) for x in range(rightlen): outfields.append(".") yield outfields if leftfill: def report_unvisited( node, results ): if not hasattr(node, "visited"): results.append( node ) results = [] rightTree.traverse( lambda x: report_unvisited( x, results ) ) for item in results: outfields = list() for x in range(leftlen): outfields.append(".") map(outfields.append, item.other) yield outfields
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert(item, rightSet.linenum, item.fields) if rightlen == 0: rightlen = item.nfields for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect(interval, lambda node: result.append(node)) overlap_not_met = 0 for item in result: if item.start in range(interval.start, interval.end + 1) and item.end not in range( interval.start, interval.end + 1): overlap = interval.end - item.start elif item.end in range(interval.start, interval.end + 1) and item.start not in range( interval.start, interval.end + 1): overlap = item.end - interval.start elif item.start in range(interval.start, interval.end + 1) and item.end in range( interval.start, interval.end + 1): overlap = item.end - item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end - interval.start if overlap < mincols: overlap_not_met += 1 continue outfields = list(interval) map(outfields.append, item.other) setattr(item, "visited", True) yield outfields if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: outfields = list(interval) for x in range(rightlen): outfields.append(".") yield outfields if leftfill: def report_unvisited(node, results): if not hasattr(node, "visited"): results.append(node) results = [] rightTree.traverse(lambda x: report_unvisited(x, results)) for item in results: outfields = list() for x in range(leftlen): outfields.append(".") map(outfields.append, item.other) yield outfields
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1,-1]): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightStrandCol = -1 minoverlap = mincols rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert( item, rightSet.linenum, item.fields ) if rightlen == 0: rightlen = item.nfields if rightStrandCol == -1: rightStrandCol = item.strand_col for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect( interval, lambda node: result.append( node ) ) overlap_not_met = 0 leftbases = interval.end - interval.start for item in result: rightbases = item.end - item.start if (asfraction==True): if rightbases < leftbases: mincols = rightbases else: mincols = leftbases mincols = math.floor(mincols * minoverlap) if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1): overlap = interval.end-item.start elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1): overlap = item.end-interval.start elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1): overlap = item.end-item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end-interval.start if overlap < mincols: overlap_not_met += 1 continue else: #check strand strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]] if (strandMatched == -1 and matchStrand > 0): #needed match but found a complement overlap_not_met += 1 continue if (strandMatched == 1 and matchStrand < 0): #needed complement but found a match overlap_not_met += 1 continue if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)): #strict criteria but only permissive match found overlap_not_met += 1 continue #strand criteria met setattr( item, "visited", True ) yield(getSelectedColumns( interval.fields, item.other, outColumns )) if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: yield(getSelectedColumns( interval.fields, rightlen, outColumns )) if leftfill: def report_unvisited( node, results ): if not hasattr(node, "visited"): results.append( node ) results = [] rightTree.traverse( lambda x: report_unvisited( x, results ) ) for item in results: yield(getSelectedColumns( leftlen, item.other, outColumns))
def getpairs(leftSet, rightSet, leftCol, mincols=1, asfraction=False, matchStrand=STRAND_NEUTRAL, skipChrNames=True, skipStrandNames=True): # Read leftSet into memory: leftlen = 0 rightlen = 0 leftStrandCol = -1 minoverlap = mincols leftTree = IntervalTree() rightCols = list() for item in leftSet: if type(item) is GenomicInterval: leftTree.insert(item, leftSet.linenum, item.fields) if leftlen == 0: leftlen = item.nfields if leftStrandCol == -1: leftStrandCol = item.strand_col for interval in rightSet: if rightlen == 0 and type(interval) is GenomicInterval: rightlen = interval.nfields rightCols = range(rightlen) #remove the useless columns rightCols.remove(interval.start_col) rightCols.remove(interval.end_col) if skipChrNames: rightCols.remove(interval.chrom_col) if skipStrandNames: rightCols.remove(interval.strand_col) if not (type(interval) is GenomicInterval): yield interval else: result = [] leftTree.intersect(interval, lambda node: result.append(node)) overlap_not_met = 0 rightbases = interval.end - interval.start for item in result: leftbases = item.end - item.start if (asfraction == True): if leftbases < rightbases: mincols = leftbases else: mincols = rightbases mincols = math.floor(mincols * minoverlap) if (item.start >= interval.start and item.start <= interval.end ) and (item.end < interval.start or item.end > interval.end): overlap = interval.end - item.start elif (item.end >= interval.start and item.end <= interval.end ) and (item.start < interval.start or item.end > interval.end): overlap = item.end - interval.start elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end: overlap = item.end - item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end - interval.start if overlap < mincols: overlap_not_met += 1 continue else: #check strand strandMatched = STRAND_INTEGER_VALUES[ interval.strand] * STRAND_INTEGER_VALUES[ item.other[leftStrandCol]] if (strandMatched == -1 and matchStrand > 0): #needed match but found a complement overlap_not_met += 1 continue if (strandMatched == 1 and matchStrand < 0): #needed complement but found a match overlap_not_met += 1 continue if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)): #strict criteria but only permissive match found overlap_not_met += 1 continue #strand criteria met setattr(item, "visited", True) leftTerm = item.other[leftCol] for col in rightCols: #take each field that's not a number #split it on semicolons, commas, and spaces #output the word and the leftTerm as being associated #curcol = re.sub("\;|\,","\t",interval.fields[col]) curcol = interval.fields[col] lexer = shlex.shlex(curcol) lexer.whitespace = '\t\r\n\,\;' lexer.wordchars += ":'" lexer.whitespace_split = True lexer.quotes = '"' for item in lexer: item = item.strip() if (item == "."): continue try: float(item) except ValueError: yield [item, leftTerm]
def join( leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1, -1], ): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightStrandCol = -1 minoverlap = mincols rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert(item, rightSet.linenum, item.fields) if rightlen == 0: rightlen = item.nfields if rightStrandCol == -1: rightStrandCol = item.strand_col for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect(interval, lambda node: result.append(node)) overlap_not_met = 0 leftbases = interval.end - interval.start for item in result: rightbases = item.end - item.start if asfraction == True: if rightbases < leftbases: mincols = rightbases else: mincols = leftbases mincols = math.floor(mincols * minoverlap) if item.start in range(interval.start, interval.end + 1) and item.end not in range( interval.start, interval.end + 1 ): overlap = interval.end - item.start elif item.end in range(interval.start, interval.end + 1) and item.start not in range( interval.start, interval.end + 1 ): overlap = item.end - interval.start elif item.start in range(interval.start, interval.end + 1) and item.end in range( interval.start, interval.end + 1 ): overlap = item.end - item.start else: # the intersecting item's start and end are outside the interval range overlap = interval.end - interval.start if overlap < mincols: overlap_not_met += 1 continue else: # check strand strandMatched = ( STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]] ) if strandMatched == -1 and matchStrand > 0: # needed match but found a complement overlap_not_met += 1 continue if strandMatched == 1 and matchStrand < 0: # needed complement but found a match overlap_not_met += 1 continue if strandMatched == 0 and (matchStrand < -1 or matchStrand > 1): # strict criteria but only permissive match found overlap_not_met += 1 continue # strand criteria met setattr(item, "visited", True) yield (getSelectedColumns(interval.fields, item.other, outColumns)) if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: yield (getSelectedColumns(interval.fields, rightlen, outColumns)) if leftfill: def report_unvisited(node, results): if not hasattr(node, "visited"): results.append(node) results = [] rightTree.traverse(lambda x: report_unvisited(x, results)) for item in results: yield (getSelectedColumns(leftlen, item.other, outColumns))
def getpairs(leftSet, rightSet, leftCol, mincols=1, asfraction=False, matchStrand=STRAND_NEUTRAL, skipChrNames=True, skipStrandNames=True): # Read leftSet into memory: leftlen = 0 rightlen = 0 leftStrandCol = -1 minoverlap = mincols leftTree = IntervalTree() rightCols = list() for item in leftSet: if type( item ) is GenomicInterval: leftTree.insert( item, leftSet.linenum, item.fields ) if leftlen == 0: leftlen = item.nfields if leftStrandCol == -1: leftStrandCol = item.strand_col for interval in rightSet: if rightlen == 0 and type( interval ) is GenomicInterval: rightlen = interval.nfields rightCols = range(rightlen) #remove the useless columns rightCols.remove( interval.start_col ) rightCols.remove( interval.end_col ) if skipChrNames: rightCols.remove( interval.chrom_col ) if skipStrandNames: rightCols.remove( interval.strand_col ) if not (type( interval ) is GenomicInterval): yield interval else: result = [] leftTree.intersect( interval, lambda node: result.append( node ) ) overlap_not_met = 0 rightbases = interval.end - interval.start for item in result: leftbases = item.end - item.start if (asfraction==True): if leftbases < rightbases: mincols = leftbases else: mincols = rightbases mincols = math.floor(mincols * minoverlap) if (item.start >= interval.start and item.start <= interval.end) and (item.end < interval.start or item.end > interval.end): overlap = interval.end-item.start elif (item.end >= interval.start and item.end <= interval.end) and (item.start < interval.start or item.end > interval.end): overlap = item.end-interval.start elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end: overlap = item.end-item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end-interval.start if overlap < mincols: overlap_not_met += 1 continue else: #check strand strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[leftStrandCol]] if (strandMatched == -1 and matchStrand > 0): #needed match but found a complement overlap_not_met += 1 continue if (strandMatched == 1 and matchStrand < 0): #needed complement but found a match overlap_not_met += 1 continue if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)): #strict criteria but only permissive match found overlap_not_met += 1 continue #strand criteria met setattr( item, "visited", True ) leftTerm = item.other[leftCol] for col in rightCols: #take each field that's not a number #split it on semicolons, commas, and spaces #output the word and the leftTerm as being associated #curcol = re.sub("\;|\,","\t",interval.fields[col]) curcol= interval.fields[col] lexer = shlex.shlex(curcol) lexer.whitespace='\t\r\n\,\;' lexer.wordchars += ":'" lexer.whitespace_split=True lexer.quotes='"' for item in lexer: item = item.strip() if (item == "."): continue try: float(item) except ValueError: yield [item, leftTerm]