def test_sentinelize(self): stream = fstream([(10, 12, 0.5), (14, 15, 1.2)], fields=['start', 'end', 'score']) stream = sentinelize(stream, 'Z') for y in stream: x = y self.assertEqual(x, 'Z')
def _stream(ts, tf): X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts] S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts] start_idx = tf.fields.index('start') end_idx = tf.fields.index('end') if hasattr(method, '__call__'): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for y in tf: ystart = y[start_idx] yend = y[end_idx] scores = () for i in range(len(ts)): xnext = S[i][-1] # Load into S all score items which intersect feature y while xnext[0] < yend: xnext = X[i].next() if xnext[1] > ystart: S[i].append(xnext) n = 0 while S[i][n][1] <= ystart: n += 1 S[i] = S[i][n:] scores_y = [] for s in S[i]: if yend <= s[0]: continue if s[0] < ystart: start = ystart else: start = s[0] if yend < s[1]: end = yend else: end = s[1] scores_y.extend([s[2]] * (end - start)) scores += (mean_fn(scores_y, 1.0 / (yend - ystart)), ) yield tuple(y) + scores
def merge_scores(trackList, method="arithmetic"): """ Creates a stream with per-base average of several score tracks:: X1: __________666666666______ X2: _____2222222222__________ R: _____11111444443333______ :param trackList: list of FeatureStream objects. :param method: (str) type of average: one of 'arithmetic','geometric', or 'sum' (no average). :rtype: FeatureStream """ tracks = [FeatureStream(common.sentinelize(x, [sys.maxint] * len(x.fields)), x.fields) for x in trackList] tracks = [common.reorder(t, ["start", "end", "score"]) for t in tracks] fields = [f for f in tracks[0].fields if all([f in t.fields for t in tracks])] # common fields elements = [list(x.next()) for x in tracks] track_denom = 1.0 / len(trackList) if hasattr(method, "__call__"): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) def _stream(tracks): while tracks: start = min([x[0] for x in elements]) end = min([x[0] for x in elements if x[0] > start] + [x[1] for x in elements]) scores = [x[2] for x in elements if x[1] > start and x[0] < end] if len(fields) > 3: rest = [] for i in range(len(fields[3:])): r = [str(x[3 + i]) for x in elements if not (x[3 + i] is None) and x[1] > start and x[0] < end] if all([x == r[0] for x in r]): rest.append(r[0]) else: rest.append("|".join(r)) yield (start, end, mean_fn(scores, track_denom)) + tuple(rest) else: yield (start, end, mean_fn(scores, track_denom)) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] < end: elements[i][0] = end if elements[i][1] <= end: elements[i] = list(tracks[i].next()) if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) return FeatureStream(_stream(tracks), fields)
def _stream(ts, tf): tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields)) info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields] if stranded: ts_strand_idx = ts.fields.index("strand") tf_strand_idx = tf.fields.index("strand") same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx] else: same_strand = lambda x, y: True Y = [] ynext = (-sys.maxint, -sys.maxint, 0.0) for x in ts: xstart = x[0] xend = x[1] # Load into Y all feature items which intersect score x while ynext[0] < xend: if ynext[1] > xstart: Y.append(ynext) ynext = tf.next() # Remove features that are far behind x if Y: n = 0 try: while Y[n][1] <= xstart: n += 1 Y = Y[n:] except IndexError: Y = [ynext] # Yield intersections for y in Y: if not same_strand(x, y): continue info = tuple([y[k] for k in info_idx]) if annotate else () if strict and (y[0] > xstart or y[1] < xend): continue if y[0] >= xend: continue # keep for next iteration start = xstart if y[0] < xstart else y[0] end = xend if y[1] > xend else y[1] yield (start, end) + tuple(x[2:]) + info
def _stream(ts, tf): X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts] S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts] start_idx = tf.fields.index("start") end_idx = tf.fields.index("end") if hasattr(method, "__call__"): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for y in tf: ystart = y[start_idx] yend = y[end_idx] scores = () for i in range(len(ts)): xnext = S[i][-1] # Load into S all score items which intersect feature y while xnext[0] < yend: xnext = X[i].next() if xnext[1] > ystart: S[i].append(xnext) n = 0 while S[i][n][1] <= ystart: n += 1 S[i] = S[i][n:] scores_y = [] for s in S[i]: if yend <= s[0]: continue if s[0] < ystart: start = ystart else: start = s[0] if yend < s[1]: end = yend else: end = s[1] scores_y.extend([s[2]] * (end - start)) scores += (mean_fn(scores_y, 1.0 / (yend - ystart)),) yield tuple(y) + scores
def _stream(ts, tf): tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields)) info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields] if stranded: ts_strand_idx = ts.fields.index('strand') tf_strand_idx = tf.fields.index('strand') same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx] else: same_strand = lambda x, y: True Y = [] ynext = (-sys.maxint, -sys.maxint, 0.0) for x in ts: xstart = x[0] xend = x[1] # Load into Y all feature items which intersect score x while ynext[0] < xend: if ynext[1] > xstart: Y.append(ynext) ynext = tf.next() # Remove features that are far behind x if Y: n = 0 try: while Y[n][1] <= xstart: n += 1 Y = Y[n:] except IndexError: Y = [ynext] # Yield intersections for y in Y: if not same_strand(x, y): continue info = tuple([y[k] for k in info_idx]) if annotate else () if strict and (y[0] > xstart or y[1] < xend): continue if y[0] >= xend: continue # keep for next iteration start = xstart if y[0] < xstart else y[0] end = xend if y[1] > xend else y[1] yield (start, end) + tuple(x[2:]) + info
def merge_scores(trackList, method='arithmetic'): """ Creates a stream with per-base average of several score tracks:: X1: __________666666666______ X2: _____2222222222__________ R: _____11111444443333______ :param trackList: list of FeatureStream objects. :param method: (str) type of average: one of 'arithmetic','geometric', or 'sum' (no average). :rtype: FeatureStream """ tracks = [ FeatureStream(common.sentinelize(x, [sys.maxint] * len(x.fields)), x.fields) for x in trackList ] tracks = [common.reorder(t, ['start', 'end', 'score']) for t in tracks] fields = [ f for f in tracks[0].fields if all([f in t.fields for t in tracks]) ] # common fields elements = [list(x.next()) for x in tracks] track_denom = 1.0 / len(trackList) if hasattr(method, '__call__'): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) def _stream(tracks): while tracks: start = min([x[0] for x in elements]) end = min([x[0] for x in elements if x[0] > start] + [x[1] for x in elements]) scores = [x[2] for x in elements if x[1] > start and x[0] < end] if len(fields) > 3: rest = [] for i in range(len(fields[3:])): r = [ str(x[3 + i]) for x in elements if not (x[3 + i] is None) and x[1] > start and x[0] < end ] if all([x == r[0] for x in r]): rest.append(r[0]) else: rest.append("|".join(r)) yield (start, end, mean_fn(scores, track_denom)) + tuple(rest) else: yield (start, end, mean_fn(scores, track_denom)) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] < end: elements[i][0] = end if elements[i][1] <= end: elements[i] = list(tracks[i].next()) if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) return FeatureStream(_stream(tracks), fields)
def concatenate(trackList, fields=None, remove_duplicates=False, group_by=None, aggregate={}): """ Returns one stream containing all features from a list of tracks, ordered by *fields*. :param trackList: list of FeatureStream objects. :param fields: (list of str) list of fields to keep in the output (at least ['start','end']). :param remove_duplicates: (bool) whether to remove items that are identical in several of the tracks in *trackList*. [False] :param group_by: (list of str) if specified, elements having all values for these fields in common will be merged into a singe element. Other fields are merged according to *aggregate* if specified, or `common.generic_merge` by default. :aggregate: (dict) for each field name given as a key, its value is the function to apply to the vector containing all different values for this field in order to merge them. E.g. ``{'score': lambda x: sum(x)}`` will return the sum of all scores in the output. :rtype: FeatureStream """ def _find_min(feat_tuple): """Return the index of the 'smallest' element amongst a tuple of features from different tracks. Priority is given to the first field; if the first field items are equal amongst several elements, it looks at the second field, a.s.o.""" nmin = 0 xmin = feat_tuple[0] for n,x in enumerate(feat_tuple[1:]): if x[0] == sys.maxint: continue for k in range(len(x)): if cmp(hash(x[k]),hash(xmin[k]))<0: xmin = x nmin = n+1 break elif cmp(hash(x[k]),hash(xmin[k]))>0: break return nmin def _weave(_t,N): """Generator yielding all features represented in a list of tracks *_t*, sorted w.r.t the *N* first fields.""" current = [x.next()[:N] for x in _t] # init allfields = [t.fields for t in _t] n = _find_min(current) last = current[n] current[n] = _t[n].next()[:N] if not group_by: yield last while 1: # Remove duplicates if remove_duplicates: while not all([current.count(x)==1 for x in current]): for k in range(len(current)): if current.count(current[k]) > 1: current[k] = _t[k].next()[:N] n = _find_min(current) if current[n][0] == sys.maxint: break if group_by: idx = [allfields[n].index(f) for f in group_by] if all(current[n][i] == last[i] for i in idx): last = tuple(current[n][i] if i in idx \ else aggregate.get(allfields[n][i],common.generic_merge)((last[i],current[n][i])) \ for i in range(len(allfields[n]))) # merge last and current else: yield last last = current[n] else: yield current[n] current[n] = _t[n].next()[:N] if group_by: yield last if len(trackList) == 1: return trackList[0] if fields is None: fields = trackList[0].fields fields = [f for f in fields if all(f in t.fields for t in trackList)] _of = ['start','end'] if 'chr' in fields: _of = ['chr']+_of if 'name' in fields: _of += ['name'] _of += [f for f in fields if not(f in _of)] tl = [common.reorder(t,_of) for t in trackList] tl = [FeatureStream(common.sentinelize(x,(sys.maxint,)*len(x.fields)),x.fields) for x in tl] return FeatureStream(_weave(tl,len(_of)),fields=_of)
def _combine(trackList,fn,win_size,aggregate): """Generator - see function `combine` below.""" N = len(trackList) fields = trackList[0].fields trackList = [common.sentinelize(t, [sys.maxint]*len(t.fields)) for t in trackList] init = [trackList[i].next() for i in range(N)] # the first element of each track activity = [False]*N # a vector of boolean values for the N tracks at a given position z = [None]*N # If there are empty tracks, remove them, and their index from init for i in xrange(N-1,-1,-1): if init[i][0] == sys.maxint: N-=1 trackList.pop(i) init.pop(i) if N == 0: return available_tracks = range(N-1,-1,-1) # Sort starts and ends of all init elements indifferently; record the origin track index. current = [(init[i][0],i)+init[i][2:] for i in range(N)]+[(init[i][1],i) for i in range(N)] current.sort() # Init step: set all tracks beginning at the starting point as 'active' is_chr = 'chr' in fields if is_chr: empty = (current[0][2],)+('0',)*len(fields[3:]) # write chr name if a region has no other annotation else: empty = ('0',)*len(fields[2:]) start = current[0][0] while current[0][0] == start: i = current[0][1] # track index activity[i] = True # set this track to 'active' z[i] = current.pop(0)[2:] # z records all meta info k=1 while available_tracks or current: # Load all elements within *win_size* bp in *current* to_remove = [] limit = k * win_size while current[0][0] >= limit: k+=1 limit = k * win_size for i in available_tracks: a = [0,0] while a[1] < limit: a = trackList[i].next() if a[0] == sys.maxint: # track i is completely read: to_remove.append(i) # remove it from the tracks list else: current.append((a[0],i)+a[2:]) current.append((a[1],i)) for i in to_remove: available_tracks.remove(i) if not current: continue current.sort() # Calculate boolean values for this window while current and current[0][0] < limit: next = current[0][0] if fn(activity): feat_aggreg = [None]*len(fields[2:]) for n,f in enumerate(fields[2:]): feats = tuple(zi[n] for zi in z if zi) try: feat_aggreg[n] = aggregate.get(f,common.generic_merge)(feats) except IndexError: feat_aggreg = empty yield (start,next) + tuple(feat_aggreg) while current and current[0][0] == next: i = current[0][1] # track index activity[i] = not(activity[i]) # reverse activity zi = current.pop(0)[2:] # record meta info z[i] = zi if activity[i] else None start = next k+=1
def test_sentinelize(self): stream = fstream([(10,12,0.5), (14,15,1.2)], fields=['start','end','score']) stream = sentinelize(stream,'Z') for y in stream: x = y self.assertEqual(x,'Z')
def _get_feature(_t, _a): F = [] _a = common.sentinelize(_a, [sys.maxint] * len(_a.fields)) for peak in _t: distMinBefore = distMinAfter = thresholdInter + 1 gene = dist = typeLoc = "" geneBefore = geneAfter = strandBefore = strandAfter = None included = 0 # keep only genes which don't start too far for annot in _a: F.append(annot) if annot[0] > peak[1] + thresholdInter: break # remove genes that end too far fpop = -1 # always keep one gene before for annot in F: if annot[1] > peak[0] - thresholdInter: break fpop += 1 if fpop > 0: F = F[fpop:] for annot in F: # if the peak is totally included in the gene if (peak[0] >= annot[0]) and (annot[1] >= peak[1]): includedGene = annot[2] includedDist = ( annot[3] == -1) and annot[1] - peak[1] or peak[0] - annot[0] included = 1 # if the gene is totally included in the peak elif (annot[0] > peak[0]) and (peak[1] > annot[1]): includedGene = annot[2] includedDist = 0 included = 1 else: # if annot is not too far 3' and no intersection if 0 < (peak[0] - annot[1]) < distMinBefore: distMinBefore = peak[0] - annot[1] geneBefore = annot[2] strandBefore = annot[3] # if intersection (annot is before) elif annot[0] < peak[0] < annot[1]: distMinBefore = 0 geneBefore = annot[2] strandBefore = annot[3] #print "gene %s overlaps begin of peak %s" % (geneBefore,peakName) # if annot is not too far 5' and no intersection if 0 < (annot[0] - peak[1]) < distMinAfter: distMinAfter = annot[0] - peak[1] geneAfter = annot[2] strandAfter = annot[3] # if intersection (annot is after) elif annot[0] < peak[1] < annot[1]: distMinAfter = 0 geneAfter = annot[2] strandAfter = annot[3] #print "gene %s overlaps end of peak %s" % (geneAfter,peakName) # detect intergenic peak if not ( included ) and distMinBefore > thresholdInter and distMinAfter > thresholdInter: yield peak + ('', 'Intergenic', thresholdInter) continue # detect peak before the first or after the last gene on the chromosome if geneBefore == None: if distMinAfter <= thresholdInter: gene = geneAfter dist = distMinAfter typeLoc = (strandAfter == 1) and "Upstream" or "Downstream" elif geneAfter == None: if distMinBefore <= thresholdInter: gene = geneBefore dist = distMinBefore typeLoc = (strandBefore == -1) and "Upstream" or "Downstream" # detect peak between two genes on the same strand elif strandBefore == strandAfter: if strandBefore == 1: if thresholdUTR * distMinAfter > 100 * distMinBefore: gene = geneBefore dist = distMinBefore if distMinAfter < thresholdPromot: typeLoc = "3UTR" else: typeLoc = "Downstream" else: gene = geneAfter dist = distMinAfter if dist < thresholdPromot: typeLoc = "Promot" else: typeLoc = "Upstream" else: if thresholdUTR * distMinBefore > 100 * distMinAfter: gene = geneAfter dist = distMinAfter if distMinBefore < thresholdPromot: typeLoc = "3UTR" else: typeLoc = "Downstream" else: gene = geneBefore dist = distMinBefore if dist < thresholdPromot: typeLoc = "Promot" else: typeLoc = "Upstream" # detect peak between two genes on different strands else: # detect peak between 2 promoters if strandBefore == -1: typeLoc = "Upstream" if distMinBefore < distMinAfter: gene = geneBefore dist = distMinBefore if dist < thresholdPromot: typeLoc = "Promot" if distMinAfter < thresholdPromot: typeLoc += "_Promot" gene += "_" + geneAfter dist = str(dist) + "_" + str(distMinAfter) else: gene = geneAfter dist = distMinAfter if dist < thresholdPromot: typeLoc = "Promot" if distMinBefore < thresholdPromot: typeLoc += "_Promot" gene += "_" + geneBefore dist = str(dist) + "_" + str(distMinBefore) # detect peak between 2 3UTR else: typeLoc = "Downstream" # detect peak overlapping the 2 3UTR if distMinBefore == distMinAfter: if thresholdUTR * thresholdPromot > 100 * distMinBefore: typeLoc = "3UTR" typeLoc += "_" + typeLoc gene = geneBefore + "_" + geneAfter dist = str(distMinBefore) + "_" + str(distMinAfter) elif distMinBefore < distMinAfter: dist = distMinBefore gene = geneBefore if thresholdUTR * thresholdPromot > 100 * dist: typeLoc = "3UTR" else: dist = distMinAfter gene = geneAfter if thresholdUTR * thresholdPromot > 100 * dist: typeLoc = "3UTR" if included == 1: gene += "_" + includedGene if gene else includedGene dist = str(dist) dist = dist + "_" + str(includedDist) if dist else str( includedDist) typeLoc += "_Included" if typeLoc else "Included" yield peak + (gene, typeLoc, dist)
def _get_feature(_t,_a): F = [] _a = common.sentinelize(_a, [sys.maxint]*len(_a.fields)) for peak in _t: distMinBefore = distMinAfter = thresholdInter+1 gene = dist = typeLoc = "" geneBefore = geneAfter = strandBefore = strandAfter = None included = 0 # keep only genes which don't start too far for annot in _a: F.append(annot) if annot[0] > peak[1]+thresholdInter: break # remove genes that end too far fpop = -1 # always keep one gene before for annot in F: if annot[1] > peak[0]-thresholdInter: break fpop += 1 if fpop>0: F = F[fpop:] for annot in F: # if the peak is totally included in the gene if (peak[0]>=annot[0]) and (annot[1]>=peak[1]): includedGene = annot[2] includedDist = (annot[3] == -1) and annot[1]-peak[1] or peak[0]-annot[0] included = 1 # if the gene is totally included in the peak elif (annot[0]>peak[0]) and (peak[1]>annot[1]): includedGene = annot[2] includedDist = 0 included = 1 else: # if annot is not too far 3' and no intersection if 0 < (peak[0]-annot[1]) < distMinBefore: distMinBefore = peak[0]-annot[1] geneBefore = annot[2] strandBefore = annot[3] # if intersection (annot is before) elif annot[0] < peak[0] < annot[1]: distMinBefore = 0 geneBefore = annot[2] strandBefore = annot[3] #print "gene %s overlaps begin of peak %s" % (geneBefore,peakName) # if annot is not too far 5' and no intersection if 0 < (annot[0]-peak[1]) < distMinAfter: distMinAfter = annot[0]-peak[1] geneAfter = annot[2] strandAfter = annot[3] # if intersection (annot is after) elif annot[0] < peak[1] < annot[1]: distMinAfter = 0 geneAfter = annot[2] strandAfter = annot[3] #print "gene %s overlaps end of peak %s" % (geneAfter,peakName) # detect intergenic peak if not(included) and distMinBefore > thresholdInter and distMinAfter > thresholdInter: yield peak+('','Intergenic',thresholdInter) continue # detect peak before the first or after the last gene on the chromosome if geneBefore == None: if distMinAfter <= thresholdInter: gene = geneAfter dist = distMinAfter typeLoc = (strandAfter == 1) and "Upstream" or "Downstream" elif geneAfter == None: if distMinBefore <= thresholdInter: gene = geneBefore dist = distMinBefore typeLoc = (strandBefore == -1) and "Upstream" or "Downstream" # detect peak between two genes on the same strand elif strandBefore == strandAfter: if strandBefore == 1: if thresholdUTR*distMinAfter > 100*distMinBefore: gene = geneBefore dist = distMinBefore if distMinAfter < thresholdPromot: typeLoc = "3UTR" else: typeLoc = "Downstream" else: gene = geneAfter dist = distMinAfter if dist < thresholdPromot: typeLoc = "Promot" else: typeLoc = "Upstream" else: if thresholdUTR*distMinBefore > 100*distMinAfter: gene = geneAfter dist = distMinAfter if distMinBefore < thresholdPromot: typeLoc = "3UTR" else: typeLoc = "Downstream" else: gene = geneBefore dist = distMinBefore if dist < thresholdPromot: typeLoc = "Promot" else: typeLoc = "Upstream" # detect peak between two genes on different strands else: # detect peak between 2 promoters if strandBefore == -1: typeLoc = "Upstream" if distMinBefore < distMinAfter: gene = geneBefore dist = distMinBefore if dist < thresholdPromot: typeLoc = "Promot" if distMinAfter < thresholdPromot: typeLoc += "_Promot" gene += "_"+geneAfter dist = str(dist)+"_"+str(distMinAfter) else: gene = geneAfter dist = distMinAfter if dist < thresholdPromot: typeLoc = "Promot" if distMinBefore < thresholdPromot: typeLoc += "_Promot" gene += "_"+geneBefore dist = str(dist)+"_"+str(distMinBefore) # detect peak between 2 3UTR else: typeLoc = "Downstream" # detect peak overlapping the 2 3UTR if distMinBefore == distMinAfter: if thresholdUTR*thresholdPromot > 100*distMinBefore: typeLoc = "3UTR" typeLoc += "_"+typeLoc gene = geneBefore+"_"+geneAfter dist = str(distMinBefore)+"_"+str(distMinAfter) elif distMinBefore < distMinAfter: dist = distMinBefore gene = geneBefore if thresholdUTR*thresholdPromot > 100*dist: typeLoc = "3UTR" else: dist = distMinAfter gene = geneAfter if thresholdUTR*thresholdPromot > 100*dist: typeLoc = "3UTR" if included == 1: gene += "_"+includedGene if gene else includedGene dist = str(dist) dist = dist+"_"+str(includedDist) if dist else str(includedDist) typeLoc += "_Included" if typeLoc else "Included" yield peak+(gene,typeLoc,dist)