return max(groups, key=_auxfun)[0] print(files) br = [] rr = [] bc = 0 #count bouke rc = 0 #count raoul for x in files: inter = InterLap() b = tgt.read_textgrid(mypath + x + "B.TextGrid").tiers[1] r = tgt.read_textgrid(mypath + x + "R.TextGrid").tiers[1] bc += len(b) rc += len(r) inter.add([convert_to_float(i) for i in r]) tot_overlaps = set() for i in b: interval = convert_to_float(i) overlaps = list(inter.find(interval)) #print(interval[2]) if (len(overlaps) > 0): overlaps = [tuple(x) for x in overlaps] for o in overlaps: tot_overlaps.add(o) rr.append(most_common([o[2] for o in overlaps])) br.append(interval[2]) else:
def read_exons(gtf, chrom, cutoff, coverage_array, exclude): genes = defaultdict(IntervalSet) splitters = defaultdict(IntervalSet) interlaps = [] split_iv = InterLap() # preempt any bugs by checking that we are getting a particular chrom assert gtf[0] == "|", ( "expecting a tabix query so we can handle chroms correctly") #f1 = open("selfchaincut.txt","a") #f2 = open("segdupscut.txt","a") #f3 = open("coveragecut.txt","a") for bed in exclude: # expecting a tabix query so we can handle chroms correctly a = "|tabix {bed} {chrom}".format(chrom=chrom, bed=bed) # any file that gets sent in will be used to split regions (just like # low-coverage). For example, we split on self-chains as well. #TODO: comment this block if you don't want any filtering by self-chains or segdups for toks in ( x.strip().split("\t") for x in ts.nopen(a) ): # adds self chains and segdups to splitters list, so that exons can be split, and they are removed from CCRs s, e = int(toks[1]), int(toks[2]) split_iv.add((s, e)) #if len(toks) > 3: # f1.write("\t".join(toks)+"\n") # self chain #else: # f2.write("\t".join(toks)+"\n") # segdups for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf) if x[0] != "#"): if toks[2] not in ("CDS", "stop_codon") or toks[1] not in ("protein_coding"): continue #if toks[0] != "1": break start, end = map(int, toks[3:5]) gene = toks[8].split('gene_name "')[1].split('"', 1)[0] assert start <= end, toks key = toks[0], gene #cutoff = 0.3 # find sections of exon under certain coverage. #TODO: comment this if we don't want coverage cutoff filtering if coverage_array[start - 1:end].min( ) < cutoff: # doesn't bother to run these operations if there is not one bp below the cutoff #splitters[key].add([(start - 1, end)]) #this takes out the whole exon for one section of poor coverage a = coverage_array[start - 1:end] #print str(start-1),end,a is_under, locs = False, [] # generates "locs" for each exon" if a[0] < cutoff: locs.append([start - 1]) is_under = True # so you can initialize is_under for pos, v in enumerate( a[1:], start=start ): #enumerates positions in the coverage array starting at the beginning of the exon if v < cutoff: if not is_under: is_under = True locs.append( [pos - 1] ) #start, coverage is in bed format, so pos-1 is necessary, since splitters are open left and right side else: if is_under: is_under = False locs[-1].append(pos) #end if is_under: locs[-1].append( end ) # in this case would end splitter at the end of the exon splitters[key].add(map(tuple, locs)) #for i in locs: # f3.write(chrom+"\t"+"\t".join(map(str,i))+"\n") for s, e in split_iv.find((start - 1, end)): splitters[key].add([(s, e)]) genes[key].add( [(start - 1, end)] ) # converts GTF exon coordinates to BED format (subtracts 1 from exon start) # sort by start so we can do binary search. genes = dict((k, sorted(v._vals)) for k, v in genes.iteritems()) #ends = dict((k, sorted(v)) for k, v in ends.iteritems()) splits, starts, ends = {}, {}, {} splitters = dict(splitters) for chrom_gene, sends in genes.iteritems(): starts[chrom_gene] = [s[0] for s in sends] ends[chrom_gene] = [s[1] for s in sends] if chrom_gene in splitters: splits[chrom_gene] = splitters[chrom_gene]._vals return starts, ends, splits