def getPerReadAverage(args, in_fh): for line in in_fh: try: line = line.decode('ascii') except AttributeError: pass read = MethRead(line) seqs = [x.seq for x in read.calldict.values()] # filter out calls with unwanted seq filteridx = [ idx for idx, s in enumerate(seqs) if args.exclude not in s ] calls_all = np.array([x.call for x in read.calldict.values()])[filteridx] ratios = np.array([x.ratio for x in read.calldict.values()])[filteridx] # filter out unconfident calls calls = [x for x in calls_all if x != -1] # drop empty call strings if len(calls) > 0: print('\t'.join([ str(x) for x in [ read.rname, read.start, read.end, read.qname, np.mean(ratios), '.', np.mean(calls), len(calls) ] ]), file=args.out)
def read_tabix(fpath, window): with pysam.TabixFile(fpath) as tabix: entries = [x for x in tabix.fetch(window)] reads = [MethRead(x) for x in entries] rdict = dict() for read in reads: try: rdict[read.qname].append(read) except: rdict[read.qname] = [read] return rdict
def read_tabix(fpath,window) : with pysam.TabixFile(fpath) as tabix : entries = [x for x in tabix.fetch(window)] reads = [MethRead(x) for x in entries] rdict = dict() # for split-reads, multiple entries are recorded per read name for meth in reads : qname = meth.qname if qname in rdict.keys() : rdict[qname] = np.append(rdict[qname],meth.callarray,0) else : rdict[qname] = meth.callarray return rdict
def getReadlevel(args, in_fh): for line in in_fh: try: line = line.decode('ascii') except AttributeError: pass read = MethRead(line) callkeys = sorted(read.calldict) for key in callkeys: methcall = read.calldict[key] outlist = [read.rname] + [str(x) for x in methcall] outlist.insert(3, read.qname) print("\t".join(outlist), file=args.out)
def readlevelHeatmap(datapath,reg,thr=0.1,window=10,verbose=False) : heat = HeatmapRegion(reg) if verbose : print(heat.coord,file=sys.stderr) data = tabix(datapath,heat.coord) for line in data : read = MethRead(line) if ( read.start <= heat.start and read.end >= heat.end ) : heat.addread(read) if verbose : print("{} reads covering the entire region out of total {} in the region".format( heat.totreads,len(data)),file=sys.stderr) heat.makeMatrix(window) g = heat.plot(thr,window) if verbose : print(heat.title,file=sys.stderr) return g
def getFreq(args, in_fh): if args.verbose: print("getting frequency", file=sys.stderr) sites = dict() n = 0 for line in in_fh: try: line = line.decode('ascii') except AttributeError: pass n += 1 read = MethRead(line) # # debug # print(line,file=sys.stdout) # print(read.ratios) # print(read.keys) # # sitekeys = sorted(sites.keys()) # print(sitekeys) try: # print everything in sites if chromosome is different if read.rname != sites[sitekeys[0]].rname: printind = len(sitekeys) else: # get index of new position printind = bisect.bisect_left(sitekeys, read.keys[0]) except IndexError: printind = 0 if printind != 0: for i in range(printind): key = sitekeys[i] sites[key].printFreq(args.motif, args.out) sites.pop(key) for key in read.keys: if key not in sites.keys(): sites[key] = SiteStats(read.calldict[key], read.rname) sites[key].update(read.calldict[key]) if args.verbose: if n % 10000 == 0: print("parsed {} lines".format(n), file=sys.stderr) for key in sorted(sites.keys()): sites[key].printFreq(args.motif, args.out)
def read_tabix(fpath,window) : with pysam.TabixFile(fpath) as tabix : entries = [x for x in tabix.fetch(window)] reads = [MethRead(x) for x in entries] cgdict = dict() gcdict = dict() # for split-reads, multiple entries are recorded per read name for meth in reads : qname = meth.qname mod = meth.fields[-1] if mod == "CG" : if qname in cgdict.keys() : cgdict[qname] = np.append(cgdict[qname],meth.callarray,0) else : cgdict[qname] = meth.callarray elif mod == "GC" : if qname in gcdict.keys() : gcdict[qname] = np.append(gcdict[qname],meth.callarray,0) else : gcdict[qname] = meth.callarray return cgdict,gcdict