def __init__(self, sj, me, filepre, depth=500, maxcnt=10000): MEGraph3.__init__(self, sj, me, depth, maxcnt) self.pre = filepre a = filepre + 'ex1.txt.gz' b = filepre + 'ex2.txt.gz' c = filepre + 'ov.txt.gz' # calculate exon overlap to self cols0 = ['chr', 'st', 'ed', 'strand', '_id'] # single cell data contains float in st,ed in ex ??? me = UT.check_int_nan(me) a = UT.write_pandas(me[cols0], a, '') b = UT.write_pandas(me[cols0], b, '') c = BT.bedtoolintersect(a, b, c, wao=True) cols1 = cols0 + ['b_' + x for x in cols0] + ['ovl'] self.ov = ov = UT.read_pandas(c, names=cols1) # select same strand overlap to non-self self.ov1 = ov1 = ov[(ov['_id'] != ov['b__id']) & (ov['strand'] == ov['b_strand'])] # make connected dictionary _id => [b__id's] tmp = ov1.groupby('_id')['b__id'].apply( lambda x: list(x)).reset_index() if 'index' in tmp.columns: tmp['_id'] = tmp['index'] #LOG.debug('graph.MEGraph4.__init__: tmp.columns={0}, len(tmp)={1}'.format(tmp.columns, len(tmp))) self.eoe = dict(UT.izipcols(tmp, ['_id', 'b__id'])) # cleanup os.unlink(a) os.unlink(b) os.unlink(c)
def __init__(self, mg, se=None): self.mg = mg self.ex = ex = mg.exons.set_index('_id') #self.genes = genes = ex.groupby('_gidx')['_id'].groups # _gidx => [_id] dict #WARNING above does not give the desired dict it maps to index instead of _id self.genes = ex.groupby('_gidx').groups self.se = se self.i2g = dict(UT.izipcols(mg.exons, ['_id','gname']))
def calc_ecov_mp(covci, fname, np, blocksize=100): """ WARNING: this assumes _id is assinged according to sorted (chr,st,ed) """ LOG.debug('calc_ecov...') chroms = sorted(covci['chr'].unique()) if 'name1' not in covci.columns: covci['name1'] = covci['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) if 'eidmax' not in covci.columns: covci['eidmax'] = covci['name1'].apply(lambda x: max(x)) if 'eidmin' not in covci.columns: covci['eidmin'] = covci['name1'].apply(lambda x: min(x)) args = [(covci[covci['chr'] == c].copy(), blocksize) for c in chroms] e2cs = {} if np == 1: # for c,bwname,chrom,d in data: for arg in args: e2cs.update(calc_ecov_chrom(*arg)) else: try: p = multiprocessing.Pool(np) rslts = p.map(mp_worker, zip(repeat(calc_ecov_chrom), args)) finally: LOG.debug('closing pool') p.close() for x in rslts: e2cs.update(x) LOG.debug('writing rslts...') if fname is None: return e2cs ccf = UT.flattendf(covci, 'name1') ccfg = ccf.groupby('name1') e2chr = dict( UT.izipcols(ccfg['chr'].first().reset_index(), ['name1', 'chr'])) e2st = dict(UT.izipcols(ccfg['st'].min().reset_index(), ['name1', 'st'])) e2ed = dict(UT.izipcols(ccfg['ed'].max().reset_index(), ['name1', 'ed'])) df = PD.DataFrame(e2cs, index=['ecov']).T df.index.name = 'eid' df = df.reset_index() df['chr'] = [e2chr[x] for x in df['eid']] df['st'] = [e2st[x] for x in df['eid']] df['ed'] = [e2ed[x] for x in df['eid']] UT.save_tsv_nidx_whead(df[['eid', 'chr', 'st', 'ed', 'ecov']], fname) return df
def _gen(): for x in UT.izipcols(df, cols): rec = [x[0], 0, 0, x[3], x[4], x[5], x[-1]] bsizes = [int(y) for y in x[-3].split(',')] bstarts = [int(y) for y in x[-2].split(',')] for y, z in zip(bstarts, bsizes): if y >= 0: rec[1] = x[1] + y rec[2] = x[1] + y + z yield rec.copy()
def __init__(self, sjexpre, th=0.1): self.sjexpre = sjexpre self.th = th self.sj = sj = UT.read_pandas(sjexpre+'.sj.txt.gz') self.ex = ex = UT.read_pandas(sjexpre+'.ex.txt.gz') self.mg = mg = GP.MEGraph3(sj,ex) # only consider splice junction connections self.exg = ex.set_index('_gidx') self.exi = ex.set_index('_id') self.nullidx = UT.find_nullidx(self.ex) self.e2c = dict(UT.izipcols(ex, ['_id', 'cat'])) self.precalc_branch_p()
def _egen(): for chrom,tname,strand,est,eed in UT.izipcols(kg,['chr','name','strand','_ests','_eeds']): if len(est)==1: yield (chrom,st,ed,tname,0,strand,'s') else: if strand=='+': yield (chrom,est[0],eed[0],tname,0,strand,'5') for st,ed in izip(est[1:-1],eed[1:-1]): yield (chrom,st,ed,tname,0,strand,'i') yield (chrom,est[-1],eed[-1],tname,0,strand,'3') else: #'-' yield (chrom,est[0],eed[0],tname,0,strand,'3') for st,ed in izip(est[1:-1],eed[1:-1]): yield (chrom,st,ed,tname,0,strand,'i') yield (chrom,est[-1],eed[-1],tname,0,strand,'5')
def calc_glen(ex, cipath): ci = GGB.read_bed(cipath) # 5 col bed, name:eids, sc1:cid ci['len'] = ci['ed'] - ci['st'] ci['cid'] = ci['sc1'] c2l = dict(UT.izipcols(ci, ['cid', 'len'])) if 'cid' not in ex.columns: e2c = {} for i, name in ci[['cid', 'name']].values: for eid in name.split(','): e2c.setdefault(int(eid), []).append(i) ex['cid'] = [e2c[x] for x in ex['_id']] def _gen(): for g, cids in UT.izipcols(ex, ['_gidx', 'cid']): for c in cids: yield (c, g) df = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', '_gidx']) df['len'] = [c2l[x] for x in df['cid']] glen = df.groupby('_gidx')['len'].sum() return dict(zip(glen.index, glen.values))
def _gen(): for g, cids in UT.izipcols(ex, ['_gidx', 'cid']): for c in cids: yield (c, g)
def _set_df2prop(src, tgt, default): dic = dict(UT.izipcols(df2, ['_gidx', src])) df[tgt] = [dic.get(x, default) for x in df['_gidx']]
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4): """Calculate gene coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz' 2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids) len2: calculate length from ci with cov > 0 (normal length = use entire ci's belonging to the gene) gcov2 = val/len2 cids: cid with cov > for the gene ','.joined """ ex = UT.read_pandas(expath) covcipath = dstprefix + 'covci.txt.gz' gcovpath = dstprefix + 'gcov.txt.gz' if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'eid' not in cc.columns: cc['eid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['len'] = cc['ed'] - cc['st'] cc['val'] = cc['cov'] * cc['len'] ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid') e2g = dict(UT.izipcols(ex, ['_id', '_gidx'])) ccf['_gidx'] = [e2g[x] for x in ccf['eid']] # for normal gcov: take unique combination of (gid, id) (id=cid) # for gocv2 : first select ccf with val>0 ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index() ccf2g = ccf2.groupby('_gidx') df2 = ccf2g[['len', 'val']].sum() df2['gcov2'] = df2['val'] / df2['len'] df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x])) df2['gst2'] = ccf2g['st'].min() df2['ged2'] = ccf2g['ed'].max() df2['glen2'] = df2['ged2'] - df2['gst2'] df2 = df2.reset_index() ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index() ccf1g = ccf1.groupby('_gidx') df = ccf1g[['len', 'val']].sum() df['gcov'] = df['val'] / df['len'] df['st'] = ccf1g['st'].min() df['ed'] = ccf1g['ed'].max() df['glen'] = df['ed'] - df['st'] df = df.reset_index() g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr'])) df['chr'] = [g2chr[x] for x in df['_gidx']] def _set_df2prop(src, tgt, default): dic = dict(UT.izipcols(df2, ['_gidx', src])) df[tgt] = [dic.get(x, default) for x in df['_gidx']] _set_df2prop('gcov2', 'gcov2', 0) _set_df2prop('len', 'len2', 0) _set_df2prop('cids', 'cids', '') _set_df2prop('gst2', 'st2', -1) _set_df2prop('ged2', 'ed2', -1) _set_df2prop('glen2', 'glen2', 0) cols = [ '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2', 'gcov2', 'cids', 'st2', 'ed2', 'glen2' ] cols = ['_gidx', 'gcov'] df = df[cols] UT.save_tsv_nidx_whead(df, gcovpath) return df
def ex_d_e2p(self,eid): try: return dict(UT.izipcols(self.d2ep.get_group(eid), ['e_id_a','p'])) except: return {}
def _igen(): for chrom,tname,strand,est,eed in UT.izipcols(kg,['chr','name','strand','_ests','_eeds']): for st,ed in izip(eed[:-1],est[1:]): yield (chrom,st+1,ed,tname,0,strand,'j')
def find_genes4(sj, ae, filepre, cachename=None, np=1, override=False, depth=500, separatese=True): """ Adds _gidx column to ae Connection: 1) by junctions, 2) by overlap in the same strand Returns genes [set([_id,..]), ...] """ if '_id' not in ae.columns: LOG.info('setting ex _id...') UT.set_ids(ae) if '_id' not in sj.columns: LOG.info('setting sj _id...') UT.set_ids(sj) if 'cat' not in ae.columns: UT.set_exon_category(sj, ae) if 'a_id' not in ae.columns: UT.set_ad_info(sj, ae) ### FIND GENES if cachename and os.path.exists(cachename) and not override: LOG.info('loading cached genes (connected components)...') genes = pickle.load(open(cachename, 'rb')) else: LOG.info('finding genes (connected components)...') _sttime = time.time() if separatese: me, se = UT.mese(ae) genes = mcore_allcomponents4(sj, me, filepre, np, depth=depth) # SE genes genes += [set([x]) for x in se['_id']] else: genes = mcore_allcomponents4(sj, ae, filepre, np, depth=depth) # version 4 graph: uses overlaps in addition to junctions to connect # genes = [set([_id's]),...] if cachename: UT.makedirs(os.path.dirname(cachename)) pickle.dump(genes, open(cachename, 'wb')) LOG.info(' time: {0:.3f}s'.format(time.time() - _sttime)) ### WRITE EXONS W/ GENE number LOG.info('assigning gidx...') _sttime = time.time() i2g = {} # eid => _gidx i2gn = {} # eidt => gname g2gn = {} i2s = dict(UT.izipcols(ae, ['_id', 'strand'])) # eid => strand #i2c = dict(UT.izipcols(ae, ['_id','cat'])) # eid => category s2n = {'+': 'P', '-': 'N', '.': '', '.+': '', '.-': ''} c2n = {'s': 'S', 'i': 'G', '5': 'G', '3': 'G'} for i, ids in enumerate(genes): gid = i + 1 strand = s2n[i2s[list(ids)[0]]] cat = 'S' if len(ids) == 1 else 'G' if strand == 'N': # negative strand gid = -gid gname = 'J{0}{1}{2}'.format(strand, cat, abs(gid)) g2gn[gid] = gname for x in ids: i2g[x] = gid i2gn[x] = gname ae['_gidx'] = [i2g[x] for x in ae['_id']] ae['gname'] = [i2gn[x] for x in ae['_id']] ## set sj _gidx, use acceptor=>_gidx map (exon a_id, sj a_id) a2g = dict(UT.izipcols(ae, ['a_id', '_gidx'])) d2g = dict(UT.izipcols(ae, ['d_id', '_gidx'])) sj['_gidx'] = [ a2g.get(x, d2g.get(y, 0)) for x, y in UT.izipcols(sj, ['a_id', 'd_id']) ] sj['gname'] = [g2gn.get(x, '') for x in sj['_gidx']] # This shouldn't happen nidx = ae['_gidx'] == 0 if N.sum(nidx) > 0: LOG.warning( '###### WARNING!!!!!! exons with no gene assignment:{0}'.format( N.sum(nidx))) #ae.loc[nidx, '_gidx'] = N.arange(len(ae),len(ae)+N.sum(nidx)) return genes
def __init__(self, ex, sj, xmargin=None, ymargin=0.25, compress=True, ecov='ecov', ucnt='ucnt', mcnt='mcnt', minlw=1, drawscalebar=True, ecovth=None, jcntth=None, origin=None, sortexby=None, fontsize=7): self.ymargin = ymargin self.ecov = ecov self.ucnt = ucnt self.jcnt = jcnt = 'jcnt' self.mcnt = mcnt self.minlw = minlw self.drawscalebar = drawscalebar self.ecovth = ecovth self.jcntth = jcntth self.ex = ex = ex.copy() self.sj = sj = sj.copy() self.compress = compress self.fontsize = fontsize if sortexby is None: self.sortexby = ecov else: self.sortexby = sortexby # when plotting multiple and comparing, you want to use same sorting # start and end, strand if ex.iloc[0]['strand'] == '+': if origin is None: origin = ex['st'].min() ex['xst'] = ex['st'] - origin ex['xed'] = ex['ed'] - origin self.strand = '+' self.origin = origin else: if origin is None: origin = ex['ed'].max() ex['xst'] = origin - ex['ed'] ex['xed'] = origin - ex['st'] self.strand = '-' self.origin = origin # fix old a_id null if (ex['a_id'].min() == -1) and (N.sum(ex['a_id'] == 0) == 0): ex.loc[ex['a_id'] == -1, 'a_id'] = 0 ex.loc[ex['d_id'] == -1, 'd_id'] = 0 sj.loc[sj['a_id'] == -1, 'a_id'] = 0 sj.loc[sj['d_id'] == -1, 'd_id'] = 0 ex['len'] = ex['xed'] - ex['xst'] if xmargin is None: xmargin = int(ex['len'].mean()) self.xmargin = xmargin if ecov not in ex.columns: ex[ecov] = 1 if (ucnt not in sj.columns) or (mcnt not in sj.columns): sj[jcnt] = 1 sj[jcnt + '_ls'] = 'solid' else: # sj uniq, mult sj[jcnt] = [x or y for x, y in sj[[ucnt, mcnt]].values] sj[jcnt + '_ls'] = ['solid' if x else 'dashed' for x in sj[ucnt]] if ecovth is not None: self.ex = ex = ex[ex[ecov] > ecovth].copy() if jcntth is not None: self.sj = sj = sj[sj[jcnt] > jcntth].copy() if len(ex) == 0: return # find exon groups if 'asize' not in ex.columns: a2size = dict( UT.izipcols( ex.groupby('a_id').size().reset_index(), ['a_id', 0])) d2size = dict( UT.izipcols( ex.groupby('d_id').size().reset_index(), ['d_id', 0])) a2size[0] = 0 d2size[0] = 0 ex['asize'] = [a2size[x] for x in ex['a_id']] ex['dsize'] = [d2size[x] for x in ex['d_id']] ex['group'] = [ 'a{0}'.format(ai) if (a != 0 and a > d) else 'd{0}'.format(di) for a, ai, d, di in ex[['asize', 'a_id', 'dsize', 'd_id']].values ] # find exon group st, ed exg = ex.groupby('group') g2st = dict( UT.izipcols(exg['xst'].min().reset_index(), ['group', 'xst'])) g2ed = dict( UT.izipcols(exg['xed'].max().reset_index(), ['group', 'xed'])) g2size = dict(UT.izipcols(exg.size().reset_index(), ['group', 0])) ex['gst'] = [g2st[x] for x in ex['group']] ex['ged'] = [g2ed[x] for x in ex['group']] ex['gsize'] = [g2size[x] for x in ex['group']] #self.ex = ex = ex.sort_values(['group',ecov]) #'gst','ged','xst','xed']) self.ex = ex = ex.sort_values(['group', self.sortexby ]) #'gst','ged','xst','xed']) # find exon y pos within group def _eypos(gs): g0, s0 = gs[0] # first g cnt = 0 yield cnt - (s0 - 1) / 2. for g1, s1 in gs[1:]: if g1 == g0: cnt += 1 else: cnt = 0 yield cnt - (s1 - 1) / 2. g0 = g1 ex['eypos'] = [x for x in _eypos(ex[['group', 'gsize']].values)] # find group y center pos self.gr = gr = ex.groupby('group')[['gst', 'ged', 'gsize']].first().sort_values( ['gst', 'ged']) gr['len'] = gr['ged'] - gr['gst'] def _gypos(gr): side = 1 r0 = gr.iloc[0] h = r0['gsize'] / 2. ged0 = r0['ged'] gy0 = {1: h, -1: -h} # remember filled height both side (1,-1) yield 0 # first one gets center for gst1, ged1, gsiz1 in gr[['gst', 'ged', 'gsize']].values[1:]: h = gsiz1 / 2. if ged0 <= gst1: # no overlap gy0 = {1: h, -1: -h} yield 0 else: gy1 = gy0[side] + side * gsiz1 / 2. gy0[side] = gy0[side] + side * gsiz1 side = -1 * side # flip side yield gy1 gst0 = gst1 ged0 = max(ged0, ged1) gr['gypos'] = [x for x in _gypos(gr)] # compress x coord if compress: def _gxst(gr): r0 = gr.iloc[0] delta = 0 yield r0['gst'] - delta # 0 ged0 = r0['ged'] for i, r1 in gr.iloc[1:].iterrows(): gst1 = r1['gst'] if gst1 - ged0 > self.xmargin: delta += (gst1 - ged0 - self.xmargin) yield gst1 - delta ged0 = r1['ged'] gr['cst'] = [x for x in _gxst(gr)] else: gr['cst'] = gr['gst'] #gr['ced'] = gr['cst']+gr['len'] ex['cst0'] = [ gr['cst'].ix[g] + (xst - gst) for g, xst, gst in ex[['group', 'xst', 'gst']].values ] ex['ced0'] = ex['cst0'] + ex['len'] if self.strand == '+': ex['cst'] = origin + ex['cst0'] ex['ced'] = origin + ex['ced0'] else: ex['cst'] = origin - ex['ced0'] ex['ced'] = origin - ex['cst0'] ex['ey'] = [ ey + gr['gypos'].ix[g] for ey, g in ex[['eypos', 'group']].values ]