예제 #1
0
파일: __init__.py 프로젝트: tsznxx/lncin
 def calPvalues(siRNASeeds,seedfile,outfile,method='RRA',N=10000):
 
     # read headers
     print >>sys.stderr, Utils.touchtime(),"reading header"
     fh = gzip.open(seedfile,'rb')
     genes,lengths = SeedUtils.parseHeader(fh)
     num_genes = len(genes)
 
     # aRRA
     print >>sys.stderr, Utils.touchtime(),"start analyzing data."
     method = {'RRA':SeedUtils.aRRA}.get(method,SeedUtils.aRRA)
     escores = numpy.zeros(num_genes)
     for i in range(0,num_genes,N):
         n = N if i+N < num_genes else num_genes%N
         if n:
             geneSeeds = numpy.sum(numpy.frombuffer(fh.read(2**17*n),dtype=numpy.uint16,count=2**16*n).reshape((n,4,2**14)),axis=1)
             escores[i:i+n] = method(siRNASeeds,geneSeeds)
             print Utils.touchtime(), "analyzed", i+n
     fh.close()
 
     # Writting result to file
     print >>sys.stderr, Utils.touchtime(), 'writing result to file.'
     df = pandas.DataFrame({'gid':genes,'length':lengths, 'escore':escores})
     rv = SeedUtils.evdplot(df,outfile+".pdf")
     df.loc[:,'pvalue'] =  rv.pdf(df.escore)
     df = df.sort_values(by='pvalue',ascending=False)
     df.to_csv(outfile,index=False,sep='\t',columns=['gid','length','escore','pvalue'])
     print >>sys.stderr, Utils.touchtime(), "finished."
     
     return df
예제 #2
0
파일: __init__.py 프로젝트: tsznxx/lncin
 def parseGeneSeeds(gfile,dumpfile,N=10):
     '''
     Parse seeds on gene sequences.
     Parameters:
         gfile: string
             Gene sequence file in Fasta format. 
         N: int
             Parse N sequence at a time instead all to reduce memory usage. [Default=10000]
             N = 10000, memory usage for the two huge matrix is up to 1.83G.
     '''
     # create seed tables
     SeedUtils.createTables() 
     # read genes
     ofh = gzip.open(dumpfile,'wb')
     genes = [fa for fa in IO.BioReader(gfile,'fasta')]
     # write headers
     gids = ';'.join(["{0}:{1}".format(fa.id,len(fa)) for fa in genes])
     header = numpy.array([0x19840405,len(genes),len(gids)],dtype=numpy.uint64)
     ofh.write(header.data)
     ofh.write(gids)
     # parse genes 
     cnt = 0
     mat = numpy.zeros((N,2**15),dtype=numpy.uint16)
     seeds = numpy.zeros((N,4,2**14),dtype=numpy.uint16) # N x type x seeds
     for fa in genes:
         SeedUtils.findSeeds(fa.seq.seq.upper(),mat[cnt%N])
         cnt += 1
         if cnt%(N)==0:
             # calculate seeds for the N genes
             SeedUtils.parseSeeds(mat,seeds)
             ofh.write(seeds.data)
             print >>sys.stderr, touchtime(), "Parsed {0} genes .".format(cnt)
             seeds.fill(0)
             mat.fill(0)
     # calculate seeds for the rest of genes
     rest = cnt%N
     if rest:
         SeedUtils.parseSeeds(mat[:rest],seeds[:rest])
         ofh.write(seeds[:rest].data)
     print >>sys.stderr, Utils.touchtime(), "Parsed {0} genes.       ".format(cnt)
     ofh.close()