def calPvalues(siRNASeeds,seedfile,outfile,method='RRA',N=10000): # read headers print >>sys.stderr, Utils.touchtime(),"reading header" fh = gzip.open(seedfile,'rb') genes,lengths = SeedUtils.parseHeader(fh) num_genes = len(genes) # aRRA print >>sys.stderr, Utils.touchtime(),"start analyzing data." method = {'RRA':SeedUtils.aRRA}.get(method,SeedUtils.aRRA) escores = numpy.zeros(num_genes) for i in range(0,num_genes,N): n = N if i+N < num_genes else num_genes%N if n: geneSeeds = numpy.sum(numpy.frombuffer(fh.read(2**17*n),dtype=numpy.uint16,count=2**16*n).reshape((n,4,2**14)),axis=1) escores[i:i+n] = method(siRNASeeds,geneSeeds) print Utils.touchtime(), "analyzed", i+n fh.close() # Writting result to file print >>sys.stderr, Utils.touchtime(), 'writing result to file.' df = pandas.DataFrame({'gid':genes,'length':lengths, 'escore':escores}) rv = SeedUtils.evdplot(df,outfile+".pdf") df.loc[:,'pvalue'] = rv.pdf(df.escore) df = df.sort_values(by='pvalue',ascending=False) df.to_csv(outfile,index=False,sep='\t',columns=['gid','length','escore','pvalue']) print >>sys.stderr, Utils.touchtime(), "finished." return df
def parseGeneSeeds(gfile,dumpfile,N=10): ''' Parse seeds on gene sequences. Parameters: gfile: string Gene sequence file in Fasta format. N: int Parse N sequence at a time instead all to reduce memory usage. [Default=10000] N = 10000, memory usage for the two huge matrix is up to 1.83G. ''' # create seed tables SeedUtils.createTables() # read genes ofh = gzip.open(dumpfile,'wb') genes = [fa for fa in IO.BioReader(gfile,'fasta')] # write headers gids = ';'.join(["{0}:{1}".format(fa.id,len(fa)) for fa in genes]) header = numpy.array([0x19840405,len(genes),len(gids)],dtype=numpy.uint64) ofh.write(header.data) ofh.write(gids) # parse genes cnt = 0 mat = numpy.zeros((N,2**15),dtype=numpy.uint16) seeds = numpy.zeros((N,4,2**14),dtype=numpy.uint16) # N x type x seeds for fa in genes: SeedUtils.findSeeds(fa.seq.seq.upper(),mat[cnt%N]) cnt += 1 if cnt%(N)==0: # calculate seeds for the N genes SeedUtils.parseSeeds(mat,seeds) ofh.write(seeds.data) print >>sys.stderr, touchtime(), "Parsed {0} genes .".format(cnt) seeds.fill(0) mat.fill(0) # calculate seeds for the rest of genes rest = cnt%N if rest: SeedUtils.parseSeeds(mat[:rest],seeds[:rest]) ofh.write(seeds[:rest].data) print >>sys.stderr, Utils.touchtime(), "Parsed {0} genes. ".format(cnt) ofh.close()