Пример #1
0
 def getSeeds(fafile,sense=True,start_at=2,stop_at=8):
     if sense:
         for fa in IO.BioReader(fafile,'fasta'):
             yield fa.id,fa.seq.seq[start_at-1:stop_at-1]
     else:
         for fa in IO.BioReader(fafile,'fasta'):
             yield fa.id,Utils.rc(fa.seq.seq)[start_at-1:stop_at-1]
Пример #2
0
 def calPvalues(siRNASeeds,seedfile,outfile,method='RRA',N=10000):
 
     # read headers
     print >>sys.stderr, Utils.touchtime(),"reading header"
     fh = gzip.open(seedfile,'rb')
     genes,lengths = SeedUtils.parseHeader(fh)
     num_genes = len(genes)
 
     # aRRA
     print >>sys.stderr, Utils.touchtime(),"start analyzing data."
     method = {'RRA':SeedUtils.aRRA}.get(method,SeedUtils.aRRA)
     escores = numpy.zeros(num_genes)
     for i in range(0,num_genes,N):
         n = N if i+N < num_genes else num_genes%N
         if n:
             geneSeeds = numpy.sum(numpy.frombuffer(fh.read(2**17*n),dtype=numpy.uint16,count=2**16*n).reshape((n,4,2**14)),axis=1)
             escores[i:i+n] = method(siRNASeeds,geneSeeds)
             print Utils.touchtime(), "analyzed", i+n
     fh.close()
 
     # Writting result to file
     print >>sys.stderr, Utils.touchtime(), 'writing result to file.'
     df = pandas.DataFrame({'gid':genes,'length':lengths, 'escore':escores})
     rv = SeedUtils.evdplot(df,outfile+".pdf")
     df.loc[:,'pvalue'] =  rv.pdf(df.escore)
     df = df.sort_values(by='pvalue',ascending=False)
     df.to_csv(outfile,index=False,sep='\t',columns=['gid','length','escore','pvalue'])
     print >>sys.stderr, Utils.touchtime(), "finished."
     
     return df
Пример #3
0
 def parseGeneSeeds(gfile,dumpfile,N=10):
     '''
     Parse seeds on gene sequences.
     Parameters:
         gfile: string
             Gene sequence file in Fasta format. 
         N: int
             Parse N sequence at a time instead all to reduce memory usage. [Default=10000]
             N = 10000, memory usage for the two huge matrix is up to 1.83G.
     '''
     # create seed tables
     SeedUtils.createTables() 
     # read genes
     ofh = gzip.open(dumpfile,'wb')
     genes = [fa for fa in IO.BioReader(gfile,'fasta')]
     # write headers
     gids = ';'.join(["{0}:{1}".format(fa.id,len(fa)) for fa in genes])
     header = numpy.array([0x19840405,len(genes),len(gids)],dtype=numpy.uint64)
     ofh.write(header.data)
     ofh.write(gids)
     # parse genes 
     cnt = 0
     mat = numpy.zeros((N,2**15),dtype=numpy.uint16)
     seeds = numpy.zeros((N,4,2**14),dtype=numpy.uint16) # N x type x seeds
     for fa in genes:
         SeedUtils.findSeeds(fa.seq.seq.upper(),mat[cnt%N])
         cnt += 1
         if cnt%(N)==0:
             # calculate seeds for the N genes
             SeedUtils.parseSeeds(mat,seeds)
             ofh.write(seeds.data)
             print >>sys.stderr, touchtime(), "Parsed {0} genes .".format(cnt)
             seeds.fill(0)
             mat.fill(0)
     # calculate seeds for the rest of genes
     rest = cnt%N
     if rest:
         SeedUtils.parseSeeds(mat[:rest],seeds[:rest])
         ofh.write(seeds[:rest].data)
     print >>sys.stderr, Utils.touchtime(), "Parsed {0} genes.       ".format(cnt)
     ofh.close()
Пример #4
0
    p._optionals.title = "Options"
    p.add_argument("-i","--input",dest='ifname',type=str,metavar="input.bed",required=True,help="Input file. Can be stdin.")
    p.add_argument("-f","--format",dest="ftype",type=str,metavar="bed",default="bed",help="Format of input file. Default is 'bed'. Can be 'bed3', 'bedgraph', 'bed','peak','wig', 'sam2bed' or 'genepred'.")
    p.add_argument("-g","--genome",dest='genome',type=str,metavar="Genome",default=None,help="Genome version (hg19, mm10 .etc) or genome size file with chrom and size in each line.")
    p.add_argument("-u","--up",dest="up",type=int,metavar="upstream",default=0,help="bps extended to upstream. If minus, trim the 5' end.")
    p.add_argument("-d","--down",dest="down",type=int,metavar="downstream",default=0,help="bps extended to downstream. If minus, trim the 3' end.")
    p.add_argument("-o","--output",dest="ofname",type=str,metavar="output.bed",default="stdout",help="Output file. Default is stdout.")
    if len(sys.argv)==1:
        sys.exit(p.print_help())
    args = p.parse_args()
    return args

# ------------------------------------
# Classes
# ------------------------------------

# ------------------------------------
# Main
# ------------------------------------

if __name__=="__main__":
    # Get parameters
    args=argParser()
    fh = IO.mopen(args.ofname, 'w')
    if args.genome:
        genome=Utils.genomeSize(args.genome)
    for item in IO.BioReader(args.ifname,args.ftype):
        tbed=item.extend(args.up,args.down, args.genome)
        print >> fh, tbed
    IO.mclose(fh)
Пример #5
0
if __name__=="__main__":
    if len(sys.argv)==1:
        print "Usage: "+sys.argv[0]+" annotation.tab/bed genomesize *.bed"
        print "       Find the nearest annotation for given bed."
    else:
        # check file
        if '.tab' in sys.argv[1]:
            ftype='gene'
        else:
            ftype='bed'

        # initiation annotations.
        annos={}
        #for chrom in IO.genomeSize('hg19'):
        for chrom in Utils.genomeSize(sys.argv[2]):
            if ftype=='bed':
                annos[chrom]=BedList()
            else:
                annos[chrom]=GeneBedList()
        
        # read annotations.
        for anno in IO.BioReader(sys.argv[1],ftype=ftype):
            if annos.has_key(anno.chrom):
                annos[anno.chrom].append(anno)

        # sort
        for chrom in annos:
            annos[chrom].sort()

        # Find nearest annoations
Пример #6
0
 def findSeeds(seq,sary):    
     sary.fill(0)
     seq = Utils.rc(seq)
     for i in range(len(seq)-7):
         sary[twoBytesTable[seq[i:i+8]]] += 1
Пример #7
0
                   "--output",
                   dest="ofname",
                   type=str,
                   metavar="output.bed",
                   default="stdout",
                   help="Output file. Default is stdout.")
    if len(sys.argv) == 1:
        sys.exit(p.print_help())
    args = p.parse_args()
    return args


# ------------------------------------
# Classes
# ------------------------------------

# ------------------------------------
# Main
# ------------------------------------

if __name__ == "__main__":
    # Get parameters
    args = argParser()
    fh = IO.mopen(args.ofname, 'w')
    if args.genome:
        genome = Utils.genomeSize(args.genome)
    for item in IO.BioReader(args.ifname, args.ftype):
        tbed = item.extend(args.up, args.down, args.genome)
        print >> fh, tbed
    IO.mclose(fh)