Exemplo n.º 1
0
from cruzdb import Genome
from cruzdb.sequence import sequence

# mirror the neede tables from UCSC to a local sqlite db
local = Genome('hg19').mirror(('refGene', 'targetScanS'), 'sqlite:///hg19.mirna.db')

# connect to the newly created local sqlite database instance.
refseq_ids = []

# iterate over the coding in refGene
for gene in (rgene for rgene in local.refGene if rgene.is_coding):

    if None in gene.utr3: continue # skip genes with no UTR

    utr_start, utr_end = gene.utr3
    # query the targetScan miRNA table with efficient bin query 
    sites = local.bin_query('targetScanS', gene.chrom, utr_start, utr_end)

    # print BED file of genes whose 3'UTR contains a miR-96 target site
    # with a score > 85.
    if any("miR-96" in s.name and s.score > 85 for s in sites):
        refseq_ids.append(gene.name) # save the refSeq for later GO analysis

        # gene is a python object but its string representation is BED format
        # we also print out the UTR sequence.
        print gene, sequence('hg19', gene.chrom, utr_start, utr_end)

# open a webbrowser to show enrichment of the genes we've selected in DAVID
Genome.david_go(refseq_ids)
Exemplo n.º 2
0
print g.upstream("refGene", last, k=6)

1 / 0

seed(1)

istart = 12345
iend = 386539

qall = list(g.refGene.all())

#while True:
for iend in (randrange(istart, 65555555) for i in range(100)):

    t = time.time()
    q = g.bin_query('refGene', 'chr1', istart, iend)
    a = list(q)
    print len(a)
    print time.time() - t

    #"""
    t = time.time()
    refGene = g.refGene

    rg = refGene.table()
    q = g.session.query(rg).filter(rg.c.chrom == "chr1", rg.c.txStart <= iend,
                                   rg.c.txEnd >= istart)
    q = refGene.filter(rg.c.chrom == "chr1", rg.c.txStart <= iend,
                       rg.c.txEnd >= istart)
    b = list(q)
    print len(b)
Exemplo n.º 3
0
if not op.exists(fname):
    fhout = open(fname, 'w')
    hg18.annotate(lamina(), ('refGene', ), feature_strand=True, in_memory=True, parallel=True, out=fhout)
    fhout.close()


for cutoff in (0.90, 0.95):
    fh = open('/tmp/genes-%.2f.txt' % cutoff, 'w')
    for d in reader(fname):
        if float(d['value']) < cutoff: continue
        if d['refGene_distance'] == '0' or \
           d['refGene_distance'].startswith("0;"):
            print >>fh, "\n".join(d['refGene_name'].split(";"))
    fh.close()

cutoff = 0.90
fh = open('/tmp/genes-overlap-complete.txt', 'w')
for d in (l for l in reader(lamina()) if float(l['value']) > cutoff):
    if float(d['value']) < cutoff: continue

    start, end = map(int, (d['start'], d['end']))

    res = hg18.bin_query('refGene', d['chrom'], start, end).all()

    if len(res) == 0: continue

    for r in res:
        # genes completely contained within an LAD
        if start <= r.start and end >= r.end:
            print >>fh, r.gene_name
1/0


seed(1)

istart = 12345
iend = 386539

qall = list(g.refGene.all())

#while True:
for iend in (randrange(istart, 65555555) for i in range(100)):


    t = time.time()
    q = g.bin_query('refGene', 'chr1', istart, iend)
    a = list(q)
    print len(a)
    print time.time() - t

    #"""
    t = time.time()
    refGene = g.refGene


    rg = refGene.table()
    q = g.session.query(rg).filter(rg.c.chrom == "chr1", rg.c.txStart
            <= iend, rg.c.txEnd >= istart)
    q = refGene.filter(rg.c.chrom == "chr1", rg.c.txStart
            <= iend, rg.c.txEnd >= istart)
    b = list(q)
# MySQLdb stuff:
# sudo apt-get install mysql-server
# sudo apt-get install libmysqlclient-dev # gives us mysql_config
# FINALLY download MySQLdb source, do process described in INSTALL file

hg19 = Genome('hg19')
INPUTFILE = "suggestive.pheno_simple.covar_none.test_wald.csv"
filereader = csv.reader(open(INPUTFILE))

chrom_i = None
pos_i = None
for i, line in enumerate(filereader):
    if i == 0:
        # CHROM POS REF ALT N_INFORMATIVE Test Beta SE Pvalue PVALUE
        chrom_i = line.index('CHROM')
        pos_i = line.index('POS')
        continue
    chrom = 'chr' + str(line[chrom_i])
    pos = int(line[pos_i])
    start = pos - 50  # kind of arbitrary search 50 back 50 forward.
    end = pos + 50
    genes = hg19.bin_query('refGene', chrom, start, end)
    ## formatting the output
    basic_str = ' '.join(map(str, [chrom, start, end]))
    padding = 30 - len(basic_str)
    if padding < 0:
        padding = 1
    gene_string = ' '.join(set(g.name2 for g in genes))
    print basic_str + ' ' * padding + gene_string