Exemplo n.º 1
0
def Main():
    args=ArgParse()

    # initiate genome object from UCSC genome browser
    chromInfo=Genome(db=args.genome).chromInfo
    
    chroms={}
    sum_size=0
    sum_size_list=[]
    #store chromInfo 
    for i in range(chromInfo.count()):
        try: 
            if "random" in chromInfo[i].chrom: continue
            chroms[chromInfo[i].chrom]=chromInfo[i].size
            sum_size+=chromInfo[i].size
            sum_size_list.append((chromInfo[i].chrom,sum_size))
        except:
            break
    print >> sys.stderr, sum_size_list[-1][1]
    print >> sys.stderr, chroms
    
    print >>sys.stderr, "Chromosome information readed, %d chromosomes"%(len(chroms))
    i=0
    while i < args.num:
        # randomly select one chromosome and a region in this chromosome
        chrom=random_chr(sum_size_list)
        size=chroms[chrom]
        length=int(gauss(args.mean,args.sd))
        start=randrange(1,size-length-1)
        end=start+length-1
        strand=choice(['+','-'])
        print "\t".join(str(f) for f in [chrom,start,end,i+1,0,strand])
        i=i+1
Exemplo n.º 2
0
def Main():
    args = ArgParse()

    # initiate genome object from UCSC genome browser
    chromInfo = Genome(db=args.genome).chromInfo

    chroms = {}
    sum_size = 0
    sum_size_list = []
    #store chromInfo
    for i in range(chromInfo.count()):
        try:
            if "random" in chromInfo[i].chrom: continue
            chroms[chromInfo[i].chrom] = chromInfo[i].size
            sum_size += chromInfo[i].size
            sum_size_list.append((chromInfo[i].chrom, sum_size))
        except:
            break
    print >> sys.stderr, sum_size_list[-1][1]
    print >> sys.stderr, chroms

    print >> sys.stderr, "Chromosome information readed, %d chromosomes" % (
        len(chroms))
    i = 0
    while i < args.num:
        # randomly select one chromosome and a region in this chromosome
        chrom = random_chr(sum_size_list)
        size = chroms[chrom]
        length = int(gauss(args.mean, args.sd))
        start = randrange(1, size - length - 1)
        end = start + length - 1
        strand = choice(['+', '-'])
        print "\t".join(str(f) for f in [chrom, start, end, i + 1, 0, strand])
        i = i + 1
Exemplo n.º 3
0
    def test_dataframe(self):
        g = Genome('hg18')

        kg = g.dataframe('cpgIslandExt')
        self.assert_(kg.shape[0] == g.cpgIslandExt.count())

        q = g.cpgIslandExt.filter(g.cpgIslandExt.chromStart < 300000).limit(10)

        df = g.dataframe(q)
        self.assert_(df.shape[0] == 10)
Exemplo n.º 4
0
    def test_dataframe(self):
        g = Genome('hg18')

        kg = g.dataframe('cpgIslandExt')
        self.assert_(kg.shape[0] == g.cpgIslandExt.count())

        q = g.cpgIslandExt.filter(g.cpgIslandExt.chromStart < 300000).limit(10)

        df = g.dataframe(q)
        self.assert_(df.shape[0] == 10)
Exemplo n.º 5
0
 def test_bed_gene_pred(self):
     g = Genome('hg19')
     from sqlalchemy import and_
     from cStringIO import StringIO
     query = g.knownGene.filter(and_(g.knownGene.txStart > 10000, g.knownGene.txEnd < 20000))
     c = StringIO()
     Genome.save_bed(query, c)
     c.seek(0)
     rows = c.readlines()
     for toks in (row.split("\t") for row in rows):
         self.assert_(len(toks) == 12)
         self.assert_(int(toks[1]) > 10000)
         self.assert_(int(toks[2]) < 20000)
Exemplo n.º 6
0
 def test_bed_gene_pred(self):
     g = Genome('hg19', host="localhost", user="******")
     from sqlalchemy import and_
     from cStringIO import StringIO
     query = g.knownGene.filter(and_(g.table('knownGene').c.txStart > 10000, g.table('knownGene').c.txEnd < 20000))
     c = StringIO()
     Genome.save_bed(query, c)
     c.seek(0)
     rows = c.readlines()
     for toks in (row.split("\t") for row in rows):
         self.assert_(len(toks) == 12)
         self.assert_(int(toks[1]) > 10000)
         self.assert_(int(toks[2]) < 20000)
Exemplo n.º 7
0
    def test_mirror(self):

        try:
            os.unlink('/tmp/__u.db')
        except OSError:
            pass
        g = Genome('hg18')
        g.mirror(['chromInfo'], 'sqlite:////tmp/__u.db')
        a = str(g.chromInfo.filter().first())

        gs = Genome('sqlite:////tmp/__u.db')

        b = str(gs.chromInfo.filter().first())
        self.assertEqual(a, b)
        os.unlink('/tmp/__u.db')
Exemplo n.º 8
0
    def test_mirror(self):

        try:
            os.unlink('/tmp/__u.db')
        except OSError:
            pass
        g = Genome('hg18')
        g.mirror(['chromInfo'], 'sqlite:////tmp/__u.db')
        a = str(g.chromInfo.filter().first())

        gs = Genome('sqlite:////tmp/__u.db')

        b = str(gs.chromInfo.filter().first())
        self.assertEqual(a, b)
        os.unlink('/tmp/__u.db')
Exemplo n.º 9
0
 def test_blat(self):
     try:
         import requests
     except ImportError:
         return
     g = Genome('hg18')
     f = g.refGene[19]
     f.chrom = "chr6"
     f.txStart = 135646802
     f.txEnd = 135646832
     r = list(f.blat())
     self.assert_(str(f.txStart) in repr(r), r)
     self.assert_(str(f.txEnd) in repr(r), r)
Exemplo n.º 10
0
def mirror(genome, tables, connection_string):
    destination, dengine = make_session(connection_string)
    dmeta = MetaData(bind=dengine)

    orig_counts = []
    for table_name in tables:
        # cause it ot be mapped
        table = getattr(genome, table_name)._table
        print(('Mirroring', table_name), file=sys.stderr)

        table = set_table(genome, table, table_name, connection_string, dmeta)
        try:
            table.create(dengine)
        except sqlalchemy.exc.OperationalError:
            pass

        destination.commit()
        ins = table.insert()

        columns = list(table.columns.keys())
        records = []
        table_obj = getattr(genome, table_name)._table
        t = getattr(genome, table_name)
        for ii, record in enumerate(page_query(table_obj.select(), t.session)):
            data = dict(
                (str(column), getattr(record, column)) for column in columns)
            records.append(data)
            if ii % 20000 == 0 and ii > 0:
                destination.execute(ins, records)
                print(("processing record %i" % ii), file=sys.stderr)
                destination.commit()
                records = []
        destination.execute(ins, records)
        destination.commit()
        orig_counts.append(getattr(genome, table_name).count())

    destination, dengine = make_session(connection_string)
    from . import Genome
    newg = Genome(connection_string)
    new_counts = [getattr(newg, table_name).count() for table_name in tables]
    for tbl, oc, nc in zip(tables, orig_counts, new_counts):
        if oc != nc:
            print(("ERROR: mirrored table '%s' has %i \
            rows while the original had %i" % (tbl, nc, oc)),
                  file=sys.stderr)
    return newg
Exemplo n.º 11
0
 def test_bins(self):
     bins = Genome.bins(12345, 56779)
     expected = set([1, 9, 73, 585])
     self.assertEqual(bins, expected)
Exemplo n.º 12
0
 def setUp(self):
     self.dba = Genome('hg18')
     self.dbb = Genome('hg19')
Exemplo n.º 13
0
 def setUp(self):
     self.db = Genome('hg18')
     self.gene = self.db.refGene.filter_by(name2="MUC5B").first()
Exemplo n.º 14
0
            regions[item.chrom][0] = {}
        regions[item.chrom][0][item.chromEnd] = "+"
        #regions[item.chrom]["end"]=item.chromEnd
        #regions[item.chrom]["chrom"]=item.chrom
        #regions[item.chrom]["start"]=0
    regions["chrM"] = {}
    regions["chrM"][0] = {}
    regions["chrM"][0][16569] = "+"
    return (regions)


###MAIN PROGRAM###

file = '/Users/mok6/Desktop/Doug_NEW/dataNewII/SP04HU_PyPu'

print "start"

#get start and end of chromosomes, this is from dbcruz packages, local copy
regions = get_regions(
    Genome(
        "sqlite:////Users/mok6/Dropbox/LiClipse/MelArray/hg19_c.db").cytoband)
print "end load chr boundaries"

#load dimer data, second parameter is 0=filtered, 1=not filtered
DamagePos = loadDamagePosII(file, "1")
print "dimer data loaded"

#perform sliding window approach
slidingWindowII(regions, DamagePos, file)
print "sliding window done"
Exemplo n.º 15
0
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed,
        bed_files, mlog=True, region_filter_p=1, region_filter_n=None,
        genome_control=False, db=None, use_fdr=True):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator


    if step is None:
        step = min(acf_dist, stepsize.stepsize(bed_files, col_num))
        print("calculated stepsize as: %i" % step, file=sys.stderr)

    lags = list(range(1, acf_dist, step))
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print(" ".join(sys.argv[1:]) + "\n", file=fh)
        import datetime
        print("date: %s" % datetime.datetime.today(), file=fh)
        from .__init__ import __version__
        print("version:", __version__, file=fh)

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print("wrote: %s" % fh.name, file=fh)
    print("ACF:\n", open(prefix + ".acf.txt").read(), file=sys.stderr)

    spvals, opvals = array.array('f'), array.array('f')
    with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk:
        fhslk.write('#chrom\tstart\tend\tp\tregion-p\n')
        for chrom, results in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fmt = chrom + "\t%i\t%i\t%.4g\t%.4g\n"
            for row in results:
                row = tuple(row)
                fhslk.write(fmt % row)
                opvals.append(row[-2])
                spvals.append(row[-1])

    print("# original lambda: %.2f" % genomic_control(opvals), file=sys.stderr)
    del opvals

    gc_lambda = genomic_control(spvals)
    print("wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda),
            file=sys.stderr)

    if genome_control:
        fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w")
        adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)])
        for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")):
            print("%s\t%.5g" % (line.rstrip("\r\n"), adj[i]), file=fhslk)

        fhslk.close()
        print("wrote: %s" % fhslk.name, file=sys.stderr)

    with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh:
        fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n')
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print("wrote: %s" % fh.name, file=sys.stderr)
    fregions = prefix + ".regions.bed.gz"
    with ts.nopen(fregions, "w") as fh:
        list(peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed,
            dist, fh, operator.le))
    n_regions = sum(1 for _ in ts.nopen(fregions))
    print("wrote: %s (%i regions)" % (fregions, n_regions), file=sys.stderr)
    if n_regions == 0:
        sys.exit()

    with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                               prefix + ".slk.bed.gz",
                               prefix + ".regions.bed.gz", -2,
                               step):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print("wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N), file=sys.stderr)

    regions_bed = fh.name
    #if all(h in header for h in ('t', 'start', 'end')):
    if region_filter_n is None: region_filter_n = 0
    with ts.nopen(prefix + ".regions-t.bed", "w") as fh:
        N = 0
        for i, toks in enumerate(filter.filter(bed_files[0],
            regions_bed, p_col_name=col_num)):
            if i == 0: toks[0] = "#" + toks[0]
            else:
                if float(toks[6]) > region_filter_p: continue
                if int(toks[4]) < region_filter_n: continue
                #if region_filter_t and "/" in toks[7]:
                #    # t-pos/t-neg. if the lower one is > region_filter_t?
                #    vals = map(int, toks[7].split("/"))
                #    if min(vals) > region_filter_t: continue

                N += 1
            print("\t".join(toks), file=sys.stderr)
        print(("wrote: %s, (regions with region-p "
                            "< %.3f and n-probes >= %i: %i)") \
                % (fh.name, region_filter_p, region_filter_n, N),
                file=sys.stderr)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png",
                         False, ['#959899', '#484B4C'], "", False, None,
                         regions=regions, bonferonni=False)
    except ImportError:
        pass # they dont have matplotlib


    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            fh.write('#')
            g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh,
                    feature_strand=True, parallel=len(spvals) > 500)
        print("wrote: %s annotated with %s" % (fh.name, db), file=sys.stderr)
Exemplo n.º 16
0
DIR_PROJ = "/home/mokha/Documents/Krauthammer_Lab"
DIR_CURR = DIR_PROJ + "/PythonClasses/SVSv5"
DIR_DATA = DIR_CURR + "/TestData"
DIR_RESULTS = DIR_CURR + "/TestResults"
# DIR_RESULTS = DIR_CURR + "/Results/160729_Analyze_KF"
# DIR_RESULTS = DIR_CURR + "/Results/160731_Analyze_KF"
# DIR_RESULTS = DIR_CURR + "/Results/160909_Analyze_KF"

DIR_FUSION = DIR_PROJ + "/160510_GeneFusions"

print "------------ TDD: 161002_IsoformFusion_1.py ------------"

#set kinase gene annotation file
KinaseFusion.set_kinasefile( DIR_FUSION + "/Data/160910_KinaseAnnots_hg38_Final.txt" )
obj_cruzdb = Genome( 'sqlite:////tmp/hg38_v2.db' )
#set cruzdb Genome database instance
Isoform.set_cruzdb( obj_cruzdb )

#CASE: This returns "None" for the kinase domain for the kinase gene (TLK2 - NM_001284363)
#Fusion - ASIC2:TLK2
hash_multi_isoform = { "orientation": 'fr',
"chrom_start": 'chr17',
"chrom_end": 'chr17',
"pos_start": 34038904,
"pos_end": 62565136,
"read_span": 5, 
"read_matepair": 5,
"read_matepair_break": 5 }

obj_mif = MultiIsoformFusion( hash_multi_isoform )      #MIF = MultiIsoform Fusion instance
Exemplo n.º 17
0
import os.path as op
from toolshed import reader
from cruzdb import Genome

def lamina():
    if not op.exists('lamina.bed'):
        fh = open('lamina.bed', 'w')
        fh.write("#chrom\tstart\tend\tvalue\n")
        for gff in reader('http://www.nature.com/nature/journal/v453/n7197/extref/nature06947-s2.txt', header=False):
            fh.write("\t".join([gff[0], gff[3], gff[4], gff[5]]) + "\n")
        fh.close()
    return 'lamina.bed'

fname = 'supplement/Additional-File-11_lamina.anno.bed'
hg18 = Genome('sqlite:///hg18.db')
if not op.exists(fname):
    fhout = open(fname, 'w')
    hg18.annotate(lamina(), ('refGene', ), feature_strand=True, in_memory=True, parallel=True, out=fhout)
    fhout.close()


for cutoff in (0.90, 0.95):
    fh = open('/tmp/genes-%.2f.txt' % cutoff, 'w')
    for d in reader(fname):
        if float(d['value']) < cutoff: continue
        if d['refGene_distance'] == '0' or \
           d['refGene_distance'].startswith("0;"):
            print >>fh, "\n".join(d['refGene_name'].split(";"))
    fh.close()

cutoff = 0.90
Exemplo n.º 18
0
 def test_bins(self):
     bins = Genome.bins(12345, 56779)
     expected = set([1, 9, 73, 585])
     self.assertEqual(bins, expected)
Exemplo n.º 19
0
		submitters='1000GENOMES,ABI,BCM-HGSC-SUB,BCM_SSAHASNP,BGI,BL,BUSHMAN,COMPLETE_GENOMICS,DDI,ENSEMBL,EVA-GONL,EVA_DECODE,EVA_GENOME_DK,EVA_UK10K_ALSPAC,EVA_UK10K_TWINSUK,GMI,HAMMER_LAB,HGSV,HUMANGENOME_JCVI,ILLUMINA-UK,JMKIDD_LAB,PJP,SSAHASNP,SSMP,TISHKOFF,WEILL_CORNELL_DGM,',	
		alleleFreqCount=2,	
		alleles='C,T,',	
		alleleNs='2634.000000,2374.000000,',	
		alleleFreqs='0.525958,0.474042,',	
		bitfields=set(['maf-5-all-pops', 'maf-5-some-pop'])	
	)	
	'''

    # snps
    reader = TsvReader(snpfile, cnames=False)
    snplist = list(set(r[snpcol] for r in reader))
    reader.close()

    from cruzdb import Genome
    g = Genome(genome)
    outfiletmp = outfile + '.tmp'
    writer = TsvWriter(outfiletmp)
    for i in range(0, len(snplist), 1000):
        chunk = snplist[i:i + 1000]
        sql = 'SELECT chrom, chromStart, chromEnd, name, score, strand, refUCSC, alleles, alleleFreqs FROM snp{dbsnpver} WHERE name in ({snps})'.format(
            dbsnpver=dbsnpver, snps=', '.join("'{}'".format(s) for s in chunk))
        result = g.sql(sql)
        for r in result:
            allfreqs = dict(zip(r.alleles.split(','),
                                r.alleleFreqs.split(',')))
            reffreq = allfreqs.get(r.refUCSC, '0')
            if r.refUCSC in allfreqs:
                del allfreqs[r.refUCSC]
            if '' in allfreqs:
                del allfreqs['']
Exemplo n.º 20
0
        for ii, record in enumerate(page_query(table_obj.select(), t.session)):
            data = dict(
                (str(column), getattr(record, column)) for column in columns)
            records.append(data)
            if ii % 20000 == 0 and ii > 0:
                destination.execute(ins, records)
                print >> sys.stderr, "processing record %i" % ii
                destination.commit()
                records = []
        destination.execute(ins, records)
        destination.commit()
        orig_counts.append(getattr(genome, table_name).count())

    destination, dengine = make_session(connection_string)
    from . import Genome
    newg = Genome(connection_string)
    new_counts = [getattr(newg, table_name).count() for table_name in tables]
    for tbl, oc, nc in zip(tables, orig_counts, new_counts):
        if oc != nc:
            print >> sys.stderr, "ERROR: mirrored table '%s' has %i \
            rows while the original had %i" % (tbl, nc, oc)
    return newg


if __name__ == "__main__":
    if True:
        from cruzdb import Genome
        g = Genome('hg18')

        mirror(g, ['chromInfo'], 'sqlite:////tmp/u.db')
Exemplo n.º 21
0
{% if args.header %}
skip = {{args.skip}} + 1
{% else %}
skip = {{args.skip}}
{% endif %}

# get the snps
delimit = {{args.delimit | quote}}
comment = {{args.comment | quote}}
col     = {{args.col}}
with open({{i.snpfile | quote}}) as f:
	snps = sorted(set([line.split(delimit)[col] for i, line in enumerate(f.read().splitlines()) if i >= skip and line.strip() and not line.startswith(comment)]))

genome = {{args.genome | quote}}

g     = Genome (db=genome)
dbsnp = g.snp{{args.dbsnpver}}

fout  = open ("{{o.outfile}}", "w")
for snp in snps:
	s = dbsnp.filter_by(name=snp).first()
	if not s:
		sys.stderr.write('pyppl.log.warning: Cannot find coordinates for SNP: %s\n' % snp)
	else:
		# chr start end  name score strand otherinfo
		chrom   = s.chrom
		start   = s.chromStart
		end     = s.chromEnd
		name    = snp
		strand  = s.strand
		ref     = s.refUCSC
print vcf_ex.head()

# Rename columns to fix the syntax in chromosome number
names = vcf_ex.columns.values
new_names = ['CHROM']
new_names.extend(names[1:])
print "\n", new_names
vcf_ex.columns = new_names

# If QUAL > 0.5, sample passes
vcf_ex_sub = vcf_ex.loc[vcf_ex.QUAL > 0.5, ['CHROM', 'POS']].copy()
print vcf_ex_sub.head()

# Get the Genome object from cruzdb
# connects to MySQL genome browser at UCSC
g = Genome('hg38')

# Convert table 'refGene' to pandas dataframe
# columns of interest 'chrom' (chrX, %s), 'txStart' (number, %s), 'txEnd' (number , %s)
print "Extracting reference genome table (HG38) from UCSC Genome Browser"
df = g.dataframe('refGene')
df[['txStart', 'txEnd']] = df[['txStart', 'txEnd']].astype(int)

genes = pd.Series(np.zeros(vcf_ex_sub.shape[0]))
#gene = hg19.bin_query('refGene', vcf_ex_sub.CHROM[1], vcf_ex_sub.POS[1], vcf_ex_sub.POS[1])
#print vcf_ex_sub.POS.iloc[0]

for i in range(0, vcf_ex_sub.shape[0]):
    #genes[i] = df[[df.chrom == str(vcf_ex_sub.CHROM.iloc[i]) and df.txStart >= str(vcf_ex_sub.POS.iloc[i])]].bool() #and df.txStart >= str(vcf_ex_sub.POS.iloc[i]) and df.txEnd <= vcf_ex_sub.POS.iloc[i]].bool(),
    chrom = vcf_ex_sub['CHROM'].iloc[i]
    location = vcf_ex_sub['POS'].iloc[i]
Exemplo n.º 23
0
import csv
from cruzdb import Genome

# DEPENDENCIES:

# python2 -m pip install --upgrade pip
# python2 -m pip install setuptools # first 2 may be already done
# python2 -m pip install cruzdb
# python2 -m pip install sqlalchemy

# MySQLdb stuff:
# sudo apt-get install mysql-server
# sudo apt-get install libmysqlclient-dev # gives us mysql_config
# FINALLY download MySQLdb source, do process described in INSTALL file

hg19 = Genome('hg19')
INPUTFILE = "suggestive.pheno_simple.covar_none.test_wald.csv"
filereader = csv.reader(open(INPUTFILE))

chrom_i = None
pos_i = None
for i, line in enumerate(filereader):
    if i == 0:
        # CHROM POS REF ALT N_INFORMATIVE Test Beta SE Pvalue PVALUE
        chrom_i = line.index('CHROM')
        pos_i = line.index('POS')
        continue
    chrom = 'chr' + str(line[chrom_i])
    pos = int(line[pos_i])
    start = pos - 50  # kind of arbitrary search 50 back 50 forward.
    end = pos + 50
Exemplo n.º 24
0
#!/usr/bin/env python

from cruzdb import Genome
import sys
import re
import tqdm
import pandas as pd
limit = 1000
build = sys.argv[1]
fname = sys.argv[2]
mybuild = Genome(build)
if (re.search("hg19", build)):
    snpdb = mybuild.snp138
elif (re.search("hg18", build)):
    snpdb = mybuild.snp130
with open(sys.argv[3], "w") as f:
    counter = 0
    f.write("chrom\tstart\tend\tname\tscore\tstrand\n")
    rs_ids = pd.read_csv(fname, names=["id"])
    for index in tqdm.tqdm(range(0, rs_ids.shape[0], limit)):
        result = snpdb.filter(
            snpdb.name.in_(list(rs_ids["id"][index:index +
                                             limit]))).limit(limit).all()
        f.write("\n".join(map(lambda x: str(x), result)))
        f.write("\n")
Exemplo n.º 25
0
def pipeline(col_num, step, dist, prefix, threshold, seed, bed_files, mlog=False,
    region_filter_p=1, region_filter_n=1, genome_control=False, db=None):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator


    if step is None:
        step = stepsize.stepsize(bed_files, col_num)
        print >>sys.stderr, "calculated stepsize as: %i" % step

    lags = range(1, dist, step)
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    #if genome_control:
    #    with open(prefix + ".adj.bed", "w") as fh:
    #        genome_control_adjust_bed(bed_files, col_num, fh)
    #    bed_files = [fh.name]
    putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print >>fh, " ".join(sys.argv[1:]) + "\n"
        import datetime
        print >>fh, "date: %s" % datetime.datetime.today()

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print >>sys.stderr, "wrote: %s" % fh.name
    print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read()

    spvals, opvals = [], []
    with open(prefix + ".slk.bed", "w") as fhslk:

        for row in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row)
            opvals.append(row[-2])
            spvals.append(row[-1])

    print >>sys.stderr, "# original lambda: %.2f" % genomic_control(opvals)
    del opvals

    gc_lambda = genomic_control(spvals)
    print >>sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda)

    if genome_control:
        fhslk = open(prefix + ".slk.gc.bed", "w")
        adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed", -1)])
        for i, line in enumerate(open(prefix + ".slk.bed")):
            print >>fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i])

        fhslk.close()
        print >>sys.stderr, "wrote: %s" % fhslk.name

    with open(prefix + ".fdr.bed", "w") as fh:
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print >>sys.stderr, "wrote: %s" % fh.name

    fregions = prefix + ".regions.bed"
    with open(fregions, "w") as fh:
        list(peaks.peaks(prefix + ".fdr.bed", -1, threshold, seed,
            step, fh, operator.le))
    n_regions = sum(1 for _ in open(fregions))
    print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions)

    with open(prefix + ".regions-p.bed", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tslk_p\tslk_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                               prefix + ".slk.bed",
                               prefix + ".regions.bed", -2,
                               0, step, mlog=mlog):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N)

    regions_bed = fh.name
    header = (gzip.open(bed_files[0]) if bed_files[0].endswith(".gz")
            else open(bed_files[0])).next().split("\t")
    if all(h in header for h in ('t', 'start', 'end')):
        with open(prefix + ".regions-t.bed", "w") as fh:
            N = 0
            for i, toks in enumerate(filter.filter(bed_files[0], regions_bed,
                p_col_name=col_num)):
                if i == 0: toks[0] = "#" + toks[0]
                else:
                    if float(toks[6]) > region_filter_p: continue
                    if int(toks[4]) < region_filter_n: continue
                    N += 1
                print >>fh, "\t".join(toks)
            print >>sys.stderr, ("wrote: %s, (regions with region-p"
                                "< %.3f and n-probes >= %i: %i)") \
                    % (fh.name, region_filter_p, region_filter_n, N)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed", 3, prefix.rstrip(".") + ".manhattan.png",
                         False, ['#959899', '#484B4C'], "", False, None,
                         regions=regions, bonferonni=True)
    except ImportError:
        pass # they dont have matplotlib


    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            g.annotate(lastf, ("refGene", "cpgIslandExt", "cytoBand"), out=fh,
                    feature_strand=True, parallel=len(spvals) > 500)
        print >>sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
Exemplo n.º 26
0
def getSnpInfo(x):
    from cruzdb import Genome
    hg19 = Genome(db="hg19")
    snp151 = hg19.snp151
    info = snp151.filter_by(name=x).first()
    return info
Exemplo n.º 27
0
 def setUp(self):
     self.db = Genome('hg18')
Exemplo n.º 28
0
def snpinfo(infile,
            outfile=None,
            notfound='ignore',
            genome='hg19',
            dbsnpver="150",
            inopts=None,
            outopts=None,
            snpcol=None,
            cachedir=gettempdir()):
    _inopts = Box(skip=0, comment='#', delimit='\t')
    _inopts.update(inopts or {})
    inopts = _inopts

    _outopts = Box(delimit='\t',
                   headDelimit='\t',
                   headPrefix='',
                   headTransform=None,
                   head=True,
                   ftype='bed',
                   cnames='refUCSC, alleles, alleleFreqs, alleleFreqCount')
    _outopts.update(outopts)
    outopts = _outopts
    cnames = alwaysList(outopts['cnames'])

    reader = TsvReader(infile, **inopts)
    if not reader.meta: reader.autoMeta()
    snpcol = snpcol or reader.meta.keys()[0]

    snps = list(set([r[snpcol] for r in reader]))
    reader.rewind()

    dbfile = path.join(cachedir, 'snpinfo_%s_%s.db' % (genome, dbsnpver))
    schema = {
        'chrom': 'text',  # chr8
        'chromStart': 'int',  # 128700232L
        'chromEnd': 'end',  # 128700233L
        'name': 'text primary key',  # rs7005394
        'score': 'real',  # 0
        'strand': 'text',  # +
        'refNCBI': 'text',  # T
        'refUCSC': 'text',  # T
        'observed': 'text',  # C/T
        'class': 'single',  # single
        'avHet': 'real',  # 0.49
        'avHetSE': 'real',  # 0.02
        'func': 'text',  # set(['ncRNA'])
        'submitterCount': 'int',  # 20
        'submitters':
        'text',  # 1000GENOMES,ABI,BCM-HGSC-SUB,BCM_SSAHASNP,BGI,BL ...
        'alleleFreqCount': 'int',  # 2
        'alleles': 'text',  # C,T,
        'alleleNs': 'text',  # 2634.000000,2374.000000,
        'alleleFreqs': 'text',  # 0.525958,0.474042,
    }
    cache = Cache(dbfile, 'snpinfo', schema, 'name')
    dummies = {
        'func':
        dict(query=Cache.DUMMY['array']['query'],
             find=Cache.DUMMY['array']['find'],
             insert=lambda col, data:
             (col, ' // ' + ' // '.join(Cache._uniqueData(list(data), True))),
             update=lambda col, data:
             (col,
              Function.concat(Field(col),
                              value=' // ' + ' // '.join(
                                  Cache._uniqueData(list(d), True)))),
             result=lambda data: data if isinstance(data, list) else list(
                 filter(None, data.split(' // '))))
    }
    columns = ['chrom', 'name', 'chromStart', 'chromEnd', 'score', 'strand'
               ] + cnames
    ret, allrest = cache.query(columns, {'name': snps}, dummies)

    dbsnp = Genome(db=genome)
    dbsnp = getattr(dbsnp, "snp%s" % dbsnpver)

    writer = None
    if outfile:
        head = outopts['head']
        headPrefix = outopts['headPrefix']
        headDelimit = outopts['headDelimit']
        headTransform = outopts['headTransform']
        del outopts['head']
        del outopts['headPrefix']
        del outopts['headDelimit']
        del outopts['headTransform']
        writer = TsvWriter(outfile, **outopts)
        if head:
            writer.writeHead(prefix=headPrefix,
                             delimit=headDelimit,
                             transform=headTransform)

    if writer:
        for r in ret.values():
            r.CHR = r.chrom
            r.START = r.chromStart
            r.END = r.chromEnd
            r.NAME = r.name
            r.SCORE = r.score
            r.STRAND = r.strand
            writer.write(r)

    cached = []
    if allrest:
        for snp in allrest['name']:
            s = dbsnp.filter_by(name=snp).first()
            if not s:
                if notfound == 'error':
                    raise RecordNotFound('Record not found: %s' % snp)
                elif notfound == 'skip':
                    continue
                else:
                    stderr.write('Record not found: %s \n' % snp)
                    continue
            cached.append(s)
            if writer:
                r = TsvRecord()
                r.CHR = s.chrom
                r.START = s.chromStart
                r.END = s.chromEnd
                r.NAME = s.name
                r.SCORE = s.score
                r.STRAND = s.strand
                for cname in cnames:
                    setattr(r, cname, getattr(s, cname))
                writer.write(r)

    # save cached data
    cachedata = {}
    for c in cached:
        for k in schema.keys():
            if not k in cachedata:
                cachedata[k] = []
            cachedata[k].append(getattr(c, k))
    if cachedata:
        cache.save(cachedata, dummies)
    return {r.name: r for r in ret.values() + cached}
Exemplo n.º 29
0
from cruzdb import Genome
import time
from toolshed import nopen
import os

anno_file = "data_c_constant_early.bed"
# sub-sample to get fewer rows.
list(nopen("|awk 'NR == 1 || NR % 4 == 0'" +(" %s > %s.some" % (anno_file, anno_file))))
anno_file += ".some"
nlines = sum(1 for _ in nopen(anno_file))

print "loc\tinstance\tparallel\ttime"
for parallel in (True, False):
    for name, args in (('local\tsqlite', ('sqlite:///hg18.db',)),
                       ('remote\tmysql', ('hg18',)),
                       ('local\tmysql', ('hg18', 'brentp', 'localhost'))
                       ):
        g = Genome(*args)

        out = "%s-%s.anno.txt" % (name.replace("\t", "-"), parallel)

        t0 = time.time()
        g.annotate(anno_file, ('refGene',), out=out, feature_strand=True,
                parallel=parallel)
        t1 = time.time()
        print "\t".join(map(str, (name, parallel, ("%.1f" % (t1 - t0)))))
        assert nlines == sum(1 for _ in nopen(out))
        os.unlink(out)
Exemplo n.º 30
0
import sys
from cruzdb import Genome

db = sys.argv[1]

#db = Genome('sqlite:////usr/local/src/cruzdb/%s.db' % db)
db = Genome(db)
refGene = db.refGene

for g in refGene.all():
    for feat in g.gene_features:
        print "\t".join(map(str, feat))
Exemplo n.º 31
0
def pipeline(col_num,
             step,
             dist,
             acf_dist,
             prefix,
             threshold,
             seed,
             bed_files,
             mlog=True,
             region_filter_p=1,
             region_filter_n=None,
             genome_control=False,
             db=None,
             use_fdr=True):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator

    if step is None:
        step = min(acf_dist, stepsize.stepsize(bed_files, col_num))
        print >> sys.stderr, "calculated stepsize as: %i" % step

    lags = range(1, acf_dist, step)
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    putative_acf_vals = acf.acf(bed_files,
                                lags,
                                col_num,
                                simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print >> fh, " ".join(sys.argv[1:]) + "\n"
        import datetime
        print >> fh, "date: %s" % datetime.datetime.today()
        from .__init__ import __version__
        print >> fh, "version:", __version__

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print >> sys.stderr, "wrote: %s" % fh.name
    print >> sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read()

    spvals, opvals = [], []
    with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk:
        fhslk.write('#chrom\tstart\tend\tp\tregion-p\n')
        for row in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row)
            opvals.append(row[-2])
            spvals.append(row[-1])

    print >> sys.stderr, "# original lambda: %.2f" % genomic_control(opvals)
    del opvals

    gc_lambda = genomic_control(spvals)
    print >> sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name,
                                                          gc_lambda)

    if genome_control:
        fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w")
        adj = genome_control_adjust(
            [d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)])
        for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")):
            print >> fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i])

        fhslk.close()
        print >> sys.stderr, "wrote: %s" % fhslk.name

    with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh:
        fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n')
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print >> sys.stderr, "wrote: %s" % fh.name
    fregions = prefix + ".regions.bed.gz"
    with ts.nopen(fregions, "w") as fh:
        list(
            peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2,
                        threshold, seed, dist, fh, operator.le))
    n_regions = sum(1 for _ in ts.nopen(fregions))
    print >> sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions)
    if n_regions == 0:
        sys.exit()

    with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N)

    regions_bed = fh.name
    header = ts.header(bed_files[0])
    #if all(h in header for h in ('t', 'start', 'end')):
    if region_filter_n is None: region_filter_n = 0
    with ts.nopen(prefix + ".regions-t.bed", "w") as fh:
        N = 0
        for i, toks in enumerate(
                filter.filter(bed_files[0], regions_bed, p_col_name=col_num)):
            if i == 0: toks[0] = "#" + toks[0]
            else:
                if float(toks[6]) > region_filter_p: continue
                if int(toks[4]) < region_filter_n: continue
                #if region_filter_t and "/" in toks[7]:
                #    # t-pos/t-neg. if the lower one is > region_filter_t?
                #    vals = map(int, toks[7].split("/"))
                #    if min(vals) > region_filter_t: continue

                N += 1
            print >> fh, "\t".join(toks)
        print >>sys.stderr, ("wrote: %s, (regions with region-p "
                            "< %.3f and n-probes >= %i: %i)") \
                % (fh.name, region_filter_p, region_filter_n, N)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed.gz",
                            3,
                            prefix.rstrip(".") + ".manhattan.png",
                            False, ['#959899', '#484B4C'],
                            "",
                            False,
                            None,
                            regions=regions,
                            bonferonni=False)
    except ImportError:
        pass  # they dont have matplotlib

    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            fh.write('#')
            g.annotate(lastf, ("refGene", "cpgIslandExt"),
                       out=fh,
                       feature_strand=True,
                       parallel=len(spvals) > 500)
        print >> sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
"""
"testing" of bin queries and nearest queries
take a long time to run so not part of standard test suite
"""

import time
from cruzdb import Genome
from random import randrange, seed

#g = Genome('hg18', host='localhost', user='******')

#g.mirror(['refGene'], "sqlite:////tmp/u.db")


g = Genome('sqlite:////tmp/u.db')



# if we choose a huge distance all should have a distance of 0
#assert all(k.dist == 0  for k in  g.knearest("refGene", "chr1", 1234, 9915555, k=3))


print g.upstream("refGene", "chr1", 9444, 9555, k=6)

last = g.refGene.order_by(-g.refGene.table().c.txStart)[0]
print last
last.txStart = 1000 + last.txEnd
last.txEnd = last.txStart + 100
last.strand = "-"
print last
print g.upstream("refGene", last, k=6)
Exemplo n.º 33
0
import time
from toolshed import nopen
import os

anno_file = "data_c_constant_early.bed"
# sub-sample to get fewer rows.
list(
    nopen("|awk 'NR == 1 || NR % 4 == 0'" + (" %s > %s.some" %
                                             (anno_file, anno_file))))
anno_file += ".some"
nlines = sum(1 for _ in nopen(anno_file))

print "loc\tinstance\tparallel\ttime"
for parallel in (True, False):
    for name, args in (('local\tsqlite', ('sqlite:///hg18.db', )),
                       ('remote\tmysql', ('hg18', )),
                       ('local\tmysql', ('hg18', 'brentp', 'localhost'))):
        g = Genome(*args)

        out = "%s-%s.anno.txt" % (name.replace("\t", "-"), parallel)

        t0 = time.time()
        g.annotate(anno_file, ('refGene', ),
                   out=out,
                   feature_strand=True,
                   parallel=parallel)
        t1 = time.time()
        print "\t".join(map(str, (name, parallel, ("%.1f" % (t1 - t0)))))
        assert nlines == sum(1 for _ in nopen(out))
        os.unlink(out)
## Purpose:		Extract chrom/pos from rs #
##              dbsnp144, build 38
## =================================================

## ************* NOTE ******************************
## This script should be run in virtualenv with these installed:
# pip install six
# pip install cruzdb
# pip install sqlalchemy
# pip install mysql-python
## *************************************************

from cruzdb import Genome
import sys

fname = "variants_rs.csv"
outfile = open('chrom_pos.table', 'w')

lines = [line.rstrip('\r\n') for line in open(fname, "r")]
for rs in lines:
    if rs.startswith("rs"):
        var_info = Genome('hg38').snp144.filter_by(name=rs).first()
        if var_info is not None:
            outputstring = str(var_info.chrom.split("chr")[1]) + "\t" + str(
                var_info.chromStart + 1) + "\n"
            outfile.write(outputstring)
        else:
            outfile.write(str("Caution:" + rs + "\n"))

outfile.close()
Exemplo n.º 35
0
#!/usr/bin/env python2.7
from cruzdb import Genome

#this takes a list of genes on chromsome20 and gets their transcript coords
g = Genome(db="hg19")

genes = [
    'AHCY', 'ARFGEF2', 'BMP2', 'DNAJC5', 'EDN3', 'GSS', 'GNAS1', 'JAG1',
    'PANK2', 'PRNP', 'tTG', 'SALL4', 'VAPB'
]

for gene in genes:
    gene_obj = g.refGene.filter_by(name2=gene).first()
    if gene_obj:
        #one based intervals
        #http://gatkforums.broadinstitute.org/discussion/1204/what-input-files-does-the-gatk-accept-require
        print("{0}:{1}-{2}".format(gene_obj.chrom.replace('chr', ''),
                                   gene_obj.txStart, gene_obj.txEnd))
Exemplo n.º 36
0
"""
"testing" of bin queries and nearest queries
take a long time to run so not part of standard test suite
"""

import time
from cruzdb import Genome
from random import randrange, seed

#g = Genome('hg18', host='localhost', user='******')

#g.mirror(['refGene'], "sqlite:////tmp/u.db")

g = Genome('sqlite:////tmp/u.db')

# if we choose a huge distance all should have a distance of 0
#assert all(k.dist == 0  for k in  g.knearest("refGene", "chr1", 1234, 9915555, k=3))

print g.upstream("refGene", "chr1", 9444, 9555, k=6)

last = g.refGene.order_by(-g.refGene.table().c.txStart)[0]
print last
last.txStart = 1000 + last.txEnd
last.txEnd = last.txStart + 100
last.strand = "-"
print last
print g.upstream("refGene", last, k=6)

1 / 0

seed(1)
Exemplo n.º 37
0
#/usr/bin/python
import sys

from cruzdb import Genome

sys.path.insert(0, "/home/mokha/Documents/Krauthammer_Lab/PythonClasses")
from SVSv5 import Exon, Isoform, MultiIsoform, SpliceJunction, IsoformSJ

print "------------ Algorithm: 160919_Isoform_1.py ------------"
""" Reconstruct transcripts based on Splice Junctions """

#assign all splice junctions to specific gene: go through cruzdb & find end points for each gene --> assign
# g = Genome( 'sqlite:////tmp/hg19.db' )
g = Genome('sqlite:////tmp/hg19_v2.db')
Isoform.set_cruzdb(g)

#retrieve gene & print information on it based on
gene = g.refGene.filter_by(name2='BRAF').all()
# all_genes = g.refGene.filter_by( name2 = 'TTN' ).first()
# all_genes = g.refGene.filter_by( name2 = 'AGRN' ).all()
# all_genes = g.refGene.filter_by( name2 = 'AGRN' ).first()
# all_genes = g.refGene.filter_by( name2 = 'DIXDC1' ).all()

for each_isoform in gene:
    obj_iso = Isoform(each_isoform.name)

    #print name
    print obj_iso.isoform_id, ":", obj_iso.gene_sym

    print "obj_iso = ", obj_iso
Exemplo n.º 38
0
from cruzdb import Genome
from cruzdb.sequence import sequence

# mirror the neede tables from UCSC to a local sqlite db
local = Genome('hg19').mirror(('refGene', 'targetScanS'), 'sqlite:///hg19.mirna.db')

# connect to the newly created local sqlite database instance.
refseq_ids = []

# iterate over the coding in refGene
for gene in (rgene for rgene in local.refGene if rgene.is_coding):

    if None in gene.utr3: continue # skip genes with no UTR

    utr_start, utr_end = gene.utr3
    # query the targetScan miRNA table with efficient bin query 
    sites = local.bin_query('targetScanS', gene.chrom, utr_start, utr_end)

    # print BED file of genes whose 3'UTR contains a miR-96 target site
    # with a score > 85.
    if any("miR-96" in s.name and s.score > 85 for s in sites):
        refseq_ids.append(gene.name) # save the refSeq for later GO analysis

        # gene is a python object but its string representation is BED format
        # we also print out the UTR sequence.
        print gene, sequence('hg19', gene.chrom, utr_start, utr_end)

# open a webbrowser to show enrichment of the genes we've selected in DAVID
Genome.david_go(refseq_ids)
print vcf_ex.head()

# Rename columns to fix the syntax in chromosome number
names = vcf_ex.columns.values
new_names = ['CHROM']
new_names.extend(names[1:])
print "\n", new_names
vcf_ex.columns = new_names

# If QUAL > 0.5, sample passes
vcf_ex_sub = vcf_ex.loc[vcf_ex.QUAL > 0.5, ['CHROM', 'POS']].copy()
print vcf_ex_sub.head()

# Get the Genome object from cruzdb
# connects to MySQL genome browser at UCSC
g = Genome('hg38')

# Convert table 'refGene' to pandas dataframe
# columns of interest 'chrom' (chrX, %s), 'txStart' (number, %s), 'txEnd' (number , %s)
print "Extracting reference genome table (HG38) from UCSC Genome Browser"
df = g.dataframe('refGene')
df[['txStart', 'txEnd']] = df[['txStart', 'txEnd']].astype(int)

genes = pd.Series(np.zeros(vcf_ex_sub.shape[0]))
#gene = hg19.bin_query('refGene', vcf_ex_sub.CHROM[1], vcf_ex_sub.POS[1], vcf_ex_sub.POS[1])
#print vcf_ex_sub.POS.iloc[0]


for i in range(0, vcf_ex_sub.shape[0]):
     #genes[i] = df[[df.chrom == str(vcf_ex_sub.CHROM.iloc[i]) and df.txStart >= str(vcf_ex_sub.POS.iloc[i])]].bool() #and df.txStart >= str(vcf_ex_sub.POS.iloc[i]) and df.txEnd <= vcf_ex_sub.POS.iloc[i]].bool(),
    chrom = vcf_ex_sub['CHROM'].iloc[i]
Exemplo n.º 40
0
#/usr/bin/python
#Script: setup_cruzdb_databases.py
from cruzdb import Genome

arrTables_hg19 = [
    "refGene", "knownGene", "ensGene", "ccdsKgMap", "knownGeneMrna",
    "kgProtAlias", "knownToEnsembl", "knownToRefSeq", "wgEncodeGencodeBasicV19"
]
#NOTE: "ensGene" is not present in hg38

#NOTE: "ensGene"
arrTables_hg38 = [
    "refGene", "knownGene", "ccdsKgMap", "knownGeneMrna", "kgProtAlias",
    "knownToEnsembl", "knownToRefSeq"
]

# db_hg19_path = "sqlite:////tmp/hg19.db"
# db_hg38_path = "sqlite:////tmp/hg38.db"
db_hg19_path = "sqlite:////tmp/hg19_v2.db"
db_hg38_path = "sqlite:////tmp/hg38_v2.db"
Genome(db="hg19").mirror(arrTables_hg19, db_hg19_path)
Genome(db="hg38").mirror(arrTables_hg38, db_hg38_path)