Exemplo n.º 1
0
    def test_dataframe(self):
        g = Genome('hg18')

        kg = g.dataframe('cpgIslandExt')
        self.assert_(kg.shape[0] == g.cpgIslandExt.count())

        q = g.cpgIslandExt.filter(g.cpgIslandExt.chromStart < 300000).limit(10)

        df = g.dataframe(q)
        self.assert_(df.shape[0] == 10)
Exemplo n.º 2
0
    def test_dataframe(self):
        g = Genome('hg18')

        kg = g.dataframe('cpgIslandExt')
        self.assert_(kg.shape[0] == g.cpgIslandExt.count())

        q = g.cpgIslandExt.filter(g.cpgIslandExt.chromStart < 300000).limit(10)

        df = g.dataframe(q)
        self.assert_(df.shape[0] == 10)
new_names.extend(names[1:])
print "\n", new_names
vcf_ex.columns = new_names

# If QUAL > 0.5, sample passes
vcf_ex_sub = vcf_ex.loc[vcf_ex.QUAL > 0.5, ['CHROM', 'POS']].copy()
print vcf_ex_sub.head()

# Get the Genome object from cruzdb
# connects to MySQL genome browser at UCSC
g = Genome('hg38')

# Convert table 'refGene' to pandas dataframe
# columns of interest 'chrom' (chrX, %s), 'txStart' (number, %s), 'txEnd' (number , %s)
print "Extracting reference genome table (HG38) from UCSC Genome Browser"
df = g.dataframe('refGene')
df[['txStart', 'txEnd']] = df[['txStart', 'txEnd']].astype(int)

genes = pd.Series(np.zeros(vcf_ex_sub.shape[0]))
#gene = hg19.bin_query('refGene', vcf_ex_sub.CHROM[1], vcf_ex_sub.POS[1], vcf_ex_sub.POS[1])
#print vcf_ex_sub.POS.iloc[0]


for i in range(0, vcf_ex_sub.shape[0]):
     #genes[i] = df[[df.chrom == str(vcf_ex_sub.CHROM.iloc[i]) and df.txStart >= str(vcf_ex_sub.POS.iloc[i])]].bool() #and df.txStart >= str(vcf_ex_sub.POS.iloc[i]) and df.txEnd <= vcf_ex_sub.POS.iloc[i]].bool(),
    chrom = vcf_ex_sub['CHROM'].iloc[i]
    location = vcf_ex_sub['POS'].iloc[i]
    #print i, chrom, location
    tmp = df.copy()
    tmp = tmp[tmp.chrom == chrom]
    tmp = tmp[tmp['txStart'] <= location]
new_names.extend(names[1:])
print "\n", new_names
vcf_ex.columns = new_names

# If QUAL > 0.5, sample passes
vcf_ex_sub = vcf_ex.loc[vcf_ex.QUAL > 0.5, ['CHROM', 'POS']].copy()
print vcf_ex_sub.head()

# Get the Genome object from cruzdb
# connects to MySQL genome browser at UCSC
g = Genome('hg38')

# Convert table 'refGene' to pandas dataframe
# columns of interest 'chrom' (chrX, %s), 'txStart' (number, %s), 'txEnd' (number , %s)
print "Extracting reference genome table (HG38) from UCSC Genome Browser"
df = g.dataframe('refGene')
df[['txStart', 'txEnd']] = df[['txStart', 'txEnd']].astype(int)

genes = pd.Series(np.zeros(vcf_ex_sub.shape[0]))
#gene = hg19.bin_query('refGene', vcf_ex_sub.CHROM[1], vcf_ex_sub.POS[1], vcf_ex_sub.POS[1])
#print vcf_ex_sub.POS.iloc[0]

for i in range(0, vcf_ex_sub.shape[0]):
    #genes[i] = df[[df.chrom == str(vcf_ex_sub.CHROM.iloc[i]) and df.txStart >= str(vcf_ex_sub.POS.iloc[i])]].bool() #and df.txStart >= str(vcf_ex_sub.POS.iloc[i]) and df.txEnd <= vcf_ex_sub.POS.iloc[i]].bool(),
    chrom = vcf_ex_sub['CHROM'].iloc[i]
    location = vcf_ex_sub['POS'].iloc[i]
    #print i, chrom, location
    tmp = df.copy()
    tmp = tmp[tmp.chrom == chrom]
    tmp = tmp[tmp['txStart'] <= location]
    tmp = tmp[tmp['txEnd'] >= location]