Exemplo n.º 1
0
import numpy as np
from sqlalchemy.sql import select
from htsint.database import db_connect, Taxon, Gene
from htsint.tools import read_matrix, read_de_results, print_rest_table_contents
from htsint.blast import BlastMapper

#assembly = 'dn'
threshold = 0.05
homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "pieris")
transcript = 'isoforms'
featuresDir = os.path.join(homeDir, "features")

## load the differential expression results
evalue = 0.00001
deseqResultsPath = os.path.join(featuresDir, "deseq.csv")
deseqMatIds, deseqMatColumns, deseqMat = read_de_results(deseqResultsPath,
                                                         tool='DESeq')

## create a summary table and a csv file
outPath = os.path.join(featuresDir, 'de-summary-%s.csv' % (transcript))
fidout = open(outPath, 'w')
writer = csv.writer(fidout)
writer.writerow([
    'transcript ID', 'hitId', 'hitNcbiId', 'hitSpecies', 'e-value',
    'DESeq-pval', 'DESeq-adj-pval'
])

## load ref2gene
reader = csv.reader(open("../gene2ref.tab", "r"), delimiter="\t")
ref2gene = {}
for linja in reader:
    geneId = linja[1]
Exemplo n.º 2
0
s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.ncbi_id.in_(['8364']))
_taxaQueries = conn.execute(s)
taxaQueries = _taxaQueries.fetchall()
gene2taxa,gene2desc,gene2sym = {},{},{}
for tquery in taxaQueries:
    s = select([Gene.taxa_id,Gene.ncbi_id,Gene.description,Gene.symbol],Gene.taxa_id==tquery['id'])
    _geneQueries = conn.execute(s)
    geneQueries = _geneQueries.fetchall()
    gene2taxa.update(dict([(str(r['ncbi_id']),str(r['taxa_id'])) for r in geneQueries]))
    gene2desc.update(dict([(str(r['ncbi_id']),str(r['description'])) for r in geneQueries]))
    gene2sym.update(dict([(str(r['ncbi_id']),str(r['symbol'])) for r in geneQueries]))

## load feature data
featuresDir = os.path.join(homeDir,"%s-trinity"%assembly,"features")
edgerResultsPath = os.path.join(featuresDir,"edger_%s_behavior_de.csv"%(transcript))
edgerIds, edgerColumns, edgerMat = read_de_results(edgerResultsPath,tool='edgeR')
deseqResultsPath = os.path.join(featuresDir,"deseq_%s_behavior_de.csv"%(transcript))
deseqIds, deseqColumns, deseqMat = read_de_results(deseqResultsPath,tool='DESeq')
dfeMatrixPath = os.path.join(featuresDir,"deseq_%s_behavior_de_samples.csv"%(transcript))
dfeIds,dfeColumns,dfeMat = read_matrix(dfeMatrixPath,mtype='float')

## load the blast map 
if transcript == 'genes':
    blastMap = get_blast_map(os.path.join("..","..","blast","summary_blast_%s.csv"%assembly),\
                             taxaList=["8364"],asGenes=True)
else:
    blastMap = get_blast_map(os.path.join("..","..","blast","summary_blast_%s.csv"%assembly),\
                             taxaList=["8364"],asGenes=False)

## setup filters for the transcripts
threshold = 0.1
Exemplo n.º 3
0
#!/usr/bin/python
"""

"""

import sys
import numpy as np
from htsint.tools import read_matrix,read_de_results,Heatmap

## load differential expression data
deseqIds, deseqColumns, deseqMat = read_de_results('deseq.csv',tool='DESeq')
dfeIds,dfeColumns,dfeMat = read_matrix('deseq-samples.csv',mtype='float')
padjInd = np.where(deseqColumns == 'padj')[0]

## filter out nans 
print deseqMat.shape
padjInd = np.where(deseqColumns == 'padj')[0]
size1 = deseqIds.shape[0]
filter1 = np.where(~np.isnan(deseqMat[:,padjInd]))[0]
deseqIds = deseqIds[filter1]
deseqMat = deseqMat[filter1,:]
mask = np.in1d(dfeIds,deseqIds)
dfeIds = dfeIds[mask]
dfeMat = dfeMat[mask,:]
print("... %s/%s transcripts pass nan filter"%(filter1.size,size1))

## filter for only the most significant transcripts (max 50)
threshold = 0.5
size2 = deseqIds.shape[0]
filter2 = np.where(deseqMat[:,padjInd] <= threshold)[0][:50]
deseqIds = deseqIds[filter2]
    s = select([Gene.taxa_id, Gene.ncbi_id, Gene.description, Gene.symbol],
               Gene.taxa_id == tquery['id'])
    _geneQueries = conn.execute(s)
    geneQueries = _geneQueries.fetchall()
    gene2taxa.update(
        dict([(str(r['ncbi_id']), str(r['taxa_id'])) for r in geneQueries]))
    gene2desc.update(
        dict([(str(r['ncbi_id']), str(r['description']))
              for r in geneQueries]))
    gene2sym.update(
        dict([(str(r['ncbi_id']), str(r['symbol'])) for r in geneQueries]))

## load feature data
featuresDir = os.path.join(homeDir, "%s-trinity" % assembly, "features")
edgerResultsPath1 = os.path.join(featuresDir, "edger_%s_de.csv" % (transcript))
edgerIds1, edgerColumns1, edgerMat1 = read_de_results(edgerResultsPath1,
                                                      tool='edgeR')
deseqResultsPath1 = os.path.join(featuresDir, "deseq_%s_de.csv" % (transcript))
deseqIds1, deseqColumns1, deseqMat1 = read_de_results(deseqResultsPath,
                                                      tool='DESeq')
edgerResultsPath2 = os.path.join(featuresDir,
                                 "edger_%s_behavior_de.csv" % (transcript))
edgerIds2, edgerColumns2, edgerMat2 = read_de_results(edgerResultsPath2,
                                                      tool='edgeR')
deseqResultsPath2 = os.path.join(featuresDir,
                                 "deseq_%s_behavior_de.csv" % (transcript))
deseqIds2, deseqColumns2, deseqMat2 = read_de_results(deseqResultsPath2,
                                                      tool='DESeq')

#dfeMatrixPath = os.path.join(featuresDir,"deseq_%s_behavior_de_samples.csv"%(transcript))
#dfeIds,dfeColumns,dfeMat = read_matrix(dfeMatrixPath,mtype='float')
Exemplo n.º 5
0
#!/usr/bin/python
"""

"""

import sys
import numpy as np
from htsint.tools import read_matrix, read_de_results, Heatmap

## load differential expression data
deseqIds, deseqColumns, deseqMat = read_de_results('deseq.csv', tool='DESeq')
dfeIds, dfeColumns, dfeMat = read_matrix('deseq-samples.csv', mtype='float')
padjInd = np.where(deseqColumns == 'padj')[0]

## filter out nans
print deseqMat.shape
padjInd = np.where(deseqColumns == 'padj')[0]
size1 = deseqIds.shape[0]
filter1 = np.where(~np.isnan(deseqMat[:, padjInd]))[0]
deseqIds = deseqIds[filter1]
deseqMat = deseqMat[filter1, :]
mask = np.in1d(dfeIds, deseqIds)
dfeIds = dfeIds[mask]
dfeMat = dfeMat[mask, :]
print("... %s/%s transcripts pass nan filter" % (filter1.size, size1))

## filter for only the most significant transcripts (max 50)
threshold = 0.5
size2 = deseqIds.shape[0]
filter2 = np.where(deseqMat[:, padjInd] <= threshold)[0][:50]
deseqIds = deseqIds[filter2]
Exemplo n.º 6
0
def write_summary(name, aspect, transcript, assembly, geneset):
    ## load the go dictionaries
    termsPath = os.path.join("..", "results",
                             "go-terms-%s-%s.pickle" % (name, aspect))
    tmp = open(termsPath, 'r')
    gene2go, go2gene = cPickle.load(tmp)
    tmp.close()

    ## load the blast map
    bm = BlastMapper()
    homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus")
    sizeMin, sizeMax = 5, 100

    summaryFile = os.path.join(homeDir, "%s-trinity" % (assembly),
                               'blast-%s-parsed_summary.csv' % assembly)
    if transcript == 'genes':
        bmap = bm.load_summary(summaryFile,
                               trinityGene=True,
                               best=False,
                               taxaList=['8364', '8355', '9606'],
                               evalue=0.0001)
    else:
        bmap = bm.load_summary(summaryFile,
                               trinityGene=False,
                               best=False,
                               taxaList=['8364', '8355', '9606'],
                               evalue=0.0001)

    ## get gene level differencial exp results
    featuresDir = os.path.join(homeDir, "%s-trinity" % assembly, "features")
    deseqResultsPath = os.path.join(featuresDir,
                                    "deseq_%s_de.csv" % (transcript))
    deseqIds, deseqColumns, deseqMat = read_de_results(deseqResultsPath,
                                                       tool='DESeq')
    padjInd = np.where(deseqColumns == 'padj')[0]
    pvalInd = np.where(deseqColumns == 'pvalue')[0]

    ## input/output
    genesetSummaryFile = os.path.join(
        "..", "results", "genesets", "%s-%s-%s-%s-%s.csv" %
        (name, aspect, transcript, assembly, re.sub("gs-", "", geneset)))
    genesetFile = os.path.join(
        "..", "results",
        "%s-%s-%s-%s.gmt" % (name, aspect, assembly, transcript))

    if not os.path.exists(genesetFile):
        raise Exception("cannot find gene set file")

    allGenesets = {}
    fid = open(genesetFile, 'r')
    for linja in fid:
        linja = [re.sub("\s+", "", l) for l in linja.split("\t")]
        allGenesets[linja[0]] = linja[2:]

    fid.close()

    gsTranscripts = allGenesets[geneset]

    ## map back to gene space and collect go terms
    transcript2genes = {}
    for t in gsTranscripts:
        transcript2genes[t] = {}
        species = list(set([hit[2] for hit in bmap[t]]))

        ## organize the hits by species
        for hit in bmap[t]:
            if not transcript2genes[t].has_key(hit[2]):
                transcript2genes[t][hit[2]] = []

            transcript2genes[t][hit[2]].append(hit[1])

    ## get inferred go terms for each transcript
    transcript2go = {}
    for t, hit in transcript2genes.iteritems():
        transcript2go[t] = []
        for genes in hit.itervalues():
            #gene = v[1]
            for gene in genes:
                if gene2go.has_key(gene):
                    transcript2go[t].extend(gene2go[gene])
        transcript2go[t] = list(set(transcript2go[t]))
        transcript2go[t].sort()

    ## write to file
    writer = csv.writer(open(genesetSummaryFile, 'w'))
    writer.writerow(["transcript", "p-value", "genes", "go-terms"])
    allTerms = []

    for ts in gsTranscripts:
        pvalue = deseqMat[np.where(deseqIds == ts)[0], pvalInd][0]
        reportedGenes = []
        for taxa, genes in transcript2genes[ts].iteritems():
            reportedGenes.extend(genes[:2])
        reportedGenes = list(set(reportedGenes))

        if len(reportedGenes) > 1:
            genes = ";".join(reportedGenes)
        else:
            genes = reportedGenes[0]

        terms = transcript2go[ts]

        if terms:
            allTerms.extend(terms)

        if not terms:
            terms = "None"
        elif len(terms) > 1:
            terms = ";".join(terms)
        else:
            terms = terms[0]

        writer.writerow([ts, pvalue, genes, terms])

    writer.writerow(["--------"])
    ## write a summary of the go terms
    allTerms = np.array(list(set(allTerms)))
    allTermCounts = np.zeros(allTerms.size, )

    for t, term in enumerate(allTerms):
        for ts in gsTranscripts:
            allTermCounts[t] += np.where(
                np.array(transcript2go[ts]) == term)[0].size

    sortedTerms = allTerms[np.argsort(allTermCounts)[::-1]]
    sortedCounts = allTermCounts[np.argsort(allTermCounts)[::-1]]
    writer.writerow(["ID", "Counts", "Description"])
    for t, term in enumerate(sortedTerms):
        desc = session.query(GoTerm).filter(GoTerm.go_id == term).first().name
        writer.writerow([term, sortedCounts[t], desc])