def setUp(self): """ connect to the database """ self.parsedFile = os.path.join(os.path.dirname(__file__),"blast-parsed.csv") self.bm = BlastMapper()
class BlastMapperTest(unittest.TestCase): """ Run a number of tests using taxa id """ def setUp(self): """ connect to the database """ self.parsedFile = os.path.join(os.path.dirname(__file__),"blast-parsed.csv") self.bm = BlastMapper() def test01Summarize(self): """ test the summarize function """ summaryFile = re.sub("\.csv","",self.parsedFile)+"_summary.csv" if os.path.exists(summaryFile): os.remove(summaryFile) self.bm.create_summarized(self.parsedFile,uniprot=True) self.assertTrue(os.path.exists(summaryFile)) def test02Something(self): """ read in the results summary """ summaryFile = re.sub("\.csv","",self.parsedFile)+"_summary.csv" bmap = self.bm.load_summary(summaryFile,taxaList=["10090"]) self.assertEqual(bmap['GG11117|c2_g1_i1'][0],'INT1_MOUSE') self.assertEqual(bmap['GG11117|c2_g1_i1'][1],'68510') bmap = self.bm.load_summary(summaryFile,taxaList=["10090"],trinityGene=True) self.assertEqual(bmap['GG11117|c2_g1'][0],'INT1_MOUSE') bmap = self.bm.load_summary(summaryFile,taxaList=["10090"],trinityGene=True,best=False) self.assertEqual(bmap['GG11117|c2_g1'][0][0],'INT1_MOUSE') self.assertEqual(bmap['GG11117|c2_g1'][0][4],0.0)
for tquery in taxaQueries: s = select([Gene.taxa_id, Gene.ncbi_id, Gene.description, Gene.symbol], Gene.taxa_id == tquery['id']) _geneQueries = conn.execute(s) geneQueries = _geneQueries.fetchall() gene2taxa.update( dict([(str(r['ncbi_id']), str(r['taxa_id'])) for r in geneQueries])) gene2desc.update( dict([(str(r['ncbi_id']), str(r['description'])) for r in geneQueries])) gene2sym.update( dict([(str(r['ncbi_id']), str(r['symbol'])) for r in geneQueries])) ## load the blast map bm = BlastMapper() summaryFile1 = os.path.join(homeDir, "dn-trinity", 'blast-dn-parsed_summary.csv') summaryFile2 = os.path.join(homeDir, "dn-trinity", 'blast-dm-parsed_summary.csv') summaryFile3 = os.path.join(homeDir, "dn-trinity", "blast-mc-parsed_summary.csv") summaryFile4 = os.path.join(homeDir, "dn-trinity", 'blast-dp-parsed_summary.csv') bmapSP = bm.load_summary(summaryFile1, trinityGene=False, best=True) bmapDM = bm.load_summary(summaryFile2, trinityGene=False, best=True) bmapMC = bm.load_summary(summaryFile3, trinityGene=False, best=True) bmapDP = bm.load_summary(summaryFile4, trinityGene=False, best=True) ## prepare supplment output
Take the parsed blast results and create a summary file to be read by BlastMapper """ import os, sys, csv, re, getopt, time from htsint.blast import BlastMapper homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "pieris") parsedFilePath = os.path.realpath( os.path.join(homeDir, "dn-trinity", "blast-dn-parsed.csv")) summaryFile1 = os.path.join(homeDir, "blast", "blast-up-parsed_summary.csv") summaryFile2 = os.path.join(homeDir, "blast", 'blast-dm-parsed_summary.csv') summaryFile3 = os.path.join(homeDir, "blast", 'blast-dp-parsed_summary.csv') bm = BlastMapper() ## load the gene and isoform maps bmapSP = bm.load_summary(summaryFile1, trinityGene=False, best=True) bmapDM = bm.load_summary(summaryFile2, trinityGene=False, best=True) bmapDP = bm.load_summary(summaryFile3, trinityGene=False, best=True) print("-----------") print("SwissProt - isoforms") bm.print_summary(bmapSP) print("D. melanogaster - isoforms") bm.print_summary(bmapDM) print("Danaus plexippus - isoforms") bm.print_summary(bmapDP) bm.make_taxa_pie_chart_and_table(
def write(self,blastMap=None, transcriptMin=9, transcriptMax=1000,outFile="genesets.gmt"): """ outFile: specifies the output file path (*.gmt) also a *.csv file with gene transcript mapping will be created if a bmap is provided blastMap: BlastMap returned after loading summary file in BlastMapper transcriptMin: minimum size for a gene set transcriptMax: maximum size for a gene set outFile: outfile path """ print("---------------------") if self.gene2go: print('There are %s genes with at least one annotation'%(len(self.gene2go.keys()))) print('There are %s genes in the labels file'%(len(self.genes))) if blastMap: bm = BlastMapper() bmGenes = bm.print_summary(blastMap) gene2transcript = bm.get_gene_dict(blastMap) if self.gene2go: usableGenes = list(set(bmGenes).intersection(set(self.gene2go.keys()))) else: usableGenes = bmGenes if blastMap: print('There are %s genes with at least one BLAST hit'%(len(bmGenes))) print('There are %s genes that have both a BLAST hit and an annotation'%(len(usableGenes))) #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys()))))) ## prepare outfiles writer = csv.writer(open(outFile,'w'),delimiter="\t") if blastMap: outFileMap = re.sub("\.gmt",".csv",outFile) writerMap = csv.writer(open(outFileMap,'w')) writerMap.writerow(["gene_set","gene_id","mapped_transcripts"]) ## save gene sets to file failedThreshold = 0 for _k in self.allClusters: clusterInds = np.where(self.labels==_k)[0] clusterGenes = self.genes[clusterInds] gsName = "gs-"+str(_k) if self.gene2go: description = self.get_description(clusterGenes) else: description = "kegg pathway" ## map the genes if blastMap: mapped = set([]) for gene in clusterGenes: if not gene2transcript.has_key(gene): continue geneTranscripts = gene2transcript[gene] geneTranscripts = list(set([re.sub("\.[0-9]$","",g) for g in geneTranscripts])) if blastMap: writerMap.writerow([gsName,gene,";".join(list(geneTranscripts))]) mapped.update(geneTranscripts) mapped = list(mapped) else: mapped = clusterGenes ### remove non-unique and versioned genes #if len(mapped) > 0: if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax: writer.writerow([gsName,description] + mapped) else: failedThreshold+=clusterGenes.size print("-----------------") #print("sigma: %s"%self.sigma) #print("k: %s"%self.k) print('Total clusters: %s '%self.allClusters.size) percentAccepted = float(self.genes.size-failedThreshold) / float(self.genes.size) print("genes pass threshold %s/%s (%s)"%(self.genes.size-failedThreshold,self.genes.size,round(percentAccepted,2)) + "%)")
scps = SpectralClusterParamSearch(geneDistancePath,dtype='distance') scps.run(chunks=15) ## plot the parameter search psFigureFile = os.path.join(homeDir,"param-scan-%s.png"%(_aspect)) if not os.path.exists(psFigureFile): scr = SpectralClusterResults(silvalFile,clustersFile) scr.plot(figName=psFigureFile) ## run spectral clustering k = 20 sigma = 0.08 labelsPath = os.path.join(homeDir,"sc-labels-%s.csv"%(_aspect)) if not os.path.exists(labelsPath): sc = SpectralCluster(geneDistancePath,dtype='distance') sc.run(k,sk=None,sigma=sigma,verbose=True) sc.save(labelsPath=labelsPath) ## Save gene sets bm = BlastMapper() bmap = bm.load_summary('blast-parsed-summary.csv',best=False,taxaList=['8355','8364']) transcriptMin,transcriptMax = 9,1000 gsFile = os.path.join(homeDir,"%s.gmt"%(_aspect)) if not os.path.exists(gsFile): gsc = GeneSetCollection(labelsPath,gene2go) gsc.write(blastMap=bmap,transcriptMin=transcriptMin,transcriptMax=transcriptMax,outFile=gsFile) print("process complete.")
taxaList = ['13037'] uniprot = False elif db == 'dm': species = 'Drosophila melanogaster' taxaList = ['7227'] uniprot = False else: species = None taxaList = [] uniprot = True homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "pieris", "blast") parsedFilePath = os.path.realpath( os.path.join(homeDir, "blast-%s-parsed.csv" % (db))) bm = BlastMapper() ## read in the gene2ensemble file if db == 'dm': fid = open(os.path.join(homeDir, 'gene2ensembl'), 'r') reader = csv.reader(fid, delimiter="\t") header = reader.next() id2gene = {} for linja in reader: if linja[0] != taxaList[0]: continue id2gene[linja[4]] = linja[1] elif db == 'dp': transcript2uniprot = {} fid = open( os.path.join(homeDir, 'Danaus_plexippus.DanPle_1.0.28.uniprot.tsv'),
def write(self, blastMap=None, transcriptMin=9, transcriptMax=1000, outFile="genesets.gmt"): """ outFile: specifies the output file path (*.gmt) also a *.csv file with gene transcript mapping will be created if a bmap is provided blastMap: BlastMap returned after loading summary file in BlastMapper transcriptMin: minimum size for a gene set transcriptMax: maximum size for a gene set outFile: outfile path """ print("---------------------") if self.gene2go: print('There are %s genes with at least one annotation' % (len(self.gene2go.keys()))) print('There are %s genes in the labels file' % (len(self.genes))) if blastMap: bm = BlastMapper() bmGenes = bm.print_summary(blastMap) gene2transcript = bm.get_gene_dict(blastMap) if self.gene2go: usableGenes = list( set(bmGenes).intersection(set(self.gene2go.keys()))) else: usableGenes = bmGenes if blastMap: print('There are %s genes with at least one BLAST hit' % (len(bmGenes))) print( 'There are %s genes that have both a BLAST hit and an annotation' % (len(usableGenes))) #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys()))))) ## prepare outfiles writer = csv.writer(open(outFile, 'w'), delimiter="\t") if blastMap: outFileMap = re.sub("\.gmt", ".csv", outFile) writerMap = csv.writer(open(outFileMap, 'w')) writerMap.writerow(["gene_set", "gene_id", "mapped_transcripts"]) ## save gene sets to file failedThreshold = 0 for _k in self.allClusters: clusterInds = np.where(self.labels == _k)[0] clusterGenes = self.genes[clusterInds] gsName = "gs-" + str(_k) if self.gene2go: description = self.get_description(clusterGenes) else: description = "kegg pathway" ## map the genes if blastMap: mapped = set([]) for gene in clusterGenes: if not gene2transcript.has_key(gene): continue geneTranscripts = gene2transcript[gene] geneTranscripts = list( set([ re.sub("\.[0-9]$", "", g) for g in geneTranscripts ])) if blastMap: writerMap.writerow( [gsName, gene, ";".join(list(geneTranscripts))]) mapped.update(geneTranscripts) mapped = list(mapped) else: mapped = clusterGenes ### remove non-unique and versioned genes if type(mapped) == type(np.array([])): mapped = mapped.tolist() if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax: writer.writerow([gsName, description] + mapped) else: failedThreshold += clusterGenes.size print("-----------------") #print("sigma: %s"%self.sigma) #print("k: %s"%self.k) print('Total clusters: %s ' % self.allClusters.size) percentAccepted = float(self.genes.size - failedThreshold) / float( self.genes.size) print( "genes pass threshold %s/%s (%s)" % (self.genes.size - failedThreshold, self.genes.size, round(percentAccepted, 2)) + "%)")
def write_summary(name, aspect, transcript, assembly, geneset): ## load the go dictionaries termsPath = os.path.join("..", "results", "go-terms-%s-%s.pickle" % (name, aspect)) tmp = open(termsPath, 'r') gene2go, go2gene = cPickle.load(tmp) tmp.close() ## load the blast map bm = BlastMapper() homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus") sizeMin, sizeMax = 5, 100 summaryFile = os.path.join(homeDir, "%s-trinity" % (assembly), 'blast-%s-parsed_summary.csv' % assembly) if transcript == 'genes': bmap = bm.load_summary(summaryFile, trinityGene=True, best=False, taxaList=['8364', '8355', '9606'], evalue=0.0001) else: bmap = bm.load_summary(summaryFile, trinityGene=False, best=False, taxaList=['8364', '8355', '9606'], evalue=0.0001) ## get gene level differencial exp results featuresDir = os.path.join(homeDir, "%s-trinity" % assembly, "features") deseqResultsPath = os.path.join(featuresDir, "deseq_%s_de.csv" % (transcript)) deseqIds, deseqColumns, deseqMat = read_de_results(deseqResultsPath, tool='DESeq') padjInd = np.where(deseqColumns == 'padj')[0] pvalInd = np.where(deseqColumns == 'pvalue')[0] ## input/output genesetSummaryFile = os.path.join( "..", "results", "genesets", "%s-%s-%s-%s-%s.csv" % (name, aspect, transcript, assembly, re.sub("gs-", "", geneset))) genesetFile = os.path.join( "..", "results", "%s-%s-%s-%s.gmt" % (name, aspect, assembly, transcript)) if not os.path.exists(genesetFile): raise Exception("cannot find gene set file") allGenesets = {} fid = open(genesetFile, 'r') for linja in fid: linja = [re.sub("\s+", "", l) for l in linja.split("\t")] allGenesets[linja[0]] = linja[2:] fid.close() gsTranscripts = allGenesets[geneset] ## map back to gene space and collect go terms transcript2genes = {} for t in gsTranscripts: transcript2genes[t] = {} species = list(set([hit[2] for hit in bmap[t]])) ## organize the hits by species for hit in bmap[t]: if not transcript2genes[t].has_key(hit[2]): transcript2genes[t][hit[2]] = [] transcript2genes[t][hit[2]].append(hit[1]) ## get inferred go terms for each transcript transcript2go = {} for t, hit in transcript2genes.iteritems(): transcript2go[t] = [] for genes in hit.itervalues(): #gene = v[1] for gene in genes: if gene2go.has_key(gene): transcript2go[t].extend(gene2go[gene]) transcript2go[t] = list(set(transcript2go[t])) transcript2go[t].sort() ## write to file writer = csv.writer(open(genesetSummaryFile, 'w')) writer.writerow(["transcript", "p-value", "genes", "go-terms"]) allTerms = [] for ts in gsTranscripts: pvalue = deseqMat[np.where(deseqIds == ts)[0], pvalInd][0] reportedGenes = [] for taxa, genes in transcript2genes[ts].iteritems(): reportedGenes.extend(genes[:2]) reportedGenes = list(set(reportedGenes)) if len(reportedGenes) > 1: genes = ";".join(reportedGenes) else: genes = reportedGenes[0] terms = transcript2go[ts] if terms: allTerms.extend(terms) if not terms: terms = "None" elif len(terms) > 1: terms = ";".join(terms) else: terms = terms[0] writer.writerow([ts, pvalue, genes, terms]) writer.writerow(["--------"]) ## write a summary of the go terms allTerms = np.array(list(set(allTerms))) allTermCounts = np.zeros(allTerms.size, ) for t, term in enumerate(allTerms): for ts in gsTranscripts: allTermCounts[t] += np.where( np.array(transcript2go[ts]) == term)[0].size sortedTerms = allTerms[np.argsort(allTermCounts)[::-1]] sortedCounts = allTermCounts[np.argsort(allTermCounts)[::-1]] writer.writerow(["ID", "Counts", "Description"]) for t, term in enumerate(sortedTerms): desc = session.query(GoTerm).filter(GoTerm.go_id == term).first().name writer.writerow([term, sortedCounts[t], desc])
if not os.path.exists(psFigureFile): scr = SpectralClusterResults(silvalFile, clustersFile) scr.plot(figName=psFigureFile) ## run spectral clustering k = 20 sigma = 0.08 labelsPath = os.path.join(homeDir, "sc-labels-%s.csv" % (_aspect)) if not os.path.exists(labelsPath): sc = SpectralCluster(geneDistancePath, dtype='distance') sc.run(k, sk=None, sigma=sigma, verbose=True) sc.save(labelsPath=labelsPath) ## Save gene sets bm = BlastMapper() bmap = bm.load_summary('blast-parsed-summary.csv', best=False, taxaList=['8355', '8364']) transcriptMin, transcriptMax = 9, 1000 gsFile = os.path.join(homeDir, "%s.gmt" % (_aspect)) if not os.path.exists(gsFile): gsc = GeneSetCollection(labelsPath, gene2go) gsc.write(blastMap=bmap, transcriptMin=transcriptMin, transcriptMax=transcriptMax, outFile=gsFile) print("process complete.")
for o, a in optlist: if o == '-s': source = a homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus") if source in ['dn', 'gg']: sourceDir = "%s-trinity" % (source) elif source == 'ref': sourceDir = "reference" else: raise Exception("Bad source") summaryFile1 = os.path.join(homeDir, sourceDir, "blast-%s-parsed_summary.csv" % (source)) summaryFile2 = os.path.join(homeDir, sourceDir, "blast-xt-parsed_summary.csv") bm = BlastMapper() ## load the gene and isoform maps bmapIsoforms = bm.load_summary(summaryFile1, trinityGene=False, best=True) bmapFrog = bm.load_summary(summaryFile1, trinityGene=False, best=True, taxaList=['8355', '8364']) bmapXT = bm.load_summary(summaryFile2, trinityGene=False, best=True) print("-----------") print("SwissProt - isoforms") bm.print_summary(bmapIsoforms) print("SwissProt [8355,8364] - isoforms") bm.print_summary(bmapFrog) print("X. tropicalis - isoforms")