def gba_map_taxa(dbname): cbdb = compbio.projects.cbdb dbi = cbdb.getName(dbname) tgb_dbi = cbdb.getName('tax_gbs') count = 0 slc_n = 10000 slc_ofs = 0 max_ofs =dbi.S.q(func.max(dbi.Sequence.id)).scalar() while slc_ofs < max_ofs: for s in dbi.Session.query(dbi.Sequence).\ filter(dbi.Sequence.id>=slc_ofs).\ filter(dbi.Sequence.id<(slc_ofs+slc_n)): gbid = s.gb_id count += 1 if gbid: taxid = tgb_dbi.Session.query(tgb_dbi.TaxGBJoin).\ filter_by(gbid = gbid).first() s.source_taxon = taxid.taxid if count > 100: dbi.Session.commit() print 'committing' count = 0 slc_ofs += slc_n dbi.Session.commit()
def db_metadata(clade): ''' Get an ncbi genealogy for a clade - e.g: the minimal ncbi node containing every terminal of a clade as well as the ncbi nodes at each leaf. inputs: clade: <biopython clade> outputs: genealogy: the shared ncbi genealogy of every terminal nodes: ncbi taxnodes for every terminal ''' tax_dbi = cbdb.getName('taxdmp') tgb_dbi = cbdb.getName('tax_gbs') gba_dbi = cbdb.getName('gb_acc_idjoin') #terminal_gbaccs = map(lambda x: clade_gbacc(x), clade.get_terminals()) for idx, t in enumerate(clade.get_terminals()): try: t.m['id'] = idx t.m['gbacc'] = clade_gbacc(t) t.m['gbid'] = gba_dbi.Session.query(gba_dbi.GBAcc).\ filter_by(accession = t.m['gbacc']).one().gbid taxid = tgb_dbi.Session.query(tgb_dbi.TaxGBJoin).\ filter_by(gbid = t.m['gbid']).one().taxid t.m['taxid'] = taxid except: pass max_idx = idx for idx, t in enumerate(clade.get_nonterminals()): t.m['id'] = max_idx + idx
def gba_map_gbids(dbname): cbdb = compbio.projects.cbdb dbi = cbdb.getName(dbname) count = 0 fail_count =0 sxs_count =0 gbacc_dbi = cbdb.getName('gb_acc_idjoin') slc_n = 10000 slc_ofs = 0 max_ofs =dbi.S.q(func.max(dbi.Sequence.id)).scalar() while slc_ofs < max_ofs: for s in dbi.Session.query(dbi.Sequence).\ filter(dbi.Sequence.id>=slc_ofs).\ filter(dbi.Sequence.id<(slc_ofs+slc_n)): try: gbid = gbacc_dbi.Session.query(gbacc_dbi.GBAcc).\ filter_by(accession = s.gb_accession).first().gbid s.gb_id = gbid sxs_count += 1 except: print 'failed!' fail_count +=1 count += 1 if count > 100: dbi.Session.commit() count = 0 print 'adding' slc_ofs += slc_n dbi.Session.commit() print fail_count, sxs_count
def fill_db( name = '16s', reset = True): dbi = cbdb.getName(name, tables = get_tables(), reset = np.mod(reset,2)) count = 0 for p in paths: fopen = open(p) a = dbi.Alignment() dbi.Session.add(a) dbi.Session.commit() for rec in SeqIO.parse(fopen, 'genbank'): f0 = rec.features[0] if f0.type == 'source': source_taxon = f0.qualifiers['db_xref'][0][6:] source_organism=f0.qualifiers['organism'][0] else: source_taxon = None source_organism = None seq = dbi.Sequence(name = rec.name, file_name = p, file_offset = fopen.tell(), sequence = rec.seq.__str__(), source_taxon = source_taxon, source_organism = source_organism, gb_accession = rec.id, annotations = rec.annotations.__str__(), alignmentid = 0) dbi.Session.add(seq) if np.mod(count, 1000) == 0: print count, p , seq.source_organism dbi.Session.commit() count += 1 dbi.Session.commit()
def makeRank(self, rank = 'phylum', subtree = None): #Get the subtree and db connections to build meta for tree = subtree if subtree != None else self.t dbi = cbdb.getName('taxdmp') print 'Fetching taxonomic nodes from the db' #Get the terminal nodes and corresponding ncbi taxa terms = [t for t in tree.get_terminals() if t.m.has_key('taxid')] nodes = [dbi.S.q(dbi.Node).filter_by(id = t.m['taxid']).scalar() for t in terms] #endpoints for parental iteratiion taxa = ncbi.get_rank(rank) root = ncbi.get_root() print 'Computing terminal node mappings for taxon: {0}'.format(rank) bar = pbar.simple(len(nodes)); bar.start() node_taxa = list(nodes) get_p_iter = lambda: \ node_taxa[idx] == None and True \ or node_taxa[idx] in taxa and True \ or node_taxa[idx] == root and True \ or node_taxa.__setitem__(idx,node_taxa[idx].parent) \ or node_taxa[idx] for idx, v in enumerate(node_taxa): bar.update(idx); par = list(iter(get_p_iter, True))[-1] if v else None terms[idx].m[rank] = par.id if par in taxa else None bar.finish() print 'Done!'
def getGenealogy(node): root_node =taxRoot() dbi = cbdb.getName('taxdmp') path = [] cur = node while cur != root_node: path.append(cur) cur = cur.parent path.append(cur) return path[::-1]
def fill_all_rdb16s(reset = True): paths = [] for r, ds, fs in os.walk(config.dataPath('alignments/16s')): for f in fs: if '.gbk' in f: paths.append(os.path.join(r,f)) cbdb = compbio.projects.cbdb dbi = cbdb.getName('16s', tables = get_tables(), reset = np.mod(reset, 2)) last_ofs = 0 for p in paths: fopen = open(p) a = dbi.Alignment(file_name =config.dataURL(p)) dbi.Session.add(a) dbi.Session.commit() count = 0 for rec in SeqIO.parse(fopen, 'genbank'): try: src_taxon = rec.features[0].qualifiers['db_xref'][0][6:] except Exception, e: src_taxon = None ann = sjson.dumps(rec.annotations, default = lambda x: x.__str__()) seq = dbi.Sequence(name = rec.name, file_name = p, file_offset = last_ofs, sequence = rec.seq.__str__(), gb_accession = rec.id, gb_accession_version = 1, gb_id = None, annotations = ann, alignment = a, source_taxon = src_taxon ) dbi.Session.add(seq) last_ofs = fopen.tell() if np.mod(count, 1000) == 0: print count, p, seq.source_organism dbi.Session.commit() count += 1 dbi.Session.commit()
def fill_db( reset = True): dbi = cbdb.getName('taxdmp', tables = get_tables(), reset = np.mod(reset, 2)) filepath = config.dataPath('ncbi/taxdmp') maps = get_maps() record_sep = '\t|\n' col_sep = '\t|\t' colfun = lambda x: unicode(x, errors = 'replace').replace(record_sep, '').split(col_sep) record_iterfun = lambda x: x.xreadlines() fill_tables = {'Gencode':'gencode.dmp', 'Node':'nodes.dmp', 'Name':'names.dmp', 'Citation':'citations.dmp'} count = 0 for k,v in fill_tables.iteritems(): fopen = open(os.path.join(filepath, v)) fsize = os.path.getsize(os.path.join(filepath,v)) mapped_class = dbi.__dict__[k] mapped_columns = maps[k] l0 = '' for l in record_iterfun(fopen): count += 1 l0+=l if l0[-3:] == record_sep : l = l0 l0 = '' else: continue cols = colfun(l) cls = mapped_class(**dict(map(lambda (x,y): (x,cols[y]), mapped_columns.iteritems()))) dbi.Session.merge(cls) if np.mod(count, 1000) == 0: dbi.Session.commit() print k, v, count, cols, '{0:4}%'.format(100 * float(fopen.tell()) / fsize) dbi.Session.commit() return
def fill_from_rfam_stk( p, reset = True): cbdb = compbio.projects.cbdb aname = os.path.basename(p) dbi = cbdb.getName(aname, tables = get_tables(), reset = np.mod(reset,2)) fopen = open(p) a = dbi.Alignment(file_name = aname) dbi.Session.add(a) dbi.Session.commit() count = 0 for rec in SeqIO.parse(fopen, 'stockholm'): acc = rec.annotations['accession'] accidv, accrange = acc.split('/') acv_split = accidv.split('.') accid = acv_split[0] accid_version = (lambda x: len(x) == 1 and 1 or x[1])(acv_split) ann = sjson.dumps(rec.annotations, default = lambda x: x.__str__()) seq = dbi.Sequence(name = rec.name, file_name = p, file_offset = fopen.tell(), sequence = rec.seq.__str__(), gb_accession = accid, gb_accession_version = accid_version, gb_accession_range = accrange, gb_id = None, annotations = ann, alignment = a ) dbi.Session.add(seq) if np.mod(count, 100) == 0: print count, p , seq.source_organism dbi.Session.commit() count += 1 dbi.Session.commit()
def investigatePhylum(self, aliname = 'group2.stk', p_node = None, **kwargs): if not p_node: p_node = ncbi.taxon_with_name('phylum', 'Thermotogae') ali_seqs = ali.get_seqs(aliname, **mem.sr(kwargs)) ali_nodes = array(ali.get_taxnodes(aliname, **mem.sr(kwargs))) ali_phyla = array(ali.get_taxon_forall(aliname,**mem.sr(kwargs, rank = 'phylum'))) ali_inds = nonzero(equal(ali_phyla, p_node))[0] leaf_terminals = self.t.get_terminals() leaf_nodes = array(self.leafNodes(**mem.sr(kwargs))) leaf_phyla = array(self.getTaxon('phylum', **mem.sr(kwargs))) leaf_inds = nonzero(equal(leaf_phyla, p_node))[0] ap_sub = ali_phyla[ali_inds] lp_sub = leaf_phyla[leaf_inds] ag_sub = array(ali.get_taxon_forsome(ali_nodes[ali_inds],'genus','thermo', **mem.sr(kwargs))) lg_sub = array(self.getTaxon('genus', **mem.sr(kwargs)))[leaf_inds] as_sub = array(ali.get_taxon_forsome(ali_nodes[ali_inds], 'species', 'thermo')) ls_sub = array(self.getTaxon('species',**mem.sr(kwargs)))[leaf_inds] db16 = cbdb.getName('16s') a_16s= [ db16.S.q(db16.Sequence). filter_by(source_taxon = n.id).all() for n in ali_nodes[ali_inds]] l_16s= [ db16.S.q(db16.Sequence). filter_by(source_taxon = n.id).all() for n in leaf_nodes[leaf_inds]] #fill any empty nodes... (those lacking 16s rRNA) for idx, elt in enumerate(a_16s): cur_node= ali_nodes[ali_inds[idx]] while not elt: cur_node = cur_node.parent elt.extend(db16.S.q(db16.Sequence).filter_by(source_taxon = cur_node.id).all()) for idx, elt in enumerate(l_16s): cur_node= leaf_nodes[leaf_inds[idx]] while not elt: cur_node = cur_node.parent elt.extend(db16.S.q(db16.Sequence).filter_by(source_taxon = cur_node.id).all()) all_lens = dict([ (k, [len(list( e)) for e in seqlist] ) for seqlist,k in [[a_16s,'a_16s'],[l_16s,'l_16s']]]) leaf_sns = [ SeqNode(lg_sub[i],ls_sub[i] , leaf_nodes[idx], [(x.sequence,x.gb_id, x.source_taxon, ncbi.get_node(x.source_taxon).rank) for x in l_16s[i]], src = leaf_terminals[idx], node_id = 'btol:default:{0}'.format(leaf_terminals[idx].m['id'])) for i, idx in enumerate(leaf_inds)] ali_sns = [ SeqNode(ag_sub[i],as_sub[i] , ali_nodes[idx], [( x.sequence,x.gb_id , x.source_taxon, ncbi.get_node(x.source_taxon).rank) for x in a_16s[i]], src = ali_seqs[idx], node_id = 'ali:{0}:{1}'.format(aliname,ali_seqs[idx].id)) for i, idx in enumerate(ali_inds)] return list(it.chain(leaf_sns, ali_sns))
def setLeafNodes(**kwargs): all_leaves = self.t.get_terminals() dbi = cbdb.getName('taxdmp') all_nodes = [ ncbi.get_node(l.m['taxid'],dbi) if 'taxid' in l.m.keys() else None for l in all_leaves] return all_nodes
def taxRoot(): tax_dbi = cbdb.getName('taxdmp') root_node = tax_dbi.Session.query(tax_dbi.Name).filter_by(name_txt = 'root').one().node return root_node
def rna4gbid( gbid, dbname = '16s'): #print 'giving a random RNA because the taxonomy atabase is not yet created!' dbi = cbdb.getName(dbname) seq_num= floor(1000 * random.random()) seq = dbi.Session.query(dbi.Sequence)[seq_num].sequence return seq
def get_root(): dbi = cbdb.getName('taxdmp') return dbi.S.q(dbi.Node).\ filter_by(id = 1).one()
def get_rank(rankname = 'phylum'): dbi = cbdb.getName('taxdmp') rank = dbi.S.q(dbi.Node).\ filter_by(rank = rankname).all() return rank
def get_node(nodeid, dbi = None): if dbi == None: dbi = cbdb.getName('taxdmp') return dbi.S.q(dbi.Node).filter_by(id=nodeid).scalar()
def set_seqs(**kwargs): cbdb = compbio.projects.cbdb dbname = kwargs['dbname'] dbi = cbdb.getName(dbname) nodes = dbi.S.q(dbi.Sequence).all() return nodes
def fill_db( name = 'bacterial_genomes', reset = False, postgres = False, host = 'broad'): dbi = cbdb.getName( name, postgres = postgres, tables = get_tables(), reset = np.mod(reset, 2), host = host) paths = [] for r,ds, fs in os.walk('/Volumes/ganymede/all.gbk/'): for f in fs: if 'gbk' in f: paths.append(os.path.join(r, f)) count = 0 for p in paths: if count < 1668: count += 1 continue count += 1 fopen = open(p) for rec in SeqIO.parse(fopen, 'genbank'): f0 = rec.features[0] if f0.type == 'source': source_taxon = f0.qualifiers['db_xref'][0][6:] source_organism=f0.qualifiers['organism'][0] else: source_taxon = None source_organism = None fa_seqpath = 'genomes/'+rec.id+'.fa' fa_sequrl = config.dataURL(fa_seqpath) fa_seqfile = config.dataPath(fa_sequrl) fopen = open(fa_seqfile,'w') SeqIO.write(rec,fopen, 'fasta') fopen.close() adds = [] genome = dbi.Genome(name = rec.name, seq_url =fa_sequrl, source_taxon = source_taxon, source_organism = source_organism, gb_accession = rec.id, annotations = rec.annotations.__str__()) #adds.append(genome) print 'adding genome ' + source_organism dbi.Session.add(genome) print 'commiting update ' dbi.Session.commit() print 'genome added! ' for f in rec.features: feature = dbi.Feature(type = f.type, start = f.location.start.position, start_ext = f.location.start.extension, end = f.location.end.position, end_ext = f.location.end.extension, strand = f.strand, genomeobj = genome) #print 'adding feature ' + f.type #dbi.Session.add(feature) adds.append(feature) for k,v in f.qualifiers.iteritems(): q = dbi.Qualifier(key = k, value = v.__str__(), featureobj = feature) #dbi.Session.add(q) adds.append(q) for sf in f.sub_features: sub = dbi.SubFeature(type = sf.type, start = sf.location.start.position, start_ext = sf.location.start.extension, end =sf.location.end.position, end_ext = sf.location.end.extension, strand = sf.strand, featureobj = feature) adds.append(sub) #dbi.Session.add(sub) for k,v in sf.qualifiers.iteritems(): q = dbi.Qualifier(key = k, value = v.__str__(), subfeatureobj = sf) #Session.add(q) adds.append(q) dbi.Session.add_all(adds) if np.mod(count, 2) == 0: print count #print count, p , seq.source_organism print 'committing update' dbi.Session.commit() print 'update commited!' dbi.Session.commit()