def _loadNodes(self,ncbiDumpFile): inp = openCompressed(ncbiDumpFile,'r') nodes = {} n_splits = self.nInputFields #delimRe = re.compile(r"\s*\|\s*") for rec in inp: # On profiling, string methods outperformed regexes (compare next two lines): #values = [ x.strip() for x in rec.split('\t|\t',n_splits)[:n_splits]] #values = delimRe.split(rec,n_splits) # This will not split the last field, but we never use it anyway: values = rec.split("\t|\t") node = TaxaNode() node.id = int(values[0]) node.idpar = int(values[1]) node.rank = values[2].replace(' ','_') node.divid = int(values[4]) #in NCBI file, root node points to itself as a parent. #We replace it with 0 for consistency with our SQL DB representation, where #circular self-reference would be inconvenient. if node.idpar == node.id: node.idpar = 0 nodes[node.id] = node assert node.id < ncbiTaxidMax, "We assume that dump file is pristine NCBI file and "+\ "assert that taxonomy ID (%s) < our max limit (%s)" % (node.id,ncbiTaxidMax) inp.close() self.nodes = nodes
def load(self): db = self.db reader = db.makeBulkReader(sql="select * from %s" % (self.tblNodes,),bufLen=100000) nodes = {} for chunk in reader.chunks(): for rec in chunk: node = TaxaNode() node.id = rec['id'] node.idpar = rec['idpar'] node.rank = rec['rank'] node.lnest = rec['lnest'] node.rnest = rec['rnest'] node.depth = rec['depth'] node.seq_len = rec['seq_len'] node.seq_len_tot = rec['seq_len_tot'] node.idlevel = rec['idlevel'] node.divid = rec['divid'] nodes[node.id] = node reader.close() return dict(nodes=nodes,merged={})