def load_data(self): """Loads Taxonomy dump into Tree data structure""" # map tax ids to names self.id2name = defaultdict(str) # map names to nodes self.name2node = {} # map child to parent child2parent = {} for line in self.data_files.extractfile("names.dmp"): row = line.decode().split('\t|\t') self.id2name[row[0]] = row[1] for line in self.data_files.extractfile("nodes.dmp"): # parse tab-separated line row = line.decode().split('\t|\t')[:-1] # grab name and parent name name = self.id2name[row[0]] parent_name = self.id2name[row[1]] child2parent[name] = parent_name # create and store a node node = TreeNode(name) self.name2node[name] = node # build classical tree data structure for name, node in self.name2node.items(): if name == "root": # initialize root of tree self.tree = node else: # retrieve parent name parent_name = child2parent[name] # retrieve parent node parent_node = self.name2node[parent_name] # link child to parent node node.parent = parent_node # link parent to child node parent_node.add_child(node)