示例#1
0
    def load_data(self):
        """Loads Taxonomy dump into Tree data structure"""
        # map tax ids to names
        self.id2name = defaultdict(str)
        # map names to nodes
        self.name2node = {}
        # map child to parent
        child2parent = {}
        for line in self.data_files.extractfile("names.dmp"):
            row = line.decode().split('\t|\t')
            self.id2name[row[0]] = row[1]

        for line in self.data_files.extractfile("nodes.dmp"):
            # parse tab-separated line
            row = line.decode().split('\t|\t')[:-1]
            # grab name and parent name
            name = self.id2name[row[0]]
            parent_name = self.id2name[row[1]]
            child2parent[name] = parent_name
            # create and store a node
            node = TreeNode(name)
            self.name2node[name] = node
        # build classical tree data structure
        for name, node in self.name2node.items():
            if name == "root":
                # initialize root of tree
                self.tree = node
            else:
                # retrieve parent name
                parent_name = child2parent[name]
                # retrieve parent node
                parent_node = self.name2node[parent_name]
                # link child to parent node
                node.parent = parent_node
                # link parent to child node
                parent_node.add_child(node)