def __init__(self, html): '''html is input html doc''' self.html = html self.root = TreeNode.Node() self.curr = self.root self.tagPattern = re.compile(r'(?!<!)</?[^>]+>') self.attrPattern = re.compile(r'\w+=\".*\"') self.attrListPattern = re.compile(r'(?<=\")\s') self.attrNamePattern = re.compile(r'\w+(?=\=\")') self.attrContentPattern = re.compile(r'(?<=\=)\".*\"') self.contentPattern = re.compile(r'(?<=>).*(?=</)') self.startPattern = re.compile(r'<\w+') self.closePattern = re.compile(r'</\w+') self.namePattern = re.compile(r'(?<=<)\w+') self.closeNamePattern = re.compile(r'(?<=</)\w+')
def build(pre, inord, in_l, in_r, pre_l, pre_r): if pre_l > pre_r or in_l > in_r: return None root = TreeNode.Node(pre[pre_l]) #NOTE: huge optimization possible here if you use a hash map of # values to array indices in pre root_idx = inord.index(pre[pre_l]) left_size = root_idx - in_l root.left = build(pre, inord, in_l, root_idx - 1, \ pre_l + 1, pre_l + left_size) root.right = build(pre, inord, root_idx + 1, in_r, \ pre_l + left_size + 1, pre_r) return root
def buildTree(self): '''Build Dom Tree''' for eachLine in self.html: tags = re.findall(self.tagPattern, eachLine) if tags: for tag in tags: # if tag: # print(tag) start = re.match(self.startPattern, tag) close = re.match(self.closePattern, tag) if start: # fill the content of the node names = re.findall(self.namePattern, eachLine) for name in names: node = TreeNode.Node() attr = self.getAttr(tag) node.appendAttrList(attr) node.name = name node.appendAttrList(attr) # judge if the tag is a special node, if the answer is yes # ignore content part and the tag must be a child node # if the tag is not a special node, change the node to curr node if self.isSpecialNames(name): node.parent = self.curr self.curr.appendChild(node) self.addContentToAllParent(eachLine, node) else: content = re.findall(self.contentPattern, eachLine) if content: node.appendContent(content[0]) node.parent = self.curr self.curr.appendChild(node) self.curr = node # if the node is a close node, search all the way to top # to find its start tag and close it all if close: names = re.findall(self.closeNamePattern, eachLine) for name in names: self.curr = self.findStartTag(name).parent self.curr.appendContent(eachLine)
def buildTree(rel, att, od): tuple_list = tn.getAttList(rel, att) # create root node root = tn.Node(tn.getPage()) root = tn.insert(root, tuple_list[0][0], tuple_list[0][1], od, root) for i in range(1, len(tuple_list)): key = tuple_list[i][0] pointer = tuple_list[i][1] root = tn.insert(tn.search(root, key)[0], key, pointer, od, root) root.__write__() with open(INDEX_PATH + 'directory.txt') as f: directory = json.loads(f.read()) # update directory with open(INDEX_PATH + 'directory.txt', 'w') as f: tree = [] tree.append(rel) tree.append(att) tree.append(root.node_page) directory.append(tree) f.write(json.dumps(directory)) print(root.__print__())