def add_webentity_creation_rule_iter(self, rule_prefix, pattern, write_in_trie=True): ''' Note: write_in_trie has 2 effects: store the rule in the trie, and apply it to existing entities. write_in_trie=False is essentially for init on an existing traph. Note 2: it seems obvious from the api design but let's restate it: We can only have one rule for each prefix (or none). ''' rule_prefix = self.__encode(rule_prefix) self.webentity_creation_rules[rule_prefix] = re.compile( pattern, re.I ) report = TraphWriteReport() state = TraphIteratorState() if write_in_trie: node, history = self.lru_trie.add_lru(rule_prefix) if not node: raise TraphException('Prefix not in tree: ' + rule_prefix) node.flag_as_webentity_creation_rule() node.write() # Spawn necessary web entities candidate_prefixes = set() for node2, lru in self.lru_trie.dfs_iter(node, rule_prefix): if node2.is_page(): _, add_report = self.__add_page(lru) report += add_report if state.should_yield(): yield state yield state.finalize(report)
def index_batch_crawl_iter(self, data): ''' data is must be a multimap 'source_lru' => 'target_lrus' ''' store = self.link_store state = TraphIteratorState() report = TraphWriteReport() pages = dict() inlinks = defaultdict(list) for source_page, target_pages in data.items(): source_page = self.__encode(source_page) # We need to add the page if source_page not in pages: source_node, source_page_report = self.__add_page(source_page, crawled=True) report += source_page_report pages[source_page] = source_node else: source_node = pages[source_page] if not source_node.is_crawled(): source_node.refresh() source_node.flag_as_crawled() source_node.write() target_blocks = [] for target_page in target_pages: target_page = self.__encode(target_page) if target_page not in pages: target_node, target_page_report = self.__add_page(target_page) report += target_page_report pages[target_page] = target_node target_blocks.append(target_node.block) if state.should_yield(200): yield state else: target_blocks.append(pages[target_page].block) # TODO: possible to store block as value rather inlinks[target_page].append(source_page) source_node.refresh() store.add_outlinks(source_node, target_blocks) for target_page, source_pages in inlinks.items(): target_node = pages[target_page] target_node.refresh() source_blocks = (pages[source_page].block for source_page in source_pages) store.add_inlinks(target_node, source_blocks) if state.should_yield(): yield state yield state.finalize(report)
def get_webentity_pagelinks_iter(self, weid, prefixes, include_inbound=False, include_internal=True, include_outbound=False): ''' Returns all or part of: pagelinks to the entity, internal pagelinks, pagelinks out of the entity. Default is only internal pagelinks. Note: the prefixes are supposed to match the webentity id. We do not check. ''' # TODO: Can be optimized caching windups state = TraphIteratorState() pagelinks = [] source_node = self.lru_trie.node() target_node = self.lru_trie.node() for prefix in prefixes: prefix = self.__encode(prefix) starting_node = self.lru_trie.lru_node(prefix) if not starting_node: raise TraphException('LRU %s not in the traph' % (prefix)) for node, lru in self.lru_trie.webentity_dfs_iter(starting_node, prefix): if not node.is_page(): continue # Iterating over the page's outlinks if node.has_outlinks() and (include_outbound or include_internal): links_block = node.outlinks() for link_node in self.link_store.link_nodes_iter(links_block): target_node.read(link_node.target()) target_lru = self.lru_trie.windup_lru(target_node.block) target_webentity = self.lru_trie.windup_lru_for_webentity(target_node) if (include_outbound and target_webentity != weid) or (include_internal and target_webentity == weid): pagelinks.append([lru, target_lru, link_node.weight()]) if state.should_yield(5000): yield state # Iterating over the page's inlinks if node.has_inlinks() and include_inbound: links_block = node.inlinks() for link_node in self.link_store.link_nodes_iter(links_block): source_node.read(link_node.target()) source_lru = self.lru_trie.windup_lru(source_node.block) source_webentity = self.lru_trie.windup_lru_for_webentity(source_node) if source_webentity != weid: pagelinks.append([source_lru, lru, link_node.weight()]) if state.should_yield(5000): yield state yield state.finalize(pagelinks)
def get_webentities_links_iter(self, out=True, include_auto=False): ''' This method should be faster than the slow version because it avoids unnecessary upward traversal. Note that it is also possible to solve the links right away and store them to solve their webentities later but this is most costly in RAM. ''' graph = defaultdict(Counter) page_to_webentity = dict() link_pointers = [] state = TraphIteratorState() # Solving the page => webentity relation for node, source_webentity in self.lru_trie.dfs_with_webentity_iter(): if not node.is_page(): continue if not source_webentity: continue page_to_webentity[node.block] = source_webentity if node.has_links(out=out): link_pointers.append((source_webentity, node.links(out=out))) if state.should_yield(): yield state # Computing the links for source_webentity, links_block in link_pointers: for link_node in self.link_store.link_nodes_iter(links_block): target_webentity = page_to_webentity.get(link_node.target()) # The target page might not have a target webentity if not target_webentity: continue if not include_auto and source_webentity == target_webentity: continue # Adding to the graph graph[source_webentity][target_webentity] += link_node.weight() if state.should_yield(5000): yield state yield state.finalize(graph)
def get_webentity_pages_iter(self, weid, prefixes): ''' Note: the prefixes are supposed to match the webentity id. We do not check. ''' state = TraphIteratorState() pages = [] for node, lru in self.webentity_page_nodes_iter(weid, prefixes): pages.append({ 'lru': lru, 'crawled': node.is_crawled() }) if state.should_yield(2000): yield state yield state.finalize(pages)
def get_webentity_most_linked_pages_iter(self, weid, prefixes, pages_count=10): ''' Returns a list of objects {lru:, indegree:} Note: the prefixes are supposed to match the webentity id. We do not check. ''' state = TraphIteratorState() pages = [] c = 0 for prefix in prefixes: prefix = self.__encode(prefix) starting_node = self.lru_trie.lru_node(prefix) if not starting_node: raise TraphException('LRU %s not in the traph' % (prefix)) for node, lru in self.lru_trie.webentity_dfs_iter(starting_node, prefix): if node.is_page(): # Iterate over link nodes indegree = 0 for linknode in self.link_store.link_nodes_iter(node.inlinks()): indegree += 1 c += 1 heapq.heappush(pages, (indegree, c, lru)) if len(pages) > pages_count: heapq.heappop(pages) if state.should_yield(2000): yield state sorted_pages = range(len(pages)) i = len(pages) - 1 while len(pages): page = heapq.heappop(pages) sorted_pages[i] = {'lru': page[2], 'indegree': page[0]} i -= 1 yield state.finalize(sorted_pages)
def get_webentity_child_webentities_iter(self, weid, prefixes): ''' Note: the prefixes are supposed to match the webentity id. We do not check. ''' state = TraphIteratorState() weids = set() for prefix in prefixes: prefix = self.__encode(prefix) starting_node = self.lru_trie.lru_node(prefix) if not starting_node: raise TraphException('LRU %s not in the traph' % (prefix)) for node, _ in self.lru_trie.dfs_iter(starting_node, prefix): weid2 = node.webentity() if weid2 and weid2 > 0 and weid2 != weid: weids.add(weid2) if state.should_yield(5000): yield state yield state.finalize(list(weids))
def get_webentity_inlinks_iter(self, weid, prefixes): ''' Returns the list of citing web entities Note: the prefixes are supposed to match the webentity id. We do not check. ''' state = TraphIteratorState() done_blocks = set() weids = set() source_node = self.lru_trie.node() for prefix in prefixes: prefix = self.__encode(prefix) starting_node = self.lru_trie.lru_node(prefix) if not starting_node: raise TraphException('LRU %s not in the traph' % (prefix)) for node, lru in self.lru_trie.webentity_dfs_iter(starting_node, prefix): if not node.is_page(): continue # Iterating over the page's inlinks if node.has_inlinks(): links_block = node.inlinks() for link_node in self.link_store.link_nodes_iter(links_block): source_node.read(link_node.target()) if source_node.block not in done_blocks: source_webentity = self.lru_trie.windup_lru_for_webentity(source_node) done_blocks.add(source_node.block) weids.add(source_webentity) if state.should_yield(5000): yield state yield state.finalize(weids)