예제 #1
0
    def add_webentity_creation_rule_iter(self, rule_prefix, pattern, write_in_trie=True):
        '''
        Note: write_in_trie has 2 effects: store the rule in the trie, and apply it to existing entities.
        write_in_trie=False is essentially for init on an existing traph.

        Note 2: it seems obvious from the api design but let's restate it:
        We can only have one rule for each prefix (or none).
        '''
        rule_prefix = self.__encode(rule_prefix)

        self.webentity_creation_rules[rule_prefix] = re.compile(
            pattern,
            re.I
        )

        report = TraphWriteReport()
        state = TraphIteratorState()

        if write_in_trie:
            node, history = self.lru_trie.add_lru(rule_prefix)
            if not node:
                raise TraphException('Prefix not in tree: ' + rule_prefix)
            node.flag_as_webentity_creation_rule()
            node.write()
            # Spawn necessary web entities
            candidate_prefixes = set()
            for node2, lru in self.lru_trie.dfs_iter(node, rule_prefix):
                if node2.is_page():
                    _, add_report = self.__add_page(lru)
                    report += add_report

                if state.should_yield():
                    yield state

        yield state.finalize(report)
예제 #2
0
    def index_batch_crawl_iter(self, data):
        '''
        data is must be a multimap 'source_lru' => 'target_lrus'
        '''
        store = self.link_store
        state = TraphIteratorState()
        report = TraphWriteReport()
        pages = dict()
        inlinks = defaultdict(list)

        for source_page, target_pages in data.items():
            source_page = self.__encode(source_page)

            # We need to add the page
            if source_page not in pages:
                source_node, source_page_report = self.__add_page(source_page, crawled=True)
                report += source_page_report
                pages[source_page] = source_node
            else:
                source_node = pages[source_page]

                if not source_node.is_crawled():
                    source_node.refresh()
                    source_node.flag_as_crawled()
                    source_node.write()

            target_blocks = []

            for target_page in target_pages:
                target_page = self.__encode(target_page)

                if target_page not in pages:
                    target_node, target_page_report = self.__add_page(target_page)
                    report += target_page_report
                    pages[target_page] = target_node
                    target_blocks.append(target_node.block)

                    if state.should_yield(200):
                        yield state

                else:
                    target_blocks.append(pages[target_page].block)

                # TODO: possible to store block as value rather
                inlinks[target_page].append(source_page)

            source_node.refresh()
            store.add_outlinks(source_node, target_blocks)

        for target_page, source_pages in inlinks.items():
            target_node = pages[target_page]
            target_node.refresh()
            source_blocks = (pages[source_page].block for source_page in source_pages)
            store.add_inlinks(target_node, source_blocks)

            if state.should_yield():
                yield state

        yield state.finalize(report)
예제 #3
0
    def get_webentity_pagelinks_iter(self, weid, prefixes, include_inbound=False, include_internal=True, include_outbound=False):
        '''
        Returns all or part of: pagelinks to the entity, internal pagelinks, pagelinks out of the entity.
        Default is only internal pagelinks.
        Note: the prefixes are supposed to match the webentity id. We do not check.
        '''

        # TODO: Can be optimized caching windups

        state = TraphIteratorState()
        pagelinks = []

        source_node = self.lru_trie.node()
        target_node = self.lru_trie.node()

        for prefix in prefixes:
            prefix = self.__encode(prefix)

            starting_node = self.lru_trie.lru_node(prefix)
            if not starting_node:
                raise TraphException('LRU %s not in the traph' % (prefix))

            for node, lru in self.lru_trie.webentity_dfs_iter(starting_node, prefix):

                if not node.is_page():
                    continue

                # Iterating over the page's outlinks
                if node.has_outlinks() and (include_outbound or include_internal):
                    links_block = node.outlinks()
                    for link_node in self.link_store.link_nodes_iter(links_block):

                        target_node.read(link_node.target())
                        target_lru = self.lru_trie.windup_lru(target_node.block)
                        target_webentity = self.lru_trie.windup_lru_for_webentity(target_node)

                        if (include_outbound and target_webentity != weid) or (include_internal and target_webentity == weid):
                            pagelinks.append([lru, target_lru, link_node.weight()])

                        if state.should_yield(5000):
                            yield state

                # Iterating over the page's inlinks
                if node.has_inlinks() and include_inbound:
                    links_block = node.inlinks()
                    for link_node in self.link_store.link_nodes_iter(links_block):

                        source_node.read(link_node.target())
                        source_lru = self.lru_trie.windup_lru(source_node.block)
                        source_webentity = self.lru_trie.windup_lru_for_webentity(source_node)

                        if source_webentity != weid:
                            pagelinks.append([source_lru, lru, link_node.weight()])

                        if state.should_yield(5000):
                            yield state

        yield state.finalize(pagelinks)
예제 #4
0
    def get_webentities_links_iter(self, out=True, include_auto=False):
        '''
        This method should be faster than the slow version because it avoids
        unnecessary upward traversal.

        Note that it is also possible to solve the links right away and store
        them to solve their webentities later but this is most costly in RAM.
        '''
        graph = defaultdict(Counter)
        page_to_webentity = dict()
        link_pointers = []
        state = TraphIteratorState()

        # Solving the page => webentity relation
        for node, source_webentity in self.lru_trie.dfs_with_webentity_iter():
            if not node.is_page():
                continue

            if not source_webentity:
                continue

            page_to_webentity[node.block] = source_webentity

            if node.has_links(out=out):
                link_pointers.append((source_webentity, node.links(out=out)))

            if state.should_yield():
                yield state

        # Computing the links
        for source_webentity, links_block in link_pointers:

            for link_node in self.link_store.link_nodes_iter(links_block):
                target_webentity = page_to_webentity.get(link_node.target())

                # The target page might not have a target webentity
                if not target_webentity:
                    continue

                if not include_auto and source_webentity == target_webentity:
                    continue

                # Adding to the graph
                graph[source_webentity][target_webentity] += link_node.weight()

                if state.should_yield(5000):
                    yield state

        yield state.finalize(graph)
예제 #5
0
    def get_webentity_pages_iter(self, weid, prefixes):
        '''
        Note: the prefixes are supposed to match the webentity id. We do not check.
        '''
        state = TraphIteratorState()
        pages = []
        for node, lru in self.webentity_page_nodes_iter(weid, prefixes):
            pages.append({
                'lru': lru,
                'crawled': node.is_crawled()
            })

            if state.should_yield(2000):
                yield state

        yield state.finalize(pages)
예제 #6
0
    def get_webentity_most_linked_pages_iter(self, weid, prefixes, pages_count=10):
        '''
        Returns a list of objects {lru:, indegree:}
        Note: the prefixes are supposed to match the webentity id. We do not check.
        '''
        state = TraphIteratorState()
        pages = []
        c = 0
        for prefix in prefixes:
            prefix = self.__encode(prefix)

            starting_node = self.lru_trie.lru_node(prefix)
            if not starting_node:
                raise TraphException('LRU %s not in the traph' % (prefix))

            for node, lru in self.lru_trie.webentity_dfs_iter(starting_node, prefix):
                if node.is_page():

                    # Iterate over link nodes
                    indegree = 0

                    for linknode in self.link_store.link_nodes_iter(node.inlinks()):
                        indegree += 1

                    c += 1
                    heapq.heappush(pages, (indegree, c, lru))

                    if len(pages) > pages_count:
                        heapq.heappop(pages)

                if state.should_yield(2000):
                    yield state

        sorted_pages = range(len(pages))
        i = len(pages) - 1

        while len(pages):
            page = heapq.heappop(pages)
            sorted_pages[i] = {'lru': page[2], 'indegree': page[0]}
            i -= 1

        yield state.finalize(sorted_pages)
예제 #7
0
    def get_webentity_child_webentities_iter(self, weid, prefixes):
        '''
        Note: the prefixes are supposed to match the webentity id. We do not check.
        '''
        state = TraphIteratorState()
        weids = set()
        for prefix in prefixes:
            prefix = self.__encode(prefix)

            starting_node = self.lru_trie.lru_node(prefix)
            if not starting_node:
                raise TraphException('LRU %s not in the traph' % (prefix))

            for node, _ in self.lru_trie.dfs_iter(starting_node, prefix):
                weid2 = node.webentity()
                if weid2 and weid2 > 0 and weid2 != weid:
                    weids.add(weid2)

                if state.should_yield(5000):
                    yield state

        yield state.finalize(list(weids))
예제 #8
0
    def get_webentity_inlinks_iter(self, weid, prefixes):
        '''
        Returns the list of citing web entities
        Note: the prefixes are supposed to match the webentity id. We do not check.
        '''

        state = TraphIteratorState()
        done_blocks = set()
        weids = set()

        source_node = self.lru_trie.node()

        for prefix in prefixes:
            prefix = self.__encode(prefix)

            starting_node = self.lru_trie.lru_node(prefix)
            if not starting_node:
                raise TraphException('LRU %s not in the traph' % (prefix))

            for node, lru in self.lru_trie.webentity_dfs_iter(starting_node, prefix):

                if not node.is_page():
                    continue

                # Iterating over the page's inlinks
                if node.has_inlinks():
                    links_block = node.inlinks()
                    for link_node in self.link_store.link_nodes_iter(links_block):
                        source_node.read(link_node.target())
                        if source_node.block not in done_blocks:
                            source_webentity = self.lru_trie.windup_lru_for_webentity(source_node)
                            done_blocks.add(source_node.block)
                            weids.add(source_webentity)

                        if state.should_yield(5000):
                            yield state

        yield state.finalize(weids)