Exemplo n.º 1
0
    def get_candidates(*tables: Table):
        """
        Combine results of already computed tables.
        :param tables: a list of annotated tables (i-th table annotated by the HybridGenerator i-th generator)
        :return:
        """
        results = []
        missing_cells = tables[0].get_gt_cells()
        for table in tables:
            new_missing = []
            for cell in missing_cells:
                if cell in table.cell_annotations:
                    entities = table.cell_annotations[cell].entities
                    if entities:
                        results.append(
                            GeneratorResult(
                                search_key=table.get_search_key(cell),
                                candidates=[entity.uri
                                            for entity in entities]))
                    else:
                        new_missing.append(cell)
                else:
                    new_missing.append(cell)
            missing_cells = new_missing
            if not missing_cells:
                break

        return results
Exemplo n.º 2
0
    def _lookup_candidates(
            self, search_keys: List[SearchKey]) -> List[GeneratorResult]:
        """
        Get a set of candidates from the lookup service.
        :param search_keys: a list of SearchKeys
        :return: a list of LookupResult
        """
        lookup_results = {}
        for lookup_service in self._lookup_services:
            labels = [
                search_key.label for search_key in search_keys
                if search_key.label not in lookup_results
                or not lookup_results[search_key.label]
            ]
            if not labels:
                break

            if self._config.max_subseq_len and self._config.max_subseq_len > 0:
                lookup_results.update(
                    dict(
                        lookup_service.lookup_subsequences(
                            labels, self._config.max_subseq_len)))
            else:
                lookup_results.update(dict(lookup_service.lookup(labels)))

        return [
            GeneratorResult(search_key, lookup_results[search_key.label])
            for search_key in search_keys
        ]
Exemplo n.º 3
0
 def get_candidates(self, table: Table) -> List[GeneratorResult]:
     res_dict = dict()
     empty_candidates = []
     for generator in self._generators:
         res = generator.get_candidates(table)
         if not res_dict:
             res_dict = dict(res)
         else:
             res_dict.update({
                 search_key: candidates
                 for search_key, candidates in dict(res).items()
                 if search_key in empty_candidates
             })
         empty_candidates = [
             search_key for search_key, candidates in res_dict.items()
             if not candidates
         ]
         if not empty_candidates:  # no more cells to annotate
             break
     return [
         GeneratorResult(search_key, candidates)
         for search_key, candidates in res_dict.items()
     ]
Exemplo n.º 4
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        """
        Return a list of candidates, sorted by the cosine distance between their label and context embeddings.
        :param table: a Table object
        :return: a list of GeneratorResult
        """
        search_keys = [
            table.get_search_key(cell_) for cell_ in table.get_gt_cells()
        ]
        lookup_results = dict(self._lookup_candidates(
            search_keys))  # collect lookup result from the super class

        # create embed for each label and context pair
        cached_entries, to_compute = self._get_cached_entries(search_keys)
        new_results = self._embed_search_keys(to_compute)
        self._update_cache(new_results)  # write new entries to cache

        search_keys_embs = dict(cached_entries + new_results)

        # create embed for the candidates' abstracts
        candidates_list = functools.reduce(operator.iconcat,
                                           lookup_results.values(), [])
        if self._config.abstract == 'short':
            abstracts = self._abstract_helper.fetch_short_abstracts(
                candidates_list)
        else:
            abstracts = self._abstract_helper.fetch_long_abstracts(
                candidates_list)
        abstracts = {
            candidate: truncate_string(abstract,
                                       self._config.abstract_max_tokens)
            for candidate, abstract in abstracts.items()
        }

        cached_entries, to_compute = self._get_cached_entries(
            abstracts.values())
        new_results = self._embed_abstracts(to_compute)
        self._update_cache(new_results)
        abstracts_embeddings = dict(cached_entries + new_results)

        # do not zip! abstracts.values() might contain duplicates...
        abstracts_embs = {
            candidate: abstracts_embeddings[abstract]
            for candidate, abstract in abstracts.items()
        }

        results = []
        for search_key in search_keys:
            candidates_embeddings = []
            context_emb = np.nan
            if search_key.context and search_keys_embs[search_key].size:
                context_emb = search_keys_embs[search_key]
            for candidate in lookup_results[search_key]:
                abstract_emb = np.nan
                if candidate in abstracts and abstracts_embs[candidate].size:
                    abstract_emb = abstracts_embs[candidate]
                candidates_embeddings.append(
                    CandidateEmbeddings(candidate, context_emb, abstract_emb))

            results.append(
                GeneratorResult(search_key, [
                    c.candidate for c in weighting_by_ranking(
                        candidates_embeddings, self._config.alpha,
                        self._config.default_score)
                ]))

        return results
Exemplo n.º 5
0
    def _get_candidates_for_column(self, search_keys: List[SearchKey]) -> List[GeneratorResult]:
        lookup_results = dict(self._lookup_candidates(search_keys))

        # Create a complete directed k-partite disambiguation graph where k is the number of search keys.
        disambiguation_graph = nx.DiGraph()
        sk_nodes = {}
        personalization = {}  # prepare dict for pagerank with normalized priors
        embeddings = {}
        for search_key, candidates in lookup_results.items():
            degrees = self._dbp.get_degree_for_uris(candidates)
            embeddings.update(self._w2v.get_vectors(candidates))

            # Filter candidates that have an embedding in w2v.
            nodes = sorted([(candidate, {'weight': degrees[candidate]})
                            for candidate in candidates
                            if embeddings[candidate] is not None],
                           key=lambda x: x[1]['weight'], reverse=True)

            # Take only the max_candidates most relevant (highest priors probability) candidates.
            nodes = nodes[:self._config.max_candidates]
            disambiguation_graph.add_nodes_from(nodes)
            sk_nodes[search_key] = [n[0] for n in nodes]

            # Store normalized priors
            weights_sum = sum([x[1]['weight'] for x in nodes])
            for node, props in nodes:
                if node not in personalization:
                    personalization[node] = []
                personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0)

        # Add weighted edges among the nodes in the disambiguation graph.
        # Avoid to connect nodes in the same partition.
        # Weights of edges are the cosine similarity between the nodes which the edge is connected to.
        # Only positive weights are considered.
        for search_key, nodes in sk_nodes.items():
            other_nodes = set(disambiguation_graph.nodes()) - set(nodes)
            for node, other_node in product(nodes, other_nodes):
                v1 = embeddings[node]
                v2 = embeddings[other_node]
                cos_sim = cosine_similarity(v1, v2)
                if cos_sim > 0:
                    disambiguation_graph.add_weighted_edges_from([(node, other_node, cos_sim)])

        # Thin out a fraction of edges which weights are the lowest
        thin_out = int(self._config.thin_out_frac * len(disambiguation_graph.edges.data("weight")))
        disambiguation_graph.remove_edges_from(
            sorted(disambiguation_graph.edges.data("weight"), key=lambda tup: tup[2])[:thin_out])

        # Page rank computaton - epsilon is increased by a factor 2 until convergence
        page_rank = None
        epsilon = 1e-6
        while page_rank is None:
            try:
                page_rank = nx.pagerank(disambiguation_graph,
                                        tol=epsilon, max_iter=50, alpha=0.9,
                                        personalization={node: np.mean(weights)
                                                         for node, weights in personalization.items()})
            except nx.PowerIterationFailedConvergence:
                epsilon *= 2  # lower factor can be used too since pagerank is extremely fast

        # Sort candidates -> the higher the score, the better the candidate (reverse=True)
        return [GeneratorResult(search_key,
                                [c.candidate for c in sorted([ScoredCandidate(candidate, page_rank[candidate])
                                                              for candidate in candidates],
                                                             reverse=True)])
                for search_key, candidates in sk_nodes.items()]
Exemplo n.º 6
0
    def _get_candidates_for_column(self, search_keys: List[SearchKey]) -> List[GeneratorResult]:
        """
        Generate candidate for a set of search keys.
        The assumption is that all the search keys belong to the same column.
        :param search_keys: a list of search_keys
        :return:
        """
        lookup_results = dict(self._lookup_candidates(search_keys))
        generator_results = {}

        # Pre-fetch types and description of the top candidate of each candidates set
        candidates_set = list({candidates[0] for candidates in lookup_results.values() if candidates})
        types = functools.reduce(operator.iconcat,
                                 self._dbp.get_direct_types_for_uris(candidates_set).values(),
                                 [])
        description_tokens = functools.reduce(operator.iconcat,
                                              self._get_descriptions_tokens(candidates_set).values(),
                                              [])
        facts = {}  # dict of possible facts in table (fact := <top_concept, ?p, support_col_value>)

        # First scan - raw results
        for search_key, candidates in lookup_results.items():
            if candidates:  # Handle cells with some candidates (higher confidence)
                if len(candidates) == 1:
                    generator_results[search_key] = GeneratorResult(search_key, candidates)
                    # Check for relationships if there is only one candidate (very high confidence)
                    for col_id, col_value in search_key.context:
                        if col_id not in facts:
                            facts[col_id] = []
                        facts[col_id].append((candidates[0], col_value))
                    self._stats.incr_exact()

        acceptable_types = get_most_frequent(types, n=5)
        acceptable_tokens = get_most_frequent(description_tokens)
        relations = {col_id: candidate_relations[0][0]
                     for col_id, candidate_relations in self._contains_facts(facts, min_occurrences=5).items()
                     if candidate_relations}

        # Second scan - refinement and loose searches
        for search_key, candidates in lookup_results.items():
            # Skip already annotated cells
            if search_key in generator_results:
                continue

            if candidates:
                # Pre-fetch types and description of all the candidates of not annotated cells
                types = self._dbp.get_direct_types_for_uris(candidates)
                description_tokens = self._get_descriptions_tokens(candidates)

                # Strict search: filter lists of candidates by removing entities that do not match types and tokens
                refined_candidates = self._search_strict(candidates,
                                                         acceptable_types,
                                                         types,
                                                         acceptable_tokens,
                                                         description_tokens)
                if refined_candidates:
                    generator_results[search_key] = GeneratorResult(search_key, refined_candidates)
                    self._stats.incr_strict()
                    continue

            # Loose search: increase the recall by allowing a big margin of edit distance (Levenshtein)
            context_dict = dict(search_key.context)
            for col_id, relation in relations.items():
                refined_candidates = self._search_loose(search_key.label, relation, context_dict[col_id])
                if len(refined_candidates) > 0:
                    generator_results[search_key] = GeneratorResult(search_key, refined_candidates)
                    self._stats.incr_loose()
                    break

            # Coarse- and fine-grained searches failed: no results
            if search_key not in generator_results:
                generator_results[search_key] = GeneratorResult(search_key, [])
                self._stats.incr_empty()

        return list(generator_results.values())
Exemplo n.º 7
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        col_search_keys = {}
        row_search_keys = {}
        for cell in table.get_gt_cells():
            if cell.col_id not in col_search_keys:
                col_search_keys[cell.col_id] = []
            if cell.row_id not in row_search_keys:
                row_search_keys[cell.row_id] = []
            col_search_keys[cell.col_id].append(table.get_search_key(cell))
            row_search_keys[cell.row_id].append(table.get_search_key(cell))

        lookup_results_col = {}
        for col, search_key in col_search_keys.items():
            lookup_results_col[col] = dict(self._lookup_candidates(search_key))
        lookup_results_row = {}
        for row, search_key in row_search_keys.items():
            lookup_results_row[row] = dict(self._lookup_candidates(search_key))

        # Create a complete directed k-partite disambiguation graph where k is the number of search keys.
        disambiguation_graph_col = []
        disambiguation_graph_row = []
        sk_nodes_col = []
        sk_nodes_row = []
        personalization = {}  # prepare dict for pagerank with normalized priors
        embeddings = {}

        for (col, lookup) in enumerate(lookup_results_col.values()):
            disambiguation_graph_col.append(nx.DiGraph())
            sk_nodes_col.append(dict())
            for search_key, candidates in lookup.items():
                degrees = self._dbp.get_degree_for_uris(candidates)
                embeddings.update(self._w2v.get_vectors(candidates))

                # Filter candidates that have an embedding in w2v.
                nodes = sorted([(candidate, {'weight': degrees[candidate]})
                                for candidate in candidates
                                if embeddings[candidate] is not None],
                               key=lambda x: x[1]['weight'], reverse=True)

                # Take only the max_candidates most relevant (highest priors probability) candidates.
                nodes = nodes[:self._config.max_candidates]
                disambiguation_graph_col[col].add_nodes_from(nodes)
                sk_nodes_col[col][search_key] = [n[0] for n in nodes]

                # Store normalized priors
                weights_sum = sum([x[1]['weight'] for x in nodes])
                for node, props in nodes:
                    if node not in personalization:
                        personalization[node] = []
                    personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0)

        for (row, lookup) in enumerate(lookup_results_row.values()):
            disambiguation_graph_row.append(nx.DiGraph())
            sk_nodes_row.append(dict())
            for search_key, candidates in lookup.items():
                degrees = self._dbp.get_degree_for_uris(candidates)
                embeddings.update(self._w2v.get_vectors(candidates))

                # Filter candidates that have an embedding in w2v.
                nodes = sorted([(candidate, {'weight': degrees[candidate]})
                                for candidate in candidates
                                if embeddings[candidate] is not None],
                               key=lambda x: x[1]['weight'], reverse=True)

                # Take only the max_candidates most relevant (highest priors probability) candidates.
                nodes = nodes[:self._config.max_candidates]
                disambiguation_graph_row[row].add_nodes_from(nodes)
                sk_nodes_row[row][search_key] = [n[0] for n in nodes]

                # Store normalized priors
                weights_sum = sum([x[1]['weight'] for x in nodes])
                for node, props in nodes:
                    if node not in personalization:
                        personalization[node] = []
                    personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0)

        # Predict types using the classifier
        node_types = self._type_predictor.predict_types([node for sk_nodes in sk_nodes_col
                                                         for nodes in list(sk_nodes.values())
                                                         for node in nodes])
        # Predict types using the classifier
        node_types.update(self._type_predictor.predict_types([node for sk_nodes in sk_nodes_row
                                                              for nodes in list(sk_nodes.values())
                                                              for node in nodes]))
        # Get type embeddings. Set is used to remove duplicate
        type_embeddings = self._tee.get_vectors(list({t for types in list(node_types.values()) for t in types}))

        # Add weighted edges among the nodes in the disambiguation graph.
        # Avoid to connect nodes in the same partition.
        # Weights of edges are the cosine similarity between the nodes which the edge is connected to.
        # Only positive weights are considered.
        for (col, k) in enumerate(sk_nodes_col):
            for search_key, nodes in k.items():
                other_nodes = set(disambiguation_graph_col[col].nodes()) - set(nodes)
                for node, other_node in product(nodes, other_nodes):
                    if type_embeddings[node_types[node][0]] is not None \
                            and type_embeddings[node_types[other_node][0]] is not None:
                        v1 = type_embeddings[node_types[node][0]]
                        v2 = type_embeddings[node_types[other_node][0]]
                        cos_sim = cosine_similarity(v1, v2)
                        if cos_sim > 0:
                            disambiguation_graph_col[col].add_weighted_edges_from([(node, other_node, cos_sim)])

        for (row, k) in enumerate(sk_nodes_row):
            for search_key, nodes in k.items():
                other_nodes = set(disambiguation_graph_row[row].nodes()) - set(nodes)
                for node, other_node in product(nodes, other_nodes):
                    v1 = embeddings[node]
                    v2 = embeddings[other_node]
                    cos_sim = cosine_similarity(v1, v2)
                    if cos_sim > 0:
                        disambiguation_graph_row[row].add_weighted_edges_from([(node, other_node, cos_sim)])

        disambiguation_graph = nx.DiGraph()
        for col in disambiguation_graph_col:
            disambiguation_graph = nx.compose(disambiguation_graph, col)
        for row in disambiguation_graph_row:
            disambiguation_graph = nx.compose(disambiguation_graph, row)

        # Thin out a fraction of edges which weights are the lowest
        thin_out = int(self._config.thin_out_frac * len(disambiguation_graph.edges.data("weight")))
        disambiguation_graph.remove_edges_from(
            sorted(disambiguation_graph.edges.data("weight"), key=lambda tup: tup[2])[:thin_out])

        # Page rank computaton - epsilon is increased by a factor 2 until convergence
        page_rank = None
        epsilon = 1e-6
        while page_rank is None:
            try:
                page_rank = nx.pagerank(disambiguation_graph,
                                        tol=epsilon, max_iter=50, alpha=0.9,
                                        personalization={node: np.mean(weights)
                                                         for node, weights in personalization.items()})

            except nx.PowerIterationFailedConvergence:
                epsilon *= 2  # lower factor can be used too since pagerank is extremely fast

        # Sort candidates -> the higher the score, the better the candidate (reverse=True)
        return [GeneratorResult(search_key,
                                [c.candidate for c in sorted([ScoredCandidate(candidate, page_rank[candidate])
                                                              for candidate in candidates],
                                                             reverse=True)])
                for (x, sk_nodes) in enumerate(sk_nodes_col)
                for search_key, candidates in sk_nodes.items()]