예제 #1
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        """
        This method annotates each table column separately, by finding which are the column types and
        the relationships between the current column and the other.
        :param table: a list of search_keys, which must belong to the same table column
        :return: a list of GeneratorResult
        """
        col_search_keys = {}
        for cell in table.get_gt_cells():
            if cell.col_id not in col_search_keys:
                col_search_keys[cell.col_id] = []
            col_search_keys[cell.col_id].append(table.get_search_key(cell))

        col_search_keys = {col: chunk_list(search_keys, 500) for col, search_keys in col_search_keys.items()}

        if self._config.max_workers == 1:
            results = [self._get_candidates_for_column(search_keys)
                       for search_keys_list in col_search_keys.values()
                       for search_keys in search_keys_list]
        else:
            with ProcessPoolExecutor(self._config.max_workers) as pool:
                results = pool.map(self._get_candidates_for_column,
                                   [search_keys for search_keys_list in col_search_keys.values()
                                    for search_keys in search_keys_list])
        return functools.reduce(operator.iconcat, results, [])
예제 #2
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        """
        Candidate selection method. This implementation just forwards the LookupService results.
        :param table: a Table object
        :return: a list of GeneratorResult
        """
        search_keys = [table.get_search_key(cell_) for cell_ in table.get_gt_cells()]
        if self._config.max_workers == 1:
            results = self._lookup_candidates(search_keys)
        else:  # Parallelize at cell level (no dependencies between cells in the same col/row)
            with ProcessPoolExecutor(self._config.max_workers) as pool:
                results = pool.map(self._lookup_candidates, chunk_list(search_keys, self._config.chunk_size))

        return functools.reduce(operator.iconcat, results, [])
예제 #3
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        """
        Return a list of candidates, sorted by the cosine distance between their label and context embeddings.
        :param table: a Table object
        :return: a list of GeneratorResult
        """
        search_keys = [
            table.get_search_key(cell_) for cell_ in table.get_gt_cells()
        ]
        lookup_results = dict(self._lookup_candidates(
            search_keys))  # collect lookup result from the super class

        # create embed for each label and context pair
        cached_entries, to_compute = self._get_cached_entries(search_keys)
        new_results = self._embed_search_keys(to_compute)
        self._update_cache(new_results)  # write new entries to cache

        search_keys_embs = dict(cached_entries + new_results)

        # create embed for the candidates' abstracts
        candidates_list = functools.reduce(operator.iconcat,
                                           lookup_results.values(), [])
        if self._config.abstract == 'short':
            abstracts = self._abstract_helper.fetch_short_abstracts(
                candidates_list)
        else:
            abstracts = self._abstract_helper.fetch_long_abstracts(
                candidates_list)
        abstracts = {
            candidate: truncate_string(abstract,
                                       self._config.abstract_max_tokens)
            for candidate, abstract in abstracts.items()
        }

        cached_entries, to_compute = self._get_cached_entries(
            abstracts.values())
        new_results = self._embed_abstracts(to_compute)
        self._update_cache(new_results)
        abstracts_embeddings = dict(cached_entries + new_results)

        # do not zip! abstracts.values() might contain duplicates...
        abstracts_embs = {
            candidate: abstracts_embeddings[abstract]
            for candidate, abstract in abstracts.items()
        }

        results = []
        for search_key in search_keys:
            candidates_embeddings = []
            context_emb = np.nan
            if search_key.context and search_keys_embs[search_key].size:
                context_emb = search_keys_embs[search_key]
            for candidate in lookup_results[search_key]:
                abstract_emb = np.nan
                if candidate in abstracts and abstracts_embs[candidate].size:
                    abstract_emb = abstracts_embs[candidate]
                candidates_embeddings.append(
                    CandidateEmbeddings(candidate, context_emb, abstract_emb))

            results.append(
                GeneratorResult(search_key, [
                    c.candidate for c in weighting_by_ranking(
                        candidates_embeddings, self._config.alpha,
                        self._config.default_score)
                ]))

        return results
예제 #4
0
파일: ours.py 프로젝트: gianlucapuleri/nest
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        col_search_keys = {}
        row_search_keys = {}
        for cell in table.get_gt_cells():
            if cell.col_id not in col_search_keys:
                col_search_keys[cell.col_id] = []
            if cell.row_id not in row_search_keys:
                row_search_keys[cell.row_id] = []
            col_search_keys[cell.col_id].append(table.get_search_key(cell))
            row_search_keys[cell.row_id].append(table.get_search_key(cell))

        lookup_results_col = {}
        for col, search_key in col_search_keys.items():
            lookup_results_col[col] = dict(self._lookup_candidates(search_key))
        lookup_results_row = {}
        for row, search_key in row_search_keys.items():
            lookup_results_row[row] = dict(self._lookup_candidates(search_key))

        # Create a complete directed k-partite disambiguation graph where k is the number of search keys.
        disambiguation_graph_col = []
        disambiguation_graph_row = []
        sk_nodes_col = []
        sk_nodes_row = []
        personalization = {}  # prepare dict for pagerank with normalized priors
        embeddings = {}

        for (col, lookup) in enumerate(lookup_results_col.values()):
            disambiguation_graph_col.append(nx.DiGraph())
            sk_nodes_col.append(dict())
            for search_key, candidates in lookup.items():
                degrees = self._dbp.get_degree_for_uris(candidates)
                embeddings.update(self._w2v.get_vectors(candidates))

                # Filter candidates that have an embedding in w2v.
                nodes = sorted([(candidate, {'weight': degrees[candidate]})
                                for candidate in candidates
                                if embeddings[candidate] is not None],
                               key=lambda x: x[1]['weight'], reverse=True)

                # Take only the max_candidates most relevant (highest priors probability) candidates.
                nodes = nodes[:self._config.max_candidates]
                disambiguation_graph_col[col].add_nodes_from(nodes)
                sk_nodes_col[col][search_key] = [n[0] for n in nodes]

                # Store normalized priors
                weights_sum = sum([x[1]['weight'] for x in nodes])
                for node, props in nodes:
                    if node not in personalization:
                        personalization[node] = []
                    personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0)

        for (row, lookup) in enumerate(lookup_results_row.values()):
            disambiguation_graph_row.append(nx.DiGraph())
            sk_nodes_row.append(dict())
            for search_key, candidates in lookup.items():
                degrees = self._dbp.get_degree_for_uris(candidates)
                embeddings.update(self._w2v.get_vectors(candidates))

                # Filter candidates that have an embedding in w2v.
                nodes = sorted([(candidate, {'weight': degrees[candidate]})
                                for candidate in candidates
                                if embeddings[candidate] is not None],
                               key=lambda x: x[1]['weight'], reverse=True)

                # Take only the max_candidates most relevant (highest priors probability) candidates.
                nodes = nodes[:self._config.max_candidates]
                disambiguation_graph_row[row].add_nodes_from(nodes)
                sk_nodes_row[row][search_key] = [n[0] for n in nodes]

                # Store normalized priors
                weights_sum = sum([x[1]['weight'] for x in nodes])
                for node, props in nodes:
                    if node not in personalization:
                        personalization[node] = []
                    personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0)

        # Predict types using the classifier
        node_types = self._type_predictor.predict_types([node for sk_nodes in sk_nodes_col
                                                         for nodes in list(sk_nodes.values())
                                                         for node in nodes])
        # Predict types using the classifier
        node_types.update(self._type_predictor.predict_types([node for sk_nodes in sk_nodes_row
                                                              for nodes in list(sk_nodes.values())
                                                              for node in nodes]))
        # Get type embeddings. Set is used to remove duplicate
        type_embeddings = self._tee.get_vectors(list({t for types in list(node_types.values()) for t in types}))

        # Add weighted edges among the nodes in the disambiguation graph.
        # Avoid to connect nodes in the same partition.
        # Weights of edges are the cosine similarity between the nodes which the edge is connected to.
        # Only positive weights are considered.
        for (col, k) in enumerate(sk_nodes_col):
            for search_key, nodes in k.items():
                other_nodes = set(disambiguation_graph_col[col].nodes()) - set(nodes)
                for node, other_node in product(nodes, other_nodes):
                    if type_embeddings[node_types[node][0]] is not None \
                            and type_embeddings[node_types[other_node][0]] is not None:
                        v1 = type_embeddings[node_types[node][0]]
                        v2 = type_embeddings[node_types[other_node][0]]
                        cos_sim = cosine_similarity(v1, v2)
                        if cos_sim > 0:
                            disambiguation_graph_col[col].add_weighted_edges_from([(node, other_node, cos_sim)])

        for (row, k) in enumerate(sk_nodes_row):
            for search_key, nodes in k.items():
                other_nodes = set(disambiguation_graph_row[row].nodes()) - set(nodes)
                for node, other_node in product(nodes, other_nodes):
                    v1 = embeddings[node]
                    v2 = embeddings[other_node]
                    cos_sim = cosine_similarity(v1, v2)
                    if cos_sim > 0:
                        disambiguation_graph_row[row].add_weighted_edges_from([(node, other_node, cos_sim)])

        disambiguation_graph = nx.DiGraph()
        for col in disambiguation_graph_col:
            disambiguation_graph = nx.compose(disambiguation_graph, col)
        for row in disambiguation_graph_row:
            disambiguation_graph = nx.compose(disambiguation_graph, row)

        # Thin out a fraction of edges which weights are the lowest
        thin_out = int(self._config.thin_out_frac * len(disambiguation_graph.edges.data("weight")))
        disambiguation_graph.remove_edges_from(
            sorted(disambiguation_graph.edges.data("weight"), key=lambda tup: tup[2])[:thin_out])

        # Page rank computaton - epsilon is increased by a factor 2 until convergence
        page_rank = None
        epsilon = 1e-6
        while page_rank is None:
            try:
                page_rank = nx.pagerank(disambiguation_graph,
                                        tol=epsilon, max_iter=50, alpha=0.9,
                                        personalization={node: np.mean(weights)
                                                         for node, weights in personalization.items()})

            except nx.PowerIterationFailedConvergence:
                epsilon *= 2  # lower factor can be used too since pagerank is extremely fast

        # Sort candidates -> the higher the score, the better the candidate (reverse=True)
        return [GeneratorResult(search_key,
                                [c.candidate for c in sorted([ScoredCandidate(candidate, page_rank[candidate])
                                                              for candidate in candidates],
                                                             reverse=True)])
                for (x, sk_nodes) in enumerate(sk_nodes_col)
                for search_key, candidates in sk_nodes.items()]