Exemplo n.º 1
0
    def __init__(self, **kwargs):
        """
        initialize the qnodes_dict as original tfidf required input, it is a dict with
            key: Q node id
            value: list of edges in format "property#node2"
        :param kwargs:
        """
        self.input_df = pd.read_csv(kwargs['input_file'], dtype=object)
        self.output_col_name = kwargs["output_column_name"]
        self.similarity_column = kwargs["similarity_column"]
        if self.similarity_column not in self.input_df.columns:
            raise RequiredColumnMissingException(
                "Similarity column {} does not exist in input.".format(
                    self.similarity_column))

        self.es = Search(kwargs["url"],
                         kwargs["index"],
                         es_user=kwargs.get("user"),
                         es_pass=kwargs.get("password"))
        self.qnodes_dict = {}
        nodes_candidates = self.input_df["kg_id"].dropna().unique().tolist()
        for each in self.es.get_node_info(nodes_candidates):
            node_id = each["_source"]["id"]
            node_edges_info = each["_source"]["edges"]
            self.qnodes_dict[node_id] = node_edges_info
        # properties_classes_map is a dict mapping the P nodes or Q nodes to unique integer id (starting from 0)
        self.properties_classes_map = self.create_all_properties_classes_map()
Exemplo n.º 2
0
 def __init__(self, parameters):
     self.vectors_map = {}
     self.sentence_map = {}
     self.kwargs = parameters
     self.loaded_file = None
     self.kgtk_format_input = None
     self.centroid = {}
     self.groups = defaultdict(set)
     self.es = Search(self.kwargs["url"],
                      self.kwargs["index"],
                      es_user=self.kwargs.get("user"),
                      es_pass=self.kwargs.get("password"))
Exemplo n.º 3
0
 def __init__(self,
              es_url,
              es_index,
              es_user=None,
              es_pass=None,
              output_column_name: str = "retrieval_score"):
     self.es = Search(es_url, es_index, es_user=es_user, es_pass=es_pass)
     self.utility = Utility(self.es, output_column_name)
Exemplo n.º 4
0
 def __init__(self, es_url, es_index, es_user=None, es_pass=None, score_column_name: str = "retrieval_score",
              previous_match_column_name: str = "retrieval_score"):
     self.es = Search(es_url, es_index, es_user=es_user, es_pass=es_pass)
     self.utility = Utility(self.es, score_column_name, previous_match_column_name)
Exemplo n.º 5
0
class TFIDF(object):
    def __init__(self, **kwargs):
        """
        initialize the qnodes_dict as original tfidf required input, it is a dict with
            key: Q node id
            value: list of edges in format "property#node2"
        :param kwargs:
        """
        self.input_df = pd.read_csv(kwargs['input_file'], dtype=object)
        self.output_col_name = kwargs["output_column_name"]
        self.similarity_column = kwargs["similarity_column"]
        if self.similarity_column not in self.input_df.columns:
            raise RequiredColumnMissingException(
                "Similarity column {} does not exist in input.".format(
                    self.similarity_column))

        self.es = Search(kwargs["url"],
                         kwargs["index"],
                         es_user=kwargs.get("user"),
                         es_pass=kwargs.get("password"))
        self.qnodes_dict = {}
        nodes_candidates = self.input_df["kg_id"].dropna().unique().tolist()
        for each in self.es.get_node_info(nodes_candidates):
            node_id = each["_source"]["id"]
            node_edges_info = each["_source"]["edges"]
            self.qnodes_dict[node_id] = node_edges_info
        # properties_classes_map is a dict mapping the P nodes or Q nodes to unique integer id (starting from 0)
        self.properties_classes_map = self.create_all_properties_classes_map()

    @staticmethod
    def get_properties_classes_for_qnode(edges):
        properties_classes_set = set()
        for wd_prop_val in edges:
            edge, value = wd_prop_val.split('#', 1)
            if len(value) > 6 and value[:3] == '"""' and value[-3:] == '"""':
                value = value[3:-3]
            elif len(value) > 2:
                if value[0] == "'" and value[-1] == "'":
                    value = value[1:-1]
                elif value[0] == '"' and value[-1] == '"':
                    value = value[1:-1]

            # add edges
            properties_classes_set.add(edge)
            # if "isinstance"
            if edge == 'P31':
                properties_classes_set.add(value)
        return properties_classes_set

    def create_all_properties_classes_map(self):
        # map each properties to a corresponding unique number id
        properties_classes_set = set()
        for qnode in self.qnodes_dict:
            v = self.qnodes_dict[qnode]
            properties_classes_set.update(
                self.get_properties_classes_for_qnode(v))
        return {p: idx for idx, p in enumerate(properties_classes_set)}

    def create_feature_vector_dict(self, label_candidates_dict):
        # creates input for tfidf computation
        feature_vector_dict = {}
        _p_c_len = len(self.properties_classes_map)

        for label, candidates in label_candidates_dict.items():
            feature_vector_dict[label] = {}
            for candidate in candidates:
                feature_vector = [0] * _p_c_len
                if candidate in self.qnodes_dict:
                    prop_class_list = self.get_properties_classes_for_qnode(
                        self.qnodes_dict[candidate])
                    for _p_c in prop_class_list:
                        if _p_c in self.properties_classes_map:
                            feature_vector[
                                self.properties_classes_map[_p_c]] = 1
                feature_vector_dict[label][candidate] = feature_vector
        return feature_vector_dict

    def compute_tfidf(self):
        """
        Compute TF/IDF for all candidates.

        Args:
            candidates:
                ```
                {
                    e1: {
                        q1: [f1, f2, f3],
                        q2: [f1, f2, f3]
                    },
                    'e2': ...
                }
                ```
                `[f1, f2, f3]` is feature vector. All vectors should have same length.
            feature_count: Length of feature vector.
            high_preision_candidates: `{e1: q1, e2: q2}`.
                If None, all qnodes will be used to compute tf.

        Returns:
            ```
            {
                e1: {q1: 1.0, q2: 0.9},
                e2: {q3: 0.1}
            }
        """

        label_candidates_dict = defaultdict(list)
        high_precision_candidates = defaultdict(set)

        for _, each in self.input_df.iterrows():
            if isinstance(each["kg_id"], str) and each["kg_id"] != "":
                label_candidates_dict[each["label"]].append(each["kg_id"])
                if each["method"] == "exact-match":
                    high_precision_candidates[each["label"]].add(each["kg_id"])

        candidates = self.create_feature_vector_dict(label_candidates_dict)
        feature_count = len(self.properties_classes_map)
        tfidf_values = [{
            'tf': 0,
            'df': 0,
            'idf': 0
        } for _ in range(feature_count)]
        corpus_num = sum(len(qs) for _, qs in candidates.items())

        # get normalized similarity score
        similarity_score_col = self.input_df[self.similarity_column].astype(
            float)
        max_score = max(similarity_score_col)
        min_score = min(similarity_score_col)
        temp = self.input_df.copy()
        if max_score != 1.0 or min_score < 0:
            score_range = max_score - min_score
            temp[
                "||similarity_score_col_normalized||"] = similarity_score_col.apply(
                    lambda x: (x - min_score) / score_range)
        else:
            temp["||similarity_score_col_normalized||"] = similarity_score_col

        similarity_score_dict = {}
        for _, each_row in temp.iterrows():
            similarity_score_dict[(
                each_row["label"], each_row["kg_id"]
            )] = each_row["||similarity_score_col_normalized||"]

        # compute tf
        for f_idx in range(feature_count):
            for e in candidates:
                for q, v in candidates[e].items():
                    if high_precision_candidates.get(
                            e) and q in high_precision_candidates[e]:
                        if v[f_idx] == 1:
                            tfidf_values[f_idx]['tf'] += 1
                    else:
                        tfidf_values[f_idx]['tf'] += 1

        # compute df
        for f_idx in range(feature_count):
            for e in candidates:
                for q, v in candidates[e].items():
                    if v[f_idx] == 1:
                        tfidf_values[f_idx]['df'] += 1

        # compute idf
        for f_idx in range(len(tfidf_values)):
            if tfidf_values[f_idx]['df'] == 0:
                tfidf_values[f_idx]['idf'] = 0
            else:
                tfidf_values[f_idx]['idf'] = math.log(
                    float(corpus_num) / tfidf_values[f_idx]['df'], 10)

        # compute final score
        ret = {}
        for e in candidates:
            for q, v in candidates[e].items():
                ret[q] = 0
                for f_idx in range(feature_count):
                    ret[q] += tfidf_values[f_idx]['tf'] * tfidf_values[f_idx]['idf'] * v[f_idx] \
                              * similarity_score_dict.get((e, q), 1)

        output_df = self.input_df.copy()
        output_df[self.output_col_name] = output_df['kg_id'].map(ret)
        return output_df
 def __init__(self, es_url, es_index, es_user, es_pass, properties,
              output_column_name):
     self.properties = properties
     self.es = Search(es_url, es_index, es_user=es_user, es_pass=es_pass)
     self.utility = Utility(self.es, output_column_name)
Exemplo n.º 7
0
class EmbeddingVector:
    """
        a class support embedding vectors ranking operations
    """
    def __init__(self, parameters):
        self.vectors_map = {}
        self.sentence_map = {}
        self.kwargs = parameters
        self.loaded_file = None
        self.kgtk_format_input = None
        self.centroid = {}
        self.groups = defaultdict(set)
        self.es = Search(self.kwargs["url"],
                         self.kwargs["index"],
                         es_user=self.kwargs.get("user"),
                         es_pass=self.kwargs.get("password"))

    def load_input_file(self, input_file):
        """
            read the input file
        """
        self.loaded_file = pd.read_csv(input_file, dtype=object)
        self._to_kgtk_test_format()

    def _to_kgtk_test_format(self):
        """
        wrap input file to kgtk format input
        :return:
        """
        # remove evaluation label equals to 0 (which means no ground truth)
        self.groups = defaultdict(set)
        if "evaluation_label" in self.loaded_file.columns:
            self.loaded_file = self.loaded_file[
                self.loaded_file['evaluation_label'] != '0']
        all_info = {}
        count = 0
        correspond_key = {
            "label_clean": "label",
            "kg_id": "candidates",
            "GT_kg_id": "kg_id"
        }
        for i, each_part in self.loaded_file.groupby(["column", "row"]):
            info = {}
            for each_choice in correspond_key.keys():
                if each_choice in each_part.columns:
                    temp = list(set(each_part[each_choice].unique()))
                    temp_filtered = []
                    for each in temp:
                        if each != "" and not isinstance(each, float):
                            temp_filtered.append(each)
                    info[correspond_key[each_choice]] = temp_filtered
                else:
                    info[correspond_key[each_choice]] = []

            if len(info['kg_id']) > 1 or len(info['label']) > 1:
                Utility.eprint(
                    "WARNING: pair {} has multiple ground truths?".format(i))
            self.groups[i[0]].update(info["candidates"])
            self.groups[i[0]].update(info["kg_id"])
            info["label"] = info["label"][0]
            if len(info["kg_id"]) > 0:
                info["kg_id"] = info["kg_id"][0]
            else:
                info["kg_id"] = " "
            info["candidates"] = "|".join(info["candidates"])

            all_info[count] = info
            count += 1

        self.kgtk_format_input = pd.DataFrame.from_dict(all_info,
                                                        orient='index')

    def process_vectors(self):
        """
        apply corresponding vector strategy to process the calculated vectors
        :return:
        """
        vector_strategy = self.kwargs.get("column_vector_strategy",
                                          "exact-matches")
        if vector_strategy == "page-rank":
            self._calculate_page_rank()
        elif vector_strategy == "page-rank-precomputed":
            self._get_precomputed_page_rank()
        else:
            self._get_centroid(vector_strategy)

    def _generate_graph(self):
        """
        function used to calculate page rank
        :return:
        """
        Utility.eprint("start calculating page rank, it may take some time.")
        import networkx as nx
        # calculate probability to next stage
        # calculate probability base on columns
        col_memo = {}
        nodes_memo = {}
        graph_memo = {}
        similarity_memo = {}
        for col_number, each_part in self.loaded_file.groupby(["column"]):
            # first calculate all distance for memo
            all_nodes = set(each_part['kg_id']) - {"", np.nan}
            all_nodes_list = list(all_nodes)
            for i, each_node in enumerate(all_nodes):
                col_memo[each_node] = col_number
            for i in range(len(all_nodes_list)):
                for j in range(i + 1, len(all_nodes_list)):
                    similarity = self.compute_distance(
                        self.vectors_map[all_nodes_list[i]],
                        self.vectors_map[all_nodes_list[j]])
                    similarity_memo[(all_nodes_list[i],
                                     all_nodes_list[j])] = similarity
                    similarity_memo[(all_nodes_list[j],
                                     all_nodes_list[i])] = similarity
            similarity_graph = nx.DiGraph()
            similarity_graph.add_nodes_from(all_nodes)
            graph_memo[col_number] = similarity_graph
            nodes_memo[col_number] = all_nodes

        for i, each_row in self.kgtk_format_input.iterrows():
            each_surface = each_row["candidates"].split("|")
            if len(each_surface) > 0:
                for each_node_i in each_surface:
                    if each_node_i == "":
                        continue
                    col_number = col_memo[each_node_i]
                    all_nodes_set = nodes_memo[col_number]
                    remained_nodes = all_nodes_set - set(each_surface)
                    # calculate sum score first
                    sum_score = 0
                    for each_node_j in remained_nodes:
                        sum_score += similarity_memo[(each_node_i,
                                                      each_node_j)]
                    for each_node_j in remained_nodes:
                        # pos = (pos_memo[each_node_i], pos_memo[each_node_j])
                        each_weight = similarity_memo[
                            (each_node_i, each_node_j)] / sum_score
                        graph_memo[col_number].add_edge(each_node_i,
                                                        each_node_j,
                                                        weight=each_weight)
        return graph_memo

    def _calculate_page_rank(self):
        import networkx as nx
        # just get initial page rank to do filtering
        weights_original = {}
        graph_memo = self._generate_graph()
        for each_graph in graph_memo.values():
            weights_original.update(dict(each_graph.degree(weight='weight')))
        self.loaded_file['|pr|'] = self.loaded_file['kg_id'].map(
            weights_original)
        from tl.features.normalize_scores import drop_by_score
        self.loaded_file = drop_by_score(column="|pr|",
                                         df=self.loaded_file,
                                         k=20)
        # also we need to update kgtk format input
        self._to_kgtk_test_format()
        # create the graph again base on filtered result
        res = {}
        graph_memo = self._generate_graph()
        # it seems pagerank_numpy runs quickest
        for each_graph in graph_memo.values():
            res.update(nx.pagerank_numpy(each_graph, alpha=0.9))
        self.loaded_file['|pr|'] = self.loaded_file['kg_id'].map(res)

    def _get_precomputed_page_rank(self):
        """
        get the precomputed pagerank from whole wikidata graph
        :return:
        """
        pageranks = {
            k: v[0] if len(v) > 0 else 0
            for k, v in self.es.search_node_pagerank(
                self.loaded_file['kg_id'].dropna().unique().tolist()).items()
        }
        self.loaded_file["|pr|"] = self.loaded_file['kg_id'].map(
            pageranks).fillna(0)

    def _get_centroid(self, vector_strategy: str):
        """
            function used to calculate the column-vector(centroid) value
        """
        n_value = int(self.kwargs.pop("n_value"))

        if vector_strategy == "ground-truth":
            if "GT_kg_id" not in self.loaded_file:
                raise TLException(
                    "The input file does not have `GT_kg_id` column! Can't run with ground-truth "
                    "strategy")
            candidate_nodes = list(set(self.loaded_file["GT_kg_id"].tolist()))
        elif vector_strategy == "exact-matches":
            candidate_nodes = list(set(self.loaded_file["kg_id"].tolist()))
        else:
            raise TLException(
                "Unknown vector vector strategy {}".format(vector_strategy))
        candidate_nodes = [
            each for each in candidate_nodes
            if each != "" and each is not np.nan
        ]

        # get corresponding column of each candidate nodes
        nodes_map = defaultdict(set)
        for each_node in candidate_nodes:
            for group, nodes in self.groups.items():
                if each_node in nodes:
                    nodes_map[group].add(each_node)

        # random sample nodes if needed
        nodes_map_updated = {}

        for group, nodes in nodes_map.items():
            if n_value != 0 and n_value < len(nodes):
                nodes_map_updated[group] = random.sample(nodes, n_value)
            else:
                nodes_map_updated[group] = nodes

        # get centroid for each column
        for group, nodes in nodes_map_updated.items():
            temp = []
            for each_node in sorted(list(nodes)):
                temp.append(self.vectors_map[each_node])
            each_centroid = np.mean(np.array(temp), axis=0)
            self.centroid[group] = each_centroid

    def compute_distance(self, v1: typing.List[float], v2: typing.List[float]):
        if self.kwargs["distance_function"] == "cosine":
            val = 1 - cosine(v1, v2)

        elif self.kwargs["distance_function"] == "euclidean":
            val = euclidean(v1, v2)
            # because we need score higher to be better, here we use the reciprocal value
            if val == 0:
                val = float("inf")
            else:
                val = 1 / val
        else:
            raise TLException("Unknown distance function {}".format(
                self.kwargs["distance_function"]))
        return val

    def add_score_column(self):
        score_column_name = self.kwargs["output_column_name"]
        if score_column_name is None:
            score_column_name = "score_{}".format(
                self.kwargs["column_vector_strategy"])
            i = 1
            while score_column_name in self.loaded_file:
                i += 1
                score_column_name = "score_{}_{}".format(
                    self.kwargs["column_vector_strategy"], i)

        if self.kwargs["column_vector_strategy"] in {
                "page-rank", "page-rank-precomputed"
        }:
            self.loaded_file = self.loaded_file.rename(
                columns={'|pr|': score_column_name})
        else:
            scores = []
            for i, each_row in self.loaded_file.iterrows():
                # the nan value can also be float
                if (isinstance(each_row["kg_id"], float) and math.isnan(
                        each_row["kg_id"])) or each_row["kg_id"] is np.nan:
                    each_score = ""
                else:
                    each_score = self.compute_distance(
                        self.centroid[each_row["column"]],
                        self.vectors_map[each_row["kg_id"]])

                scores.append(each_score)
            self.loaded_file[score_column_name] = scores

        if self.kwargs["save_embedding_feature"]:
            self.loaded_file['sentence'] = self.loaded_file['kg_id'].map(
                self.sentence_map)
            self.loaded_file['vector'] = self.loaded_file['kg_id'].map(
                self.vectors_map)

        if self.kwargs["ignore_empty_sentences"]:
            # remove sentences which is same as kg ids
            self.loaded_file = self.loaded_file[
                self.loaded_file['kg_id'] != self.loaded_file['sentence'].
                apply(lambda x: x[:-1] if isinstance(x, str) else x)]

    def _create_detail_has_properties(self):
        """
        By loading the property file, remove unnecessary things and get something inside if needed
        :return: None
        """
        model_file_path = os.path.join(
            repr(__file__).replace("'", "").replace("/text_embedding.py", ""),
            "predicate_counts_and_labels.tsv")
        if os.path.exists(model_file_path):
            properties_df = pd.read_csv(model_file_path, sep='\t')
        else:
            return
        # process
        need_isa_properties = {"P31"}
        need_has_properties = set()
        for _, each_row in properties_df.iterrows():
            if not isinstance(each_row["label"], str) and np.isnan(
                    each_row["label"]):
                continue
            if each_row["operation"] == "check_inside" or each_row[
                    "label"].endswith("of'@en"):
                need_isa_properties.add(each_row["predicate"])
                continue
            elif each_row["operation"] == "bl":
                continue
            else:
                if "ID" in each_row["label"] or \
                        "identifier" in each_row["label"].lower() or \
                        "common" in each_row["label"].lower():
                    continue
            need_has_properties.add(each_row["predicate"])

        self.kwargs["has_properties"] = list(need_has_properties)
        self.kwargs["isa_properties"] = list(need_isa_properties)

    def get_vectors(self):
        """
            send the table linker format data to kgtk vector embedding
            the load the output and get the vector map
        """
        # no vector calculation needed for precomputed pagerank
        if self.kwargs.get(
                "column_vector_strategy") == "page-rank-precomputed":
            return

        # transform format to kgtk format input
        temp_file = tempfile.NamedTemporaryFile(mode='r+', suffix=".csv")
        self.kgtk_format_input.to_csv(temp_file, index=False)
        temp_file.seek(0)
        self.kwargs["input_file"] = Path(temp_file.name)
        self.kwargs["input_format"] = "test_format"
        self.kwargs["_debug"] = self.kwargs["debug"]
        self.kwargs["output_uri"] = "none"
        self.kwargs["use_cache"] = True
        # always send true to kgtk, so that we can get the sentences to check if they are empty or not
        self.kwargs["save_embedding_sentence"] = True
        if self.kwargs["has_properties"] == ["all"] and self.kwargs["isa_properties"] == ["P31"] \
                and self.kwargs["use_default_file"]:
            self._create_detail_has_properties()

        # catch the stdout to string
        old_stdout = sys.stdout
        sys.stdout = output_vectors = StringIO()

        main_embedding_function(**self.kwargs)
        sys.stdout = old_stdout
        # read the output vectors
        output_vectors.seek(0)
        _ = output_vectors.readline()
        for each_line in output_vectors.readlines():
            each_line = each_line.replace("\n", "").split("\t")
            each_q = each_line[0]
            each_edge = each_line[1]
            if each_edge == "embedding_sentence":
                each_sentence = each_line[2]
                self.sentence_map[each_q] = each_sentence
            else:
                each_vector = np.array(
                    [float(each_v) for each_v in each_line[2].split(",")])
                self.vectors_map[each_q] = each_vector

        # save kgtk output vector file if needed
        if self.kwargs["projector_file_name"] is not None:
            self.save_vector_file(output_vectors)
        output_vectors.close()

    def save_vector_file(self, vector_io):
        output_path = self.kwargs["projector_file_name"]
        if "/" not in output_path:
            output_path = os.path.join(os.getcwd(), output_path)
        vector_io.seek(0)
        with open(output_path, "w") as f:
            f.writelines(vector_io.readlines())

    def print_output(self):
        self.loaded_file.to_csv(sys.stdout, index=False)