Пример #1
0
    def perform_clustering_quality(self, df):
        """

        :param df:
        :param type_info:
        :param min_cluster_size:
        :param min_samples:
        :return:
        """

        def create_binary_type_vector(t_types, a_types):
            vector = np.zeros(len(all_types))
            i = [a_types.index(_) for _ in t_types]
            vector[i] = 1
            return vector

        type_info = ut.deserializer(path=self.p_folder, serialized_name='type_info')

        # get all unique types, i.e. all o : (s,#type,o) \in KG
        all_types = sorted(set.union(*list(type_info.values())))

        # get only those resources that have type information
        df_only_subjects = df.loc[list(type_info.keys())]

        # Apply clustering
        df_only_subjects = self.pseudo_label_HDBSCAN(df_only_subjects, min_cluster_size=26, min_samples=29)

        clusters = pd.unique(df_only_subjects.labels)

        sum_purity = 0
        for c in clusters:

            valid_indexes_in_c = df_only_subjects[df_only_subjects.labels == c].index.values

            sum_of_cosines = 0

            print('##### CLUSTER', c, ' #####')

            for i in valid_indexes_in_c:

                # returns a set of indexes
                types_i = type_info[i]

                vector_type_i = create_binary_type_vector(types_i, all_types)

                for j in valid_indexes_in_c:
                    types_j = type_info[j]
                    vector_type_j = create_binary_type_vector(types_j, all_types)

                    sum_of_cosines += 1 - cosine(vector_type_i, vector_type_j)

            purity = sum_of_cosines / (len(valid_indexes_in_c) ** 2)

            sum_purity += purity

        mean_of_scores = sum_purity / len(clusters)
        print('Mean of cluster purity', mean_of_scores)
Пример #2
0
parser.set_similarity_measure(PPMI)

model = PYKE()

analyser = DataAnalyser(p_folder=storage_path)

holder = parser.pipeline_of_preprocessing(kg_path)

vocab_size = len(holder)

embeddings = ut.randomly_initialize_embedding_space(vocab_size, num_of_dims)

learned_embeddings = model.pipeline_of_learning_embeddings(
    e=embeddings,
    max_iteration=bound_on_iter,
    energy_release_at_epoch=e_release,
    holder=holder,
    omega=omega)
del embeddings
del holder

#analyser.perform_clustering_quality(learned_embeddings)
analyser.perform_type_prediction(learned_embeddings)

vocab = ut.deserializer(path=storage_path, serialized_name='vocabulary')
learned_embeddings.index = [
    i.replace('http://example.com/foo#', '') for i in vocab
]
learned_embeddings.to_csv(storage_path + '/PYKE_50_embd.csv')

analyser.plot2D(learned_embeddings)
Пример #3
0
    def perform_type_prediction(self, df):
        def create_binary_type_vector(t_types, a_types):
            vector = np.zeros(len(all_types))
            i = [a_types.index(_) for _ in t_types]
            vector[i] = 1
            return vector

        def create_binary_type_prediction_vector(t_types, a_types):
            vector = np.zeros(len(all_types))
            i = [
                a_types.index(_)
                for _ in itertools.chain.from_iterable(t_types)
            ]
            vector[i] += 1
            return vector

        # get the types. Mapping from the index of subject to the index of object
        type_info = ut.deserializer(path=self.p_folder,
                                    serialized_name='type_info')

        # get the index of objects / get type information =>>> s #type o
        all_types = sorted(set.union(*list(type_info.values())))

        # Consider only points with type infos.
        e_w_types = df.loc[list(type_info.keys())]

        neigh = NearestNeighbors(n_neighbors=101,
                                 algorithm='kd_tree',
                                 metric='euclidean',
                                 n_jobs=-1).fit(e_w_types)

        # Get similarity results for selected entities
        df_most_similars = pd.DataFrame(
            neigh.kneighbors(e_w_types, return_distance=False))

        # Reindex the target
        df_most_similars.index = e_w_types.index.values

        # As sklearn implementation of kneighbors returns the point itself as most similar point
        df_most_similars.drop(columns=[0], inplace=True)

        # Map back to the original indexes. KNN does not consider the index of Dataframe.
        mapper = dict(zip(list(range(len(e_w_types))), e_w_types.index.values))
        # The values of most similars are mapped to original vocabulary positions
        df_most_similars = df_most_similars.applymap(lambda x: mapper[x])

        k_values = [1, 3, 5, 10, 15, 30, 50, 100]

        print('K values:', k_values)
        for k in k_values:
            print('#####', k, '####')
            similarities = list()
            for _, S in df_most_similars.iterrows():
                true_types = type_info[_]
                type_predictions = [type_info[_] for _ in S.values[:k]]

                vector_true = create_binary_type_vector(true_types, all_types)
                vector_prediction = create_binary_type_prediction_vector(
                    type_predictions, all_types)

                sim = cosine(vector_true, vector_prediction)
                similarities.append(1 - sim)

            report = pd.DataFrame(similarities)
            print('Mean type prediction', report.mean().values)