def perform_clustering_quality(self, df): """ :param df: :param type_info: :param min_cluster_size: :param min_samples: :return: """ def create_binary_type_vector(t_types, a_types): vector = np.zeros(len(all_types)) i = [a_types.index(_) for _ in t_types] vector[i] = 1 return vector type_info = ut.deserializer(path=self.p_folder, serialized_name='type_info') # get all unique types, i.e. all o : (s,#type,o) \in KG all_types = sorted(set.union(*list(type_info.values()))) # get only those resources that have type information df_only_subjects = df.loc[list(type_info.keys())] # Apply clustering df_only_subjects = self.pseudo_label_HDBSCAN(df_only_subjects, min_cluster_size=26, min_samples=29) clusters = pd.unique(df_only_subjects.labels) sum_purity = 0 for c in clusters: valid_indexes_in_c = df_only_subjects[df_only_subjects.labels == c].index.values sum_of_cosines = 0 print('##### CLUSTER', c, ' #####') for i in valid_indexes_in_c: # returns a set of indexes types_i = type_info[i] vector_type_i = create_binary_type_vector(types_i, all_types) for j in valid_indexes_in_c: types_j = type_info[j] vector_type_j = create_binary_type_vector(types_j, all_types) sum_of_cosines += 1 - cosine(vector_type_i, vector_type_j) purity = sum_of_cosines / (len(valid_indexes_in_c) ** 2) sum_purity += purity mean_of_scores = sum_purity / len(clusters) print('Mean of cluster purity', mean_of_scores)
parser.set_similarity_measure(PPMI) model = PYKE() analyser = DataAnalyser(p_folder=storage_path) holder = parser.pipeline_of_preprocessing(kg_path) vocab_size = len(holder) embeddings = ut.randomly_initialize_embedding_space(vocab_size, num_of_dims) learned_embeddings = model.pipeline_of_learning_embeddings( e=embeddings, max_iteration=bound_on_iter, energy_release_at_epoch=e_release, holder=holder, omega=omega) del embeddings del holder #analyser.perform_clustering_quality(learned_embeddings) analyser.perform_type_prediction(learned_embeddings) vocab = ut.deserializer(path=storage_path, serialized_name='vocabulary') learned_embeddings.index = [ i.replace('http://example.com/foo#', '') for i in vocab ] learned_embeddings.to_csv(storage_path + '/PYKE_50_embd.csv') analyser.plot2D(learned_embeddings)
def perform_type_prediction(self, df): def create_binary_type_vector(t_types, a_types): vector = np.zeros(len(all_types)) i = [a_types.index(_) for _ in t_types] vector[i] = 1 return vector def create_binary_type_prediction_vector(t_types, a_types): vector = np.zeros(len(all_types)) i = [ a_types.index(_) for _ in itertools.chain.from_iterable(t_types) ] vector[i] += 1 return vector # get the types. Mapping from the index of subject to the index of object type_info = ut.deserializer(path=self.p_folder, serialized_name='type_info') # get the index of objects / get type information =>>> s #type o all_types = sorted(set.union(*list(type_info.values()))) # Consider only points with type infos. e_w_types = df.loc[list(type_info.keys())] neigh = NearestNeighbors(n_neighbors=101, algorithm='kd_tree', metric='euclidean', n_jobs=-1).fit(e_w_types) # Get similarity results for selected entities df_most_similars = pd.DataFrame( neigh.kneighbors(e_w_types, return_distance=False)) # Reindex the target df_most_similars.index = e_w_types.index.values # As sklearn implementation of kneighbors returns the point itself as most similar point df_most_similars.drop(columns=[0], inplace=True) # Map back to the original indexes. KNN does not consider the index of Dataframe. mapper = dict(zip(list(range(len(e_w_types))), e_w_types.index.values)) # The values of most similars are mapped to original vocabulary positions df_most_similars = df_most_similars.applymap(lambda x: mapper[x]) k_values = [1, 3, 5, 10, 15, 30, 50, 100] print('K values:', k_values) for k in k_values: print('#####', k, '####') similarities = list() for _, S in df_most_similars.iterrows(): true_types = type_info[_] type_predictions = [type_info[_] for _ in S.values[:k]] vector_true = create_binary_type_vector(true_types, all_types) vector_prediction = create_binary_type_prediction_vector( type_predictions, all_types) sim = cosine(vector_true, vector_prediction) similarities.append(1 - sim) report = pd.DataFrame(similarities) print('Mean type prediction', report.mean().values)