def get_glove_similarity(self, num_hits=25, method='cosine'): """ Calculate GloVe based similarities (all-vs-all) Args: ------- num_centroid_hits: int Function will store the num_centroid_hits closest matches. Default is 25. method: str See scipy spatial.distance.cdist for options. Default is 'cosine'. """ if self.model_glove is None: print("No GloVe document vectors found.") print( "Please first train model using 'build_model_glove' function.") print( "Then create document vectors using 'get_vectors_glove' function." ) else: list_similars_idx, list_similars, mean_similarity = functions.calculate_similarities( self.vectors_glove, num_hits, method=method) print("Calculated distances between ", list_similars.shape[0], " documents.") self.list_similars_glove_idx = list_similars_idx self.list_similars_glove = list_similars
def get_doc2vec_similarity(self, num_hits=25, method='cosine'): """ Calculate Doc2Vec based similarities (all-vs-all) Args: ------- num_centroid_hits: int Function will store the num_centroid_hits closest matches. Default is 25. method: str See scipy spatial.distance.cdist for options. Default is 'cosine'. """ if self.vectors_glove is None: print("No trained Doc2Vec model found.") print( "Please first train model using 'build_model_doc2vec' function." ) else: vectors = np.zeros( (len(self.corpus), self.model_doc2vec.vector_size)) for i in range(len(self.corpus)): vectors[i, :] = self.model_doc2vec.docvecs[i] list_similars_idx, list_similars, mean_similarity = functions.calculate_similarities( vectors, num_hits, method=method) self.list_similars_ctr_idx = list_similars_idx self.list_similars_ctr = list_similars
def test_calculate_similarities(): # Test with test-vectors and known outcome testvectors = np.array([[0, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 1, 1, 0, 0], [0.5, 0.5, 0.5, 0, 0]]) # Run function: list_similars_ids, list_similars, mean_similarity = calculate_similarities( testvectors, num_hits=4, method='cosine') assert list_similars[0][1] == list_similars[1][1] > 0.7 assert list_similars[2][1] == list_similars[3][1] == 1 assert list_similars[0][3] == list_similars[2][3] == 0 assert np.min(list_similars_ids[1, :] == np.array([1, 0, 2, 3])) assert np.min(list_similars_ids[2, :] == list_similars_ids[3, :] == np.array([2, 3, 1, 0]))
def get_pca_similarity(self, num_hits=25, method='cosine'): """ Calculate PCA similarities(all-versus-all --> matrix) Args: ------- num_centroid_hits: int Function will store the num_centroid_hits closest matches. Default is 25. method: str See scipy spatial.distance.cdist for options. Default is 'cosine'. """ list_similars_idx, list_similars, mean_similarity = functions.calculate_similarities( self.vectors_pca, num_hits, method=method) self.list_similars_pca_idx = list_similars_idx self.list_similars_pca = list_similars
def get_centroid_similarity(self, num_hits=25, method='cosine'): """ Calculate centroid similarities(all-versus-all --> matrix) Args: ------- num_centroid_hits: int Function will store the num_centroid_hits closest matches. Default is 25. method: str See scipy spatial.distance.cdist for options. Default is 'cosine'. """ list_similars_idx, list_similars, mean_similarity = functions.calculate_similarities( self.vectors_centroid, num_hits, method=method) print("Calculated distances between ", list_similars.shape[0], " documents.") self.list_similars_ctr_idx = list_similars_idx self.list_similars_ctr = list_similars
def get_autoencoder_similarity(self, num_hits=25, method='cosine'): """ Calculate autoencoder similarities(all-versus-all --> matrix) Args: ------- num_centroid_hits: int Function will store the num_centroid_hits closest matches. Default is 25. method: str See scipy spatial.distance.cdist for options. Default is 'cosine'. """ self.vectors_ae = self.encoder.predict(self.X_data) list_similars_ae_idx, list_similars_ae, mean_similarity = functions.calculate_similarities( self.vectors_ae, num_hits, method=method) self.list_similars_ae_idx = list_similars_ae_idx self.list_similars_ae = list_similars_ae