def avg_embed(self, train_data, body_dict, threshold): embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) bodyText_list = list(body_dict.values()) bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_w = sent2stokens_wostop(headline) body_w = bodyText_w[bodyIds_index[bodyID]] sim = avg_embedding_similarity(embeddings, self.embedding_size, ' '.join(headline_w), ' '.join(body_w)) unrelated, related, y_true, y_pred = create_lists( sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
def doc2vec_similarity_max(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' # Load embeddings logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) + ' Embedding size: ' + str(self.embedding_size)) embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) # Align body-text in workable format bodyText_list = list(body_dict.values()) bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())} unrelated, related, y_true, y_pred = [], [], [], [] sentence_list = [] for headline, bodyID, stance in train_data: logging.info("Headline: " + headline) score = 0 bodyText = bodyText_list[bodyIds_index[bodyID]] sentence_list = text2sent(bodyText) # logging.info("Bodytext: " + bodyText) for sentence in sentence_list: #logging.info("Sentence: " + sentence) # compare both sentences - vectors not necessary, since this procedure works with text # note: avg_embeddings_similarity tokenizes and lemmatizes the sentences prior to calculation, so no pre-assessment is necessary (Sentence to tokens without stopwords) temp_score = avg_embedding_similarity(embeddings, self.embedding_size, headline, sentence) #logging.info("Similarity: " + str(temp_score)) # store the highest similarity score score = max(score, temp_score) # asses headline - body as related or unrelated based on threshold, taken the highest similarity of sentences unrelated, related, y_true, y_pred = create_lists( score, stance, threshold, [unrelated, related, y_true, y_pred]) # following lines just for manual cross-checks if score <= threshold: calculated_stance = "unrelated" else: calculated_stance = "related" logging.info( "Best score for this headline - sentence similarity: " + str(score)) logging.info("Real/calculated stance: " + stance + " / " + calculated_stance) print_results([unrelated, related, y_true, y_pred], self.model_type)
def word_mover_distance_similarity(self, train_data, body_dict, threshold, type): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar type: sentence|wholeText: compute distance per sentence or with whole body text ''' # Load embeddings #logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) + ' Embedding size: ' + str(self.embedding_size)) embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) # Align body-text in workable format bodyText_list = list(body_dict.values()) bodyIds_index = dict( (k, index) for index, k in enumerate(list(body_dict.keys()))) unrelated, related, y_true, y_pred = [], [], [], [] sentence_list = [] for headline, bodyID, stance in train_data: #logging.info("Headline: " + headline) distance = 99999 bodyText = bodyText_list[bodyIds_index[bodyID]] sentence_list = text2sent(bodyText) #logging.info("Bodytext: " + bodyText) if type == "sentence": for sentence in sentence_list: #logging.info("Sentence: " + sentence) temp_distance = abs( computeAverageWMD(embeddings, headline, sentence)) # store the lowest distance distance = min(distance, temp_distance) #Note: Distance is not normallized!! elif type == "wholeText": distance = abs( computeAverageWMD(embeddings, headline, bodyText)) unrelated, related, y_true, y_pred = create_lists_distance_based( distance, stance, threshold, [unrelated, related, y_true, y_pred]) if distance <= threshold: calculated_stance = "related" else: calculated_stance = "unrelated" #logging.info("Best word_mover_distance for this headline - body combination: " + str(distance)) #logging.info("Real/calculated stance: " + stance + " / " + calculated_stance) print_results_distance_based([unrelated, related, y_true, y_pred], self.model_type)
def load_embeddings(headlines, bodies): # embedding parameters: embedding_size = 300 vocab_size = 3000000 embeddPath = "%s/data/embeddings/google_news/GoogleNews-vectors-negative300.bin.gz" % ( path.dirname(path.dirname(path.dirname(path.abspath(__file__))))) embeddData = path.normpath("%s/data/" % (path.dirname(path.abspath(embeddPath)))) binary_val = True embeddings = LoadEmbeddings(filepath=embeddPath, data_path=embeddData, vocab_size=vocab_size, embedding_size=embedding_size, binary_val=binary_val) # print('Loaded embeddings: Vocab-Size: ' + str(vocab_size) + ' \n Embedding size: ' + str(embedding_size)) return embedding_size, embeddings
featureVec = np.add(featureVec, model.word2embedd(word)) else: featureVec = np.add(featureVec, model.word2embedd(u"unknown")) if(nwords>0): featureVec = np.divide(featureVec, nwords) return featureVec def avg_embedding_similarity(embeddings, embedding_size, sent1, sent2): #print("Calculating similarity for: " + sent1 + "\n and\n" + sent2) v1 = avg_feature_vector(sent1, model=embeddings, num_features=embedding_size) v2 = avg_feature_vector(sent2, model=embeddings, num_features=embedding_size) cosine_distance = spatial.distance.cosine(v1, v2) score = 1 - cosine_distance #print("Score = " + str(score)) return score if __name__ == "__main__": sent1 = "United States of America" sent2 = "USA" data_path = myConstants.BASE_DIR + "/data/embeddings" embeddPath = os.path.normpath("%s/google_news/GoogleNews-vectors-negative300.bin.gz" % (data_path)) embeddData = os.path.normpath("%s/google_news/data/" % (data_path)) vocab_size = 3000000 embedding_size = 300 embeddings = LoadEmbeddings(filepath=embeddPath, data_path=embeddData, vocab_size=vocab_size, embedding_size=embedding_size) score = avg_embedding_similarity(embeddings, embedding_size, sent1, sent2) print(score)