def tfidf_sim(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] vocab = corpora.Dictionary(bodyText_w) corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w] tfidf_model = models.TfidfModel(corporaBody_bow) unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_bow = vocab.doc2bow(sent2stokens_wostop(headline)) headlines_tfidf = tfidf_model[headline_bow] corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]] sim = cossim(headlines_tfidf, corporaBody_tfidf) unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
def avg_embed(self, train_data, body_dict, threshold): embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) bodyText_list = list(body_dict.values()) bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_w = sent2stokens_wostop(headline) body_w = bodyText_w[bodyIds_index[bodyID]] sim = avg_embedding_similarity(embeddings, self.embedding_size, ' '.join(headline_w), ' '.join(body_w)) unrelated, related, y_true, y_pred = create_lists( sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
def doc2vec_similarity_max(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' # Load embeddings logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) + ' Embedding size: ' + str(self.embedding_size)) embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) # Align body-text in workable format bodyText_list = list(body_dict.values()) bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())} unrelated, related, y_true, y_pred = [], [], [], [] sentence_list = [] for headline, bodyID, stance in train_data: logging.info("Headline: " + headline) score = 0 bodyText = bodyText_list[bodyIds_index[bodyID]] sentence_list = text2sent(bodyText) # logging.info("Bodytext: " + bodyText) for sentence in sentence_list: #logging.info("Sentence: " + sentence) # compare both sentences - vectors not necessary, since this procedure works with text # note: avg_embeddings_similarity tokenizes and lemmatizes the sentences prior to calculation, so no pre-assessment is necessary (Sentence to tokens without stopwords) temp_score = avg_embedding_similarity(embeddings, self.embedding_size, headline, sentence) #logging.info("Similarity: " + str(temp_score)) # store the highest similarity score score = max(score, temp_score) # asses headline - body as related or unrelated based on threshold, taken the highest similarity of sentences unrelated, related, y_true, y_pred = create_lists( score, stance, threshold, [unrelated, related, y_true, y_pred]) # following lines just for manual cross-checks if score <= threshold: calculated_stance = "unrelated" else: calculated_stance = "related" logging.info( "Best score for this headline - sentence similarity: " + str(score)) logging.info("Real/calculated stance: " + stance + " / " + calculated_stance) print_results([unrelated, related, y_true, y_pred], self.model_type)
def doc2vec_similarity_max(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' # Load embeddings logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) + ' Embedding size: ' + str(self.embedding_size)) embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) # Align body-text in workable format bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} unrelated, related, y_true, y_pred = [], [], [], [] sentence_list = [] for headline, bodyID, stance in train_data: logging.info("Headline: " + headline) score = 0 bodyText = bodyText_list[bodyIds_index[bodyID]] sentence_list = text2sent(bodyText) # logging.info("Bodytext: " + bodyText) for sentence in sentence_list: #logging.info("Sentence: " + sentence) # compare both sentences - vectors not necessary, since this procedure works with text # note: avg_embeddings_similarity tokenizes and lemmatizes the sentences prior to calculation, so no pre-assessment is necessary (Sentence to tokens without stopwords) temp_score = avg_embedding_similarity(embeddings, self.embedding_size, headline, sentence) #logging.info("Similarity: " + str(temp_score)) # store the highest similarity score score=max(score, temp_score) # asses headline - body as related or unrelated based on threshold, taken the highest similarity of sentences unrelated, related, y_true, y_pred = create_lists(score, stance, threshold, [unrelated, related, y_true, y_pred]) # following lines just for manual cross-checks if score <= threshold: calculated_stance = "unrelated" else: calculated_stance = "related" logging.info("Best score for this headline - sentence similarity: " + str(score)) logging.info("Real/calculated stance: " + stance + " / " + calculated_stance) # ToDo: Correctly write and evaluate the results print_results([unrelated, related, y_true, y_pred], self.model_type)
def sdm_sim(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' import retinasdk fullClient = retinasdk.FullClient( "e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative") bodyText_list = body_dict.values() bodyIds_index = dict( (k, index) for index, k in enumerate(body_dict.keys())) unrelated, related, y_true, y_pred = [], [], [], [] cnt1 = 0 cnt2 = 1 for headline, bodyID, stance in train_data: comp_with_stop_words = fullClient.compare( '[{"text": "' + headline + '"}, {"text": "' + bodyText_list[bodyIds_index[bodyID]] + '"}]') sim = comp_with_stop_words.cosineSimilarity # sim = comp_with_stop_words.jaccardDistance # comp_without_stop_words = fullClient.compare('[{"text": "'+' '.join(sent2stokens_wostop(headline))+'"}, {"text": "'+' '.join(sent2stokens_wostop(bodyText_list[bodyIds_index[bodyID]]))+'"}]') # sim = comp_without_stop_words.cosineSimilarity unrelated, related, y_true, y_pred = create_lists( sim, stance, threshold, [unrelated, related, y_true, y_pred]) # keep track of the processed examples if (cnt1 == 100): print(cnt2 * 100) cnt2 += 1 cnt1 = 0 cnt1 += 1 print_results([unrelated, related, y_true, y_pred], self.model_type)
def avg_embed(self, train_data, body_dict, threshold): embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_w = sent2stokens_wostop(headline) body_w = bodyText_w[bodyIds_index[bodyID]] sim = avg_embedding_similarity(embeddings, self.embedding_size, ' '.join(headline_w), ' '.join(body_w)) unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
def sdm_sim(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' import retinasdk fullClient = retinasdk.FullClient("e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative") bodyText_list = body_dict.values() bodyIds_index = dict((k,index) for index, k in enumerate(body_dict.keys())) unrelated, related, y_true, y_pred = [], [], [], [] cnt1 = 0 cnt2 = 1 for headline, bodyID, stance in train_data: comp_with_stop_words = fullClient.compare('[{"text": "'+headline+'"}, {"text": "'+bodyText_list[bodyIds_index[bodyID]]+'"}]') sim = comp_with_stop_words.cosineSimilarity # sim = comp_with_stop_words.jaccardDistance # comp_without_stop_words = fullClient.compare('[{"text": "'+' '.join(sent2stokens_wostop(headline))+'"}, {"text": "'+' '.join(sent2stokens_wostop(bodyText_list[bodyIds_index[bodyID]]))+'"}]') # sim = comp_without_stop_words.cosineSimilarity unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred]) # keep track of the processed examples if (cnt1 == 100): print(cnt2*100) cnt2 += 1 cnt1 = 0 cnt1 += 1 print_results([unrelated, related, y_true, y_pred], self.model_type)