예제 #1
0
 def tfidf_sim(self, train_data, body_dict, threshold):
     '''
     :param 
     train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
     body_dict : a dictionary of values containing {bodyID:'bodyText'}
     threshold : used distinguish between similar and not similar
     '''
     bodyText_list = list(body_dict.values())
     bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
     
     bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
     
     vocab = corpora.Dictionary(bodyText_w)
     corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
     tfidf_model = models.TfidfModel(corporaBody_bow)
     
     unrelated, related, y_true, y_pred = [], [], [], []
     for headline, bodyID, stance in train_data:        
         headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
         
         headlines_tfidf = tfidf_model[headline_bow]
         corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
         
         sim = cossim(headlines_tfidf, corporaBody_tfidf)
         unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
     
     print_results([unrelated, related, y_true, y_pred], self.model_type)      
    def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
        headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
        headline_tfidf = self.tfidf_model[headline_bow]

        scored_sentences = []
        'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
        #sentences = sentences.replace('\n', ' ')

        for sentence in self.tokenizer.tokenize(sentences):
            sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
            sim = cossim(headline_tfidf, sentence_tfidf)
            #print(str(sim))
            scored_sentences.append([sentence, sim])

        sorted_sentences = sorted(
            scored_sentences,
            key=lambda scored_sentences: scored_sentences[1],
            reverse=True)
        '''
        for sentence in sorted_sentences:
        print(str(sentence))
        '''
        ' return sorted_sentences '

        sentences_string = ""
        current_sentence_number = 0
        for sentence in sorted_sentences:
            current_sentence_number += 1
            sentences_string += sentence[0] + ' '
            if current_sentence_number == number_of_sentences:
                break
        #print("Ranked: \n " + sentences_string)
        return sentences_string
예제 #3
0
    def avg_embed(self, train_data, body_dict, threshold):

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        bodyText_list = list(body_dict.values())
        bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())}

        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        unrelated, related, y_true, y_pred = [], [], [], []

        for headline, bodyID, stance in train_data:
            headline_w = sent2stokens_wostop(headline)
            body_w = bodyText_w[bodyIds_index[bodyID]]

            sim = avg_embedding_similarity(embeddings, self.embedding_size,
                                           ' '.join(headline_w),
                                           ' '.join(body_w))

            unrelated, related, y_true, y_pred = create_lists(
                sim, stance, threshold, [unrelated, related, y_true, y_pred])

        print_results([unrelated, related, y_true, y_pred], self.model_type)
예제 #4
0
    def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
        headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
        headline_tfidf = self.tfidf_model[headline_bow]

        scored_sentences = []
        'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
        #sentences = sentences.replace('\n', ' ')

        for sentence in self.tokenizer.tokenize(sentences):
            sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
            sim = cossim(headline_tfidf, sentence_tfidf)
            #print(str(sim))
            scored_sentences.append([sentence, sim])

        sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True)
        '''
        for sentence in sorted_sentences:
        print(str(sentence))
        '''
        ' return sorted_sentences '

        sentences_string = ""
        current_sentence_number = 0
        for sentence in sorted_sentences:
            current_sentence_number += 1
            sentences_string += sentence[0] + ' '
            if current_sentence_number == number_of_sentences:
                break
        #print("Ranked: \n " + sentences_string)
        return sentences_string
예제 #5
0
    def generate_tf_idf_corpora(self):
        data_path = "%s/../data/fnc-1" % (path.dirname(path.dirname(path.abspath(__file__))))
        reader = CorpusReader(data_path)
        body_dict = reader.load_body("train_bodies.csv")
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}

        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        self.vocab = corpora.Dictionary(bodyText_w)
        corporaBody_bow = [self.vocab.doc2bow(text) for text in bodyText_w]
        self.tfidf_model = models.TfidfModel(corporaBody_bow)
    def generate_tf_idf_corpora(self):
        #data_path = myConstants.data_path
        #reader = CorpusReader(data_path)
        #body_dict = reader.load_body(myConstants.train_bodies)
        body_dict = myConstants.d.articles
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())}

        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        self.vocab = corpora.Dictionary(bodyText_w)
        corporaBody_bow = [self.vocab.doc2bow(text) for text in bodyText_w]
        self.tfidf_model = models.TfidfModel(corporaBody_bow)
예제 #7
0
    def avg_embed(self, train_data, body_dict, threshold):

        embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData,
                                     vocab_size=self.vocab_size, embedding_size=self.embedding_size)

        
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
        
        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        unrelated, related, y_true, y_pred = [], [], [], []
                        
        for headline, bodyID, stance in train_data:        
            headline_w = sent2stokens_wostop(headline)
            body_w = bodyText_w[bodyIds_index[bodyID]]
            
            sim = avg_embedding_similarity(embeddings, self.embedding_size, ' '.join(headline_w), ' '.join(body_w))
     
            unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, 
                                                                   [unrelated, related, y_true, y_pred])
        
        print_results([unrelated, related, y_true, y_pred], self.model_type)