Пример #1
0
 def tfidf_sim(self, train_data, body_dict, threshold):
     '''
     :param 
     train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
     body_dict : a dictionary of values containing {bodyID:'bodyText'}
     threshold : used distinguish between similar and not similar
     '''
     bodyText_list = list(body_dict.values())
     bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
     
     bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
     
     vocab = corpora.Dictionary(bodyText_w)
     corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
     tfidf_model = models.TfidfModel(corporaBody_bow)
     
     unrelated, related, y_true, y_pred = [], [], [], []
     for headline, bodyID, stance in train_data:        
         headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
         
         headlines_tfidf = tfidf_model[headline_bow]
         corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
         
         sim = cossim(headlines_tfidf, corporaBody_tfidf)
         unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
     
     print_results([unrelated, related, y_true, y_pred], self.model_type)      
Пример #2
0
    def avg_embed(self, train_data, body_dict, threshold):

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        bodyText_list = list(body_dict.values())
        bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())}

        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        unrelated, related, y_true, y_pred = [], [], [], []

        for headline, bodyID, stance in train_data:
            headline_w = sent2stokens_wostop(headline)
            body_w = bodyText_w[bodyIds_index[bodyID]]

            sim = avg_embedding_similarity(embeddings, self.embedding_size,
                                           ' '.join(headline_w),
                                           ' '.join(body_w))

            unrelated, related, y_true, y_pred = create_lists(
                sim, stance, threshold, [unrelated, related, y_true, y_pred])

        print_results([unrelated, related, y_true, y_pred], self.model_type)
Пример #3
0
    def doc2vec_similarity_max(self, train_data, body_dict, threshold):
        '''
        :param
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        # Load embeddings
        logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) +
                     ' Embedding size: ' + str(self.embedding_size))

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        # Align body-text in workable format
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())}

        unrelated, related, y_true, y_pred = [], [], [], []
        sentence_list = []

        for headline, bodyID, stance in train_data:
            logging.info("Headline: " + headline)
            score = 0
            bodyText = bodyText_list[bodyIds_index[bodyID]]
            sentence_list = text2sent(bodyText)
            # logging.info("Bodytext: " + bodyText)

            for sentence in sentence_list:
                #logging.info("Sentence: " + sentence)
                # compare both sentences - vectors not necessary, since this procedure works with text
                # note: avg_embeddings_similarity tokenizes and lemmatizes the sentences prior to calculation, so no pre-assessment is necessary (Sentence to tokens without stopwords)
                temp_score = avg_embedding_similarity(embeddings,
                                                      self.embedding_size,
                                                      headline, sentence)
                #logging.info("Similarity: " + str(temp_score))

                # store the highest similarity score
                score = max(score, temp_score)

            # asses headline - body as related or unrelated based on threshold, taken the highest similarity of sentences
            unrelated, related, y_true, y_pred = create_lists(
                score, stance, threshold, [unrelated, related, y_true, y_pred])

            # following lines just for manual cross-checks
            if score <= threshold:
                calculated_stance = "unrelated"
            else:
                calculated_stance = "related"

            logging.info(
                "Best score for this headline - sentence similarity: " +
                str(score))
            logging.info("Real/calculated stance: " + stance + " / " +
                         calculated_stance)

        print_results([unrelated, related, y_true, y_pred], self.model_type)
Пример #4
0
    def doc2vec_similarity_max(self, train_data, body_dict, threshold):
        '''
        :param
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        # Load embeddings
        logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) + ' Embedding size: ' + str(self.embedding_size))

        embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size)

        # Align body-text in workable format
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}

        unrelated, related, y_true, y_pred = [], [], [], []
        sentence_list = []

        for headline, bodyID, stance in train_data:
            logging.info("Headline: " + headline)
            score = 0
            bodyText = bodyText_list[bodyIds_index[bodyID]]
            sentence_list = text2sent(bodyText)
            # logging.info("Bodytext: " + bodyText)

            for sentence in sentence_list:
                #logging.info("Sentence: " + sentence)
                # compare both sentences - vectors not necessary, since this procedure works with text
                # note: avg_embeddings_similarity tokenizes and lemmatizes the sentences prior to calculation, so no pre-assessment is necessary (Sentence to tokens without stopwords)
                temp_score = avg_embedding_similarity(embeddings, self.embedding_size, headline, sentence)
                #logging.info("Similarity: " + str(temp_score))

                # store the highest similarity score
                score=max(score, temp_score)

            # asses headline - body as related or unrelated based on threshold, taken the highest similarity of sentences
            unrelated, related, y_true, y_pred = create_lists(score, stance, threshold, [unrelated, related, y_true, y_pred])


            # following lines just for manual cross-checks
            if score <= threshold:
                calculated_stance = "unrelated"
            else:
                calculated_stance = "related"

            logging.info("Best score for this headline - sentence similarity: " + str(score))
            logging.info("Real/calculated stance: " + stance + " / " + calculated_stance)
        # ToDo: Correctly write and evaluate the results
        print_results([unrelated, related, y_true, y_pred], self.model_type)
Пример #5
0
    def sdm_sim(self, train_data, body_dict, threshold):
        '''
        :param 
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        import retinasdk
        fullClient = retinasdk.FullClient(
            "e8bf8de0-fe52-11e6-b22d-93a4ae922ff1",
            apiServer="http://api.cortical.io/rest",
            retinaName="en_associative")

        bodyText_list = body_dict.values()
        bodyIds_index = dict(
            (k, index) for index, k in enumerate(body_dict.keys()))

        unrelated, related, y_true, y_pred = [], [], [], []
        cnt1 = 0
        cnt2 = 1
        for headline, bodyID, stance in train_data:

            comp_with_stop_words = fullClient.compare(
                '[{"text": "' + headline + '"}, {"text": "' +
                bodyText_list[bodyIds_index[bodyID]] + '"}]')
            sim = comp_with_stop_words.cosineSimilarity
            #             sim = comp_with_stop_words.jaccardDistance

            #             comp_without_stop_words = fullClient.compare('[{"text": "'+' '.join(sent2stokens_wostop(headline))+'"}, {"text": "'+' '.join(sent2stokens_wostop(bodyText_list[bodyIds_index[bodyID]]))+'"}]')
            #             sim = comp_without_stop_words.cosineSimilarity

            unrelated, related, y_true, y_pred = create_lists(
                sim, stance, threshold, [unrelated, related, y_true, y_pred])

            # keep track of the processed examples
            if (cnt1 == 100):
                print(cnt2 * 100)
                cnt2 += 1
                cnt1 = 0
            cnt1 += 1

        print_results([unrelated, related, y_true, y_pred], self.model_type)
Пример #6
0
    def avg_embed(self, train_data, body_dict, threshold):

        embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData,
                                     vocab_size=self.vocab_size, embedding_size=self.embedding_size)

        
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
        
        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        unrelated, related, y_true, y_pred = [], [], [], []
                        
        for headline, bodyID, stance in train_data:        
            headline_w = sent2stokens_wostop(headline)
            body_w = bodyText_w[bodyIds_index[bodyID]]
            
            sim = avg_embedding_similarity(embeddings, self.embedding_size, ' '.join(headline_w), ' '.join(body_w))
     
            unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, 
                                                                   [unrelated, related, y_true, y_pred])
        
        print_results([unrelated, related, y_true, y_pred], self.model_type)
Пример #7
0
    def sdm_sim(self, train_data, body_dict, threshold):
        
        '''
        :param 
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        import retinasdk
        fullClient = retinasdk.FullClient("e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative")
        
        bodyText_list = body_dict.values()
        bodyIds_index = dict((k,index) for index, k in enumerate(body_dict.keys()))

        unrelated, related, y_true, y_pred = [], [], [], []
        cnt1 = 0
        cnt2 = 1
        for headline, bodyID, stance in train_data:        

            comp_with_stop_words = fullClient.compare('[{"text": "'+headline+'"}, {"text": "'+bodyText_list[bodyIds_index[bodyID]]+'"}]')
            sim = comp_with_stop_words.cosineSimilarity
#             sim = comp_with_stop_words.jaccardDistance
            
#             comp_without_stop_words = fullClient.compare('[{"text": "'+' '.join(sent2stokens_wostop(headline))+'"}, {"text": "'+' '.join(sent2stokens_wostop(bodyText_list[bodyIds_index[bodyID]]))+'"}]')
#             sim = comp_without_stop_words.cosineSimilarity
            
            unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
            
            # keep track of the processed examples
            if (cnt1 == 100):
                print(cnt2*100)
                cnt2 += 1
                cnt1 = 0
            cnt1 += 1

            
        print_results([unrelated, related, y_true, y_pred], self.model_type)