예제 #1
0
    def get_feature(self):
        ds = DataSet(self.name, self.path)
        data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc, self.sent)

        vocabulary = set()
        headline_body = []
        for i, row in data.iterrows():
            vocabulary.update(row['Headline'].split(' '))
            vocabulary.update(row['Body'].split(' '))
            headline_body.append(row['Headline'] + ' ' + row['Body'])
        headlines = data.Headline.to_numpy()
        bodies = data.Body.to_numpy()

        if self.name == 'train':
            vectorizer = TfidfVectorizer(vocabulary=vocabulary)
            headline_body_TF_IDF = vectorizer.fit(headline_body)
            with open('../feature_files/headline_body_tfidf_vectorizer.pkl', 'wb') as f:
                pickle.dump(headline_body_TF_IDF, f)
            headline_TF_IDF = headline_body_TF_IDF.transform(headlines)
            body_TF_IDF = headline_body_TF_IDF.transform(bodies)
        else:
            headline_body_TF_IDF = pickle.load(open('../feature_files/headline_body_tfidf_vectorizer.pkl', 'rb'))
            headline_TF_IDF = headline_body_TF_IDF.transform(headlines)
            body_TF_IDF = headline_body_TF_IDF.transform(bodies)

        features = []
        for h, b in zip(headline_TF_IDF, body_TF_IDF):
            features.append(sklearn.metrics.pairwise.cosine_similarity(h, b)[0][0])
        return np.array(features)
예제 #2
0
    def get_feature(self):
        def get_sentiment(d):
            return list(sid.polarity_scores(d).values())

        ds = DataSet(path=self.path, name=self.name)
        data = ds.preprocess(self.lemmatize, self.remove_stop,
                             self.remove_punc, self.sent)
        sid = SentimentIntensityAnalyzer()
        sentiments = []
        for index, row in data.iterrows():
            headline_sentiment = get_sentiment(row['Headline'])
            body_sentiment = get_sentiment(row['Body'])
            sentiments.append([headline_sentiment + body_sentiment])
        return np.array(sentiments).reshape(-1, 8)
예제 #3
0
 def __init__(self, modelInstance):
     self.model = modelInstance
     features = [
         cosine_similarity.CosineSimilarity(),
         n_gram_matching.NGramMatching(),
         sentiment_feature.SentimentFeature(),
         SVD.SVD(),
         TFIDF.TFIDF(),
         baseline_features.BaselineFeature(),
         cue_words.CueWords()
     ]
     self.features_train = np.hstack(
         [feature.read() for feature in features])
     self.labels_train = DataSet(path="../FNC-1").get_labels()
     self.features_test = np.hstack(
         [feature.read('competition_test') for feature in features])
     self.labels_test = DataSet(path="../FNC-1",
                                name="competition_test").get_labels()
예제 #4
0
 def get_feature(self):
     ds = DataSet(path=self.path, name=self.name)
     data = ds.preprocess(self.lemmatize, self.remove_stop,
                          self.remove_punc)
     cue_words_list = self.get_cue_words()
     X = []
     for index, row in data.iterrows():
         X_row = []
         for word in cue_words_list:
             if word in row['Headline']:
                 X_row.append(1)
             else:
                 X_row.append(0)
         for word in cue_words_list:
             if word in row['Body']:
                 X_row.append(1)
             else:
                 X_row.append(0)
         X.append(X_row)
     return np.array(X)
예제 #5
0
 def nGramMathing(self):
     ds = DataSet(path=self.path, name=self.name)
     data = ds.preprocess(self.lemmatize, self.remove_stop,
                          self.remove_punc)
     idf = self.getIDF(data["Body"].to_numpy())
     features = []
     for index, row in data.iterrows():
         H = []
         A = []
         for n in range(1, 6):
             H_ngram = self.get_ngram(n, row['Headline'])
             A_ngram = self.get_ngram(n, row["Body"])
             H.extend(list(H_ngram.keys()))
             A.extend(list(A_ngram.keys()))
         sum = 0
         for i, h in enumerate(H):
             TF_hi = (H.count(h) + A.count(h)) * len(h)
             idf_hi = idf.get(" ".join(h), 0)
             sum += (TF_hi * idf_hi)
         sc = sum / (len(H) + len(A))
         features.append(sc)
     return np.array(features).reshape(-1, 1)
예제 #6
0
    def get_feature(self):
        dataset = DataSet(path=self.path, name=self.name)
        h, b = [], []
        stances = dataset.stances
        for stance in stances:
            h.append(stance['Headline'])
            b.append(dataset.articles[stance['Body ID']])

        X_overlap = word_overlap_features(h, b)
        X_refuting = refuting_features(h, b)
        X_polarity = polarity_features(h, b)
        X_hand = hand_features(h, b)

        X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
        return X