def get_feature(self): ds = DataSet(self.name, self.path) data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc, self.sent) vocabulary = set() headline_body = [] for i, row in data.iterrows(): vocabulary.update(row['Headline'].split(' ')) vocabulary.update(row['Body'].split(' ')) headline_body.append(row['Headline'] + ' ' + row['Body']) headlines = data.Headline.to_numpy() bodies = data.Body.to_numpy() if self.name == 'train': vectorizer = TfidfVectorizer(vocabulary=vocabulary) headline_body_TF_IDF = vectorizer.fit(headline_body) with open('../feature_files/headline_body_tfidf_vectorizer.pkl', 'wb') as f: pickle.dump(headline_body_TF_IDF, f) headline_TF_IDF = headline_body_TF_IDF.transform(headlines) body_TF_IDF = headline_body_TF_IDF.transform(bodies) else: headline_body_TF_IDF = pickle.load(open('../feature_files/headline_body_tfidf_vectorizer.pkl', 'rb')) headline_TF_IDF = headline_body_TF_IDF.transform(headlines) body_TF_IDF = headline_body_TF_IDF.transform(bodies) features = [] for h, b in zip(headline_TF_IDF, body_TF_IDF): features.append(sklearn.metrics.pairwise.cosine_similarity(h, b)[0][0]) return np.array(features)
def get_feature(self): def get_sentiment(d): return list(sid.polarity_scores(d).values()) ds = DataSet(path=self.path, name=self.name) data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc, self.sent) sid = SentimentIntensityAnalyzer() sentiments = [] for index, row in data.iterrows(): headline_sentiment = get_sentiment(row['Headline']) body_sentiment = get_sentiment(row['Body']) sentiments.append([headline_sentiment + body_sentiment]) return np.array(sentiments).reshape(-1, 8)
def __init__(self, modelInstance): self.model = modelInstance features = [ cosine_similarity.CosineSimilarity(), n_gram_matching.NGramMatching(), sentiment_feature.SentimentFeature(), SVD.SVD(), TFIDF.TFIDF(), baseline_features.BaselineFeature(), cue_words.CueWords() ] self.features_train = np.hstack( [feature.read() for feature in features]) self.labels_train = DataSet(path="../FNC-1").get_labels() self.features_test = np.hstack( [feature.read('competition_test') for feature in features]) self.labels_test = DataSet(path="../FNC-1", name="competition_test").get_labels()
def get_feature(self): ds = DataSet(path=self.path, name=self.name) data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc) cue_words_list = self.get_cue_words() X = [] for index, row in data.iterrows(): X_row = [] for word in cue_words_list: if word in row['Headline']: X_row.append(1) else: X_row.append(0) for word in cue_words_list: if word in row['Body']: X_row.append(1) else: X_row.append(0) X.append(X_row) return np.array(X)
def nGramMathing(self): ds = DataSet(path=self.path, name=self.name) data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc) idf = self.getIDF(data["Body"].to_numpy()) features = [] for index, row in data.iterrows(): H = [] A = [] for n in range(1, 6): H_ngram = self.get_ngram(n, row['Headline']) A_ngram = self.get_ngram(n, row["Body"]) H.extend(list(H_ngram.keys())) A.extend(list(A_ngram.keys())) sum = 0 for i, h in enumerate(H): TF_hi = (H.count(h) + A.count(h)) * len(h) idf_hi = idf.get(" ".join(h), 0) sum += (TF_hi * idf_hi) sc = sum / (len(H) + len(A)) features.append(sc) return np.array(features).reshape(-1, 1)
def get_feature(self): dataset = DataSet(path=self.path, name=self.name) h, b = [], [] stances = dataset.stances for stance in stances: h.append(stance['Headline']) b.append(dataset.articles[stance['Body ID']]) X_overlap = word_overlap_features(h, b) X_refuting = refuting_features(h, b) X_polarity = polarity_features(h, b) X_hand = hand_features(h, b) X = np.c_[X_hand, X_polarity, X_refuting, X_overlap] return X