def __init__(self, name="train", path="fnc-1"):
        params = parse_params()
        self.path = path

        print("Reading dataset")
        bodies = name + "_bodies.csv"
        stances = name + "_stances.csv"

        self.stances = self.read(stances)
        articles = self.read(bodies)
        self.articles = dict()

        #make the body ID an integer value
        for s in self.stances:
            s['Body ID'] = int(s['Body ID'])
            if params.run_2_class and s[
                    'Stance'] != 'unrelated' and name != 'competition_test':
                s['Stance'] = 'related'

        #copy all bodies into a dictionary
        for article in articles:
            self.articles[int(article['Body ID'])] = article['articleBody']

        print("Total stances: " + str(len(self.stances)))
        print("Total bodies: " + str(len(self.articles)))
示例#2
0
import os

# from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats, word_tfidf_features, word_overlap_split_bodies_features
from feature_engineering import word_overlap_features, word_overlap_pos_features, word_overlap_quotes_features, word_tfidf_pos_ss_features, word_overlap_bpe_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, LABELS_RELATED, score_submission, score_cal

from utils.system import parse_params, check_version
from test_dl_model import get_predictions_from_FNC_1_Test
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

params = parse_params()


def generate_features(stances, dataset, name):
    h, b, y = [], [], []

    for stance in stances:
        if params.run_2_class:
            if name != 'competition':
                y.append(LABELS_RELATED.index(stance['Stance']))
            else:
                y.append(LABELS.index(stance['Stance']))
        else:
            y.append(LABELS.index(stance['Stance']))

        h.append(stance['Headline'])
示例#3
0
    X_overlap = gen_or_load_feats(word_overlap_features, h, b,
                                  "features/overlap." + name + ".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b,
                                   "features/refuting." + name + ".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b,
                                   "features/polarity." + name + ".npy")
    X_hand = gen_or_load_feats(hand_features, h, b,
                               "features/hand." + name + ".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X, y


if __name__ == "__main__":
    check_version()
    parse_params()

    #Load the training dataset and generate folds
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    with open(os.path.join(out_dir, tier +'.headline'), 'w') as headline_file,  \
         open(os.path.join(out_dir, tier +'.body'), 'w') as body_file,\
         open(os.path.join(out_dir, tier +'.stance'), 'w') as stance_file:

        for i in indices:
            (headline, body, stance) = examples[i]
            # write tokenized data to file
            write_to_file(headline_file, headline)
            write_to_file(body_file, body)
            write_to_file(stance_file, stance)


if __name__ == "__main__":
    #check_version()
    args = parse_params()

    #Load the training dataset
    d = DataSet()
    #competition_dataset = DataSet(name="competition_test")

    train_body_ids, dev_body_ids = get_body_ids(d,
                                                training=0.8,
                                                base_dir='splits')
    train_stances = get_stances(d, train_body_ids)
    dev_stances = get_stances(d, dev_body_ids)

    #test_body_ids = list(competition_dataset.articles.keys())
    #test_stances = get_stances(competition_dataset, test_body_ids)

    print("Train data has %i examples total" % len(train_stances))