Пример #1
0
def get_head_body_tuples(include_holdout=False):
    # file paths
    splits_dir = "splits"
    dataset = DataSet()

    def get_stances(dataset, folds, holdout):
        # Creates the list with a dict {'headline': ..., 'body': ..., 'stance': ...} for each
        # stance in the data set (except for holdout)
        stances = []
        for stance in dataset.stances:
            if stance['Body ID'] in holdout and include_holdout == True:
                stances.append(stance)
            for fold in folds:  # TODO maybe just flatten folds beforehand
                if stance['Body ID'] in fold:
                    stances.append(stance)

        return stances

    # create new vocabulary
    folds, holdout = kfold_split(dataset, n_folds=10, base_dir=splits_dir)  # [[133,1334,65645,], [32323,...]] => body ids for each fold
    stances = get_stances(dataset, folds, holdout)

    print("Stances length: " + str(len(stances)))

    h = []
    b = []
    # create the final lists with all the headlines and bodies of the set except for holdout
    for stance in stances:
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    return h, b
Пример #2
0
def do_reg():
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
                                               str(fold))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf_stage1 = GradientBoostingClassifier(n_estimators=200,
                                                random_state=14128,
                                                verbose=True)
        #clf = GradientBoostingClassifier(n_estimators=50, random_state=14128, verbose=False)
        # Try random forest
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    report_score(actual, predicted)
Пример #3
0
    def __init__(self, dataset, n_folds=10):
        self.dataset = dataset
        #print('generating folds')
        self.folds, self.hold_out = kfold_split(dataset, n_folds=n_folds)
        self.fold_stances, self.hold_out_stances = get_stances_for_folds(
            dataset, self.folds, self.hold_out)

        self.ys = dict()
        self.Xcs = dict()
        self.ys_nb = dict()
        self.Xcs_nb = dict()
        self.fold_stances_nb = dict()
        self.ys_true = dict()

        self.Xbasenb = dict()
        self.Xtotalnb = dict()
        self.Xtotal = dict()
        self.X_baseline = dict()
        self.y_baseline = dict()
Пример #4
0
def get_head_body_tuples(include_holdout=False):
    # file paths
    '''
    data_path = "%s/data/fnc-1" % (path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))))
    splits_dir = "%s/data/fnc-1/splits" % (path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))))
    dataset = DataSet(data_path)
    '''
    data_path = myConstants.data_path
    splits_dir = myConstants.splits_dir
    dataset = myConstants.d

    def get_stances(dataset, folds, holdout):
        # Creates the list with a dict {'headline': ..., 'body': ..., 'stance': ...} for each
        # stance in the data set (except for holdout)
        stances = []
        for stance in dataset.stances:
            if stance['Body ID'] in holdout and include_holdout == True:
                stances.append(stance)
            for fold in folds:
                if stance['Body ID'] in fold:
                    stances.append(stance)

        return stances

    # create new vocabulary
    folds, holdout = kfold_split(
        dataset, n_folds=10, base_dir=splits_dir
    )  # [[133,1334,65645,], [32323,...]] => body ids for each fold
    stances = get_stances(dataset, folds, holdout)

    print("Stances length: " + str(len(stances)))

    h = []
    b = []
    # create the final lists with all the headlines and bodies of the set except for holdout
    for stance in stances:
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    return h, b
Пример #5
0
    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap, X_overlap_quotes,
              X_overlap_pos, X_overlap_pos_sentence, X_tfidf, X_tfidf_max,
              X_overlap_bpe_SS]
    return X, y


if __name__ == "__main__":
    check_version()

    print('Running Conditioned CNN on FNC1 Dataset')
    dl_model_pred, _unused1, _unused2 = get_predictions_from_FNC_1_Test(
        params.dl_weights_file, params.apply_pos_filter, DEVICE)

    #Load the training dataset and generate folds
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    stances = pd.DataFrame(competition_dataset.stances)
    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
Пример #6
0
    # X_tf_idf_cos : The cosine similarity between the TF-IDF vectors of the headline and body.
    X_tf_cos, X_tf_idf_cos = gen_tf_idf_feats(stances, dataset.articles,
                                              bow_vectorizer, tfreq_vectorizer,
                                              tfidf_vectorizer)

    X = np.c_[X_hand, X_polarity, X_refuting_head, X_overlap, X_tf_cos]
    return X, y


if __name__ == "__main__":
    check_version()
    parse_params()

    #Load the training dataset and generate folds
    d = DataSet(name="train", path="./../fnc-1")
    folds, hold_out = kfold_split(d, n_folds=10, base_dir="./../splits")
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer = tf_idf_preprocess(d, competition_dataset, lim_unigram=lim_unigram)

    # Load the competition dataset
    competition_dataset = DataSet("competition_test", path="./../fnc-1")
    bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer = tf_idf_preprocess(
        d, competition_dataset, lim_unigram=lim_unigram)

    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition",
        bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer)

    headline, bodyId, stance = [], [], []
    for s in competition_dataset.stances:
Пример #7
0

if __name__ == "__main__":

    params = {'n_folds': 5, 'n_estimators': 200}
    #params = { 'n_folds' : 5, 'n_estimators' : 25}

    check_version()
    parse_params()

    time_1 = time.time()
    d = DataSet()
    time_2 = time.time()
    print('Dataset load: ' + str(time_2 - time_1))

    folds, hold_out = kfold_split(d, n_folds=params['n_folds'])

    time_3 = time.time()
    print('Kfold_split: ' + str(time_3 - time_2))

    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)
    time_4 = time.time()
    print('Get stances: ' + str(time_4 - time_3))

    Xs = dict()
    ys = dict()
    Xnn = dict()
    ynn = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
Пример #8
0
def run_stage(fn, d, competition_dataset):
    global runpass
    runpass += 1

    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load/Precompute all features now
    Xs = dict()
    ys = dict()
    ids = dict()
    comp_stances = competition_dataset.get_unlabelled_stances()
    X_comp, y_comp, id_comp = fn(comp_stances, competition_dataset,
                                 "competition_{}".format(str(runpass)))
    X_holdout, y_holdout, id_holdout = fn(hold_out_stances, d,
                                          "holdout_{}".format(str(runpass)))
    for fold in fold_stances:
        Xs[fold], ys[fold], ids[fold] = fn(
            fold_stances[fold], d, "{}_{}".format(str(fold), str(runpass)))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        id_train = np.hstack(
            tuple([ids[i] for i in range(len(fold_stances)) if i != fold]))
        X_train = np.vstack(
            tuple([Xs[i] for i in range(len(fold_stances)) if i != fold]))
        y_train = np.hstack(
            tuple([ys[i] for i in range(len(fold_stances)) if i != fold]))
        id_test = ids[fold]
        X_test = Xs[fold]
        y_test = ys[fold]

        clf = GradientBoostingClassifier(n_estimators=200,
                                         random_state=14128,
                                         verbose=True)
        clf.fit(X_train, y_train)

        predicted_test = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual_test = [LABELS[int(a)] for a in y_test]
        for i in range(len(actual_test)):
            d.stances[id_test[i]]['Predict'] = actual_test[i]  # Data is known

        fold_score, _ = score_submission(actual_test, predicted_test)
        max_fold_score, _ = score_submission(actual_test, actual_test)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted_hold = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual_hold = [LABELS[int(a)] for a in y_holdout]
    for i in range(len(predicted_hold)):
        d.stances[id_holdout[i]]['Predict'] = predicted_hold[
            i]  # Data is unknown

    #Run on competition dataset
    predicted_comp = [LABELS[int(a)] for a in best_fold.predict(X_comp)]
    actual_comp = [LABELS[int(a)] for a in y_comp]
    for i in range(len(actual_comp)):
        competition_dataset.stances[id_comp[i]]['Predict'] = predicted_comp[
            i]  # Data is unknown

    return id_holdout
Пример #9
0
    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X, y


if __name__ == "__main__":
    assert sys.argv[1]  # filename to serialize the model to
    SERIALIZED_FN = os.path.join(SERIALIZED_DIR, sys.argv[1] + '.pkl')
    assert not os.path.exists(SERIALIZED_FN)
    print('Serialize to {}'.format(SERIALIZED_FN))
    #check_version()
    #parse_params()

    #Load the training dataset and generate folds
    d = DataSet(path=DATASET_DIR)
    folds, hold_out = kfold_split(d, n_folds=1, base_dir=BASE_DIR)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load the competition dataset
    competition_dataset = DataSet("competition_test", path=DATASET_DIR)
    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
                                               str(fold))
Пример #10
0

if __name__ == "__main__":
    check_version()
    parse_params()

    #Load the training dataset and generate folds
    d = DataSet()

    # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    X_competition, y_competition, y_competition_bi = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    # step1 : classification model for related or unrelated
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)
    if not os.path.isfile("models/finalClassifier.1.model"):
        generate_model(fold_stances, 1)
    best_fold = joblib.load("models/finalClassifier.1.model")

    # Load/Precompute all features now
    X_holdout, y_holdout, y_holdout_bi = generate_features(
        hold_out_stances, d, "holdout")

    # step2 : classification model for related (3 classes : Agree, Disagree, Discuss)
    related_folds, related_hold_out = kfold_split(d, n_folds=10, biClass=True)
    related_fold_stances, related_hold_out_stances = get_stances_for_folds(
        d, related_folds, related_hold_out, only_related=True)
    if not os.path.isfile("models/finalClassifier.2.model"):
        generate_model(related_fold_stances, 2)
Пример #11
0
    # Load the training dataset and generate folds
    dataSet = DataSet(
        name="fake_gold_real_articles")  # lire la dataset de TRAINING
    training_ids, testing_ids = generate_splited_data_ids(dataSet, 0.8)

    train_Headlines, train_bodies, train_labels = parseDataSet(
        training_ids, dataSet)
    test_Headlines, test_bodies, test_labels = parseDataSet(
        testing_ids, dataSet)

    #generate tfidf vectorizers
    bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer = tfIdf_parameteres(
        train_Headlines, train_bodies, 50)

    #split loaded data into folds
    folds_ids = kfold_split(training_ids, 10)

    #iterate possibilities, from 1 to 16, every number will be presented in binary format (i.e : possibility =1 => pssibility = 0001 => only first feature is enabled)
    for possibility in range(1, 16):
        x = "{0:b}".format(possibility)
        x = x.rjust(4, '0')

        Xs = dict()
        ys = dict()

        index = 0
        #generate feature for the folds :
        for fold_ids in folds_ids:
            fold_headlines, fold_bodies, fold_lables = parseDataSet(
                fold_ids, dataSet)
            ys[index] = fold_lables
Пример #12
0
    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y





if __name__ == "__main__":

    if sys.version_info.major < 3:
        sys.stderr.write('Please use Python version 3 and above\n')
        sys.exit(1)

    d = DataSet()
    folds,hold_out = kfold_split(d,n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
    for fold in fold_stances:
        Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))


    best_score = 0
    best_fold = None

Пример #13
0
        optimizer='Adam',
        learning_rate=0.001)

    return ({
        'class': tf.argmax(logits, 1),
        'prob': tf.nn.softmax(logits)
    }, loss, train_op)


if __name__ == '__main__':

    start_time = time.time()

    print('loading data')
    d = DataSet()
    folds, hold_out = kfold_split(d, training=0.9, n_folds=1)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)
    body = pd.read_csv('fnc-1/train_bodies.csv', index_col=0)
    train = pd.DataFrame(fold_stances[0]).join(body, on='Body ID')
    test = pd.DataFrame(hold_out_stances).join(body, on='Body ID')
    print('data is loaded')

    print('loading pre-trained embedding')
    vocab, embd = loadGloVe('glove.6B.50d.txt')
    vocab_size = len(vocab)
    embedding_dim = len(embd[0])
    embedding = np.asarray(embd)

    with tf.name_scope('W'):
        W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]),
                        trainable=False,