def get_head_body_tuples(include_holdout=False): # file paths splits_dir = "splits" dataset = DataSet() def get_stances(dataset, folds, holdout): # Creates the list with a dict {'headline': ..., 'body': ..., 'stance': ...} for each # stance in the data set (except for holdout) stances = [] for stance in dataset.stances: if stance['Body ID'] in holdout and include_holdout == True: stances.append(stance) for fold in folds: # TODO maybe just flatten folds beforehand if stance['Body ID'] in fold: stances.append(stance) return stances # create new vocabulary folds, holdout = kfold_split(dataset, n_folds=10, base_dir=splits_dir) # [[133,1334,65645,], [32323,...]] => body ids for each fold stances = get_stances(dataset, folds, holdout) print("Stances length: " + str(len(stances))) h = [] b = [] # create the final lists with all the headlines and bodies of the set except for holdout for stance in stances: h.append(stance['Headline']) b.append(dataset.articles[stance['Body ID']]) return h, b
def do_reg(): d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) Xs = dict() ys = dict() # Load/Precompute all features now X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout") for fold in fold_stances: Xs[fold], ys[fold] = generate_features(fold_stances[fold], d, str(fold)) best_score = 0 best_fold = None # Classifier for each fold for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] X_train = np.vstack(tuple([Xs[i] for i in ids])) y_train = np.hstack(tuple([ys[i] for i in ids])) X_test = Xs[fold] y_test = ys[fold] clf_stage1 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) #clf = GradientBoostingClassifier(n_estimators=50, random_state=14128, verbose=False) # Try random forest clf.fit(X_train, y_train) predicted = [LABELS[int(a)] for a in clf.predict(X_test)] actual = [LABELS[int(a)] for a in y_test] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Run on Holdout set and report the final score on the holdout set predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)] actual = [LABELS[int(a)] for a in y_holdout] report_score(actual, predicted)
def __init__(self, dataset, n_folds=10): self.dataset = dataset #print('generating folds') self.folds, self.hold_out = kfold_split(dataset, n_folds=n_folds) self.fold_stances, self.hold_out_stances = get_stances_for_folds( dataset, self.folds, self.hold_out) self.ys = dict() self.Xcs = dict() self.ys_nb = dict() self.Xcs_nb = dict() self.fold_stances_nb = dict() self.ys_true = dict() self.Xbasenb = dict() self.Xtotalnb = dict() self.Xtotal = dict() self.X_baseline = dict() self.y_baseline = dict()
def get_head_body_tuples(include_holdout=False): # file paths ''' data_path = "%s/data/fnc-1" % (path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))) splits_dir = "%s/data/fnc-1/splits" % (path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))) dataset = DataSet(data_path) ''' data_path = myConstants.data_path splits_dir = myConstants.splits_dir dataset = myConstants.d def get_stances(dataset, folds, holdout): # Creates the list with a dict {'headline': ..., 'body': ..., 'stance': ...} for each # stance in the data set (except for holdout) stances = [] for stance in dataset.stances: if stance['Body ID'] in holdout and include_holdout == True: stances.append(stance) for fold in folds: if stance['Body ID'] in fold: stances.append(stance) return stances # create new vocabulary folds, holdout = kfold_split( dataset, n_folds=10, base_dir=splits_dir ) # [[133,1334,65645,], [32323,...]] => body ids for each fold stances = get_stances(dataset, folds, holdout) print("Stances length: " + str(len(stances))) h = [] b = [] # create the final lists with all the headlines and bodies of the set except for holdout for stance in stances: h.append(stance['Headline']) b.append(dataset.articles[stance['Body ID']]) return h, b
X = np.c_[X_hand, X_polarity, X_refuting, X_overlap, X_overlap_quotes, X_overlap_pos, X_overlap_pos_sentence, X_tfidf, X_tfidf_max, X_overlap_bpe_SS] return X, y if __name__ == "__main__": check_version() print('Running Conditioned CNN on FNC1 Dataset') dl_model_pred, _unused1, _unused2 = get_predictions_from_FNC_1_Test( params.dl_weights_file, params.apply_pos_filter, DEVICE) #Load the training dataset and generate folds d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) # Load the competition dataset competition_dataset = DataSet("competition_test") stances = pd.DataFrame(competition_dataset.stances) X_competition, y_competition = generate_features( competition_dataset.stances, competition_dataset, "competition") Xs = dict() ys = dict() # Load/Precompute all features now X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout") for fold in fold_stances: Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
# X_tf_idf_cos : The cosine similarity between the TF-IDF vectors of the headline and body. X_tf_cos, X_tf_idf_cos = gen_tf_idf_feats(stances, dataset.articles, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer) X = np.c_[X_hand, X_polarity, X_refuting_head, X_overlap, X_tf_cos] return X, y if __name__ == "__main__": check_version() parse_params() #Load the training dataset and generate folds d = DataSet(name="train", path="./../fnc-1") folds, hold_out = kfold_split(d, n_folds=10, base_dir="./../splits") fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) # bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer = tf_idf_preprocess(d, competition_dataset, lim_unigram=lim_unigram) # Load the competition dataset competition_dataset = DataSet("competition_test", path="./../fnc-1") bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer = tf_idf_preprocess( d, competition_dataset, lim_unigram=lim_unigram) X_competition, y_competition = generate_features( competition_dataset.stances, competition_dataset, "competition", bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer) headline, bodyId, stance = [], [], [] for s in competition_dataset.stances:
if __name__ == "__main__": params = {'n_folds': 5, 'n_estimators': 200} #params = { 'n_folds' : 5, 'n_estimators' : 25} check_version() parse_params() time_1 = time.time() d = DataSet() time_2 = time.time() print('Dataset load: ' + str(time_2 - time_1)) folds, hold_out = kfold_split(d, n_folds=params['n_folds']) time_3 = time.time() print('Kfold_split: ' + str(time_3 - time_2)) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) time_4 = time.time() print('Get stances: ' + str(time_4 - time_3)) Xs = dict() ys = dict() Xnn = dict() ynn = dict() # Load/Precompute all features now X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
def run_stage(fn, d, competition_dataset): global runpass runpass += 1 folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) # Load/Precompute all features now Xs = dict() ys = dict() ids = dict() comp_stances = competition_dataset.get_unlabelled_stances() X_comp, y_comp, id_comp = fn(comp_stances, competition_dataset, "competition_{}".format(str(runpass))) X_holdout, y_holdout, id_holdout = fn(hold_out_stances, d, "holdout_{}".format(str(runpass))) for fold in fold_stances: Xs[fold], ys[fold], ids[fold] = fn( fold_stances[fold], d, "{}_{}".format(str(fold), str(runpass))) best_score = 0 best_fold = None # Classifier for each fold for fold in fold_stances: id_train = np.hstack( tuple([ids[i] for i in range(len(fold_stances)) if i != fold])) X_train = np.vstack( tuple([Xs[i] for i in range(len(fold_stances)) if i != fold])) y_train = np.hstack( tuple([ys[i] for i in range(len(fold_stances)) if i != fold])) id_test = ids[fold] X_test = Xs[fold] y_test = ys[fold] clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) clf.fit(X_train, y_train) predicted_test = [LABELS[int(a)] for a in clf.predict(X_test)] actual_test = [LABELS[int(a)] for a in y_test] for i in range(len(actual_test)): d.stances[id_test[i]]['Predict'] = actual_test[i] # Data is known fold_score, _ = score_submission(actual_test, predicted_test) max_fold_score, _ = score_submission(actual_test, actual_test) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Run on Holdout set and report the final score on the holdout set predicted_hold = [LABELS[int(a)] for a in best_fold.predict(X_holdout)] actual_hold = [LABELS[int(a)] for a in y_holdout] for i in range(len(predicted_hold)): d.stances[id_holdout[i]]['Predict'] = predicted_hold[ i] # Data is unknown #Run on competition dataset predicted_comp = [LABELS[int(a)] for a in best_fold.predict(X_comp)] actual_comp = [LABELS[int(a)] for a in y_comp] for i in range(len(actual_comp)): competition_dataset.stances[id_comp[i]]['Predict'] = predicted_comp[ i] # Data is unknown return id_holdout
X = np.c_[X_hand, X_polarity, X_refuting, X_overlap] return X, y if __name__ == "__main__": assert sys.argv[1] # filename to serialize the model to SERIALIZED_FN = os.path.join(SERIALIZED_DIR, sys.argv[1] + '.pkl') assert not os.path.exists(SERIALIZED_FN) print('Serialize to {}'.format(SERIALIZED_FN)) #check_version() #parse_params() #Load the training dataset and generate folds d = DataSet(path=DATASET_DIR) folds, hold_out = kfold_split(d, n_folds=1, base_dir=BASE_DIR) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) # Load the competition dataset competition_dataset = DataSet("competition_test", path=DATASET_DIR) X_competition, y_competition = generate_features( competition_dataset.stances, competition_dataset, "competition") Xs = dict() ys = dict() # Load/Precompute all features now X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout") for fold in fold_stances: Xs[fold], ys[fold] = generate_features(fold_stances[fold], d, str(fold))
if __name__ == "__main__": check_version() parse_params() #Load the training dataset and generate folds d = DataSet() # Load the competition dataset competition_dataset = DataSet("competition_test") X_competition, y_competition, y_competition_bi = generate_features( competition_dataset.stances, competition_dataset, "competition") # step1 : classification model for related or unrelated folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) if not os.path.isfile("models/finalClassifier.1.model"): generate_model(fold_stances, 1) best_fold = joblib.load("models/finalClassifier.1.model") # Load/Precompute all features now X_holdout, y_holdout, y_holdout_bi = generate_features( hold_out_stances, d, "holdout") # step2 : classification model for related (3 classes : Agree, Disagree, Discuss) related_folds, related_hold_out = kfold_split(d, n_folds=10, biClass=True) related_fold_stances, related_hold_out_stances = get_stances_for_folds( d, related_folds, related_hold_out, only_related=True) if not os.path.isfile("models/finalClassifier.2.model"): generate_model(related_fold_stances, 2)
# Load the training dataset and generate folds dataSet = DataSet( name="fake_gold_real_articles") # lire la dataset de TRAINING training_ids, testing_ids = generate_splited_data_ids(dataSet, 0.8) train_Headlines, train_bodies, train_labels = parseDataSet( training_ids, dataSet) test_Headlines, test_bodies, test_labels = parseDataSet( testing_ids, dataSet) #generate tfidf vectorizers bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer = tfIdf_parameteres( train_Headlines, train_bodies, 50) #split loaded data into folds folds_ids = kfold_split(training_ids, 10) #iterate possibilities, from 1 to 16, every number will be presented in binary format (i.e : possibility =1 => pssibility = 0001 => only first feature is enabled) for possibility in range(1, 16): x = "{0:b}".format(possibility) x = x.rjust(4, '0') Xs = dict() ys = dict() index = 0 #generate feature for the folds : for fold_ids in folds_ids: fold_headlines, fold_bodies, fold_lables = parseDataSet( fold_ids, dataSet) ys[index] = fold_lables
X = np.c_[X_hand, X_polarity, X_refuting, X_overlap] return X,y if __name__ == "__main__": if sys.version_info.major < 3: sys.stderr.write('Please use Python version 3 and above\n') sys.exit(1) d = DataSet() folds,hold_out = kfold_split(d,n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out) Xs = dict() ys = dict() # Load/Precompute all features now X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout") for fold in fold_stances: Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold)) best_score = 0 best_fold = None
optimizer='Adam', learning_rate=0.001) return ({ 'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits) }, loss, train_op) if __name__ == '__main__': start_time = time.time() print('loading data') d = DataSet() folds, hold_out = kfold_split(d, training=0.9, n_folds=1) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) body = pd.read_csv('fnc-1/train_bodies.csv', index_col=0) train = pd.DataFrame(fold_stances[0]).join(body, on='Body ID') test = pd.DataFrame(hold_out_stances).join(body, on='Body ID') print('data is loaded') print('loading pre-trained embedding') vocab, embd = loadGloVe('glove.6B.50d.txt') vocab_size = len(vocab) embedding_dim = len(embd[0]) embedding = np.asarray(embd) with tf.name_scope('W'): W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False,