def run(self, X_test, y_test_true): final = [LABELS[int(a)] for a in self.classifier.predict(X_test)] actual = [LABELS[int(a)] for a in y_test_true] fold_score, _ = score_submission(actual, final) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score return final, actual, score
def naive_bayes_train(fold_stances, dataset, repl): # Naive Bayes classifier, modified for k-folds estimation # Source: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html # Added by Julian global cvec, tfidf, mnb best_score = 0 ids = dict() Hs = dict() Bs = dict() ys = dict() for fold in fold_stances: ids[fold], Hs[fold], Bs[fold], ys[fold] = init_features( fold_stances[fold], dataset, repl) for fold in fold_stances: y_train = np.hstack( tuple([ys[i] for i in range(len(fold_stances)) if i != fold])) id_test = ids[fold] H_test = Hs[fold] B_test = Bs[fold] y_test = ys[fold] articles = [] for i in range(len(fold_stances)): if i == fold: continue for h, b in zip(Hs[i], Bs[i]): articles.append(h + " " + b) _cvec = CountVectorizer() X_train_counts = _cvec.fit_transform(articles) _cvec2 = CountVectorizer(vocabulary=string.punctuation) _tfidf = TfidfTransformer() X_train_tfidf = _tfidf.fit_transform(X_train_counts) _mnb = MultinomialNB().fit(X_train_tfidf, y_train) articles = [] for h, b in zip(H_test, B_test): articles.append(h + " " + b) X_test_counts = _cvec.transform(articles) X_test_tfidf = _tfidf.transform(X_test_counts) predicted_test = [LABELS[int(a)] for a in _mnb.predict(X_test_tfidf)] actual_test = [LABELS[int(a)] for a in y_test] for i in range(len(actual_test)): dataset.stances[id_test[i]]['Predict'] = actual_test[ i] # Data is known fold_score, _ = score_submission(actual_test, predicted_test) max_fold_score, _ = score_submission(actual_test, actual_test) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score cvec = _cvec tfidf = _tfidf mnb = _mnb
def do_reg(): d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) Xs = dict() ys = dict() # Load/Precompute all features now X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout") for fold in fold_stances: Xs[fold], ys[fold] = generate_features(fold_stances[fold], d, str(fold)) best_score = 0 best_fold = None # Classifier for each fold for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] X_train = np.vstack(tuple([Xs[i] for i in ids])) y_train = np.hstack(tuple([ys[i] for i in ids])) X_test = Xs[fold] y_test = ys[fold] clf_stage1 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) #clf = GradientBoostingClassifier(n_estimators=50, random_state=14128, verbose=False) # Try random forest clf.fit(X_train, y_train) predicted = [LABELS[int(a)] for a in clf.predict(X_test)] actual = [LABELS[int(a)] for a in y_test] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Run on Holdout set and report the final score on the holdout set predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)] actual = [LABELS[int(a)] for a in y_holdout] report_score(actual, predicted)
def run_stage2(self, X_test_stg2): init_pred_ind = [i for i, e in enumerate(self.init_pred) if e == 0] X_test_temp = [X_test_stg2[x] for x in init_pred_ind] predicted_new = [ LABELS[int(a)] for a in self.classifier2.predict(X_test_temp) ] self.final = self.predicted for i, e in enumerate(init_pred_ind): self.final[e] = predicted_new[i] fold_score, _ = score_submission(self.actual, self.final) max_fold_score, _ = score_submission(self.actual, self.actual) score = fold_score / max_fold_score return self.final, self.actual, score
def generate_model(fold_stances): Xs = dict() ys = dict() for fold in fold_stances: Xs[fold], ys[fold] = generate_features(fold_stances[fold], d, str(fold)) best_score = 0 best_fold = None # Classifier for each fold for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] X_train = np.vstack(tuple([Xs[i] for i in ids])) y_train = np.hstack(tuple([ys[i] for i in ids])) X_test = Xs[fold] y_test = ys[fold] # clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) # clf = xgb.XGBClassifier(objective= "multi:softmax",num_class=4,seed=12345) clf = xgb.XGBClassifier(seed=12345) clf.fit(X_train, y_train) predicted = [LABELS[int(a)] for a in clf.predict(X_test)] actual = [LABELS[int(a)] for a in y_test] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf joblib.dump(best_fold, "models/xgboost.model")
clf.fit(X_train, y_train) if params.run_2_class: predicted = [ LABELS_RELATED[int(a)] for a in clf.predict(X_test) ] actual = [LABELS_RELATED[int(a)] for a in y_test] fold_score = score_cal(actual, predicted) max_fold_score = score_cal(actual, actual) else: predicted = [LABELS[int(a)] for a in clf.predict(X_test)] actual = [LABELS[int(a)] for a in y_test] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf pickle.dump(best_fold, open(params.gb_weights_file, 'wb')) best_fold = pickle.load(open(params.gb_weights_file, 'rb')) # Run on Holdout set and report the final score on the holdout set if params.run_2_class: predicted = [ LABELS_RELATED[int(a)] for a in best_fold.predict(X_holdout)
predic = sess.run(pred, feed_dict={ x: test, labels: to_one_hot(y_test) }) predic_lab = [LABELS[int(a)] for a in from_one_hot(predic)] actual = [ LABELS[int(a)] for a in np.argmax(to_one_hot(y_test), 1) ] for i, e in enumerate(init_pred_ind[fold]): base_pred[e] = predic_lab[i] print('confusion matrix') #report_score(base_act, base_pred) fold_score, _ = score_submission(base_act, base_pred) max_fold_score, _ = score_submission(base_act, base_act) score = fold_score / max_fold_score print(fold, " : ", score) if score > best_score: best_score = score best_fold1 = clf base_pred = [ LABELS[3] if a == 1 else LABELS[0] for a in clf.predict(X_baseline_holdout) ] base_act = [LABELS[int(a)] for a in y_holdout] init_pred = dict() init_pred_ind = dict() init_pred = [
lstm.labels: y_tr_batch }) classes = lstm.test(lstm.batch_size) if (n_step % lstm.display_step) == 0: outputs = sess.run(classes, feed_dict={ lstm.head: XH_test, lstm.head_lengths: X_htest_len, lstm.body: XB_test, lstm.body_lengths: X_btest_len }) print(type(outputs)) predicted_labels = [LABELS[int(a)] for a in outputs] actual_labels = [LABELS[int(a)] for a in y_test] fold_score, _ = score_submission( actual_labels, predicted_labels) max_fold_score, _ = score_submission( actual_labels, actual_labels) score = fold_score / max_fold_score print("step is :" + str(n_step) + "cost is :" + str(result["cost"]) + "score is :" + str(score)) start_ind += lstm.batch_size n_step += 1 epoch += 1
predicted = np.argmax(np.asarray(y_pred), axis=1) actual = np.asarray(y_target) print(predicted) print(actual) confmat = confusion_matrix(actual, predicted) print('\nconfusion matrix:') print(confmat) accu = accuracy_score(actual, predicted) print('\naccyracy = {:>.4f}\n'.format(accu)) predicted1 = [LABELS[int(a)] for a in predicted] actual1 = [LABELS[int(a)] for a in actual] fold_score, _ = score_submission(actual1, predicted1) max_fold_score, _ = score_submission(actual1, actual1) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score X_holdout = np.array(X_holdout) y_holdout = np.array(y_holdout) testVal = X_holdout.shape[0] % 100 testVal = X_holdout.shape[0] - testVal test = data_utils.TensorDataset(
clf = LinearSVC(verbose=True, random_state=14128, class_weight={ 0: 0.74535, 1: 2.7549 }, loss='hinge', max_iter=15000) print(X_train.shape) clf.fit(X_train, y_train) predic = [LABELS[int(a)] for a in clf.predict(X_test)] act = [LABELS[int(a)] for a in y_test] fold_score, _, fa = score_submission(act, predic) max_fold_score, _, bfa = score_submission(act, act) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Run on Holdout set and report the final score on the holdout set predic = [LABELS[int(a)] for a in best_fold.predict(X_holdout)] act = [LABELS[int(a)] for a in y_holdout] print("Scores on the dev set") report_score(act, predic)
X_holdout = X_stg1["holdout"] y_test_true = y_stg1["true"] y_train = y_stg1["train"] y_test = y_stg1["test"] y_holdout = y_stg1["holdout"] clf = RandomForestClassifier(n_estimators=200, n_jobs=4, verbose=False) clf.fit(X_train, y_train) final = [LABELS[int(a)] for a in clf.predict(X_test)] actual = [LABELS[int(a)] for a in y_test_true] fold_score, _ = score_submission(actual, final) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold1 = clf #best_fold2 = clf2 filename = model_dir + "_" + mode + "_" + cval[cval_ind] pickle.dump(best_fold1, open(filename, "wb")) final = [ LABELS[int(a)] for a in best_fold1.predict(X_holdout)
def run_stage(fn, d, competition_dataset): global runpass runpass += 1 folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) # Load/Precompute all features now Xs = dict() ys = dict() ids = dict() comp_stances = competition_dataset.get_unlabelled_stances() X_comp, y_comp, id_comp = fn(comp_stances, competition_dataset, "competition_{}".format(str(runpass))) X_holdout, y_holdout, id_holdout = fn(hold_out_stances, d, "holdout_{}".format(str(runpass))) for fold in fold_stances: Xs[fold], ys[fold], ids[fold] = fn( fold_stances[fold], d, "{}_{}".format(str(fold), str(runpass))) best_score = 0 best_fold = None # Classifier for each fold for fold in fold_stances: id_train = np.hstack( tuple([ids[i] for i in range(len(fold_stances)) if i != fold])) X_train = np.vstack( tuple([Xs[i] for i in range(len(fold_stances)) if i != fold])) y_train = np.hstack( tuple([ys[i] for i in range(len(fold_stances)) if i != fold])) id_test = ids[fold] X_test = Xs[fold] y_test = ys[fold] clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) clf.fit(X_train, y_train) predicted_test = [LABELS[int(a)] for a in clf.predict(X_test)] actual_test = [LABELS[int(a)] for a in y_test] for i in range(len(actual_test)): d.stances[id_test[i]]['Predict'] = actual_test[i] # Data is known fold_score, _ = score_submission(actual_test, predicted_test) max_fold_score, _ = score_submission(actual_test, actual_test) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Run on Holdout set and report the final score on the holdout set predicted_hold = [LABELS[int(a)] for a in best_fold.predict(X_holdout)] actual_hold = [LABELS[int(a)] for a in y_holdout] for i in range(len(predicted_hold)): d.stances[id_holdout[i]]['Predict'] = predicted_hold[ i] # Data is unknown #Run on competition dataset predicted_comp = [LABELS[int(a)] for a in best_fold.predict(X_comp)] actual_comp = [LABELS[int(a)] for a in y_comp] for i in range(len(actual_comp)): competition_dataset.stances[id_comp[i]]['Predict'] = predicted_comp[ i] # Data is unknown return id_holdout
init_pred[fold] = [ int(a) for a in clf.predict(X_baseline[fold]) ] init_pred_ind[fold] = [ i for i, e in enumerate(init_pred[fold]) if e == 0 ] Xcs_temp = [Xcs[fold][x] for x in init_pred_ind[fold]] predicted_new = [ LABELS[int(a)] for a in clf2.predict(Xcs_temp) ] final = predicted for i, e in enumerate(init_pred_ind[fold]): final[e] = predicted_new[i] fold_score, _ = score_submission(actual, final) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score #for f in ids: # init_pred[f] = [int(a) for a in clf.predict(Xcs[f])] # init_pred_ind[f] = [i for i,e in enumerate(init_pred[f]) if e==0] # fold_stances_new[f] = [fold_stances[1][x]['Stance'] for x in init_pred_ind[f]] # Xcs_new[f],Xhs_new[f],Xbs_new[f],ys_new[f]=generate_features(fold_stances_new[f],d,str(f),model,binary=False) print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold1 = clf
trains = np.vstack(t) y_train = dict(ys) del y_train[index] trainsy = np.hstack(tuple([y_train[i] for i in y_train])) clf = GradientBoostingClassifier() clf.fit(trains, trainsy) X_test = Xs[index] y_test = ys[index] predicted = [a for a in clf.predict(X_test)] actual = [a for a in y_test] predicted_score = score_submission(actual, predicted) max_fold_score = score_submission(actual, actual) score = predicted_score / max_fold_score if score > best_score: best_score = score best_fold = clf best_predicted = predicted best_actual = actual index += 1 #print the possibility's string format print(p) #print the possibility's best-fold score and the holdout score
ids = list(range(len(folds))) del ids[fold] X_train = np.vstack(tuple([Xs[i] for i in ids])) y_train = np.hstack(tuple([ys[i] for i in ids])) X_test = Xs[fold] y_test = ys[fold] clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) clf.fit(X_train, y_train) predicted = [LABELS[int(a)] for a in clf.predict(X_test)] actual = [LABELS[int(a)] for a in y_test] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score/max_fold_score print("Score for fold "+ str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Run on Holdout set and report the final score on the holdout set predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)] actual = [LABELS[int(a)] for a in y_holdout]