regressor = LinearRegression() # Used with default setting here. However, in the real DKPro system, its setting # were probably optmized by a CV gridsearch on the training data # TODO: this approach is brain dead, because it keeps reading features from files print "{:64s}\t".format("Features:"), print "\t".join(["{:>16s}".format(p[1]) for p in id_pairs]) for feat in feats: print "{:64s}\t".format(feat), for train_id, test_id in id_pairs: train_feat, train_scores = read_train_data(train_id, [feat]) regressor.fit(train_feat, train_scores) test_feat, test_scores = read_test_data(test_id, [feat]) sys_scores = regressor.predict(test_feat) sys_input = read_system_input(test_input_fnames[test_id]) postprocess(sys_input, sys_scores) if isinstance(train_id, tuple): train_id = "+".join(train_id) print "{:16.2f}\t".format(correlation(sys_scores, test_scores)), print
from sts.score import correlation train = np.load("_npz_data/_STS2012.train.MSRpar.npz") clf = SVR(kernel='rbf', C=50, epsilon=.2, gamma=.02) print clf clf.fit(train["X"], train["y"]) #print clf.score(train["X"], train["y"]) test = np.load("_npz_data/_STS2012.test.MSRpar.npz") #print clf.score(test["X"], test["y"]) sys_scores = clf.predict(test["X"]) # postprocess sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt") sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0 sys_scores[sys_scores > 5.0] = 5.0 sys_scores[sys_scores < 0.0] = 0.0 # compute correlation score gold_scores = read_gold_standard("../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"] print correlation(gold_scores, sys_scores) #from sklearn.cross_validation import KFold #from sklearn.grid_search import GridSearchCV #C_range = 10.0 ** np.arange(-2, 9) #gamma_range = 10.0 ** np.arange(-5, 4) #param_grid = dict(gamma=gamma_range, C=C_range)
test_ids, "FNWN"), (train_ids, test_ids, "OnWN") ] feats = takelab_feats + takelab_lsa_feats + subsem_best_feats scores = [] X_sts12_train, y_sts12_train = read_train_data(train_ids, feats) X_sts12_test, y_sts12_test = read_test_data(test_ids, feats) X_train = vstack([X_sts12_train, X_sts12_test]) y_train = hstack([y_sts12_train, y_sts12_test]) test_input = [read_system_input(test_input_fnames[sts13_test_id]) for sts13_test_id in sts13.test_ids] test_input = concatenate(test_input) X_sts13, y_sts13 = sts13.read_test_data(sts13.test_ids, feats) X_sts13_val = X_sts13[0:X_sts13.shape[0]/2, :] X_sts13_held = X_sts13[X_sts13.shape[0]/2:, :] y_sts_val = y_sts13[0:len(y_sts13)/2] y_sts_held = y_sts13[len(y_sts13)/2:] test_input_val = test_input[0:len(test_input)/2] test_input_held = test_input[len(test_input)/2:] n_train = len(y_train) n_test = len(y_sts_val)
filenames = [] for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs: # combine 2012, 2013 training and test data X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats) X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats) X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test]) y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts14_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts14_test_id]) postprocess(test_input, y_test) fname = "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id) write_scores(fname, y_test) filenames.append(fname) descr_fname = "{}/STS-en-{}-{}.description.txt".format(out_dir, GROUP, APPROACH) open(descr_fname, "w").write(DESCRIPTION) filenames.append(descr_fname) filenames = " ".join(filenames) zipfile = "STS-en-{}-{}.zip".format(GROUP, APPROACH) call("zip -rv {} {}".format(zipfile, filenames),
from sts.io import read_system_input, read_gold_standard from sts.score import correlation train = np.load("_npz_data/_STS2012.train.MSRpar.npz") clf = SVR(kernel='rbf', C=50, epsilon=.2, gamma=.02) print clf clf.fit(train["X"], train["y"]) #print clf.score(train["X"], train["y"]) test = np.load("_npz_data/_STS2012.test.MSRpar.npz") #print clf.score(test["X"], test["y"]) sys_scores = clf.predict(test["X"]) # postprocess sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt") sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0 sys_scores[sys_scores > 5.0] = 5.0 sys_scores[sys_scores < 0.0] = 0.0 # compute correlation score gold_scores = read_gold_standard( "../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"] print correlation(gold_scores, sys_scores) #from sklearn.cross_validation import KFold #from sklearn.grid_search import GridSearchCV #C_range = 10.0 ** np.arange(-2, 9) #gamma_range = 10.0 ** np.arange(-5, 4) #param_grid = dict(gamma=gamma_range, C=C_range)
filenames = [] for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs: # combine 2012 training and test data X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, feats) X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test]) y_train = np.hstack([y_sts12_train, y_sts12_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts13_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts13_test_id]) postprocess(test_input, y_test) fname = "{}/STScore.output.{}.txt".format(out_dir, sts13_test_id) write_scores(fname, y_test) filenames.append(fname) descr_fname = "{}/STScore-{}-{}.description.txt".format( out_dir, GROUP, APPROACH) open(descr_fname, "w").write(DESCRIPTION) filenames.append(descr_fname) filenames = " ".join(filenames) zipfile = "STScore-{}-{}.zip".format(GROUP, APPROACH)