Пример #1
0
def run_experiment(frame,
                   test_ids,
                   train_ids=ALL_TRAIN_IDS,
                   feats=FEATS,
                   svm_grid=SMALL_PARAM_GRID,
                   cv=5,
                   n_jobs=1,
                   **drc_args):
    keyed_train_data = {}

    for data_id in train_ids:
        m = frame[frame.data_id == data_id][feats].as_matrix()
        keyed_train_data[data_id] = m

    keyed_test_data = {}

    for data_id in test_ids:
        m = frame[frame.data_id == data_id][feats].as_matrix()
        keyed_test_data[data_id] = m

    drc = DatasetRelevanceClassifier(**drc_args)
    drc.fit(keyed_train_data)

    result = []

    for data_id, X in keyed_test_data.items():
        train_sets = drc.predict(X)

        x_train = frame[frame.data_id.isin(train_sets)][feats].as_matrix()
        y_train = frame[frame.data_id.isin(train_sets)]['gs'].values
        x_test = frame[frame.data_set == 'STS2013-test']
        x_test = x_test[x_test.data_id == data_id][feats].as_matrix()
        y_test = frame[frame.data_set == 'STS2013-test']
        y_test = y_test[y_test.data_id == data_id]['gs'].values

        # reserve second half of test sets for final evaluation
        x_test = x_test[0:len(y_test) / 2, :]
        y_test = y_test[0:len(y_test) / 2]

        if svm_grid:
            grid = GridSearchCV(SVR(),
                                svm_grid,
                                cv=cv,
                                verbose=1,
                                n_jobs=n_jobs)
            grid.fit(x_train, y_train)

            best_params = grid.best_params_
        else:
            best_params = {}

        model = SVR(**best_params)
        model.fit(x_train, y_train)

        pred = model.predict(x_test)
        score = correlation(y_test, pred)

        result.append((data_id, score, train_sets, best_params))

    return result
Пример #2
0
def run_experiment(frame, test_ids, train_ids=ALL_TRAIN_IDS, feats=FEATS, svm_grid=SMALL_PARAM_GRID,
                   cv=5, n_jobs=1,**drc_args):
    keyed_train_data = {}

    for data_id in train_ids:
        m = frame[frame.data_id == data_id][feats].as_matrix()
        keyed_train_data[data_id] = m

    keyed_test_data = {}

    for data_id in test_ids:
        m = frame[frame.data_id == data_id][feats].as_matrix()
        keyed_test_data[data_id] = m

    drc = DatasetRelevanceClassifier(**drc_args)
    drc.fit(keyed_train_data)

    result = []

    for data_id, X in keyed_test_data.items():
        train_sets = drc.predict(X)

        x_train = frame[frame.data_id.isin(train_sets)][feats].as_matrix()
        y_train = frame[frame.data_id.isin(train_sets)]['gs'].values
        x_test = frame[frame.data_set == 'STS2013-test']
        x_test = x_test[x_test.data_id == data_id][feats].as_matrix()
        y_test = frame[frame.data_set == 'STS2013-test']
        y_test = y_test[y_test.data_id == data_id]['gs'].values

        # reserve second half of test sets for final evaluation
        x_test = x_test[0:len(y_test)/2, :]
        y_test = y_test[0:len(y_test)/2]

        if svm_grid:
            grid = GridSearchCV(SVR(), svm_grid, cv=cv, verbose=1, n_jobs=n_jobs)
            grid.fit(x_train, y_train)

            best_params = grid.best_params_
        else:
            best_params = {}

        model = SVR(**best_params)
        model.fit(x_train, y_train)

        pred = model.predict(x_test)
        score = correlation(y_test, pred)

        result.append((data_id, score, train_sets, best_params))

    return result
Пример #3
0
regressor = LinearRegression()
# Used with default setting here. However, in the real DKPro system, its setting
# were probably optmized by a CV gridsearch on the training data


# TODO: this approach is brain dead, because it keeps reading features from files

print "{:64s}\t".format("Features:"),

print "\t".join(["{:>16s}".format(p[1]) for p in id_pairs])


for feat in feats:
    print "{:64s}\t".format(feat),
    
    for train_id, test_id in id_pairs:
        train_feat, train_scores = read_train_data(train_id, [feat])
        regressor.fit(train_feat, train_scores)
        
        test_feat, test_scores = read_test_data(test_id, [feat])
        sys_scores = regressor.predict(test_feat)
        
        sys_input = read_system_input(test_input_fnames[test_id])
        postprocess(sys_input,  sys_scores)
        
        if isinstance(train_id, tuple):
            train_id = "+".join(train_id)
        
        print "{:16.2f}\t".format(correlation(sys_scores, test_scores)),
    print
    
Пример #4
0
clf.fit(train["X"], train["y"])
#print clf.score(train["X"], train["y"])
    
test = np.load("_npz_data/_STS2012.test.MSRpar.npz")
#print clf.score(test["X"], test["y"])
sys_scores = clf.predict(test["X"])

# postprocess
sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt")
sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0 
sys_scores[sys_scores > 5.0] = 5.0 
sys_scores[sys_scores < 0.0] = 0.0 

# compute correlation score
gold_scores = read_gold_standard("../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"]
print correlation(gold_scores, sys_scores)


#from sklearn.cross_validation import KFold
#from sklearn.grid_search import GridSearchCV
    
#C_range = 10.0 ** np.arange(-2, 9)
#gamma_range = 10.0 ** np.arange(-5, 4)
#param_grid = dict(gamma=gamma_range, C=C_range)
#cv = KFold(train["y"].size, k=3, shuffle=True)
#grid = GridSearchCV(SVR(kernel='rbf'), param_grid=param_grid, cv=cv)
#grid.fit(train["X"], train["y"])

#print("The best classifier is: ", grid.best_estimator_)
Пример #5
0
    #SVR(C=200, epsilon=0.5, gamma=0.02),
    #SVR(C=100, epsilon=0.2, gamma=0.02),
    #SVR(C=100, epsilon=0.2, gamma=0.02),
    #SVR(C=10,  epsilon=0.5, gamma=0.02)
    #]

regressors = [SVR() for i in range(5)]

total = 0.0

for (train_id, test_id), regressor in zip(id_pairs, regressors):
    train_feat, train_scores = read_train_data(train_id, feats)
    regressor.fit(train_feat, train_scores)
    
    test_feat, test_scores = read_test_data(test_id, feats)
    sys_scores = regressor.predict(test_feat)
    
    sys_input = read_system_input(test_input_fnames[test_id])
    postprocess(sys_input,  sys_scores)
    
    if isinstance(train_id, tuple):
        train_id = "+".join(train_id)

    r = correlation(sys_scores, test_scores)
    total += r
    
    print "{:32s} {:32s} {:2.4f}".format(train_id, test_id, r)
        
print "{:2.4f}".format(total / len(id_pairs))

    
Пример #6
0
results = np.recarray(0,
                      dtype=[("train_id", "S16"), ("test_id", "S16"),
                             ("min_diff", "f"), ("max_diff", "f"),
                             ("samples", "i"), ("score", "f")])

# fit regressor on training data
for train_id, rgr in regressors.items():
    rgr.fit(X_train[train_id], y_train[train_id])

# compute and report initial score on test data
for test_id, train_id in test_id2train_id.items():
    rgr = regressors[train_id]
    sys_scores = rgr.predict(X_test[test_id])
    postprocess(sys_input[test_id], sys_scores)
    r = correlation(sys_scores, y_test[test_id])
    n = X_train[train_id].shape[0]
    results.resize(results.size + 1)
    if isinstance(train_id, tuple):
        train_id = "+".join(train_id)
    results[-1] = (train_id, test_id, 0, 0, n, r)
    print "{:32s} {:32s} {:>8d} {:8.4f}".format(train_id, test_id, n, r)

# score headlines
for train_id in hline_regressors:
    # TODO: full postprocessing
    scores = regressors[train_id].predict(X_hline)
    scores[scores < 0] = 0.0
    scores[scores > 5] = 5.0
    y_hline[train_id] = scores
Пример #7
0
    #SVR(C=200, epsilon=0.5, gamma=0.02),
    #SVR(C=100, epsilon=0.2, gamma=0.02),
    #SVR(C=100, epsilon=0.2, gamma=0.02),
    #SVR(C=10,  epsilon=0.5, gamma=0.02)
    #]

regressors = [SVR(C=200) for i in range(5)]

total = 0.0


for (train_id, test_id), regressor in zip(id_pairs, regressors):
    train_feat, train_scores = read_train_data(train_id, feats)
    regressor.fit(train_feat, train_scores)
    
    test_feat, test_scores = read_test_data(test_id, feats)
    sys_scores = regressor.predict(test_feat)
    
    sys_input = read_system_input(test_input_fnames[test_id])
    postprocess(sys_input,  sys_scores)
    
    if isinstance(train_id, tuple):
        train_id = "+".join(train_id)
    
    r = correlation(sys_scores, test_scores)
    total += r
    
    print "{:32s} {:32s} {:2.4f}".format(train_id, test_id, r)
        
print "{:2.4f}".format(total / len(id_pairs))
Пример #8
0
def score_stub(true, pred):
    return correlation(true, postprocess(test_input_val, pred))
Пример #9
0
comb_train_ids = [("MSRpar", ),
                  ("MSRpar", "MSRvid"), ("MSRpar", "SMTeuroparl"),
                  ("MSRpar", "MSRvid", "SMTeuroparl"), ("MSRvid", ),
                  ("MSRvid", "SMTeuroparl"), ("SMTeuroparl", )]

id_pairs = [(train_id, test_id) for test_id in test_ids
            for train_id in comb_train_ids]

# features to be used
feats = all_feats

# regressors
regressor = SVR()

for train_id, test_id in id_pairs:
    train_feat, train_scores = read_train_data(train_id, feats)
    regressor.fit(train_feat, train_scores)

    test_feat, test_scores = read_test_data(test_id, feats)
    sys_scores = regressor.predict(test_feat)

    sys_input = read_system_input(test_input_fnames[test_id])
    postprocess(sys_input, sys_scores)

    if isinstance(train_id, tuple):
        train_id = "+".join(train_id)

    print "{:32s}\t{:32s}\t{:2.2f}".format(
        train_id, test_id, correlation(sys_scores, test_scores))
Пример #10
0
                                ("max_diff", "f"),
                                ("samples", "i"),
                                ("iteration", "i"),
                                ("score", "f")])    

         
# fit regressor on training data
for train_id, rgr in regressors.items():
    rgr.fit(X_train[train_id], y_train[train_id])
    
# compute and report initial score on test data
for test_id, train_id in test_id2train_id.items():
    rgr = regressors[train_id]
    sys_scores = rgr.predict(X_test[test_id])
    postprocess(sys_input[test_id],  sys_scores)
    r = correlation(sys_scores, y_test[test_id])
    n = X_train[train_id].shape[0]
    results.resize(results.size + 1)
    if isinstance(train_id, tuple):
        train_id = "+".join(train_id)    
    results[-1] = (train_id, test_id, 0, 0, n, 0, r)
    print "{:32s} {:32s} {:>8d} {:8.4f}".format(train_id, test_id, n, r)
   
# score headlines
for train_id in hline_regressors:  
    # TODO: full postprocessing
    scores = regressors[train_id].predict(X_hline)
    scores[scores < 0] = 0.0 
    scores[scores > 5] = 5.0 
    y_hline[train_id] = scores
Пример #11
0
import numpy as np
import matplotlib.pyplot as plt

from sts.io import read_gold_standard, read_system_output
from sts.score import correlation

# Takelab system

for data in "MSRpar", "MSRvid", "SMTeuroparl", "surprise.OnWN", "surprise.SMTnews":
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    gold = read_gold_standard("../../data/STS2012-test/STS.gs.{}.txt".format(data))
    
    out = read_system_output("takelab-out/{}-output.txt".format(data.lower()))
    
    ax.plot(gold, out, ".")
    r = correlation(gold["gold"], out["output"])
    
    ax.set_xlim(-0.5,5.5)
    ax.set_ylim(-0.5,5.5)
    ax.set_xlabel("Gold")
    ax.set_ylabel("System")
    ax.set_title("TakeLab.TST12.Test.{} (n={}, r={})".format(data, len(out), r))
    ax.grid(True)
    

    plt.savefig("scatter-takelab-tst12-test-{}.png".format(data))
    plt.show()
Пример #12
0
clf.fit(train["X"], train["y"])
#print clf.score(train["X"], train["y"])

test = np.load("_npz_data/_STS2012.test.MSRpar.npz")
#print clf.score(test["X"], test["y"])
sys_scores = clf.predict(test["X"])

# postprocess
sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt")
sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0
sys_scores[sys_scores > 5.0] = 5.0
sys_scores[sys_scores < 0.0] = 0.0

# compute correlation score
gold_scores = read_gold_standard(
    "../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"]
print correlation(gold_scores, sys_scores)

#from sklearn.cross_validation import KFold
#from sklearn.grid_search import GridSearchCV

#C_range = 10.0 ** np.arange(-2, 9)
#gamma_range = 10.0 ** np.arange(-5, 4)
#param_grid = dict(gamma=gamma_range, C=C_range)
#cv = KFold(train["y"].size, k=3, shuffle=True)
#grid = GridSearchCV(SVR(kernel='rbf'), param_grid=param_grid, cv=cv)
#grid.fit(train["X"], train["y"])

#print("The best classifier is: ", grid.best_estimator_)
Пример #13
0
def score_stub(true, pred):
    return correlation(true, postprocess(test_input_val, pred))
Пример #14
0
def score_stub(true, pred):
    return correlation(true, postprocess(test_input_val, pred))


# grid cv on test held
cv_split = [(range(n_train), range(n_train, n_train + n_test))]
grid = GridSearchCV(SVR(),
                    param_grid,
                    cv=cv_split,
                    verbose=1,
                    n_jobs=N_JOBS,
                    scoring=make_scorer(correlation))
grid.fit(vstack([X_train, X_sts13_val]), hstack([y_train, y_sts_val]))

held_cv_score = grid.best_score_
held_cv_params = grid.best_params_

print held_cv_score
print held_cv_params

regressor = SVR(**held_cv_params)
regressor.fit(X_train, y_train)

y_test = regressor.predict(X_sts13_held)
# y_test = postprocess(test_input_held,  y_test)

held_cv_held_score = correlation(y_sts_held, y_test)

print held_cv_held_score
Пример #15
0
        x_train = df[df.data_id.isin(train_sets)][feats].as_matrix()
        y_train = df[df.data_id.isin(train_sets)]['gs'].values
        x_test = df[df.data_set == 'STS2013-test']
        x_test = x_test[x_test.data_id == data_id][feats].as_matrix()
        y_test = df[df.data_set == 'STS2013-test']
        y_test = y_test[y_test.data_id == data_id]['gs'].values

        # use second half of test sets for final evaluation
        x_test = x_test[len(y_test) / 2:, :]
        y_test = y_test[len(y_test) / 2:]

        model = SVR()
        model.fit(x_train, y_train)

        pred = model.predict(x_test)
        score = correlation(y_test, pred)

        scores.append(score)

        print "%s\t%.04f" % (data_id, score)

    print "Mean\t%.04f" % mean(scores)

    # 2014 predictions
    print "Data set predictions for STS2014-test"

    keyed_train_data = {}

    for data_id in ALL_TRAIN_IDS + ALL_TEST_IDS:
        m = df[df.data_id == data_id][feats].as_matrix()
        keyed_train_data[data_id] = m
Пример #16
0
n_train = len(y_train)
n_test = len(y_sts_val)

param_grid = LARGE_PARAM_GRID

def score_stub(true, pred):
    return correlation(true, postprocess(test_input_val, pred))

# grid cv on test held
cv_split = [(range(n_train), range(n_train, n_train + n_test))]
grid = GridSearchCV(SVR(), param_grid, cv=cv_split, verbose=1, n_jobs=N_JOBS,
                    scoring=make_scorer(correlation))
grid.fit(vstack([X_train, X_sts13_val]), hstack([y_train, y_sts_val]))

held_cv_score = grid.best_score_
held_cv_params = grid.best_params_

print held_cv_score
print held_cv_params

regressor = SVR(**held_cv_params)
regressor.fit(X_train, y_train)

y_test = regressor.predict(X_sts13_held)
# y_test = postprocess(test_input_held,  y_test)

held_cv_held_score = correlation(y_sts_held, y_test)

print held_cv_held_score
Пример #17
0
        x_train = df[df.data_id.isin(train_sets)][feats].as_matrix()
        y_train = df[df.data_id.isin(train_sets)]['gs'].values
        x_test = df[df.data_set == 'STS2013-test']
        x_test = x_test[x_test.data_id == data_id][feats].as_matrix()
        y_test = df[df.data_set == 'STS2013-test']
        y_test = y_test[y_test.data_id == data_id]['gs'].values

        # use second half of test sets for final evaluation
        x_test = x_test[len(y_test)/2:, :]
        y_test = y_test[len(y_test)/2:]

        model = SVR()
        model.fit(x_train, y_train)

        pred = model.predict(x_test)
        score = correlation(y_test, pred)

        scores.append(score)

        print "%s\t%.04f" % (data_id, score)

    print "Mean\t%.04f" % mean(scores)

    # 2014 predictions
    print "Data set predictions for STS2014-test"

    keyed_train_data = {}

    for data_id in ALL_TRAIN_IDS + ALL_TEST_IDS:
        m = df[df.data_id == data_id][feats].as_matrix()
        keyed_train_data[data_id] = m