Пример #1
0
def run_experiment(frame,
                   test_ids,
                   train_ids=ALL_TRAIN_IDS,
                   feats=FEATS,
                   svm_grid=SMALL_PARAM_GRID,
                   cv=5,
                   n_jobs=1,
                   **drc_args):
    keyed_train_data = {}

    for data_id in train_ids:
        m = frame[frame.data_id == data_id][feats].as_matrix()
        keyed_train_data[data_id] = m

    keyed_test_data = {}

    for data_id in test_ids:
        m = frame[frame.data_id == data_id][feats].as_matrix()
        keyed_test_data[data_id] = m

    drc = DatasetRelevanceClassifier(**drc_args)
    drc.fit(keyed_train_data)

    result = []

    for data_id, X in keyed_test_data.items():
        train_sets = drc.predict(X)

        x_train = frame[frame.data_id.isin(train_sets)][feats].as_matrix()
        y_train = frame[frame.data_id.isin(train_sets)]['gs'].values
        x_test = frame[frame.data_set == 'STS2013-test']
        x_test = x_test[x_test.data_id == data_id][feats].as_matrix()
        y_test = frame[frame.data_set == 'STS2013-test']
        y_test = y_test[y_test.data_id == data_id]['gs'].values

        # reserve second half of test sets for final evaluation
        x_test = x_test[0:len(y_test) / 2, :]
        y_test = y_test[0:len(y_test) / 2]

        if svm_grid:
            grid = GridSearchCV(SVR(),
                                svm_grid,
                                cv=cv,
                                verbose=1,
                                n_jobs=n_jobs)
            grid.fit(x_train, y_train)

            best_params = grid.best_params_
        else:
            best_params = {}

        model = SVR(**best_params)
        model.fit(x_train, y_train)

        pred = model.predict(x_test)
        score = correlation(y_test, pred)

        result.append((data_id, score, train_sets, best_params))

    return result
Пример #2
0
def run_experiment(frame, test_ids, train_ids=ALL_TRAIN_IDS, feats=FEATS, svm_grid=SMALL_PARAM_GRID,
                   cv=5, n_jobs=1,**drc_args):
    keyed_train_data = {}

    for data_id in train_ids:
        m = frame[frame.data_id == data_id][feats].as_matrix()
        keyed_train_data[data_id] = m

    keyed_test_data = {}

    for data_id in test_ids:
        m = frame[frame.data_id == data_id][feats].as_matrix()
        keyed_test_data[data_id] = m

    drc = DatasetRelevanceClassifier(**drc_args)
    drc.fit(keyed_train_data)

    result = []

    for data_id, X in keyed_test_data.items():
        train_sets = drc.predict(X)

        x_train = frame[frame.data_id.isin(train_sets)][feats].as_matrix()
        y_train = frame[frame.data_id.isin(train_sets)]['gs'].values
        x_test = frame[frame.data_set == 'STS2013-test']
        x_test = x_test[x_test.data_id == data_id][feats].as_matrix()
        y_test = frame[frame.data_set == 'STS2013-test']
        y_test = y_test[y_test.data_id == data_id]['gs'].values

        # reserve second half of test sets for final evaluation
        x_test = x_test[0:len(y_test)/2, :]
        y_test = y_test[0:len(y_test)/2]

        if svm_grid:
            grid = GridSearchCV(SVR(), svm_grid, cv=cv, verbose=1, n_jobs=n_jobs)
            grid.fit(x_train, y_train)

            best_params = grid.best_params_
        else:
            best_params = {}

        model = SVR(**best_params)
        model.fit(x_train, y_train)

        pred = model.predict(x_test)
        score = correlation(y_test, pred)

        result.append((data_id, score, train_sets, best_params))

    return result
Пример #3
0
    # Held out eval
    print "Held out evaluation scores"
    keyed_train_data = {}

    for data_id in ALL_TRAIN_IDS:
        m = df[df.data_id == data_id][feats].as_matrix()
        keyed_train_data[data_id] = m

    keyed_test_data = {}

    for data_id in ALL_TEST_IDS:
        m = df[df.data_id == data_id][feats].as_matrix()
        keyed_test_data[data_id] = m

    drc = DatasetRelevanceClassifier(method='k-means',
                                     selection='top',
                                     representative='medoid',
                                     n_clusters=3)
    drc.fit(keyed_train_data)

    scores = []

    for data_id, X in keyed_test_data.items():
        train_sets = drc.predict(X)

        x_train = df[df.data_id.isin(train_sets)][feats].as_matrix()
        y_train = df[df.data_id.isin(train_sets)]['gs'].values
        x_test = df[df.data_set == 'STS2013-test']
        x_test = x_test[x_test.data_id == data_id][feats].as_matrix()
        y_test = df[df.data_set == 'STS2013-test']
        y_test = y_test[y_test.data_id == data_id]['gs'].values
Пример #4
0
    # Held out eval
    print "Held out evaluation scores"
    keyed_train_data = {}

    for data_id in ALL_TRAIN_IDS:
        m = df[df.data_id == data_id][feats].as_matrix()
        keyed_train_data[data_id] = m

    keyed_test_data = {}

    for data_id in ALL_TEST_IDS:
        m = df[df.data_id == data_id][feats].as_matrix()
        keyed_test_data[data_id] = m

    drc = DatasetRelevanceClassifier(method='k-means', selection='top', representative='medoid', n_clusters=3)
    drc.fit(keyed_train_data)

    scores = []

    for data_id, X in keyed_test_data.items():
        train_sets = drc.predict(X)

        x_train = df[df.data_id.isin(train_sets)][feats].as_matrix()
        y_train = df[df.data_id.isin(train_sets)]['gs'].values
        x_test = df[df.data_set == 'STS2013-test']
        x_test = x_test[x_test.data_id == data_id][feats].as_matrix()
        y_test = df[df.data_set == 'STS2013-test']
        y_test = y_test[y_test.data_id == data_id]['gs'].values

        # use second half of test sets for final evaluation