Exemplo n.º 1
0
    def test_shuffle_list(self):
        # dummy data in vw format
        X = ['1 |Pet cat', '-1 |Pet dog', '1 |Pet cat', '1 |Pet cat']

        # Classifier with multiple passes over the data
        clf = VWClassifier(passes=3, convert_to_vw=False)
        clf.fit(X)

        # assert that the dummy data was not perturbed
        assert X == ['1 |Pet cat', '-1 |Pet dog', '1 |Pet cat', '1 |Pet cat']
Exemplo n.º 2
0
    def test_decision_function(self, data):
        classes = np.array([-1., 1.])
        raw_model = VW(loss_function='logistic')
        raw_model.fit(data.x, data.y)
        predictions = raw_model.predict(data.x)
        class_indices = (predictions > 0).astype(np.int)
        class_predictions = classes[class_indices]

        model = VWClassifier()
        model.fit(data.x, data.y)

        assert np.allclose(class_predictions, model.predict(data.x))
Exemplo n.º 3
0
    def test_decision_function(self, data):
        classes = np.array([-1., 1.])
        raw_model = VW(loss_function='logistic')
        raw_model.fit(data.x, data.y)
        predictions = raw_model.predict(data.x)
        class_indices = (predictions > 0).astype(np.int)
        class_predictions = classes[class_indices]

        model = VWClassifier()
        model.fit(data.x, data.y)

        assert np.allclose(class_predictions, model.predict(data.x))
Exemplo n.º 4
0
    def test_shuffle_pd_series(self):
        # dummy data in vw format
        X = pd.Series(['1 |Pet cat', '-1 |Pet dog', '1 |Pet cat', '1 |Pet cat'], name='catdog')

        kfold = KFold(n_splits=3, random_state=314, shuffle=True)
        for train_idx, valid_idx in kfold.split(X):
            X_train = X[train_idx]
            # Classifier with multiple passes over the data
            clf = VWClassifier(passes=3, convert_to_vw=False)
            # Test that there is no exception raised in the fit on folds
            try:
                clf.fit(X_train)
            except KeyError:
                pytest.fail("Failed the fit over sub-sampled DataFrame")
class classifier_Vowpalwabbit(object):
    def __init__(self):
        self.name = "Vowpalwabbit"
        self.model = VWClassifier()
        return
# ----------------------------------------------------------------------------------------------------------------------

    def maybe_reshape(self, X):
        if numpy.ndim(X) == 2:
            return X
        else:
            return numpy.reshape(X, (-1, X.shape[0]))
# ----------------------------------------------------------------------------------------------------------------------

    def learn(self, data_train, target_train):
        yyy = target_train.astype(float)
        yyy[yyy > 0] = 1
        yyy[yyy <= 0] = -1
        self.model.fit(self.maybe_reshape(data_train).astype(numpy.float), yyy)
        return
# ----------------------------------------------------------------------------------------------------------------------

    def predict(self, array):
        xxx = self.maybe_reshape(array).astype(numpy.float)

        res = numpy.array(self.model.decision_function(xxx))
        #res = numpy.array(self.model.predict(xxx))
        return numpy.vstack((numpy.zeros_like(res), res)).T
# ----------------------------------------------------------------------------------------------------------------------

    def sanitycheck(self):
        X, y = datasets.make_hastie_10_2(n_samples=1000, random_state=1)
        X = X.astype(numpy.float32)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=256)

        model = VWClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        score_train = model.score(X_train, y_train)
        scoer_test = model.score(X_test, y_test)
        return


# ----------------------------------------------------------------------------------------------------------------------
    def sanitycheck(self):
        X, y = datasets.make_hastie_10_2(n_samples=1000, random_state=1)
        X = X.astype(numpy.float32)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=256)

        model = VWClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        score_train = model.score(X_train, y_train)
        scoer_test = model.score(X_test, y_test)
        return


# ----------------------------------------------------------------------------------------------------------------------
Exemplo n.º 7
0
def test_repr():

    model = VW()
    expected = "VW('convert_labels:False', 'quiet:True', 'sgd:False')"
    assert expected == model.__repr__()

    model = VWClassifier()
    expected = "VWClassifier('convert_labels:False', "\
    "'loss_function:logistic', 'quiet:True', 'sgd:False')"
    assert expected == model.__repr__()

    model = VWRegressor()
    expected = "VWRegressor('convert_labels:False', 'quiet:True', 'sgd:False')"
    assert expected == model.__repr__()

    model = VW(convert_to_vw=False,
               oaa=3,
               loss_function='logistic',
               probabilities=True)
    expected = "VW('convert_labels:False', 'loss_function:logistic', "\
    "'oaa:3', 'probabilities:True', 'quiet:True', 'sgd:False')"
    assert expected == model.__repr__()
Exemplo n.º 8
0
def run_sklearn(train, test, model):
    if type(model) is not str:
        raise TypeError('Non-str value invalid for parameter \'model\'')
    if not model:
        raise AttributeError('Empty string invalid for parameter \'model\'')
    if not train:
        raise AttributeError('None-type or empty object invalid for parameter \'train\'')
    if not test:
        raise AttributeError('None-type or empty object invalid for parameter \'test\'')

    sklearn_model_dict = {
                            'naive' : BernoulliNB(),
                            'svm'   : LinearSVC(),
                            'knn'   : KNeighborsClassifier(),
                         }

    vocab_size = len(train.vocabulary)

    raw_train = [' '.join([str(t.token) for t in doc.tokens]) for doc in train.documents]
    raw_test = [' '.join([str(t.token) for t in doc.tokens]) for doc in test.documents]

    voc = dict()
    for doc in raw_train:
        for t in doc.split():
            if t not in voc.keys():
                voc[t] = len(voc)

    tfv = TfidfVectorizer(vocabulary=voc)
    train_matrix = tfv.fit_transform(raw_train)
    test_matrix = tfv.transform(raw_test)

    # Hacky, should fix this
    X = vstack([train_matrix, test_matrix])

    train_targets = get_targets(train)
    test_targets = get_targets(test)

    if model == 'vowpal':
        assert VWClassifier is not None
        sklearn_model_dict[model] = VWClassifier()

        # Vowpal Wabbit requires this format
        train_targets = [-1 if t == 0 else 1 for t in train_targets]
        test_targets = [-1 if t == 0 else 1 for t in test_targets]


    train_targets.extend(test_targets)
    Y = train_targets

    sklearn_model = sklearn_model_dict[model]

    # random state 0 gives us consistency
    rs = ShuffleSplit(n_splits=5, test_size=1000, random_state=0)
    results = cross_validate(sklearn_model, X, Y, cv=rs, scoring=scoring_dict, error_score=0.0)

    accuracy = np.mean(results['test_accuracy'])
    precision = np.mean(results['test_precision'])
    recall = np.mean(results['test_recall'])

    train_time = np.mean(results['fit_time'])
    test_time = np.mean(results['score_time'])

    # We need to compare to ankura
    topic_time = 0
    anchor_time = 0

    return accuracy, precision, recall, train_time, test_time, topic_time, anchor_time
Exemplo n.º 9
0
from vowpalwabbit.sklearn_vw import VWClassifier

X = [[1, 2], [3, 4], [5, 6], [7, 8]]
y = [-1, -1, 1, 1]

model = VWClassifier(loss_function='logistic', l=0.01, l2=0.1)
model.fit(X, y)

print(model.predict(X))
print(model.score(X, y))
Exemplo n.º 10
0
 def test_repr(self):
     model = VWClassifier()
     expected = "VWClassifier(convert_labels: True, convert_to_vw: True, loss_function: logistic, passes: 1, quiet: True)"
     assert expected == model.__repr__()
Exemplo n.º 11
0
 def test_predict_proba(self, data):
     model = VWClassifier()
     model.fit(data.x, data.y)
     actual = model.predict_proba(data.x)
     assert actual.shape[0] == 100
     assert np.allclose(actual[0], [0.3997, 0.6003], atol=1e-4)
from vowpalwabbit.sklearn_vw import tovw

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score

# In[ ]:

thres = 0.5
# We will evaluate and compare several VW models as well as simple gender-based models
models = {
    'VW_passes1':
    VWClassifier(quiet=False,
                 convert_to_vw=False,
                 passes=1,
                 link='logistic',
                 pos_threshold=thres,
                 random_seed=314),
    'VW_passes2':
    VWClassifier(quiet=False,
                 convert_to_vw=False,
                 passes=2,
                 link='logistic',
                 pos_threshold=thres,
                 random_seed=314),
    'VW_passes2_l2001':
    VWClassifier(quiet=False,
                 convert_to_vw=False,
                 passes=2,
                 link='logistic',
                 pos_threshold=thres,
Exemplo n.º 13
0
 def __init__(self):
     self.name = "Vowpalwabbit"
     self.model = VWClassifier()
     return
Exemplo n.º 14
0
def run(dtrain, dtest, opts, retrain=False, fold_num=5):
    if dtrain.dtype != 'svm':
        raise Exception("vw only supports svm type data")

    best_l = 1.0
    if 'cv' in opts:
        cv_output_path = osp.join(dtrain.work_dir, 'cv-vw.txt')
        if os.path.exists(cv_output_path) and retrain == False:
            with open(cv_output_path, 'r') as fh:
                line = fh.readline()
            C = float(line.split('=')[1])
        else:
            #cross validation
            logging.info("cross validation")
            logging.info("loading %s" % (dtrain.rand_path()))
            x_train, y_train = datasets.load_svmlight_file(dtrain.rand_path())
            logging.info("loading %s" % (dtest.data_path))
            x_test, y_test = datasets.load_svmlight_file(dtest.data_path)

            clf = GridSearchCV(estimator=VWClassifier(),
                               param_grid=opts['cv'],
                               n_jobs=1,
                               cv=fold_num,
                               verbose=True)
            clf.fit(x_train, y_train)

            best_l = max(clf.grid_scores_, key=itemgetter(1)).parameters['l']
            with open(cv_output_path, 'w') as fh:
                fh.write('Best Result: l=%f' % (best_l))

        logging.info('cross validation parameters: l=%f' % (best_l))

    vw_data_path = dtrain.rand_path('svm') + '.vw'
    convert_to_vw(dtrain.rand_path('svm'), vw_data_path, dtrain.class_num)
    cache_path = vw_data_path + '.cache'
    if osp.exists(cache_path):
        os.remove(cache_path)

    if 'lambda' in opts:
        #sol
        sparsity_list = []
        test_accu_list = []
        for l1 in opts['lambda']:
            readable_model_path = osp.join(dtrain.work_dir, 'vw.r.model')
            train_accu, train_time = train(dtrain,
                                           model_params=[('learning_rate',
                                                          best_l), ('l1', l1)],
                                           cache=True,
                                           readable_path=readable_model_path)
            logging.info("training accuracy: %.4f" % (train_accu))
            logging.info("training time: %.4f seconds" % (train_time))

            test_accu, test_time = test(dtest, cache=True)
            logging.info("test accuracy: %.4f" % (test_accu))
            logging.info("test time: %.4f seconds" % (test_time))
            #parse sparsity
            sparsity = parse_sparsity(dtrain, readable_model_path,
                                      dtrain.class_num)
            sparsity_list.append(sparsity)
            test_accu_list.append(test_accu)

        return sparsity_list, test_accu_list

    else:
        train_accu, train_time = train(dtrain,
                                       model_params=[('learning_rate', best_l)
                                                     ])
        logging.info("training accuracy: %.4f" % (train_accu))
        logging.info("training time: %.4f seconds" % (train_time))

        test_accu, test_time = test(dtest)
        logging.info("test accuracy: %.4f" % (test_accu))
        logging.info("test time: %.4f seconds" % (test_time))
        return train_accu, train_time, test_accu, test_time
Exemplo n.º 15
0
metadata.drop(metadata.columns[[0]], axis = 1, inplace = True)


print(seenMovie)
print(metadata)
print("Data loaded")
print(seenMovie.shape, '\t', metadata.shape)
seenMovie = seenMovie.astype('int')
# split train and test set
X_train, X_test, y_train, y_test = train_test_split(metadata, seenMovie, test_size=0.3, random_state=256)

# build VW logistic regression model
# LogLossVal:  0.013283467177640678
# Mean Square Error of the Log for the 1st model:  0.00018478019510039388
print('Train\n', y_train)
model = VWClassifier(loss_function='logistic')
model.fit(X_train, y_train)
# predict model
y_pred = model.predict_proba(X_test)
print(y_pred)
print("Training complete for model 1...")

print("starting LogLoss...")
# get log loss for linear regression model 
logLossVal = log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None)
# r2_score_nnls = r2_score(y_test, y_pred)
# print("NNLS R2 score", r2_score_nnls)
# mse_1 = calculateMeanSquareError(y_test, y_pred)
# m1_recall = recall_score(y_test, y_pred, average='binary', zero_division=0)
# m1_precision = precision_score(y_test, y_pred, average='binary', zero_division=0)
# accuracy_m1 = accuracy_score(y_test, y_pred); 
Exemplo n.º 16
0
 def test_init(self):
     assert isinstance(VWClassifier(), VWClassifier)
Exemplo n.º 17
0
# from vowpalwabbit import pyvw
#
# vw = pyvw.vw(quiet=True)
# ex = vw.example('1 | a b c')
# vw.learn(ex)
# vw.predict(ex)

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from vowpalwabbit.pyvw import vw
from vowpalwabbit.sklearn_vw import VWClassifier

# generate some data
X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)
X = X.astype(np.float32)

# split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)

# build model
model = VWClassifier()
model.fit(X_train, y_train)

# predict model
y_pred = model.predict(X_test)
print(y_pred)
# evaluate model
model.score(X_train, y_train)
model.score(X_test, y_test)
Exemplo n.º 18
0
 def test_decision_function(self, data):
     model = VWClassifier()
     model.fit(data.x, data.y)
     actual = model.decision_function(data.x)
     assert actual.shape[0] == 100
     assert np.isclose(actual[0], 0.4069, atol=1e-4)
Exemplo n.º 19
0
    text_filename = os.listdir('D:/cadec/text')[i]
    Ori_filename = os.listdir('D:/cadec/Ori')[i]
    objects.append(
        document('D:/cadec/text/' + text_filename,
                 'D:/cadec/Ori/' + Ori_filename, dic))
    print(i)
sel = (len(objects))
x, y, l = transform(objects[:sel], dic)
train_cut = int(0.75 * (len(objects)))
test_cut = (len(objects))
x_train, y_train, l_train = transform(objects[:train_cut], dic)
x_test, y_test, l_test = transform(objects[train_cut:test_cut], dic)
x, y = clean(x, y)
x_train, y_train = clean(x_train, y_train)
x_test, y_test = clean(x_test, y_test)
print(x_test)
print(y_test)
print("completed")
vecx = DictVectorizer(sparse=True)
vecy = DictVectorizer(sparse=False)
train_len = (x_train)
test_len = (x_test)
clf = VWClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(len(y_pred))
print((y_pred))
print((y_test))
recall_str = classification_report(y_test, y_pred)
print(recall_str)
Exemplo n.º 20
0
def test_sgd_param():

    model1 = VWRegressor(sgd=True)
    model2 = VWClassifier(sgd=True)
    assert model1.get_params()['sgd'] == True
    assert model2.get_params()['sgd'] == True
Exemplo n.º 21
0
def train(dtrain,
          model_params={},
          output_path=None,
          cv_params=None,
          fold_num=5,
          retrain=False,
          cv_process_num=1):
    """
    train a vw model

    Parameter
    ---------
    dtrain: DataSet
        the dataset used to train the model
    model_params: dict{param, val}
        model parameters
    output_path: str
        path to save the model
    cv_params: dict{param, range}
        cross validation parameters
    fold_num: int
        number of folds to do cross validation
    retrain: bool
        whether to re-do the cross validation
    cv_process_num: int
        number of processes to do cross validation

    Return
    ------
    tuple(train accuracy, train time)
    """

    best_l = 1.0
    if cv_params is not None:
        cv_output_path = osp.join(dtrain.work_dir, 'cv-vw.txt')
        if osp.exists(cv_output_path) and retrain is False:
            with open(cv_output_path, 'r') as rfh:
                line = rfh.readline()
            best_l = float(line.split('=')[1])
        else:
            #cross validation
            logging.info("cross validation")
            logging.info("loading %s", dtrain.rand_path('svm'))
            xtrain, ytrain = datasets.load_svmlight_file(
                dtrain.rand_path('svm'))

            clf = GridSearchCV(estimator=VWClassifier(),
                               param_grid=cv_params,
                               n_jobs=cv_process_num,
                               cv=fold_num,
                               verbose=True)
            clf.fit(xtrain, ytrain)

            best_l = clf.best_params_['learning_rate']
            with open(cv_output_path, 'w') as wfh:
                wfh.write('Best Result: l=%f' % (best_l))

        logging.info('cross validation parameters: learning_rate=%f', (best_l))

        model_params['learning_rate'] = best_l

    cmd = vw_exe()
    if dtrain.class_num != 2:
        cmd += ' --oaa %d' % (dtrain.class_num)

    model_path = osp.join(dtrain.work_dir, 'vw.model')
    cmd += ' -f \"%s\"' % (model_path)

    if output_path != None:
        cmd += ' --readable_model \"%s\"' % (output_path)

    for k, v in model_params.iteritems():
        cmd += ' --%s \"%s\"' % (k, str(v))

    vw_data_path = dtrain.rand_path('svm') + '.vw'
    if osp.exists(vw_data_path) is False:
        convert_to_vw(dtrain.rand_path('svm'), vw_data_path, dtrain.class_num)

    cmd += ' \"%s\"' % (vw_data_path)

    logging.info(cmd)
    start_time = time.time()
    if os.system(cmd) != 0:
        logging.error('call vw failed, vw in path?')
        sys.exit()
    train_time = time.time() - start_time

    train_accu, train_test_time = test(dtrain)
    train_time += train_test_time

    logging.info("training accuracy: %.4f", train_accu)
    logging.info("training time: %.4f seconds", train_time)

    return train_accu, train_time