def test_shuffle_list(self): # dummy data in vw format X = ['1 |Pet cat', '-1 |Pet dog', '1 |Pet cat', '1 |Pet cat'] # Classifier with multiple passes over the data clf = VWClassifier(passes=3, convert_to_vw=False) clf.fit(X) # assert that the dummy data was not perturbed assert X == ['1 |Pet cat', '-1 |Pet dog', '1 |Pet cat', '1 |Pet cat']
def test_decision_function(self, data): classes = np.array([-1., 1.]) raw_model = VW(loss_function='logistic') raw_model.fit(data.x, data.y) predictions = raw_model.predict(data.x) class_indices = (predictions > 0).astype(np.int) class_predictions = classes[class_indices] model = VWClassifier() model.fit(data.x, data.y) assert np.allclose(class_predictions, model.predict(data.x))
def test_shuffle_pd_series(self): # dummy data in vw format X = pd.Series(['1 |Pet cat', '-1 |Pet dog', '1 |Pet cat', '1 |Pet cat'], name='catdog') kfold = KFold(n_splits=3, random_state=314, shuffle=True) for train_idx, valid_idx in kfold.split(X): X_train = X[train_idx] # Classifier with multiple passes over the data clf = VWClassifier(passes=3, convert_to_vw=False) # Test that there is no exception raised in the fit on folds try: clf.fit(X_train) except KeyError: pytest.fail("Failed the fit over sub-sampled DataFrame")
class classifier_Vowpalwabbit(object): def __init__(self): self.name = "Vowpalwabbit" self.model = VWClassifier() return # ---------------------------------------------------------------------------------------------------------------------- def maybe_reshape(self, X): if numpy.ndim(X) == 2: return X else: return numpy.reshape(X, (-1, X.shape[0])) # ---------------------------------------------------------------------------------------------------------------------- def learn(self, data_train, target_train): yyy = target_train.astype(float) yyy[yyy > 0] = 1 yyy[yyy <= 0] = -1 self.model.fit(self.maybe_reshape(data_train).astype(numpy.float), yyy) return # ---------------------------------------------------------------------------------------------------------------------- def predict(self, array): xxx = self.maybe_reshape(array).astype(numpy.float) res = numpy.array(self.model.decision_function(xxx)) #res = numpy.array(self.model.predict(xxx)) return numpy.vstack((numpy.zeros_like(res), res)).T # ---------------------------------------------------------------------------------------------------------------------- def sanitycheck(self): X, y = datasets.make_hastie_10_2(n_samples=1000, random_state=1) X = X.astype(numpy.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256) model = VWClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) score_train = model.score(X_train, y_train) scoer_test = model.score(X_test, y_test) return # ----------------------------------------------------------------------------------------------------------------------
def sanitycheck(self): X, y = datasets.make_hastie_10_2(n_samples=1000, random_state=1) X = X.astype(numpy.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256) model = VWClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) score_train = model.score(X_train, y_train) scoer_test = model.score(X_test, y_test) return # ----------------------------------------------------------------------------------------------------------------------
def test_repr(): model = VW() expected = "VW('convert_labels:False', 'quiet:True', 'sgd:False')" assert expected == model.__repr__() model = VWClassifier() expected = "VWClassifier('convert_labels:False', "\ "'loss_function:logistic', 'quiet:True', 'sgd:False')" assert expected == model.__repr__() model = VWRegressor() expected = "VWRegressor('convert_labels:False', 'quiet:True', 'sgd:False')" assert expected == model.__repr__() model = VW(convert_to_vw=False, oaa=3, loss_function='logistic', probabilities=True) expected = "VW('convert_labels:False', 'loss_function:logistic', "\ "'oaa:3', 'probabilities:True', 'quiet:True', 'sgd:False')" assert expected == model.__repr__()
def run_sklearn(train, test, model): if type(model) is not str: raise TypeError('Non-str value invalid for parameter \'model\'') if not model: raise AttributeError('Empty string invalid for parameter \'model\'') if not train: raise AttributeError('None-type or empty object invalid for parameter \'train\'') if not test: raise AttributeError('None-type or empty object invalid for parameter \'test\'') sklearn_model_dict = { 'naive' : BernoulliNB(), 'svm' : LinearSVC(), 'knn' : KNeighborsClassifier(), } vocab_size = len(train.vocabulary) raw_train = [' '.join([str(t.token) for t in doc.tokens]) for doc in train.documents] raw_test = [' '.join([str(t.token) for t in doc.tokens]) for doc in test.documents] voc = dict() for doc in raw_train: for t in doc.split(): if t not in voc.keys(): voc[t] = len(voc) tfv = TfidfVectorizer(vocabulary=voc) train_matrix = tfv.fit_transform(raw_train) test_matrix = tfv.transform(raw_test) # Hacky, should fix this X = vstack([train_matrix, test_matrix]) train_targets = get_targets(train) test_targets = get_targets(test) if model == 'vowpal': assert VWClassifier is not None sklearn_model_dict[model] = VWClassifier() # Vowpal Wabbit requires this format train_targets = [-1 if t == 0 else 1 for t in train_targets] test_targets = [-1 if t == 0 else 1 for t in test_targets] train_targets.extend(test_targets) Y = train_targets sklearn_model = sklearn_model_dict[model] # random state 0 gives us consistency rs = ShuffleSplit(n_splits=5, test_size=1000, random_state=0) results = cross_validate(sklearn_model, X, Y, cv=rs, scoring=scoring_dict, error_score=0.0) accuracy = np.mean(results['test_accuracy']) precision = np.mean(results['test_precision']) recall = np.mean(results['test_recall']) train_time = np.mean(results['fit_time']) test_time = np.mean(results['score_time']) # We need to compare to ankura topic_time = 0 anchor_time = 0 return accuracy, precision, recall, train_time, test_time, topic_time, anchor_time
from vowpalwabbit.sklearn_vw import VWClassifier X = [[1, 2], [3, 4], [5, 6], [7, 8]] y = [-1, -1, 1, 1] model = VWClassifier(loss_function='logistic', l=0.01, l2=0.1) model.fit(X, y) print(model.predict(X)) print(model.score(X, y))
def test_repr(self): model = VWClassifier() expected = "VWClassifier(convert_labels: True, convert_to_vw: True, loss_function: logistic, passes: 1, quiet: True)" assert expected == model.__repr__()
def test_predict_proba(self, data): model = VWClassifier() model.fit(data.x, data.y) actual = model.predict_proba(data.x) assert actual.shape[0] == 100 assert np.allclose(actual[0], [0.3997, 0.6003], atol=1e-4)
from vowpalwabbit.sklearn_vw import tovw from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score # In[ ]: thres = 0.5 # We will evaluate and compare several VW models as well as simple gender-based models models = { 'VW_passes1': VWClassifier(quiet=False, convert_to_vw=False, passes=1, link='logistic', pos_threshold=thres, random_seed=314), 'VW_passes2': VWClassifier(quiet=False, convert_to_vw=False, passes=2, link='logistic', pos_threshold=thres, random_seed=314), 'VW_passes2_l2001': VWClassifier(quiet=False, convert_to_vw=False, passes=2, link='logistic', pos_threshold=thres,
def __init__(self): self.name = "Vowpalwabbit" self.model = VWClassifier() return
def run(dtrain, dtest, opts, retrain=False, fold_num=5): if dtrain.dtype != 'svm': raise Exception("vw only supports svm type data") best_l = 1.0 if 'cv' in opts: cv_output_path = osp.join(dtrain.work_dir, 'cv-vw.txt') if os.path.exists(cv_output_path) and retrain == False: with open(cv_output_path, 'r') as fh: line = fh.readline() C = float(line.split('=')[1]) else: #cross validation logging.info("cross validation") logging.info("loading %s" % (dtrain.rand_path())) x_train, y_train = datasets.load_svmlight_file(dtrain.rand_path()) logging.info("loading %s" % (dtest.data_path)) x_test, y_test = datasets.load_svmlight_file(dtest.data_path) clf = GridSearchCV(estimator=VWClassifier(), param_grid=opts['cv'], n_jobs=1, cv=fold_num, verbose=True) clf.fit(x_train, y_train) best_l = max(clf.grid_scores_, key=itemgetter(1)).parameters['l'] with open(cv_output_path, 'w') as fh: fh.write('Best Result: l=%f' % (best_l)) logging.info('cross validation parameters: l=%f' % (best_l)) vw_data_path = dtrain.rand_path('svm') + '.vw' convert_to_vw(dtrain.rand_path('svm'), vw_data_path, dtrain.class_num) cache_path = vw_data_path + '.cache' if osp.exists(cache_path): os.remove(cache_path) if 'lambda' in opts: #sol sparsity_list = [] test_accu_list = [] for l1 in opts['lambda']: readable_model_path = osp.join(dtrain.work_dir, 'vw.r.model') train_accu, train_time = train(dtrain, model_params=[('learning_rate', best_l), ('l1', l1)], cache=True, readable_path=readable_model_path) logging.info("training accuracy: %.4f" % (train_accu)) logging.info("training time: %.4f seconds" % (train_time)) test_accu, test_time = test(dtest, cache=True) logging.info("test accuracy: %.4f" % (test_accu)) logging.info("test time: %.4f seconds" % (test_time)) #parse sparsity sparsity = parse_sparsity(dtrain, readable_model_path, dtrain.class_num) sparsity_list.append(sparsity) test_accu_list.append(test_accu) return sparsity_list, test_accu_list else: train_accu, train_time = train(dtrain, model_params=[('learning_rate', best_l) ]) logging.info("training accuracy: %.4f" % (train_accu)) logging.info("training time: %.4f seconds" % (train_time)) test_accu, test_time = test(dtest) logging.info("test accuracy: %.4f" % (test_accu)) logging.info("test time: %.4f seconds" % (test_time)) return train_accu, train_time, test_accu, test_time
metadata.drop(metadata.columns[[0]], axis = 1, inplace = True) print(seenMovie) print(metadata) print("Data loaded") print(seenMovie.shape, '\t', metadata.shape) seenMovie = seenMovie.astype('int') # split train and test set X_train, X_test, y_train, y_test = train_test_split(metadata, seenMovie, test_size=0.3, random_state=256) # build VW logistic regression model # LogLossVal: 0.013283467177640678 # Mean Square Error of the Log for the 1st model: 0.00018478019510039388 print('Train\n', y_train) model = VWClassifier(loss_function='logistic') model.fit(X_train, y_train) # predict model y_pred = model.predict_proba(X_test) print(y_pred) print("Training complete for model 1...") print("starting LogLoss...") # get log loss for linear regression model logLossVal = log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None) # r2_score_nnls = r2_score(y_test, y_pred) # print("NNLS R2 score", r2_score_nnls) # mse_1 = calculateMeanSquareError(y_test, y_pred) # m1_recall = recall_score(y_test, y_pred, average='binary', zero_division=0) # m1_precision = precision_score(y_test, y_pred, average='binary', zero_division=0) # accuracy_m1 = accuracy_score(y_test, y_pred);
def test_init(self): assert isinstance(VWClassifier(), VWClassifier)
# from vowpalwabbit import pyvw # # vw = pyvw.vw(quiet=True) # ex = vw.example('1 | a b c') # vw.learn(ex) # vw.predict(ex) import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split from vowpalwabbit.pyvw import vw from vowpalwabbit.sklearn_vw import VWClassifier # generate some data X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1) X = X.astype(np.float32) # split train and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256) # build model model = VWClassifier() model.fit(X_train, y_train) # predict model y_pred = model.predict(X_test) print(y_pred) # evaluate model model.score(X_train, y_train) model.score(X_test, y_test)
def test_decision_function(self, data): model = VWClassifier() model.fit(data.x, data.y) actual = model.decision_function(data.x) assert actual.shape[0] == 100 assert np.isclose(actual[0], 0.4069, atol=1e-4)
text_filename = os.listdir('D:/cadec/text')[i] Ori_filename = os.listdir('D:/cadec/Ori')[i] objects.append( document('D:/cadec/text/' + text_filename, 'D:/cadec/Ori/' + Ori_filename, dic)) print(i) sel = (len(objects)) x, y, l = transform(objects[:sel], dic) train_cut = int(0.75 * (len(objects))) test_cut = (len(objects)) x_train, y_train, l_train = transform(objects[:train_cut], dic) x_test, y_test, l_test = transform(objects[train_cut:test_cut], dic) x, y = clean(x, y) x_train, y_train = clean(x_train, y_train) x_test, y_test = clean(x_test, y_test) print(x_test) print(y_test) print("completed") vecx = DictVectorizer(sparse=True) vecy = DictVectorizer(sparse=False) train_len = (x_train) test_len = (x_test) clf = VWClassifier() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(len(y_pred)) print((y_pred)) print((y_test)) recall_str = classification_report(y_test, y_pred) print(recall_str)
def test_sgd_param(): model1 = VWRegressor(sgd=True) model2 = VWClassifier(sgd=True) assert model1.get_params()['sgd'] == True assert model2.get_params()['sgd'] == True
def train(dtrain, model_params={}, output_path=None, cv_params=None, fold_num=5, retrain=False, cv_process_num=1): """ train a vw model Parameter --------- dtrain: DataSet the dataset used to train the model model_params: dict{param, val} model parameters output_path: str path to save the model cv_params: dict{param, range} cross validation parameters fold_num: int number of folds to do cross validation retrain: bool whether to re-do the cross validation cv_process_num: int number of processes to do cross validation Return ------ tuple(train accuracy, train time) """ best_l = 1.0 if cv_params is not None: cv_output_path = osp.join(dtrain.work_dir, 'cv-vw.txt') if osp.exists(cv_output_path) and retrain is False: with open(cv_output_path, 'r') as rfh: line = rfh.readline() best_l = float(line.split('=')[1]) else: #cross validation logging.info("cross validation") logging.info("loading %s", dtrain.rand_path('svm')) xtrain, ytrain = datasets.load_svmlight_file( dtrain.rand_path('svm')) clf = GridSearchCV(estimator=VWClassifier(), param_grid=cv_params, n_jobs=cv_process_num, cv=fold_num, verbose=True) clf.fit(xtrain, ytrain) best_l = clf.best_params_['learning_rate'] with open(cv_output_path, 'w') as wfh: wfh.write('Best Result: l=%f' % (best_l)) logging.info('cross validation parameters: learning_rate=%f', (best_l)) model_params['learning_rate'] = best_l cmd = vw_exe() if dtrain.class_num != 2: cmd += ' --oaa %d' % (dtrain.class_num) model_path = osp.join(dtrain.work_dir, 'vw.model') cmd += ' -f \"%s\"' % (model_path) if output_path != None: cmd += ' --readable_model \"%s\"' % (output_path) for k, v in model_params.iteritems(): cmd += ' --%s \"%s\"' % (k, str(v)) vw_data_path = dtrain.rand_path('svm') + '.vw' if osp.exists(vw_data_path) is False: convert_to_vw(dtrain.rand_path('svm'), vw_data_path, dtrain.class_num) cmd += ' \"%s\"' % (vw_data_path) logging.info(cmd) start_time = time.time() if os.system(cmd) != 0: logging.error('call vw failed, vw in path?') sys.exit() train_time = time.time() - start_time train_accu, train_test_time = test(dtrain) train_time += train_test_time logging.info("training accuracy: %.4f", train_accu) logging.info("training time: %.4f seconds", train_time) return train_accu, train_time