def test_keys_with_list_of_values(): # No exception in creating and executing model with a key/list pair model = vw(quiet=True, q=["fa", "fb"]) model.learn('1 | a b c') prediction = model.predict(' | a b c') assert isinstance(prediction, float) del model
def initialize(self, test, resume=False): if self.model_class == 'lookup': self.actor_model = {} elif self.model_class == 'vw_python': self.actor_model_path = self.base_folder_name + "/model.vw" if not test: if not resume: self.actor_model = pyvw.vw(quiet=True, l2=self.params['l2'], loss_function=self.params['loss_function'], holdout_off=True, f=self.actor_model_path, b=self.params['b'], lrq=self.params['lrq'], l=self.params['l'], k=True) else: self.actor_model = pyvw.vw("--quiet -f {0} -i {0}".format(self.actor_model_path)) else: self.actor_model = pyvw.vw("--quiet -t -i {0}".format(self.actor_model_path))
def test_multilabel_prediction_type(): model = vw(multilabel_oaa=4, quiet=True) model.learn('1 | a b c') assert model.get_prediction_type() == model.pMULTILABELS prediction = model.predict(' | a b c') assert isinstance(prediction, list) del model
def test_scalar_prediction_type(): model = vw(quiet=True) model.learn('1 | a b c') assert model.get_prediction_type() == model.pSCALAR prediction = model.predict(' | a b c') assert isinstance(prediction, float) del model
def test_prob_prediction_type(): model = vw(loss_function='logistic', csoaa_ldf='mc', probabilities=True, quiet=True) model.learn('1 | a b c') assert model.get_prediction_type() == model.pPROB prediction = model.predict(' | a b c') assert isinstance(prediction, float) del model
def test_action_probs_prediction_type(): model = vw(cb_explore=2, ngram=2, quiet=True) model.learn('1 | a b c') assert model.get_prediction_type() == model.pACTION_PROBS prediction = model.predict(' | a b c') assert isinstance(prediction, list) del model
def test_action_scores_prediction_type(): model = vw(loss_function='logistic', csoaa_ldf='m', quiet=True) model.learn('1 | a b c') assert model.get_prediction_type() == model.pMULTICLASS prediction = model.predict(' | a b c') assert isinstance(prediction, int) del model
def save_and_continue(self, thread_id, event): if self.epochs % 1000.0 == 0 and thread_id == 1: event.clear() print "saving model..." print "epochs: " + str(self.epochs) self.actor_model.finish() self.actor_model = pyvw.vw("--quiet --save_resume -f {0} -i {1}".format(self.actor_model_path, self.actor_model_path)) event.set()
def test_multiclass_prediction_type(): n = 3 model = vw(loss_function='logistic', oaa=n, quiet=True) model.learn('1 | a b c') assert model.get_prediction_type() == model.pMULTICLASS prediction = model.predict(' | a b c') assert isinstance(prediction, int) del model
def test_action_scores_prediction_type(): model = vw(loss_function="logistic", csoaa_ldf="m", quiet=True) multi_ex = [model.example("1:1 | a b c"), model.example("2:-1 | a b c")] model.learn(multi_ex) assert model.get_prediction_type() == model.pMULTICLASS multi_ex = [model.example("1 | a b c"), model.example("2 | a b c")] prediction = model.predict(multi_ex) assert isinstance(prediction, int) del model
def test_scalars_prediction_type(): n = 3 model = vw(loss_function='logistic', oaa=n, probabilities=True, quiet=True) model.learn('1 | a b c') assert model.get_prediction_type() == model.pSCALARS prediction = model.predict(' | a b c') assert isinstance(prediction, list) assert len(prediction) == n del model
def test_cost_sensitive_label(): model = vw(csoaa=4, quiet=True) csl = pyvw.cost_sensitive_label(model.example('2:5 |')) assert csl.costs[0].label == 2 assert csl.costs[0].wap_value == 0.0 assert csl.costs[0].partial_prediction == 0.0 assert csl.costs[0].cost == 5.0 assert str(csl) == '2:5.0' del model
def test_cbandits_label(): model = vw(cb=4, quiet=True) cbl = pyvw.cbandits_label(model.example('1:10:0.5 |')) assert cbl.costs[0].action == 1 assert cbl.costs[0].probability == 0.5 assert cbl.costs[0].partial_prediction == 0 assert cbl.costs[0].cost == 10.0 assert str(cbl) == '1:10.0:0.5' del model
def test_prob_prediction_type(): model = vw(loss_function='logistic', csoaa_ldf='mc', probabilities=True, quiet=True) multi_ex = [model.example('1:0.2 | a b c'), model.example('2:0.8 | a b c')] model.learn(multi_ex) assert model.get_prediction_type() == model.pPROB multi_ex = [model.example('1 | a b c'), model.example('2 | a b c')] prediction = model.predict(multi_ex) assert isinstance(prediction, float) del model
def test_multiclass_label_example(): n = 4 model = pyvw.vw(loss_function="logistic", oaa=n, quiet=True) ex = model.example("1 | a b c d", 2) ml2 = pyvw.multiclass_label(ex) assert ml2.label == 1 assert ml2.weight == 1.0 assert ml2.prediction == 0 assert str(ml2) == "1"
def test_action_scores_prediction_type(): model = vw(loss_function='logistic', csoaa_ldf='m', quiet=True) multi_ex = [model.example('1:1 | a b c'), model.example('2:-1 | a b c')] model.learn(multi_ex) assert model.get_prediction_type() == model.pMULTICLASS multi_ex = [model.example('1 | a b c'), model.example('2 | a b c')] prediction = model.predict(multi_ex) assert isinstance(prediction, int) del model
def test_simple_label_example(): vw_ex = vw(quiet=True) ex = vw_ex.example("1 |a two features |b more features here") sl2 = pyvw.simple_label(ex) assert sl2.label == 1.0 assert sl2.weight == 1.0 assert sl2.prediction == 0.0 assert sl2.initial == 0.0 assert str(sl2) == "1.0"
def train_lr(args): logger.info("Reading data") df = pd.read_csv(args.input, names=['query', 'category']) logger.info("Lemmatizing and preparing data") df['query_lem'] = df['query'].apply(lemmatize) # ensure that category start from 1 df.category = df.category + 1 df['vw_train'] = df.category.astype(str) + ' | ' + df['query_lem'].values df['vw_test'] = '| ' + df['query_lem'].values category_count = len(df.category.unique()) df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.category.values, random_state=42) train_examples = list(df_train['vw_train'].values) test_examples = list(df_test['vw_train'].values) logger.info("Training LR model") vw_command = "--oaa {} --random_seed 17 --cache_file ./tmp1 -b 27 -f ./models/lr.vw ".format( category_count) logger.info(vw_command) vw = pyvw.vw(vw_command) for iteration in range(2): logger.info("Iteration %s", iteration) for i in range(len(train_examples)): vw.learn(train_examples[i]) vw.finish() logger.info("Finished model training") logger.info("Calculating accuracy and F1 score on hold out data set") vw = pyvw.vw("-i ./models/lr.vw -t") pred = [vw.predict(sample) for sample in test_examples] logger.info("LR holdout accuracy score is %s", np.round(accuracy_score(df_test.category.values, pred), 2)) logger.info( "LR holdout F1 score is %s", np.round(f1_score(df_test.category.values, pred, average='weighted'), 2))
def _vw_run(args, data, predict_and_yield): vw = pyvw.vw(" ".join(args)) log.info("Running: vw " + " ".join(args)) for d in data: ex = vw.example((d.label or b"") + b" | " + d.features + b"\n") if predict_and_yield: yield vw.predict(ex, pylibvw.vw.lMulticlass), d.tag else: vw.learn(ex) vw.finish()
def _test_helper_save_load(vw_arg: str, num_iterations=2000, seed=10, has_automl=False, log_filename=None): split = 1500 before_save = num_iterations-split first_vw = pyvw.vw(arg_str=vw_arg) has_automl = "automl" in first_vw.get_enabled_reductions() sim = Simulator(seed=seed, has_automl=has_automl, debug_logfile=log_filename) # first chunk ctr = sim.run_simulation(first_vw, before_save, sim.users, sim.times_of_day, sim.actions, sim.get_cost) # save model_file = "test_save_load.vw" first_vw.save(model_file) first_vw.finish() # reload in another instance other_vw = pyvw.vw(f"-i {model_file} {vw_arg}") # todo remove vw_arg from here # continue ctr = sim.run_simulation(other_vw, split, sim.users, sim.times_of_day, sim.actions, sim.get_cost, shift=before_save+1) return ctr
def _vw_run(args, data, predict_and_yield): vw = pyvw.vw(' '.join(args)) util.log.info('Running: vw ' + ' '.join(args)) for d in data: ex = vw.example((d.label or b'') + b' | ' + d.features + b'\n') if predict_and_yield: yield vw.predict(ex, pylibvw.vw.lMulticlass), d.tag else: vw.learn(ex) vw.finish()
def test_ccb_single_slot_and_cb_equivalence_no_slot_features(): # --- CCB ccb_model_file_name = "model_file_ccb_equiv.txt" ccb_workspace = pyvw.vw(quiet=True, ccb_explore_adf=True, readable_model=ccb_model_file_name) ccb_ex = """ ccb shared |User b ccb action |Action d ccb action |Action e ccb action |Action f ccb action |Action ff ccb action |Action fff ccb slot 4:1:0.2 | """ ccb_workspace.learn(ccb_ex) ccb_workspace.finish() ccb_num_weights = count_weights_from_readable_model_file_for_equiv_test( ccb_model_file_name) # --- CB cb_model_file_name = "model_file_cb_equiv.txt" cb_workspace = pyvw.vw(quiet=True, cb_explore_adf=True, readable_model=cb_model_file_name) cb_ex = """ shared |User b |Action d |Action e |Action f |Action ff 4:1:0.2 |Action fff """ cb_workspace.learn(cb_ex) cb_workspace.finish() cb_num_weights = count_weights_from_readable_model_file_for_equiv_test( cb_model_file_name) assert ccb_num_weights == cb_num_weights
def get_vw(self): """Factory to create a vw instance on demand Returns ------- pyvw.vw instance """ if self.vw_ is None: self.vw_ = pyvw.vw(**self.params) return self.vw_
def test_get_weight_name(): model = vw(quiet=True) model.learn("1 | a a b c |ns x") assert model.get_weight_from_name("a") != 0. assert model.get_weight_from_name("b") != 0. assert model.get_weight_from_name("b") == model.get_weight_from_name("c") assert model.get_weight_from_name("a") != model.get_weight_from_name("b") assert model.get_weight_from_name("x") == 0. assert model.get_weight_from_name("x", "ns") != 0. assert model.get_weight_from_name("x", "ns") == model.get_weight_from_name("b")
def test_multiclass_probabilities_label(): n = 4 model = pyvw.vw( loss_function="logistic", oaa=n, probabilities=True, quiet=True ) ex = model.example("1 | a b c d", 2) model.learn(ex) mpl = pyvw.multiclass_probabilities_label(ex) assert str(mpl) == "1:0.25 2:0.25 3:0.25 4:0.25" mpl = pyvw.multiclass_probabilities_label([1, 2, 3], [0.4, 0.3, 0.3]) assert str(mpl) == "1:0.4 2:0.3 3:0.3"
def test_vw_config_manager(): expected_set = {'--quiet', '--loss_function logistic', '--save_resume', '--data /root/vowpal_wabbit/test/train-sets/rcv1_small.dat'} vw = pyvw.vw(arg_str="--save_resume --loss_function logistic -d /root/vowpal_wabbit/test/train-sets/rcv1_small.dat --quiet") config = vw.get_config() cmd_str_list = helper_options_to_list_strings(config) assert set(cmd_str_list) == expected_set vw.finish() # do another iteration generating the cmd string from the output of previous new_args = " ".join(cmd_str_list) other_vw = pyvw.vw(new_args) new_config = vw.get_config() new_cmd_str_list = helper_options_to_list_strings(new_config) assert set(new_cmd_str_list) == expected_set other_vw.finish()
def load(self, verify_on_load=True): """ loads model file into memory (as a vw sub-process) verify model first, then stop process if status is not active Args: verify_on_load (bool): flag to call verify when loading a model """ self.process = pyvw.vw(self.command) super(self.__class__, self).load(verify_on_load=verify_on_load)
def __init__(self, horizon, num_actions, policy, default_model="--power_t 0.0 -q la --quiet"): """ Initialize variables to store basic information of MDP and every Q model needed for each step. Params: default_model: a default VW model to learn Q. You can try different settings to get the best model. """ super().__init__(horizon, num_actions, policy) # We assume that each step is indexed from 1 to H (horizon) self.models = [pyvw.vw(default_model) for _ in range(self.horizon)]
def _create_model(self, project, initial_params={}): initial_params = initial_params.copy() # don't mutate the original trainpath = os.path.join(self.datadir, self.TRAIN_FILE) initial_params['data'] = trainpath params = self._create_params(initial_params) if params.get('passes', 1) > 1: # need a cache file when there are multiple passes params.update({'cache': True, 'kill_cache': True}) self.debug("model parameters: {}".format(params)) self._model = pyvw.vw(**params) modelpath = os.path.join(self.datadir, self.MODEL_FILE) self._model.save(modelpath)
def test_vw_oml_problem_and_vanilla_vw(self): vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem() vanilla_vw = pyvw.vw(**vw_oml_problem_args["fixed_hp_config"]) cumulative_loss_list = online_learning_loop( vw_online_aml_problem.max_iter_num, vw_online_aml_problem.vw_examples, vanilla_vw, loss_func=vw_oml_problem_args["fixed_hp_config"].get( "loss_function", "squared"), ) print("final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list))
def _create_bandit(self, num_actions, seed=None): # --epsilon: Epsilon-Greedy exploration # --cover: Online Cover exploration # --nn N: use sigmoidal feedforward network w/ N hidden units from vowpalwabbit import pyvw cmd = "--nn 16 --epsilon 0.1 --cover 3 --cb_explore {}".format( num_actions) if seed: cmd += " --random_seed {}".format(seed) bandit = pyvw.vw(cmd, quiet=True) return bandit
def get_vw(self): """Factory to create a vw instance on demand Returns ------- pyvw.vw instance """ if self.vw_ is None: self.vw_ = vw(**self.params) # set label type self.label_type_ = self.vw_.get_label_type() return self.vw_
def test_example_features(): vw_ex = vw(quiet=True) ex = vw_ex.example("1 |a two features |b more features here") ns = pyvw.namespace_id(ex, 1) assert ex.get_feature_id(ns, "a") == 127530 ex.push_hashed_feature(ns, 1122) ex.push_features("x", [("c", 1.0), "d"]) ex.push_feature(ns, 11000) assert ex.num_features_in("x") == 2 assert ex.sum_feat_sq(ns) == 5.0 ns2 = pyvw.namespace_id(ex, 2) ex.push_namespace(ns2) assert ex.pop_namespace()
def test_example_features(): vw_ex = vw(quiet=True) ex = vw_ex.example('1 |a two features |b more features here') ns = pyvw.namespace_id(ex, 1) assert ex.get_feature_id(ns, 'a') == 127530 ex.push_hashed_feature(ns, 1122) ex.push_features('x', [('c', 1.), 'd']) ex.push_feature(ns, 11000) assert ex.num_features_in('x') == 2 assert ex.sum_feat_sq(ns) == 5.0 ns2 = pyvw.namespace_id(ex, 2) ex.push_namespace(ns2) assert ex.pop_namespace()
def test_regressor_args(): # load and parse external data file data_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'resources', 'train.dat') model = vw(oaa=3, data=data_file, passes=30, c=True, k=True) assert model.predict('| feature1:2.5') == 1 # update model in memory for _ in range(10): model.learn('3 | feature1:2.5') assert model.predict('| feature1:2.5') == 3 # save model model.save('tmp.model') del model # load initial regressor and confirm updated prediction new_model = vw(i='tmp.model', quiet=True) assert new_model.predict('| feature1:2.5') == 3 del new_model # clean up os.remove('{}.cache'.format(data_file)) os.remove('tmp.model')
def model_pred(trn_tmp_xy,val_tmp_x,val_tmp_y, tst_x): param = ['-b 7 ' + '--link logistic ' + '--loss_function logistic ' + '-l 0.2 ' + '--l1 0 ' + '--l2 0 ' + '--holdout_off ' + '--total 32 ' + '-f vw.model ' + '--readable_model vw.readable.model'] vw = pyvw.vw(*param) best_iter = 400 for iteration in xrange(best_iter): for i in xrange(len(trn_tmp_xy)): vw.learn(trn_tmp_xy[i]) vw.finish() vw = pyvw.vw("-i vw.model -t") pred_trn_tmp = [vw.predict(sample) for sample in val_tmp_x] pred_tst_tmp = [vw.predict(sample) for sample in tst_x] return pred_trn_tmp, pred_tst_tmp, best_iter
def _create_model(self, project): self.info('creating VW model (algorithm: {})'.format(self.algorithm)) trainpath = os.path.join(self.datadir, self.TRAIN_FILE) params = self._create_params({ 'data': trainpath, self.algorithm: len(project.subjects) }) if params.get('passes', 1) > 1: # need a cache file when there are multiple passes params.update({'cache': True, 'kill_cache': True}) self.debug("model parameters: {}".format(params)) self._model = pyvw.vw(**params) modelpath = os.path.join(self.datadir, self.MODEL_FILE) self._model.save(modelpath)
def initialize(self): if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) if not os.path.exists(path): raise NotInitializedException( 'model {} not found'.format(path), backend_id=self.backend_id) self.debug('loading VW model from {}'.format(path)) params = self._create_params({'i': path, 'quiet': True}) if 'passes' in params: # don't confuse the model with passes del params['passes'] self.debug("model parameters: {}".format(params)) self._model = pyvw.vw(**params) self.debug('loaded model {}'.format(str(self._model)))
def mini_vw(inputFile, numPasses, otherArgs): vw = pyvw.vw(otherArgs) for p in range(numPasses): print 'pass', (p+1) h = open(inputFile, 'r') for l in h.readlines(): if learnFromStrings: vw.learn(l) else: ex = vw.example(l) vw.learn(ex) ex.finish() h.close() vw.finish()
def test_namespace_id(): vw_ex = vw(quiet=True) ex = vw_ex.example("1 |a two features |b more features here") nm1 = pyvw.namespace_id(ex, 0) nm2 = pyvw.namespace_id(ex, 1) nm3 = pyvw.namespace_id(ex, 2) assert nm1.id == 0 assert nm1.ord_ns == 97 assert nm1.ns == "a" assert nm2.id == 1 assert nm2.ord_ns == 98 assert nm2.ns == "b" assert nm3.id == 2 assert nm3.ord_ns == 128 assert nm3.ns == "\x80" # Represents string of ord_ns
def test_prob_prediction_type(): model = vw(loss_function='logistic', csoaa_ldf='mc', probabilities=True, quiet=True) multi_ex = [ model.example('1:0.2 | a b c'), model.example('2:0.8 | a b c') ] model.learn(multi_ex) assert model.get_prediction_type() == model.pPROB multi_ex = [model.example('1 | a b c'), model.example('2 | a b c')] prediction = model.predict(multi_ex) assert isinstance(prediction, float) del model
def test_basic(self): vw = pyvw.vw(quiet=True) ex = vw.example('1 | a b c') vw.learn(ex) self.assertEqual(0.632030725479126, vw.predict(ex))
ds = config['DecisionService'] cache_folder = ds['CacheFolder'] for root, subdirs, files in os.walk(os.path.join(cache_folder, 'onlinetrainer')): print('looking at folder {0}'.format(root)) model = None trackback = None for file in files: if file == 'model': model = os.path.join(root, file) continue if file == 'model.trackback': trackback = os.path.join(root, file) continue if model is None or trackback is None: continue print('looking at folder {0}'.format(root)) with open(trackback, 'r') as f: first_line = f.readline() if (first_line.startswith('modelid:')): continue vw = pyvw.vw("--quiet -i {0}".format(model)) id = vw.get_id() del vw line_prepender(trackback, 'modelid: {0}\n'.format(id))
sortedSpans = [] for s in spans: sortedSpans.append(s) sortedSpans.sort() oracle = [] for id in range(len(sortedSpans)): if sortedSpans[id][0] > sortedSpans[0][0]: break oracle.append( sortedSpans[id][1] ) pred = self.sch.predict(examples = examples, my_tag = i+1, oracle = oracle, condition = [ (i, 'p'), (i-1, 'q') ] ) self.vw.finish_example(examples) output.append( spans[pred][2] ) for j in spans[pred][2]: covered[j] = True return output print('training LDF') vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet -q ef -q ep") task = vw.init_search_task(WordAligner) for p in range(10): task.learn(my_dataset) print('====== test ======') print(task.predict( ("the blue flower".split(), ([],[],[]), "la fleur bleue".split()) )) print('should have printed [[0], [2], [1]]')
from vowpalwabbit import pyvw vw = pyvw.vw('--audit') full = vw.example( { 'a': ['b'], 'x': ['y'] } ) full.learn() part = vw.example( {'a': ['b'] } ) part.learn() part.push_features('x', ['y']) part.learn() part.erase_namespace(ord('x')) part.push_features('x', ['z']) part.learn()
def test_cost_sensitive_label(): model = vw(csoaa=4, quiet=True) assert pyvw.cost_sensitive_label(model.example('1 |')).costs[0].label == 1 del model
def test_cbandits_label(): model = vw(cb=4, quiet=True) assert pyvw.cbandits_label(model.example('1 |')).costs[0].label == 1 del model
def test_del(self): vw = pyvw.vw() del vw
def test_finish(self): vw = pyvw.vw() assert not vw.finished vw.finish() assert vw.finished
def vw(self): return pyvw.vw(quiet=True, b=BIT_SIZE)
from vowpalwabbit import pyvw def my_predict(vw, ex): pp = 0. for f,v in ex.iter_features(): pp += vw.get_weight(f) * v return pp def ensure_close(a,b,eps=1e-6): if abs(a-b) > eps: raise Exception("test failed: expected " + str(a) + " and " + str(b) + " to be " + str(eps) + "-close, but they differ by " + str(abs(a-b))) ############################################################################### vw = pyvw.vw("--quiet") ############################################################################### vw.learn("1 |x a b") ############################################################################### print('# do some stuff with a read example:') ex = vw.example("1 |x a b |y c") ex.learn() ; ex.learn() ; ex.learn() ; ex.learn() updated_pred = ex.get_updated_prediction() print('current partial prediction =', updated_pred) # compute our own prediction
(NOUN, 'monster'), (VERB, 'ate'), (DET , 'a'), (ADJ , 'big'), (NOUN, 'sandwich')], [(DET , 'the'), (NOUN, 'sandwich'), (VERB, 'was'), (ADJ , 'tasty')], [(NOUN, 'it'), (VERB, 'ate'), (NOUN, 'it'), (ADJ , 'all')] ] # initialize VW as usual, but use 'hook' as the search_task vw = pyvw.vw("--search 4 --quiet --search_task hook --ring_size 1024") # tell VW to construct your search task object sequenceLabeler = vw.init_search_task(SequenceLabeler) # train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above print('training!', file=sys.stderr) for i in range(10): sequenceLabeler.learn(my_dataset) # now see the predictions on a test sentence print('predicting!', file=sys.stderr) print(sequenceLabeler.predict( [(1,w) for w in "the sandwich ate a monster".split()] )) print('should have printed: [1, 2, 3, 1, 2]')
vw_args = { 'quiet': True, 'passes': 10, 'cache': True, 'f': "%s-predictor.vw" % topic, 'k': True, 'ngram': 2, 'skips': 2, 'ftrl': True, 'decay_learning_rate': 0.99, 'r': "%s-predictions.txt" % topic, # 'progressive_validation': "%s-validations.txt" % topic, 'loss_function': 'hinge' } vw = pyvw.vw(**vw_args) for tweet in get_vw('%s-classified.train.vw' % topic): if (len(tweet) < 3): continue if tweet[:1] == '0': tweet = '-1' + tweet[1:] ex = vw.example(tweet) vw.learn(ex) # print(vw.predict(ex)) # print(ex) print("%s" % (re.sub(r'\n', '', tweet))) # out.write(features + "\n") # counter += 1
def test_delete(): model = vw(quiet=True, b=BIT_SIZE) assert 'model' in locals() del model assert 'model' not in locals()