def do_predict(test_file, costs_file, model_file, output_trec_run=None, output_eval=None, override_cutoffs=None): """Run prediction with a saved cascade""" test_data = load_data_file(test_file) costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1]) cascade = load_model(model_file) if 'scaler' in cascade: cascade['scaler'].transform(test_data[0]) if override_cutoffs: cutoffs = ast.literal_eval(override_cutoffs) logging.info('Override cutoffs with %s' % cutoffs) new_stages = [] for i, (prune, model) in enumerate(cascade['stages']): new_stages.append((Prune(rank=cutoffs[i]), model)) cascade['stages'] = new_stages predict(cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval)
def do_info(model_file): s = set() cascade = load_model(model_file) for i, (_, stage) in enumerate(cascade['stages'], 1): fids = np.flatnonzero(stage.get_feature_mask()) + 1 print('stage', i) print('n_features', len(fids)) print('fids', fids) for i in fids: s.add(i) print('total n_features', len(s))
def do_predict_LambdaMART(test_file, model_file, output_trec_run=None, add_original_order=False): """Run prediction with a saved model""" test_data = load_data_file(test_file) if add_original_order: test_data = (add_original_order_as_feature(test_data), test_data[1], test_data[2]) model = load_model(model_file) predict(model, test_data, core.get_score, None, output_trec_run=output_trec_run)
def do_predict_GBDT(test_file, model_file, output_trec_run=None, add_original_order=False): """Run prediction with a saved model""" test_data = load_data_file(test_file) if add_original_order: test_data = (add_original_order_as_feature(test_data), test_data[1], test_data[2]) model = load_model(model_file) class_weights = core.get_class_weights( test_data[1]) # FIXME: shouldn't peek into this predict(model, test_data, core.get_score_multiclass, class_weights, output_trec_run=output_trec_run)
def do_predict(test_file, costs_file, model_file, output_trec_run=None, output_eval=None, train_file=None): """Run prediction with a saved cascade""" test_data = load_data_file(test_file) costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1]) cascade = load_model(model_file) # FIXME: scaler needs to be saved along the cascade if train_file: train_data = load_data_file(train_file) scaler = MaxAbsScaler(copy=False) scaler.fit(train_data[0]) scaler.transform(test_data[0]) logging.info('Data scaled') if 'scaler' in cascade: cascade['scaler'].transform(test_data[0]) predict(cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval)
def do_retrain(model_type, train_file, validation_file, model_file, new_model_file, test_file=None, costs_file=None, random=0, up_to=0, learning_rate="0.1", subsample="0.5", trees="[5,10,50,100,500,1000]", nodes="[32]", output_trec_run=None, output_eval=None): """Retrain a tree-based cascade using features learned in the linear models""" train_data = load_data_file(train_file) valid_data = (None, ) * 4 if validation_file: valid_data = load_data_file(validation_file) test_data = (None, ) * 4 costs = None if test_file is not None and costs_file is not None: test_data = load_data_file(test_file) costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1]) cascade = load_model(model_file) if 'scaler' in cascade: cascade['scaler'].transform(train_data[0]) if valid_data[0] is not None: cascade['scaler'].transform(valid_data[0]) if test_data[0] is not None: cascade['scaler'].transform(test_data[0]) if random > 0: for _ in range(random): tree = 1 + np.random.randint(1000) node = np.random.choice([2, 4, 8, 16, 32, 64]) print('tree %i, node %i' % (tree, node)) new_cascade = cascade.copy() new_cascade['stages'] = retrain( model_type, cascade['stages'], train_data, valid_data, learning_rate=ast.literal_eval(learning_rate), subsample=ast.literal_eval(subsample), trees=[tree], nodes=[node], up_to=up_to) if test_data[0] is not None: predict(new_cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval) return cascade['stages'] = retrain(model_type, cascade['stages'], train_data, valid_data, learning_rate=ast.literal_eval(learning_rate), subsample=ast.literal_eval(subsample), trees=ast.literal_eval(trees), nodes=ast.literal_eval(nodes), up_to=up_to) save_model(cascade, new_model_file) if test_data[0] is not None: predict(cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval)
def do_info(model_file): cascade = load_model(model_file) for i, (prune, stage) in enumerate(cascade, 1): k = np.flatnonzero(stage) print('stage', i, 'prune', prune, 'fid', k + 1, 'weight', stage[k]) # fid is 0 based