def do_predict(test_file, costs_file, model_file, output_trec_run=None, output_eval=None, override_cutoffs=None): """Run prediction with a saved cascade""" test_data = load_data_file(test_file) costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1]) cascade = load_model(model_file) if 'scaler' in cascade: cascade['scaler'].transform(test_data[0]) if override_cutoffs: cutoffs = ast.literal_eval(override_cutoffs) logging.info('Override cutoffs with %s' % cutoffs) new_stages = [] for i, (prune, model) in enumerate(cascade['stages']): new_stages.append((Prune(rank=cutoffs[i]), model)) cascade['stages'] = new_stages predict(cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval)
def build_wlm11_cascade(train_file, validation_file, test_file, costs_file=None, importance_file=None, model_prefix=None, **kwargs): """Train a cascade over a partition of disjoint feature sets.""" train_data, valid_data, test_data = load_data( train_file, validation_file, test_file, scaler=MaxAbsScaler(copy=False)) costs, importance = load_costs_data( costs_file, importance_file, n_features=train_data[0].shape[1]) # NOTE: costs has to be untainted (make copy before passing it to functions) cascade = train(train_data, valid_data, costs.copy(), importance.copy(), **kwargs) if model_prefix: save_model(cascade, model_prefix) predict(cascade, test_data, costs.copy())
def info(model_file, costs_file=None): bst = joblib.load(model_file) fids = sorted([int(k[1:]) for k in bst.get_fscore()]) print('params', vars(bst)) if hasattr(bst, 'attributes'): print('attributes', bst.attributes()) print('n_features', len(fids)) print('feature list', fids) if costs_file: from core.cascade import load_costs_data costs, _ = load_costs_data(costs_file, None, max(fids) + 1) mask = np.zeros(costs.size, dtype=int) np.put(mask, fids, 1) print('cost %d' % np.dot(costs, mask))
def do_predict(test_file, costs_file, model_file, output_trec_run=None, output_eval=None, train_file=None): """Run prediction with a saved cascade""" test_data = load_data_file(test_file) costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1]) cascade = load_model(model_file) # FIXME: scaler needs to be saved along the cascade if train_file: train_data = load_data_file(train_file) scaler = MaxAbsScaler(copy=False) scaler.fit(train_data[0]) scaler.transform(test_data[0]) logging.info('Data scaled') if 'scaler' in cascade: cascade['scaler'].transform(test_data[0]) predict(cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval)
def do_retrain(model_type, train_file, validation_file, model_file, new_model_file, test_file=None, costs_file=None, random=0, up_to=0, learning_rate="0.1", subsample="0.5", trees="[5,10,50,100,500,1000]", nodes="[32]", output_trec_run=None, output_eval=None): """Retrain a tree-based cascade using features learned in the linear models""" train_data = load_data_file(train_file) valid_data = (None, ) * 4 if validation_file: valid_data = load_data_file(validation_file) test_data = (None, ) * 4 costs = None if test_file is not None and costs_file is not None: test_data = load_data_file(test_file) costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1]) cascade = load_model(model_file) if 'scaler' in cascade: cascade['scaler'].transform(train_data[0]) if valid_data[0] is not None: cascade['scaler'].transform(valid_data[0]) if test_data[0] is not None: cascade['scaler'].transform(test_data[0]) if random > 0: for _ in range(random): tree = 1 + np.random.randint(1000) node = np.random.choice([2, 4, 8, 16, 32, 64]) print('tree %i, node %i' % (tree, node)) new_cascade = cascade.copy() new_cascade['stages'] = retrain( model_type, cascade['stages'], train_data, valid_data, learning_rate=ast.literal_eval(learning_rate), subsample=ast.literal_eval(subsample), trees=[tree], nodes=[node], up_to=up_to) if test_data[0] is not None: predict(new_cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval) return cascade['stages'] = retrain(model_type, cascade['stages'], train_data, valid_data, learning_rate=ast.literal_eval(learning_rate), subsample=ast.literal_eval(subsample), trees=ast.literal_eval(trees), nodes=ast.literal_eval(nodes), up_to=up_to) save_model(cascade, new_model_file) if test_data[0] is not None: predict(cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval)
def do_train_budgeted_GBDT(train_file, validation_file, test_file, costs_file=None, importance_file=None, model_prefix=None, budget=None, trees='[5, 10, 50, 100, 500, 1000]', nodes='[32]'): """Train a 1-stage budgeted GBDT cascade""" train_data, valid_data, test_data = load_data(train_file, validation_file, test_file) costs, importance = load_costs_data(costs_file, importance_file, n_features=train_data[0].shape[1]) x_train, _, _ = train_data x_train = x_train.toarray() # not all features will be used in a full model all_fids = [i for i in range(x_train.shape[1]) if any(x_train[:, i])] budget = float(budget) if budget: c = costs[all_fids] c[c.argsort()] = c[c.argsort()].cumsum() fids = [fid for fid, b in zip(all_fids, c) if b <= budget] else: fids = all_fids used_features = np.array(fids) # used_features = np.flatnonzero(model.get_feature_mask()) print('Train a budgeted GBDT with %i features' % used_features.size) _, y_train, _ = train_data class_weights = core.get_class_weights(y_train) params = { 'max_depth': 7, 'eta': 0.1, 'silent': True, 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'subsample': 0.5 } import GBDT new_model = TreeModel(model=GBDT.train(train_data, valid_data, core.get_score_multiclass, class_weights, params, trees=ast.literal_eval(trees), nodes=ast.literal_eval(nodes), set_classes=True, features=used_features), score_function=core.get_score_multiclass, class_weights=class_weights, n_features=train_data[0].shape[1]) cascade = { 'stages': [(None, new_model)], 'score_update': core.cascade.UpshiftUpdate(gap=0.1) } if model_prefix: save_model(cascade, model_prefix) predict(cascade, test_data, costs)
def train_disjoint_cascade(partition_criteria, train_file, validation_file, test_file, costs_file=None, importance_file=None, model_prefix=None, n_stages=3, cutoffs=[None, 10, 5], alpha=0.1, epochs=10, pairwise_transform=False, GBDT_retraining=False): """Train a cascade over a partition of disjoint feature sets.""" np.random.seed(0) # freeze the randomness bit alphas = alpha if isinstance(alpha, list) else [alpha] * n_stages params = {'epochs': epochs, 'l1_ratio': 1.0, 'penalty': 'none'} scaler = MaxAbsScaler(copy=False) train_data, valid_data, test_data = load_data(train_file, validation_file, test_file, scaler=scaler) costs, importance = load_costs_data(costs_file, importance_file, n_features=train_data[0].shape[1]) # these options don't go well together (or I haven't figured out how to make them) assert not (pairwise_transform and GBDT_retraining) # keep the original as GBDT won't work with polarized labels original_train_data = train_data # massage the data a bit ... x_train, y_train, qid_train, docno_train = train_data y_train = core.polarize(y_train) if pairwise_transform: from utils import per_query_transform_pairwise x_train, y_train = per_query_transform_pairwise( x_train.toarray(), y_train, qid_train) train_data = (x_train, y_train, qid_train, docno_train) is_qf = np.ones_like(costs) x = x_train.toarray() for j, _ in enumerate(costs): for a, b in group_offsets(qid_train): if (x[a:b, j] != x[a, j]).any(): is_qf[j] = 0 break # NOTE: costs has to be untainted (make copy before passing it to functions) partitions = partition_criteria(n_stages, is_qf, costs.copy(), importance) stages = train(train_data, valid_data, costs.copy(), importance, n_stages, cutoffs=cutoffs, feature_partitions=partitions, alphas=alphas, **params) if GBDT_retraining: stages = retrain('GBDT', stages, original_train_data, valid_data, trees=[5, 10, 50, 100, 500, 1000], nodes=[32]) cascade = { 'stages': stages, 'scaler': scaler, 'score_update': core.cascade.UpshiftUpdate(gap=0.1) } if model_prefix: save_model(cascade, model_prefix) predict(cascade, test_data, costs)