def write_binaries(rows): # row is dict from split name (train/test) to path data can be found for pair in rows: i, row = pair with as_local_paths(row.values(), with_query=True) as local_inputs, \ as_output_files([path + '.xgb' for path in row.values()]) as local_outputs: for local_input, local_output in zip(local_inputs, local_outputs): write_xgb(local_input, local_output.name)
def trainWithFiles(fold: Mapping[str, str], train_matrix: str, params: Mapping[str, Any], **kwargs) -> 'XGBoostModel': """Wrapper around xgb.train This intentionally forwards to trainWithRDD, rather than trainWithDataFrame, as the underlying method currently prevents using rank:pairwise and metrics with @, such as ndcg@5. Parameters ---------- fold : Map from split name to data path. All provided splits will be evaluated on each boosting iteration. train_matrix: str name of split in fold to train against params : dict XGBoost training parameters Returns ------- mjolnir.training.xgboost.XGBoostModel trained xgboost ranking model """ with as_local_paths(fold.values()) as local_paths: matrices = { name: xgb.DMatrix(path) for name, path in zip(fold.keys(), local_paths) } dtrain = matrices[train_matrix] evallist = [(dmat, name) for name, dmat in matrices.items()] metrics = cast(Mapping, {}) booster = xgb.train(params, dtrain, evals=evallist, evals_result=metrics, **kwargs) return XGBoostModel(booster, metrics, params)