def out_of_sample_validation(model, targets, features, config): _logger.info( f"Performing out-of-sample validation with {targets.observations.shape[0]} targets...") mpiops.comm.barrier() if mpiops.chunk_index != 0: with open(config.model_file, 'rb') as f: model, _, _ = pickle.load(f) model = mpiops.comm.bcast(model, root=0) classification = hasattr(model, 'predict_proba') pos = np.array_split(targets.positions, mpiops.chunks)[mpiops.chunk_index] fields = {} for k, v in targets.fields.items(): fields[k] = np.array_split(v, mpiops.chunks)[mpiops.chunk_index] features = np.array_split(features, mpiops.chunks)[mpiops.chunk_index] pred = predict.predict(features, model, fields=fields, lon_lat=pos) pred = mpiops.comm.gather(pred, root=0) if mpiops.chunk_index == 0: pred = np.concatenate(pred) if classification: hard, p = pred[:, 0], pred[:, 1:] scores = classification_validation_scores(targets.observations, hard, p) else: scores = regression_validation_scores(targets.observations, pred, features.shape[1], model) _logger.info("Out of sample validation complete, scores:") for k, v in scores.items(): _logger.info(f"{k}: {v}") result_tags = model.get_predict_tags() y_pred_dict = dict(zip(result_tags, pred.T)) if hasattr(model, '_notransform_predict'): y_pred_dict['transformedpredict'] = \ model.target_transform.transform(pred[:, 0]) return OOSInfo(scores, targets.observations, y_pred_dict, classification, targets.positions) else: return None
def oos_validate(targets_all, x_all, model, config): lon_lat = targets_all.positions weights = targets_all.weights observations = targets_all.observations predictions = predict.predict(x_all, model, interval=config.quantiles, lon_lat=lon_lat) if mpiops.chunk_index == 0: tags = model.get_predict_tags() y_true = targets_all.observations to_text = [predictions, y_true[:, np.newaxis], lon_lat] true_vs_pred = Path(config.output_dir).joinpath(config.name + "_oos_validation.csv") cols = tags + ['y_true', 'lon', 'lat'] np.savetxt(true_vs_pred, X=np.hstack(to_text), delimiter=',', fmt='%.8e', header=','.join(cols), comments='') scores = regression_validation_scores(observations, predictions, weights, model) score_string = "OOS Validation Scores:\n" for metric, score in scores.items(): score_string += "{}\t= {}\n".format(metric, score) geoio.output_json(scores, Path(config.output_dir).joinpath(config.name + "_oos_validation_scores.json")) log.info(score_string)
def local_crossval(x_all, targets_all, config): """ Performs K-fold cross validation to test the applicability of a model. Given a set of inputs and outputs, this function will evaluate the effectiveness of a model at predicting the targets, by splitting all of the known data. A model is trained on a subset of the total data, and then this model is used to predict all of the unseen targets, its performance can provide a benchmark to evaluate the effectiveness of a model. Parameters ---------- x_all: numpy.array A 2D array containing all of the training inputs targets_all: numpy.array A 1D vector containing all of the training outputs config: dict The global config object, which is used to choose the model to train. Return ------ result: dict A dictionary containing all of the cross validation metrics, evaluated on the unseen data subset. """ parallel_model = config.multicubist or config.multirandomforest or config.bootstrap if config.bootstrap and config.parallel_validate: config.alrgorithm_args['parallel'] = False elif not config.bootstrap and not config.parallel_validate and mpiops.chunk_index != 0: return if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = False _logger.info("Validating with {} folds".format(config.folds)) model = modelmaps[config.algorithm](**config.algorithm_args) classification = hasattr(model, 'predict_proba') y = targets_all.observations lon_lat = targets_all.positions _, cv_indices = split_cfold(y.shape[0], config.folds, config.crossval_seed) # Split folds over workers fold_list = np.arange(config.folds) if config.parallel_validate: fold_node = \ np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index] else: fold_node = fold_list y_pred = {} y_true = {} fold_scores = {} pos = {} # Train and score on each fold for fold in fold_node: _logger.info(":mpi:Training fold {} of {}".format( fold + 1, config.folds, mpiops.chunk_index)) train_mask = cv_indices != fold test_mask = ~ train_mask y_k_train = y[train_mask] if config.target_weight_property: y_k_weight = targets_all.fields[config.target_weight_property][train_mask] else: y_k_weight = None lon_lat_train = lon_lat[train_mask] lon_lat_test = lon_lat[test_mask] # Extra fields fields_train = {f: v[train_mask] for f, v in targets_all.fields.items()} fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()} # Train on this fold x_train = x_all[train_mask] apply_multiple_masked(model.fit, data=(x_train, y_k_train), fields=fields_train, lon_lat=lon_lat_train, sample_weight=y_k_weight) # Testing if not config.parallel_validate and mpiops.chunk_index != 0: continue else: y_k_pred = predict.predict(x_all[test_mask], model, fields=fields_pred, lon_lat=lon_lat_test) y_pred[fold] = y_k_pred n_covariates = x_all[test_mask].shape[1] # Regression if not classification: y_k_test = y[test_mask] fold_scores[fold] = regression_validation_scores( y_k_test, y_k_pred, n_covariates, model) # Classification else: y_k_test = model.le.transform(y[test_mask]) y_k_hard, p_k = y_k_pred[:, 0], y_k_pred[:, 1:] fold_scores[fold] = classification_validation_scores( y_k_test, y_k_hard, p_k ) y_true[fold] = y_k_test pos[fold] = lon_lat_test if config.parallel_validate: y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0)) y_true = _join_dicts(mpiops.comm.gather(y_true, root=0)) pos = _join_dicts(mpiops.comm.gather(pos, root=0)) scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0)) else: scores = fold_scores result = None if mpiops.chunk_index == 0: y_true = np.concatenate([y_true[i] for i in range(config.folds)]) y_pred = np.concatenate([y_pred[i] for i in range(config.folds)]) pos = np.concatenate([pos[i] for i in range(config.folds)]) valid_metrics = scores[0].keys() scores = {m: np.mean([d[m] for d in scores.values()], axis=0) for m in valid_metrics} score_string = "Validation complete:\n" for metric, score in scores.items(): score_string += "{}\t= {}\n".format(metric, score) _logger.info(score_string) result_tags = model.get_predict_tags() y_pred_dict = dict(zip(result_tags, y_pred.T)) if hasattr(model, '_notransform_predict'): y_pred_dict['transformedpredict'] = \ model.target_transform.transform(y_pred[:, 0]) result = CrossvalInfo(scores, y_true, y_pred_dict, classification, pos) if parallel_model: config.algorithm_args['parallel'] = True return result
def local_crossval(x_all, targets_all: targ.Targets, config: Config): """ Performs K-fold cross validation to test the applicability of a model. Given a set of inputs and outputs, this function will evaluate the effectiveness of a model at predicting the targets, by splitting all of the known data. A model is trained on a subset of the total data, and then this model is used to predict all of the unseen targets, its performance can provide a benchmark to evaluate the effectiveness of a model. Parameters ---------- x_all: numpy.array A 2D array containing all of the training inputs targets_all: numpy.array A 1D vector containing all of the training outputs config: dict The global config object, which is used to choose the model to train. Return ------ result: dict A dictionary containing all of the cross validation metrics, evaluated on the unseen data subset. """ # run cross validation in parallel, but one thread for each fold if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = False if (mpiops.chunk_index != 0) and (not config.parallel_validate): return log.info("Validating with {} folds".format(config.folds)) model = modelmaps[config.algorithm](**config.algorithm_args) classification = hasattr(model, 'predict_proba') groups = targets_all.groups if (len(np.unique(groups)) + 1 < config.folds) and config.group_targets: raise ValueError(f"Cannot continue cross-validation with chosen params as num of groups {max(groups) + 1} " f"in data is less than the number of folds {config.folds}") random_state = \ config.algorithm_args['random_state'] if 'random_state' in config.algorithm_args else np.random.randint(1000) x_all, y, lon_lat, groups, w, cv = setup_validation_data(x_all, targets_all, config.folds, random_state) _, cv_indices = split_gfold(groups, cv) # Split folds over workers fold_list = np.arange(config.folds) if config.parallel_validate: fold_node = np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index] else: fold_node = fold_list y_pred = {} y_true = {} weight = {} lon_lat_ = {} fold_scores = {} # Train and score on each fold for fold in fold_node: model = modelmaps[config.algorithm](**config.algorithm_args) print("Training fold {} of {} using process {}".format( fold + 1, config.folds, mpiops.chunk_index)) train_mask = cv_indices != fold test_mask = ~ train_mask y_k_train = y[train_mask] w_k_train = w[train_mask] lon_lat_train = lon_lat[train_mask, :] lon_lat_test = lon_lat[test_mask, :] # Extra fields fields_train = {f: v[train_mask] for f, v in targets_all.fields.items()} fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()} # Train on this fold x_train = x_all[train_mask] apply_multiple_masked(model.fit, data=(x_train, y_k_train), ** {'fields': fields_train, 'sample_weight': w_k_train, 'lon_lat': lon_lat_train}) # Testing y_k_pred = predict.predict(x_all[test_mask], model, fields=fields_pred, lon_lat=lon_lat_test) y_pred[fold] = y_k_pred # Regression if not classification: y_k_test = y[test_mask] y_true[fold] = y_k_test w_k_test = w[test_mask] weight[fold] = w_k_test lon_lat_[fold] = lon_lat_test fold_scores[fold] = regression_validation_scores(y_k_test, y_k_pred, w_k_test, model) # Classification else: y_k_test = model.le.transform(y[test_mask]) y_true[fold] = y_k_test w_k_test = w[test_mask] weight[fold] = w_k_test lon_lat_[fold] = lon_lat_test y_k_hard, p_k = y_k_pred[:, 0], y_k_pred[:, 1:] fold_scores[fold] = classification_validation_scores(y_k_test, y_k_hard, w_k_test, p_k) if config.parallel_validate: y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0)) lon_lat_ = _join_dicts(mpiops.comm.gather(lon_lat_, root=0)) y_true = _join_dicts(mpiops.comm.gather(y_true, root=0)) weight = _join_dicts(mpiops.comm.gather(weight, root=0)) scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0)) else: scores = fold_scores result = None if mpiops.chunk_index == 0: y_true = np.concatenate([y_true[i] for i in range(config.folds)]) weight = np.concatenate([weight[i] for i in range(config.folds)]) lon_lat = np.concatenate([lon_lat_[i] for i in range(config.folds)]) y_pred = np.concatenate([y_pred[i] for i in range(config.folds)]) valid_metrics = scores[0].keys() scores = {m: np.mean([d[m] for d in scores.values()], axis=0) for m in valid_metrics} score_string = "Validation complete:\n" for metric, score in scores.items(): score_string += "{}\t= {}\n".format(metric, score) log.info(score_string) result_tags = model.get_predict_tags() y_pred_dict = dict(zip(result_tags, y_pred.T)) if hasattr(model, '_notransform_predict'): y_pred_dict['transformedpredict'] = \ model.target_transform.transform(y_pred[:, 0]) result = CrossvalInfo(scores, y_true, y_pred_dict, weight, lon_lat, classification) # change back to parallel if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = True return result
def local_crossval(x_all, targets_all, config): """ Performs K-fold cross validation to test the applicability of a model. Given a set of inputs and outputs, this function will evaluate the effectiveness of a model at predicting the targets, by splitting all of the known data. A model is trained on a subset of the total data, and then this model is used to predict all of the unseen targets, its performance can provide a benchmark to evaluate the effectiveness of a model. Parameters ---------- x_all: numpy.array A 2D array containing all of the training inputs targets_all: numpy.array A 1D vector containing all of the training outputs config: dict The global config object, which is used to choose the model to train. Return ------ result: dict A dictionary containing all of the cross validation metrics, evaluated on the unseen data subset. """ # run cross validation in parallel, but one thread for each fold if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = False if (mpiops.chunk_index != 0) and (not config.parallel_validate): return log.info("Validating with {} folds".format(config.folds)) model = modelmaps[config.algorithm](**config.algorithm_args) y = targets_all.observations lon_lat = targets_all.positions _, cv_indices = split_cfold(y.shape[0], config.folds, config.crossval_seed) # Split folds over workers fold_list = np.arange(config.folds) if config.parallel_validate: fold_node = np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index] else: fold_node = fold_list y_pred = {} y_true = {} fold_scores = {} # Train and score on each fold for fold in fold_node: print("Training fold {} of {} using process {}".format( fold + 1, config.folds, mpiops.chunk_index)) train_mask = cv_indices != fold test_mask = ~train_mask y_k_train = y[train_mask] lon_lat_train = lon_lat[train_mask] lon_lat_test = lon_lat[test_mask] # Extra fields fields_train = { f: v[train_mask] for f, v in targets_all.fields.items() } fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()} # Train on this fold apply_multiple_masked(model.fit, data=(x_all[train_mask], y_k_train), kwargs={ 'fields': fields_train, 'lon_lat': lon_lat_train }) # Testing y_k_pred = predict.predict(x_all[test_mask], model, fields=fields_pred, lon_lat=lon_lat_test) y_k_test = y[test_mask] y_pred[fold] = y_k_pred y_true[fold] = y_k_test fold_scores[fold] = calculate_validation_scores( y_k_test, y_k_train, y_k_pred) if config.parallel_validate: y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0)) y_true = _join_dicts(mpiops.comm.gather(y_true, root=0)) scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0)) else: scores = fold_scores result = None if mpiops.chunk_index == 0: y_true = np.concatenate([y_true[i] for i in range(config.folds)]) y_pred = np.concatenate([y_pred[i] for i in range(config.folds)]) valid_metrics = scores[0].keys() scores = { m: np.mean([d[m] for d in scores.values()]) for m in valid_metrics } score_string = "Validation complete:\n" for metric, score in scores.items(): score_string += "{}\t= {}\n".format(metric, score) log.info(score_string) result_tags = model.get_predict_tags() y_pred_dict = dict(zip(result_tags, y_pred.T)) result = CrossvalInfo(scores, y_true, y_pred_dict) # change back to parallel if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = True return result