def local_learn_model(x_all, targets_all: Targets, config): model = None if config.multicubist or config.multirandomforest: y = targets_all.observations weights = targets_all.weights model = all_modelmaps[config.algorithm](**config.algorithm_args) apply_multiple_masked( model.fit, (x_all, y), **{ 'fields': targets_all.fields, 'parallel': True, 'sample_weight': weights, 'lon_lat': targets_all.positions }) else: if mpiops.chunk_index == 0: y = targets_all.observations weights = targets_all.weights model = all_modelmaps[config.algorithm](**config.algorithm_args) apply_multiple_masked( model.fit, (x_all, y), **{ 'fields': targets_all.fields, 'sample_weight': weights, 'lon_lat': targets_all.positions }) return model
def calculate_validation_scores(ys, yt, eys): """ Calculates the validation scores for a prediction Given the test and training data, as well as the outputs from every model, this function calculates all of the applicable metrics in the following list, and returns a dictionary with the following (possible) keys: + r2_score + expvar + smse + lins_ccc + mll + msll Parameters ---------- ys: numpy.array The test data outputs yt: numpy.array The training data's corresponding predictions eys: numpy.array The predictions made by the trained model on test data Returns ------- scores: dict A dictionary containing all of the evaluated scores. """ probscores = ['msll', 'mll'] scores = {} # cubist can predict nan when a categorical variable is not # present in the training data # TODO: Can be removed except for cubist nans = ~np.isnan(eys[:, 0]) ys = ys[nans] eys = eys[:, 0][nans] for m in metrics: if m not in probscores: score = apply_multiple_masked(score_first_dim(metrics[m]), (ys, eys)) elif eys.ndim == 2: if m == 'mll' and eys.shape[1] > 1: score = apply_multiple_masked(mll, (ys, eys[:, 0], eys[:, 1])) elif m == 'msll' and eys.shape[1] > 1: score = apply_multiple_masked(msll, (ys, eys[:, 0], eys[:, 1]), (yt, )) else: continue else: continue scores[m] = score return scores
def test_apply_multiple_masked(masked_data): yt, Xt, ys, Xs = masked_data yt_masked = np.ma.masked_array(yt, mask=Xt.mask.flatten()) def fit(X, y): assert np.allclose(X, Xt.data[~Xt.mask.flatten()]) assert np.allclose(y, yt_masked.data[~yt_masked.mask.flatten()]) return def predict(X, y): return y yr = apply_multiple_masked(predict, (Xt, yt_masked)) assert np.ma.all(yt_masked == yr) assert apply_multiple_masked(fit, (Xt, yt_masked)) is None
def permutation_importance(model, x_all, targets_all, config): _logger.info("Computing permutation importance!!") if config.algorithm not in transformed_modelmaps.keys(): raise AttributeError("Only the following can be used for permutation " "importance {}".format( list(transformed_modelmaps.keys()))) y = targets_all.observations classification = hasattr(model, 'predict_proba') if not classification: for score in ['explained_variance', 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error']: pi_cv = apply_multiple_masked( PermutationImportance(model, scoring=score, cv='prefit', n_iter=10, refit=False).fit, data=(x_all, y) ) feature_names = geoio.feature_names(config) df_picv = eli5.explain_weights_df( pi_cv, feature_names=feature_names, top=100) csv = Path(config.output_dir).joinpath( config.name + "_permutation_importance_{}.csv".format( score)).as_posix() df_picv.to_csv(csv, index=False)
def classification_validation_scores(ys, eys, pys): """ Calculates the validation scores for a regression prediction Given the test and training data, as well as the outputs from every model, this function calculates all of the applicable metrics in the following list, and returns a dictionary with the following (possible) keys: + accuracy + log_loss + f1 Parameters ---------- ys: numpy.array The test data outputs, one-hot representation eys: numpy.array The (hard) predictions made by the trained model on test data, one-hot representation pys: numpy.array The probabilistic predictions made by the trained model on test data Returns ------- scores: dict A dictionary containing all of the evaluated scores. """ scores = {} # in case we get hard probabilites and log freaks out pys = np.minimum(np.maximum(pys, MINPROB), 1. - MINPROB) for k, m in classification_metrics.items(): scores[k] = apply_multiple_masked(m, (ys, eys, pys)) return scores
def regression_validation_scores(y, ey, ws, model): """ Calculates the validation scores for a regression prediction Given the test and training data, as well as the outputs from every model, this function calculates all of the applicable metrics in the following list, and returns a dictionary with the following (possible) keys: + r2_score + expvar + smse + lins_ccc + mll + msll Parameters ---------- y: numpy.array The test data outputs ey: numpy.array The predictions made by the trained model on test data ws: numpy.array The weights of the test data Returns ------- scores: dict A dictionary containing all of the evaluated scores. """ scores = {} result_tags = model.get_predict_tags() if 'Variance' in result_tags: py, vy = ey[:, 0], ey[:, 1] else: py, vy = ey[:, 0], ey[:, 0] # don't calculate mll when variance is not available regression_metrics.pop('mll', None) transformed_regression_metrics.pop('mll_transformed', None) if hasattr(model, '_notransform_predict') and not isinstance(model.target_transform, Identity): # # is a transformed model y_t = model.target_transform.transform(y) # transformed targets py_t = model.target_transform.transform(py) # transformed prediction regression_metrics.update(transformed_regression_metrics) if 'Variance' in result_tags: # transformed standard dev v_t = model.target_transform.transform(np.sqrt(vy)) vy_t = np.square(v_t) # transformed variances else: vy_t = py else: # don't calculate if Transformed Prediction is not available y_t = y py_t = py vy_t = py for k, m in regression_metrics.items(): scores[k] = apply_multiple_masked(m, (y, py, vy, ws, y_t, py_t, vy_t)) return scores
def local_learn_model(x_all, targets_all, config): model = None if config.multicubist or config.multirandomforest: y = targets_all.observations model = all_modelmaps[config.algorithm](**config.algorithm_args) apply_multiple_masked(model.fit, (x_all, y), kwargs={ 'fields': targets_all.fields, 'parallel': True, 'lon_lat': targets_all.positions }) if config.multirandomforest: rf_dicts = model._randomforests rf_dicts = mpiops.comm.gather(rf_dicts, root=0) mpiops.comm.barrier() if mpiops.chunk_index == 0: for rf in rf_dicts: model._randomforests.update(rf) else: if mpiops.chunk_index == 0: y = targets_all.observations model = all_modelmaps[config.algorithm](**config.algorithm_args) apply_multiple_masked(model.fit, (x_all, y), kwargs={ 'fields': targets_all.fields, 'lon_lat': targets_all.positions }) # Save transformed targets for diagnostics if mpiops.chunk_index == 0 and hasattr(model, 'target_transform'): hdr = 'nontransformed,transformed' y = targets_all.observations y_t = model.target_transform.transform(y) np.savetxt(config.transformed_targets_file, X=np.column_stack((y, y_t)), delimiter=',', header=hdr, fmt='%.4e') if config.plot_target_scaling: diagnostics.plot_target_scaling( config.transformed_targets_file).savefig( config.plot_target_scaling) return model
def y_y_plot(y1, y2, y_label=None, y_exp_label=None, title=None, outfile=None, display=None): """ Makes a y-y plot from two corresponding vectors This function makes a y-y plot given two y vectors (y1, y2). This plot can be used to evaluate the performance of the machine learning models. Parameters ---------- y1: numpy.array The first input vector y2: numpy.array The second input vector, of the same size as y1 y_label: string The axis label for the first vector y_exp_label: string The axis label for the second vector title: string The plot title outfile: string The location to save an image of the plot display: boolean If true, a matplotlib graph will display in a window, note that this pauses the execution of the main program till this window is suspended. """ fig = pl.figure() maxy = max(y1.max(), get_first_dim(y2).max()) miny = min(y1.min(), get_first_dim(y2).min()) apply_multiple_masked(pl.plot, (y1, get_first_dim(y2)), ('k.', )) pl.plot([miny, maxy], [miny, maxy], 'r') pl.grid(True) pl.xlabel(y_label) pl.ylabel(y_exp_label) pl.title(title) if outfile is not None: fig.savefig(outfile + ".png") if display: pl.show()
def local_crossval(x_all, targets_all, config): """ Performs K-fold cross validation to test the applicability of a model. Given a set of inputs and outputs, this function will evaluate the effectiveness of a model at predicting the targets, by splitting all of the known data. A model is trained on a subset of the total data, and then this model is used to predict all of the unseen targets, its performance can provide a benchmark to evaluate the effectiveness of a model. Parameters ---------- x_all: numpy.array A 2D array containing all of the training inputs targets_all: numpy.array A 1D vector containing all of the training outputs config: dict The global config object, which is used to choose the model to train. Return ------ result: dict A dictionary containing all of the cross validation metrics, evaluated on the unseen data subset. """ parallel_model = config.multicubist or config.multirandomforest or config.bootstrap if config.bootstrap and config.parallel_validate: config.alrgorithm_args['parallel'] = False elif not config.bootstrap and not config.parallel_validate and mpiops.chunk_index != 0: return if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = False _logger.info("Validating with {} folds".format(config.folds)) model = modelmaps[config.algorithm](**config.algorithm_args) classification = hasattr(model, 'predict_proba') y = targets_all.observations lon_lat = targets_all.positions _, cv_indices = split_cfold(y.shape[0], config.folds, config.crossval_seed) # Split folds over workers fold_list = np.arange(config.folds) if config.parallel_validate: fold_node = \ np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index] else: fold_node = fold_list y_pred = {} y_true = {} fold_scores = {} pos = {} # Train and score on each fold for fold in fold_node: _logger.info(":mpi:Training fold {} of {}".format( fold + 1, config.folds, mpiops.chunk_index)) train_mask = cv_indices != fold test_mask = ~ train_mask y_k_train = y[train_mask] if config.target_weight_property: y_k_weight = targets_all.fields[config.target_weight_property][train_mask] else: y_k_weight = None lon_lat_train = lon_lat[train_mask] lon_lat_test = lon_lat[test_mask] # Extra fields fields_train = {f: v[train_mask] for f, v in targets_all.fields.items()} fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()} # Train on this fold x_train = x_all[train_mask] apply_multiple_masked(model.fit, data=(x_train, y_k_train), fields=fields_train, lon_lat=lon_lat_train, sample_weight=y_k_weight) # Testing if not config.parallel_validate and mpiops.chunk_index != 0: continue else: y_k_pred = predict.predict(x_all[test_mask], model, fields=fields_pred, lon_lat=lon_lat_test) y_pred[fold] = y_k_pred n_covariates = x_all[test_mask].shape[1] # Regression if not classification: y_k_test = y[test_mask] fold_scores[fold] = regression_validation_scores( y_k_test, y_k_pred, n_covariates, model) # Classification else: y_k_test = model.le.transform(y[test_mask]) y_k_hard, p_k = y_k_pred[:, 0], y_k_pred[:, 1:] fold_scores[fold] = classification_validation_scores( y_k_test, y_k_hard, p_k ) y_true[fold] = y_k_test pos[fold] = lon_lat_test if config.parallel_validate: y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0)) y_true = _join_dicts(mpiops.comm.gather(y_true, root=0)) pos = _join_dicts(mpiops.comm.gather(pos, root=0)) scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0)) else: scores = fold_scores result = None if mpiops.chunk_index == 0: y_true = np.concatenate([y_true[i] for i in range(config.folds)]) y_pred = np.concatenate([y_pred[i] for i in range(config.folds)]) pos = np.concatenate([pos[i] for i in range(config.folds)]) valid_metrics = scores[0].keys() scores = {m: np.mean([d[m] for d in scores.values()], axis=0) for m in valid_metrics} score_string = "Validation complete:\n" for metric, score in scores.items(): score_string += "{}\t= {}\n".format(metric, score) _logger.info(score_string) result_tags = model.get_predict_tags() y_pred_dict = dict(zip(result_tags, y_pred.T)) if hasattr(model, '_notransform_predict'): y_pred_dict['transformedpredict'] = \ model.target_transform.transform(y_pred[:, 0]) result = CrossvalInfo(scores, y_true, y_pred_dict, classification, pos) if parallel_model: config.algorithm_args['parallel'] = True return result
def local_crossval(x_all, targets_all: targ.Targets, config: Config): """ Performs K-fold cross validation to test the applicability of a model. Given a set of inputs and outputs, this function will evaluate the effectiveness of a model at predicting the targets, by splitting all of the known data. A model is trained on a subset of the total data, and then this model is used to predict all of the unseen targets, its performance can provide a benchmark to evaluate the effectiveness of a model. Parameters ---------- x_all: numpy.array A 2D array containing all of the training inputs targets_all: numpy.array A 1D vector containing all of the training outputs config: dict The global config object, which is used to choose the model to train. Return ------ result: dict A dictionary containing all of the cross validation metrics, evaluated on the unseen data subset. """ # run cross validation in parallel, but one thread for each fold if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = False if (mpiops.chunk_index != 0) and (not config.parallel_validate): return log.info("Validating with {} folds".format(config.folds)) model = modelmaps[config.algorithm](**config.algorithm_args) classification = hasattr(model, 'predict_proba') groups = targets_all.groups if (len(np.unique(groups)) + 1 < config.folds) and config.group_targets: raise ValueError(f"Cannot continue cross-validation with chosen params as num of groups {max(groups) + 1} " f"in data is less than the number of folds {config.folds}") random_state = \ config.algorithm_args['random_state'] if 'random_state' in config.algorithm_args else np.random.randint(1000) x_all, y, lon_lat, groups, w, cv = setup_validation_data(x_all, targets_all, config.folds, random_state) _, cv_indices = split_gfold(groups, cv) # Split folds over workers fold_list = np.arange(config.folds) if config.parallel_validate: fold_node = np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index] else: fold_node = fold_list y_pred = {} y_true = {} weight = {} lon_lat_ = {} fold_scores = {} # Train and score on each fold for fold in fold_node: model = modelmaps[config.algorithm](**config.algorithm_args) print("Training fold {} of {} using process {}".format( fold + 1, config.folds, mpiops.chunk_index)) train_mask = cv_indices != fold test_mask = ~ train_mask y_k_train = y[train_mask] w_k_train = w[train_mask] lon_lat_train = lon_lat[train_mask, :] lon_lat_test = lon_lat[test_mask, :] # Extra fields fields_train = {f: v[train_mask] for f, v in targets_all.fields.items()} fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()} # Train on this fold x_train = x_all[train_mask] apply_multiple_masked(model.fit, data=(x_train, y_k_train), ** {'fields': fields_train, 'sample_weight': w_k_train, 'lon_lat': lon_lat_train}) # Testing y_k_pred = predict.predict(x_all[test_mask], model, fields=fields_pred, lon_lat=lon_lat_test) y_pred[fold] = y_k_pred # Regression if not classification: y_k_test = y[test_mask] y_true[fold] = y_k_test w_k_test = w[test_mask] weight[fold] = w_k_test lon_lat_[fold] = lon_lat_test fold_scores[fold] = regression_validation_scores(y_k_test, y_k_pred, w_k_test, model) # Classification else: y_k_test = model.le.transform(y[test_mask]) y_true[fold] = y_k_test w_k_test = w[test_mask] weight[fold] = w_k_test lon_lat_[fold] = lon_lat_test y_k_hard, p_k = y_k_pred[:, 0], y_k_pred[:, 1:] fold_scores[fold] = classification_validation_scores(y_k_test, y_k_hard, w_k_test, p_k) if config.parallel_validate: y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0)) lon_lat_ = _join_dicts(mpiops.comm.gather(lon_lat_, root=0)) y_true = _join_dicts(mpiops.comm.gather(y_true, root=0)) weight = _join_dicts(mpiops.comm.gather(weight, root=0)) scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0)) else: scores = fold_scores result = None if mpiops.chunk_index == 0: y_true = np.concatenate([y_true[i] for i in range(config.folds)]) weight = np.concatenate([weight[i] for i in range(config.folds)]) lon_lat = np.concatenate([lon_lat_[i] for i in range(config.folds)]) y_pred = np.concatenate([y_pred[i] for i in range(config.folds)]) valid_metrics = scores[0].keys() scores = {m: np.mean([d[m] for d in scores.values()], axis=0) for m in valid_metrics} score_string = "Validation complete:\n" for metric, score in scores.items(): score_string += "{}\t= {}\n".format(metric, score) log.info(score_string) result_tags = model.get_predict_tags() y_pred_dict = dict(zip(result_tags, y_pred.T)) if hasattr(model, '_notransform_predict'): y_pred_dict['transformedpredict'] = \ model.target_transform.transform(y_pred[:, 0]) result = CrossvalInfo(scores, y_true, y_pred_dict, weight, lon_lat, classification) # change back to parallel if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = True return result
def local_crossval(x_all, targets_all, config): """ Performs K-fold cross validation to test the applicability of a model. Given a set of inputs and outputs, this function will evaluate the effectiveness of a model at predicting the targets, by splitting all of the known data. A model is trained on a subset of the total data, and then this model is used to predict all of the unseen targets, its performance can provide a benchmark to evaluate the effectiveness of a model. Parameters ---------- x_all: numpy.array A 2D array containing all of the training inputs targets_all: numpy.array A 1D vector containing all of the training outputs config: dict The global config object, which is used to choose the model to train. Return ------ result: dict A dictionary containing all of the cross validation metrics, evaluated on the unseen data subset. """ # run cross validation in parallel, but one thread for each fold if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = False if (mpiops.chunk_index != 0) and (not config.parallel_validate): return log.info("Validating with {} folds".format(config.folds)) model = modelmaps[config.algorithm](**config.algorithm_args) y = targets_all.observations lon_lat = targets_all.positions _, cv_indices = split_cfold(y.shape[0], config.folds, config.crossval_seed) # Split folds over workers fold_list = np.arange(config.folds) if config.parallel_validate: fold_node = np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index] else: fold_node = fold_list y_pred = {} y_true = {} fold_scores = {} # Train and score on each fold for fold in fold_node: print("Training fold {} of {} using process {}".format( fold + 1, config.folds, mpiops.chunk_index)) train_mask = cv_indices != fold test_mask = ~train_mask y_k_train = y[train_mask] lon_lat_train = lon_lat[train_mask] lon_lat_test = lon_lat[test_mask] # Extra fields fields_train = { f: v[train_mask] for f, v in targets_all.fields.items() } fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()} # Train on this fold apply_multiple_masked(model.fit, data=(x_all[train_mask], y_k_train), kwargs={ 'fields': fields_train, 'lon_lat': lon_lat_train }) # Testing y_k_pred = predict.predict(x_all[test_mask], model, fields=fields_pred, lon_lat=lon_lat_test) y_k_test = y[test_mask] y_pred[fold] = y_k_pred y_true[fold] = y_k_test fold_scores[fold] = calculate_validation_scores( y_k_test, y_k_train, y_k_pred) if config.parallel_validate: y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0)) y_true = _join_dicts(mpiops.comm.gather(y_true, root=0)) scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0)) else: scores = fold_scores result = None if mpiops.chunk_index == 0: y_true = np.concatenate([y_true[i] for i in range(config.folds)]) y_pred = np.concatenate([y_pred[i] for i in range(config.folds)]) valid_metrics = scores[0].keys() scores = { m: np.mean([d[m] for d in scores.values()]) for m in valid_metrics } score_string = "Validation complete:\n" for metric, score in scores.items(): score_string += "{}\t= {}\n".format(metric, score) log.info(score_string) result_tags = model.get_predict_tags() y_pred_dict = dict(zip(result_tags, y_pred.T)) result = CrossvalInfo(scores, y_true, y_pred_dict) # change back to parallel if config.multicubist or config.multirandomforest: config.algorithm_args['parallel'] = True return result
def local_learn_model(x_all, targets_all, config): """ Trains a model. Handles special case of parallel models. Parameters ---------- x_all : np.ndarray All covariate data, shape (n_samples, n_features), sorted using X, Y of target positions. targets_all : np.ndarray All target data, shape (n_samples), sorted using X, Y of target positions. config : :class:`~uncoverml.config.Config` Config object. Returns ------- :class:`~uncoverml.model.Model` A trained Model. """ mpiops.comm.barrier() model = None if config.target_weight_property: weights = targets_all.fields[config.target_weight_property] else: weights = None # Handle models that can be trained in parallel if config.multicubist or config.multirandomforest or config.bootstrap: y = targets_all.observations model = all_modelmaps[config.algorithm](**config.algorithm_args) apply_multiple_masked(model.fit, (x_all, y), fields=targets_all.fields, lon_lat=targets_all.positions, sample_weight=weights) # Special case: for MRF we need to gather the forests from each # process and cache them in the model if config.multirandomforest: rf_dicts = model._randomforests rf_dicts = mpiops.comm.gather(rf_dicts, root=0) mpiops.comm.barrier() if mpiops.chunk_index == 0: for rf in rf_dicts: model._randomforests.update(rf) # Single-threaded models else: if mpiops.chunk_index == 0: y = targets_all.observations model = all_modelmaps[config.algorithm](**config.algorithm_args) apply_multiple_masked(model.fit, (x_all, y), fields=targets_all.fields, lon_lat=targets_all.positions, sample_weight=weights) # Save transformed targets for diagnostics if mpiops.chunk_index == 0 and hasattr(model, 'target_transform'): hdr = 'nontransformed,transformed' y = targets_all.observations y_t = model.target_transform.transform(y) np.savetxt(config.transformed_targets_file, X=np.column_stack((y, y_t)), delimiter=',', header=hdr, fmt='%.4e') if config.plot_target_scaling: diagnostics.plot_target_scaling( config.transformed_targets_file)\ .savefig(config.plot_target_scaling) return model