示例#1
0
def _benchmark_from_data(
    experiment: Experiment,
    *,
    estimator: BaseEstimator,
    X_train: DataType,
    y_train: TargetType,
    X_test: DataType,
    y_test: TargetType,
    save_train: bool = False,
) -> None:
    with _add_timing(experiment, "fit_time"):
        estimator.fit(X_train, y_train)

    _append_info(experiment, "fitted_estimator", estimator)

    with _add_timing(experiment, "score_time"):
        test_score = estimator.score(X_test, y_test)

    _append_info(experiment, "test_score", test_score)

    if save_train:
        train_score = estimator.score(X_train, y_train)
        _append_info(experiment, "train_score", train_score)

    for output in ("transform", "predict"):
        method = getattr(estimator, output, None)
        if method is not None:
            with _add_timing(experiment, f"{output}_time"):
                _append_info(experiment, f"{output}", method(X_test))
示例#2
0
def cross_validate(estimator: BaseEstimator, X: pd.DataFrame, y: pd.DataFrame,
                   num_splits: int, save_name: str) -> None:
    """
    function to perform cross validation and call error_profile at the end to generate an error report for a sklearn
    model
    :param estimator: SkLearn classification model
    :param X: dataframe containing data
    :param y: dataframe containing class labels corresponding to X
    :param num_splits: number of folds for k-fold cross validation
    :param save_name: save name for error profile plots (file extension will be appended)
    :return: None
    """
    splitter = StratifiedKFold(n_splits=num_splits,
                               shuffle=True,
                               random_state=0)

    predictions = {"test": [], "train": []}
    y_true = {"test": [], "train": []}

    for train_index, test_index in splitter.split(X, y):
        estimator.fit(X.iloc[train_index, :], y.iloc[train_index, 0])
        test_pred = estimator.predict(X.iloc[test_index, :])
        train_pred = estimator.predict(X.iloc[train_index, :])

        predictions["train"].append(train_pred)
        predictions["test"].append(test_pred)

        y_true["train"].append(np.array(y.iloc[train_index])[:, 0])
        y_true["test"].append(np.array(y.iloc[test_index])[:, 0])

    error_profile(y_true, predictions, model_type=save_name)
示例#3
0
def loop_snippet(clf: BaseEstimator, repeat: int, x, y, xt):
    time_table = []
    for i in range(repeat):
        start = time.perf_counter()
        clf.fit(x, y)
        clf.predict(xt)
        time_table.append(time.perf_counter() - start)
    return time_table
示例#4
0
def test_determine_offset(model: BaseEstimator, expected_offset: int):
    """
    Determine the correct output difference from the model
    """
    X, y = np.random.random((100, 10)), np.random.random((100, 10))
    model.fit(X, y)
    offset = ModelBuilder._determine_offset(model, X)
    assert offset == expected_offset
示例#5
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds,
                            degree_range, lambda_range):
        """
        Cross-validate to find best hyperparameters with k-fold CV.
        :param X: Training data.
        :param y: Training targets.
        :param model: sklearn model.
        :param lambda_range: Range of values for the regularization hyperparam.
        :param degree_range: Range of values for the degree hyperparam.
        :param k_folds: Number of folds for splitting the training data into.
        :return: A dict containing the best model parameters,
            with some of the keys as returned by model.get_params()
        """

        # TODO: Do K-fold cross validation to find the best hyperparameters
        #  Notes:
        #  - You can implement it yourself or use the built in sklearn utilities
        #    (recommended). See the docs for the sklearn.model_selection package
        #    http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
        #  - If your model has more hyperparameters (not just lambda and degree)
        #    you should add them to the search.
        #  - Use get_params() on your model to see what hyperparameters is has
        #    and their names. The parameters dict you return should use the same
        #    names as keys.
        #  - You can use MSE or R^2 as a score.

        # ====== YOUR CODE: ======
        #raise NotImplementedError()
        # ========================

        kf = sklearn.model_selection.KFold(k_folds)
        smallest_loss = np.inf
        best_params = {"bostonfeaturestransformer__degree": 1, "linearregressor__reg_lambda": 0.2}
        count = 0


        for lam in lambda_range:
            for deg in degree_range:
                model.set_params(linearregressor__reg_lambda=lam, bostonfeaturestransformer__degree=deg)
                avg_mse = 0.0
                count += 1

                for train_i, test_i in kf.split(X):
                    x_train = X[train_i]
                    y_train = y[train_i]
                    model.fit(x_train, y_train)
                    y_pred = model.predict(X[test_i])
                    avg_mse += np.square(y[test_i] - y_pred).sum() / (2 * X.shape[0])

                avg_mse /= k_folds

                #check if the current params are the best
                if avg_mse <= smallest_loss:
                    smallest_loss = avg_mse
                    best_params = {"linearregressor__reg_lambda": lam, "bostonfeaturestransformer__degree": deg}
                    # ========================
        print(count)
        return best_params
示例#6
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    #
    # Notes:
    # - You can implement it yourself or use the built in sklearn utilities
    #   (recommended). See the docs for the sklearn.model_selection package
    #   http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    # - If your model has more hyperparameters (not just lambda and degree)
    #   you should add them to the search.
    # - Use get_params() on your model to see what hyperparameters is has
    #   and their names. The parameters dict you return should use the same
    #   names as keys.
    # - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    params = {
        'linearregressor__reg_lambda': lambda_range,
        'bostonfeaturestransformer__degree': degree_range
    }

    kf = KFold(n_splits=k_folds)
    best_params = ParameterGrid(params)[0]
    best_mse = np.inf
    best_r_2 = 0.0

    for p_dict in ParameterGrid(params):
        cur_acc = 0.0
        curr_r_2 = 0.0
        model.set_params(**p_dict)
        for train_index, test_index in kf.split(X):
            model.fit(X[train_index], y=y[train_index])
            mse, rsq = evaluate_accuracy(y[test_index],
                                         model.predict(X[test_index]))
            cur_acc += mse
            curr_r_2 += rsq

        cur_acc /= k_folds
        curr_r_2 /= k_folds

        if curr_r_2 > best_r_2:
            best_r_2 = curr_r_2
            best_params = p_dict
    # ========================

    return best_params
示例#7
0
def test_data_types(est: BaseEstimator, feature, target):
    if hasattr(est, 'fit'):  # Meaning a Handler or Robust Model
        est.fit(feature, target).predict(feature)
    elif hasattr(est, 'detect'):
        est.detect(feature, target)
    elif hasattr(est, 'simulate_noise'):
        est.simulate_noise(feature, target)
    else:
        raise Exception("WTF")
def auto_mlflow(
    run_name: str,
    model_name: BaseEstimator,
    data_params: dict = None,
    X: np.ndarray = "X_train",
    y: np.ndarray = "y_train",
) -> str:
    """
    Wrapper function that automates the application of mlflow to a model training event.

    Args:
        run_name (str): Desired name of the run, this will appear in the database
        model_name (BaseEstimator): Variable name of the sklearn estimator object
                                    (must refer to an already instantiated model)
        data_params (dict, optional): Dictionary containing params on the data
                                    e.g. {'standard_scaled': False}. Defaults to None.
        X (np.ndarray, optional): Feature array. Defaults to "X_train".
        y (np.ndarray, optional): Target array. Defaults to "y_train".

    Returns:
        str: Logs data to mlflow, also prints representation of evaluation scores to console
    """

    with mlflow.start_run(run_name=run_name):

        model_name.fit(X, y)

        no_val_rmse, no_val_r2, val_rmse_scores, cv_mean, cv_std, cv_cov = score_model(
            model_name, X, y
        )

        data_params = data_params
        model_params = model_name.get_params()

        mlflow.log_params(data_params)
        mlflow.log_params(model_params)

        mlflow.log_metrics(
            {
                "no_val_rmse": no_val_rmse,
                "no_val_r2": no_val_r2,
                "cv_score_1": val_rmse_scores[0],
                "cv_score_2": val_rmse_scores[1],
                "cv_score_3": val_rmse_scores[2],
                "cv_score_4": val_rmse_scores[3],
                "cv_score_5": val_rmse_scores[4],
                "cv_mean": cv_mean,
                "cv_std": cv_std,
                "cv_cov": cv_cov,
            }
        )

        mlflow.sklearn.log_model(model_name, "model")

    return None
示例#9
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #
    # Notes:
    # - You can implement it yourself or use the built in sklearn utilities
    #   (recommended). See the docs for the sklearn.model_selection package
    #   http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    # - If your model has more hyperparameters (not just lambda and degree)
    #   you should add them to the search.
    # - Use get_params() on your model to see what hyperparameters is has
    #   and their names. The parameters dict you return should use the same
    #   names as keys.
    # - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    kf = sklearn.model_selection.KFold(n_splits=k_folds)
    best_params = 0
    min_mse = np.inf
    for curr_degree in degree_range:
        for curr_lambda in lambda_range:
            params = dict(linearregressor__reg_lambda=curr_lambda,
                          bostonfeaturestransformer__degree=curr_degree)
            model.set_params(**params)
            mse = 0
            counter = 0
            for train_index, test_index in kf.split(X):
                counter = counter + 1
                model.fit(X[train_index], y[train_index])
                y_pred = model.predict(X[test_index])
                mse = mse + np.mean((y[test_index] - y_pred)**2)

            avg_mse = mse / counter
            print("avg_mse:", avg_mse, " labmda:", curr_lambda, " degree:",
                  curr_degree)
            if avg_mse < min_mse:
                best_params = params
                min_mse = avg_mse
    # ========================

    return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #  Notes:
    #  - You can implement it yourself or use the built in sklearn utilities
    #    (recommended). See the docs for the sklearn.model_selection package
    #    http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    #  - If your model has more hyperparameters (not just lambda and degree)
    #    you should add them to the search.
    #  - Use get_params() on your model to see what hyperparameters is has
    #    and their names. The parameters dict you return should use the same
    #    names as keys.
    #  - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    kf = sklearn.model_selection.KFold(k_folds)
    params_grid = sklearn.model_selection.ParameterGrid({
        'bostonfeaturestransformer__degree':
        degree_range,
        'linearregressor__reg_lambda':
        lambda_range
    })
    best_loss = 0
    for param in params_grid:
        model.set_params(**param)
        avg_score = 0.0
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            avg_score += r2_score(y_test, y_pred)
        avg_score /= k_folds
        if avg_score > best_loss:
            best_loss = avg_score
            best_params = param

        # ========================

    return best_params
示例#11
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #  Notes:
    #  - You can implement it yourself or use the built in sklearn utilities
    #    (recommended). See the docs for the sklearn.model_selection package
    #    http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    #  - If your model has more hyperparameters (not just lambda and degree)
    #    you should add them to the search.
    #  - Use get_params() on your model to see what hyperparameters is has
    #    and their names. The parameters dict you return should use the same
    #    names as keys.
    #  - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    # params = model.get_params()
    kf = sklearn.model_selection.KFold(n_splits=k_folds)
    params_grid = {
        'bostonfeaturestransformer__degree': degree_range,
        'linearregressor__reg_lambda': lambda_range
    }
    min_acc = np.inf

    for params in list(sklearn.model_selection.ParameterGrid(params_grid)):
        model.set_params(**params)
        curr_acc = 0
        for train_idx, test_idx in kf.split(X):
            train_x, train_y = X[train_idx], y[train_idx]
            test_x, test_y = X[test_idx], y[test_idx]
            model.fit(train_x, train_y)
            y_pred = model.predict(test_x)
            curr_acc += mse_score(test_y, y_pred)
        mean = curr_acc / k_folds
        if mean < min_acc:
            min_acc = mean
            best_params = params
    # ========================

    return best_params
示例#12
0
    def fit_transform(self, X_train: pd.DataFrame, X_test: pd.DataFrame,
                      y_train: pd.Series, y_test: pd.Series,
                      model: BaseEstimator) -> List:
        """
        Parameters
        ----------
        X_train:
            Данные для обучения
        X_test:
            Тестовый набор
        y_train:
            Целевая для обучающего набора
        y_test:
            Целевая для тестового набора
        model:
            Модель, совместимая с sklearn estimator

        Return value
        ------------
        Выбранный набор признаков
        """
        if self.permutation_importance_df is None:
            self.__fe = FeatureImportance(X_train, X_test, y_train, y_test,
                                          model, self.metric_name)
            self.permutation_importance_df = self.__fe.get_n_permutation_importance(
                self.n)

        self.permutation_importance_df = self.permutation_importance_df.sort_values(
            'permutation_' + self.metric_name, ascending=False)

        selected_features = []
        for i, col in enumerate(self.permutation_importance_df['features']):
            selected_features.append(col)
            if self.verbose:
                print('Fitting model on {0} features'.format(i + 1))
            model.fit(X_train[selected_features], y_train)
            current_metric = self.metric(model, X_test[selected_features],
                                         y_test)
            if self.verbose:
                print(self.metric_name + ' = {0}'.format(current_metric))
            self.subsets_.append({
                'score_' + self.metric_name: current_metric,
                'feature_names': list(selected_features)
            })

            if i > self.early_stopping_rounds and current_metric - self.subsets_[i - self.early_stopping_rounds] \
                    ['score_' + self.metric_name] < self.epsilon:
                break

        return selected_features[:-self.early_stopping_rounds]
示例#13
0
    def _fit_step(self,
                  transformer: BaseEstimator,
                  ids: Tuple,
                  is_final: bool,
                  X: pd.DataFrame,
                  y: Iterable = None,
                  **fit_params):
        # make transformer unique for each CV split
        transformer.train_ = tuple(X.index)
        transformer.features_ = tuple(X.columns)

        # load transformer from database
        transformer_loaded, ids_loaded = self._load(transformer, ids)
        is_loaded = False if transformer_loaded is None else True
        if is_loaded:
            transformer = transformer_loaded
            ids = ids_loaded

        # fit final step
        if is_final:
            if not is_loaded:
                transformer.fit(X, y, **fit_params)

        # fit intermediate steps
        else:
            if not is_loaded:
                transformer.fit(X, y, **fit_params)

            transformed_data = transformer.transform(X)

            if isinstance(transformed_data, Tuple):
                X, y = transformed_data

            else:
                Xnp = transformed_data

                # reshape input data
                if Xnp.shape != X.shape:
                    if isinstance(X, pd.DataFrame):
                        X = X.iloc[:, transformer.get_support()]

                else:
                    X = pd.DataFrame(Xnp)

        # save transformer
        if not is_loaded:
            ids = self._save(transformer, ids)

        return transformer, ids, X
示例#14
0
def _train_model(estimator: BaseEstimator,
                 grid_search_context: Dict[str, Any]) \
                 -> Tuple[Dict[str, Any], BaseEstimator]:
    X = grid_search_context['X_train']
    y = grid_search_context['y_train']
    fit_params = grid_search_context['fit_params']

    fit_start = time()
    estimator.fit(X, y, **fit_params)
    fit_end = time()

    results = _evaluate_model(estimator, X, y, grid_search_context, "training")
    results["training_time_total"] = fit_end - fit_start

    return estimator, results
示例#15
0
    def out_of_fold(
            self,
            estimator: BaseEstimator,
            train_x, train_y,
            valid_x, valid_y):
        # lightGBMとcatboostの場合は、fit時に下記パラメータを与える
        fit_params = {}
        if type(estimator).__name__ in ('LGBMClassifier', 'CatBoostClassifier',):
            if 'eval_set' not in fit_params:
                fit_params['eval_set'] = [(valid_x, valid_y)]
            if 'early_stopping_rounds' not in fit_params:
                fit_params['early_stopping_rounds'] = 100

        estimator.fit(train_x, train_y, **fit_params)
        oof = self.make_pred(estimator, valid_x)
        return oof
示例#16
0
def train_fchl(rep_computer: FCHLRepresentation,
               model: BaseEstimator,
               mols: List[str],
               y: List[float],
               n_jobs: int = 1,
               y_lower: List[float] = None) -> BaseEstimator:
    """Retrain an FCHL-based model

    Args:
        rep_computer: Tool used to compute the FCHL-compatible representations for each molecule
        model: Model to be retrained
        mols: List of molecules (XYZ format) in training set
        y: List of other properties to predict
        n_jobs: Number of threads to use for generating representations
        y_lower: Lower-fidelity estimate of the property. Used for delta learning models
    Returns:
        Retrained model
    """

    # Convert the input molecules into FCHL-ready inputs
    rep_computer.n_jobs = n_jobs
    reps = rep_computer.transform(mols)

    # Retrain the model
    if y_lower is not None:
        y = np.subtract(y, y_lower)
    return model.fit(reps, y)
示例#17
0
def finalize_model(
    model: BaseEstimator,
    X_train: CSVData,
    Y_train: CSVData,
    X_test: CSVData,
    test_ids: CSVData,
    output: str,
    smote_fn: SamplerFnType = None,
    outlier_detection: Any = None,
    header: Tuple[str, str] = ("id", "y"),
    label_indexing: int = 0,
    export_int: bool = False,
) -> None:
    """Train the model on the complete data and generate the submission file.

    Parameters
    ----------
    model: The model
    X_train: The training data
    Y_train: The training labels
    X_test: The test data
    test_ids: The IDs for the test data
    output: The path where to dump the output
    smote_fn: The function that takes labels and returns SMOTE
    label_indexing: What to start indexing the label from
    export_int: Whether to export the CSV as integers
    """
    print("Training model...")

    if outlier_detection is not None:
        outliers = outlier_detection.fit_predict(X_train)
        X_train = X_train[outliers == 1]
        Y_train = Y_train[outliers == 1]

    if smote_fn:
        smote = smote_fn(Y_train)
        X_train, Y_train = smote.fit_resample(X_train, Y_train)

    model.fit(X_train, Y_train)

    print("Model trained")
    Y_pred = model.predict(X_test) + label_indexing
    submission: Any = np.stack([test_ids, Y_pred], 1)  # Add IDs
    create_submission_file(output,
                           submission,
                           header=header,
                           export_int=export_int)
示例#18
0
def instantiate_and_fit(
    index: pd.DataFrame,
    fold: pd.DataFrame,
    X: np.ndarray,
    y: pd.DataFrame,
    estimator: BaseEstimator,
    n_splits: int = 5,
    param_grid: Optional[Dict[str, Any]] = None,
) -> BaseEstimator:
    assert fold.shape[0] == index.shape[0]
    assert fold.shape[0] == X.shape[0]
    assert fold.shape[0] == y.shape[0]

    fold_vals = fold.ravel()

    train_inds = fold_vals == "train"
    val_inds = fold_vals == "val"

    if val_inds.sum():
        raise NotImplementedError(
            "Explicit validation indices not yet supported.")

    y = y.values.ravel()

    nan_row, nan_col = np.nonzero(np.isnan(X) | np.isinf(X))
    if len(nan_row):
        logger.warning(
            f"Setting {len(nan_row)} NaN elements to zero before fitting {estimator}."
        )
        X[nan_row, nan_col] = 0

    logger.info(f"Fitting {estimator} on data (shape: {X.shape})")

    if param_grid is not None:
        group_k_fold = GroupKFold(n_splits=n_splits).split(
            X[train_inds], y[train_inds], index.trial.values[train_inds])

        grid_search = GridSearchCV(estimator=estimator,
                                   param_grid=param_grid,
                                   verbose=10,
                                   cv=list(group_k_fold))
        grid_search.fit(X[train_inds], y[train_inds])

        return grid_search.best_estimator_

    estimator.fit(X[train_inds], y[train_inds])
    return estimator
示例#19
0
def link_prediction_pipeline(graph: nx.Graph, embeddings: np.array,
                             id2node: list, node2id: list,
                             classifier: BaseEstimator, **kwargs) -> dict:
    non_edges_train, non_edges_test, edges_train, edges_test = kwargs["non_edges_train"], kwargs["non_edges_test"],\
                                                               kwargs["edges_train"], kwargs["edges_test"]
    X_train, X_test, Y_train, Y_test = link_pred_train_test_split(
        embeddings, node2id, **kwargs)

    # Classify
    classifier.fit(X_train, Y_train)
    y_pred = classifier.predict(X_test)
    y_true = Y_test

    return {
        "micro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="micro"),
        "macro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="macro"),
        "accuracy": accuracy_score(y_true=y_true, y_pred=y_pred)
    }
示例#20
0
def fit_and_suppress_warnings(logger: PicklableClientLogger,
                              pipeline: BaseEstimator, X: Dict[str, Any],
                              y: Any) -> BaseEstimator:
    @no_type_check
    def send_warnings_to_log(message,
                             category,
                             filename,
                             lineno,
                             file=None,
                             line=None) -> None:
        logger.debug('%s:%s: %s:%s', filename, lineno, category.__name__,
                     message)
        return

    with warnings.catch_warnings():
        warnings.showwarning = send_warnings_to_log
        pipeline.fit(X, y)

    return pipeline
示例#21
0
def node_classification_pipeline(graph: nx.Graph, embeddings: np.ndarray,
                                 id2node: list, node2id: list,
                                 classifier: BaseEstimator, **kwargs) -> dict:
    test_size = kwargs["test_size"]

    node_vectors = embeddings
    labels = np.array([graph.nodes[word]["community"] for word in id2node])

    node_vectors_train, node_vectors_test, labels_train, labels_test = train_test_split(
        node_vectors, labels, test_size=test_size)

    classifier.fit(node_vectors_train, labels_train)
    y_pred = classifier.predict(node_vectors_test)
    y_true = labels_test

    return {
        "micro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="micro"),
        "macro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="macro")
    }
示例#22
0
def train_ss_ensemble(
    clf: BaseEstimator,
    params: SSEnsembleParams,
    X: np.ndarray,
    y: np.ndarray,
    lb_mask: np.ndarray,
):
    rng = np.random.RandomState(params.random_state)

    # We want to return out of bag predictions
    ulb_mask = ~lb_mask

    # TODO: not really necessary but we could set them all to zero from the outside as well
    y = y.copy()

    y[ulb_mask] = 0
    ulb_indices = ulb_mask.nonzero()[0]

    y_oob_sum = np.zeros(len(y))
    y_oob_hit = np.zeros(len(y))

    for i in range(params.n_estimators):
        bag_ulb_indices = rng.choice(ulb_indices, size=params.n_samples)
        bag_lb_indices = lb_mask.nonzero()[0]

        bag_indices = np.concatenate([bag_ulb_indices, bag_lb_indices])

        X_bag = X[bag_indices]
        y_bag = y[bag_indices]

        oob_mask = np.ones(len(y), dtype="bool")
        oob_mask[bag_indices] = False

        X_oob = X[oob_mask]

        clf = clone(clf)
        clf.fit(X_bag, y_bag)

        y_oob = clf.predict_proba(X_oob)
        y_oob_sum[oob_mask] += y_oob[:, 1]
        y_oob_hit[oob_mask] += 1

    return y_oob_sum / y_oob_hit
示例#23
0
def test_get_metadata_helper(model: BaseEstimator, expect_empty_dict: bool):
    """
    Ensure the builder works with various model configs and that each has
    expected/valid metadata results.
    """

    X, y = np.random.random((1000, 4)), np.random.random((1000, ))

    model.fit(X, y)

    metadata = ModelBuilder._extract_metadata_from_model(model)

    # All the metadata we've implemented so far is 'history', so we'll check that
    if not expect_empty_dict:
        assert "history" in metadata
        assert all(name in metadata["history"]
                   for name in ("params", "loss", "accuracy"))
    else:
        assert dict() == metadata
示例#24
0
文件: pca.py 项目: ravwojdyla/sgkit
def pca_fit(
    ds: Dataset,
    est: BaseEstimator,
    *,
    variable: str = "call_alternate_allele_count",
    check_missing: bool = True,
) -> BaseEstimator:
    """ Fit PCA estimator """
    AC = _allele_counts(ds, variable, check_missing=check_missing)
    return est.fit(da.asarray(AC).T)
示例#25
0
    def test_vector_alignment(self):
        # Mock out a generic scikit-learn classifier
        mocked_model = BaseEstimator()
        mocked_model.fit = MagicMock()
        mocked_model.predict = MagicMock(return_value=[True])

        # Create a simple data frame extending to January 15
        date_sequence = pd.date_range('1/1/2011', periods=15, freq='D')
        time_series = pd.DataFrame({
            # This column will be accessed by name to generate the targets vector.
            'Violent Crime Committed?': [True, True] + [False]*13,

            # Actual time series used for nonsequential prediction will contain more than one column.
            # However, we just need to verify that it grabs the correct slices of each column,
            # so one stand-in column will suffice.
            'Other Data': [0]*10 + [1]*5
        }, index=date_sequence)

        # Construct a NonsequentialPredictor with the mock
        predictor = NonsequentialPredictor(time_series, model=mocked_model)

        # The date to predict comes before the end of the time series,
        # so all rows from the 13th on should be discarded
        date_to_predict = datetime.date(2011, 1, 13)

        # The mock always predicts True, so predict() should return True
        self.assertTrue(predictor.predict(date_to_predict))

        # And both fit and predict should have been called
        self.assertTrue(mocked_model.fit.called)
        self.assertTrue(mocked_model.predict.called)

        # When feeding training data to the sklearn model,
        # predict() needs to align each day of the time series with whether a violent crime was committed the NEXT day.
        # Thus, the first element of the Violent Crime Committed? column should have been removed
        #  before being used as the model's targets vector because it has no previous day to partner with.
        expected_targets = [True] + [False]*11

        # Similarly, the last element of any other column (in this case, 'Other Data')
        # should only go up to the day before the day we're trying to predict
        expected_features = [[0]]*10 + [[1]]*2

        # Get the two arguments passed to mocked_model
        fit_args = mocked_model.fit.call_args
        observed_features = fit_args[0][0]
        observed_targets = fit_args[0][1]

        # Equality tests with numpy arrays are wonky, so I convert numpy arrays to Python lists
        self.assertEqual(observed_targets.tolist(), expected_targets)
        self.assertEqual(observed_features.tolist(), expected_features)

        # Confirm the correct argument was passed to predict
        print(mocked_model.predict.call_args)
        observed_day_to_predict = mocked_model.predict.call_args[0][0]
        self.assertEqual(observed_day_to_predict.tolist(), [[1]])
示例#26
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #  Notes:
    #  - You can implement it yourself or use the built in sklearn utilities
    #    (recommended). See the docs for the sklearn.model_selection package
    #    http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    #  - If your model has more hyperparameters (not just lambda and degree)
    #    you should add them to the search.
    #  - Use get_params() on your model to see what hyperparameters is has
    #    and their names. The parameters dict you return should use the same
    #    names as keys.
    #  - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    model = sklearn.model_selection.GridSearchCV(
        estimator=model,
        param_grid={
            'linearregressor__reg_lambda': lambda_range,
            'bostonfeaturestransformer__degree': degree_range
        },
        scoring=sklearn.metrics.make_scorer(mse_score,
                                            greater_is_better=False),
        cv=k_folds)
    model.fit(X, y)
    best_params = model.best_params_
    # ========================

    return best_params
示例#27
0
def _fit_and_suppress_warnings(logger: Union[logging.Logger,
                                             PicklableClientLogger],
                               model: BaseEstimator, X: np.ndarray,
                               y: np.ndarray) -> BaseEstimator:
    def send_warnings_to_log(
        message: Union[Warning, str],
        category: Type[Warning],
        filename: str,
        lineno: int,
        file: Optional[TextIO] = None,
        line: Optional[str] = None,
    ) -> None:
        logger.debug('%s:%s: %s:%s' %
                     (filename, lineno, str(category), message))
        return

    with warnings.catch_warnings():
        warnings.showwarning = send_warnings_to_log
        model.fit(X, y)

    return model
示例#28
0
def _lookahead(points: np.ndarray, model: BaseEstimator, train_ixs: List[int],
               obs_labels: List[float], x: np.ndarray, label: float):
    """
    Does a lookahead at what the model would be if (x, label) were added to the
    known set.  If the model implements the partial_fit API from sklearn, then
    that will be used.  Otherwise, the model is retrained from scratch

    Args:
        model (BaseEstimator): sklearn model to be retrained
        train_ixs (ndarray): Indices of currently-labeled set
        obs_labels (ndarray): Labels for each labeled entry
        x (ndarray): Data point to simulate being labeled
        label (float): Simulated label
    """
    # If partial-fit available, use it
    if hasattr(model, "partial_fit"):
        return model.partial_fit([x], [label], [0, 1])

    # Update the training set
    X_train = np.concatenate([points[train_ixs], [x]])
    obs_labels = np.concatenate([obs_labels, [label]])

    # Refit the model
    model.fit(X_train, obs_labels)
示例#29
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds,
                        degree_range, lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #
    # Notes:
    # - You can implement it yourself or use the built in sklearn utilities
    #   (recommended). See the docs for the sklearn.model_selection package
    #   http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    # - If your model has more hyperparameters (not just lambda and degree)
    #   you should add them to the search.
    # - Use get_params() on your model to see what hyperparameters is has
    #   and their names. The parameters dict you return should use the same
    #   names as keys.
    # - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    kf = sklearn.model_selection.KFold(n_splits=k_folds)

    best_params = {}

    best_mse = None
    best_degree = None
    best_lambda = None

    for degree in degree_range:
        for lambda_r in lambda_range:
            mse = 0
            cnt = 0

            model.set_params(bostonfeaturestransformer__degree=degree, linearregressor__reg_lambda=lambda_r)
            # model = sklearn.pipeline.make_pipeline(
            #     BiasTrickTransformer(),
            #     BostonFeaturesTransformer(degree),
            #     LinearRegressor(lambda_r)
            # )

            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model.fit(X_train, y_train)

                y_test_pred = model.predict(X_test)
                mse += np.sum((y_test - y_test_pred) ** 2)
                cnt += y_test.shape[0]
            mse /= cnt
            if best_mse is None or best_mse > mse:
                best_mse = mse
                best_degree = degree
                best_lambda = lambda_r
    best_params['bostonfeaturestransformer__degree'] = best_degree
    best_params['linearregressor__reg_lambda'] = best_lambda
    # ========================

    return best_params
示例#30
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #
    # Notes:
    # - You can implement it yourself or use the built in sklearn utilities
    #   (recommended). See the docs for the sklearn.model_selection package
    #   http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    # - If your model has more hyperparameters (not just lambda and degree)
    #   you should add them to the search.
    # - Use get_params() on your model to see what hyperparameters is has
    #   and their names. The parameters dict you return should use the same
    #   names as keys.
    # - You can use MSE or R^2 as a score.
    best_params = None

    # ====== YOUR CODE: ======
    best_accr = np.inf

    # Splitting the data k-fold
    k_folder = sklearn.model_selection.KFold(k_folds)
    # Iterating over all parameters
    for degree_param in degree_range:
        for lambda_param in lambda_range:

            # Defying current params and setting the model
            params = {
                'bostonfeaturestransformer__degree': degree_param,
                'linearregressor__reg_lambda': lambda_param
            }
            model.set_params(**params)

            avg_accur = 0

            # Checking params on all k folds
            for train_indices, val_indices in k_folder.split(X):
                train_X, train_y = X[train_indices], y[train_indices]
                val_X, val_y = X[val_indices], y[val_indices]

                # Training model on training set
                model.fit(train_X, train_y)

                # Evaluate accuracy on validation set
                y_pred = model.predict(val_X)
                mse = np.mean((val_y - y_pred)**2)
                avg_accur += mse

            # Calculating avg of all k_folds
            avg_accur = avg_accur / k_folds

            # Updating Best params
            if avg_accur < best_accr:
                best_accr = avg_accur
                best_params = params

    # ========================
    return best_params
def plot_feature_importance(
    estimator: BaseEstimator,
    X_train: pd.DataFrame,
    y_train: Optional[pd.DataFrame] = None,
    top_n: int = 10,
    figsize: Tuple[int, int] = (8, 8),
    plot_error_bars: bool = True,
    print_table: bool = True,
) -> Tuple[plt.Figure, pd.DataFrame]:
    """plot feature importances of a tree-based sklearn estimator

    Args:
        estimator (BaseEstimator): sklearn-based estiamtor
        X_train (pd.DataFrame): training set features
        y_train (Optional[pd.DataFrame], optional): training set target values. Defaults to None.
        top_n (int, optional): top n feature importances to plot. Defaults to 10.
        figsize (Tuple[int, int], optional): Defaults to (8, 8).
        plot_error_bars (bool, optional): whether to plot error bars (std). Default to True.
        print_table (bool, optional): whether to print the table after the plot. Defaults to False.

    Raises:
        AttributeError: When feature_importances_ does not exists for the estimator

    Returns:
        plt.Figure: feature importances plot
        pd.DataFrame: df with feature name, importance, std based on trees
    """
    if not hasattr(estimator, "feature_importances_"):
        estimator.fit(X_train.values, y_train.values.ravel())
        if not hasattr(estimator, "feature_importances_"):
            raise AttributeError(
                f"{estimator.__class__.__name__} does not have feature_importances_ attribute"
            )

    feat_imp = pd.DataFrame({
        "feature": X_train.columns,
        "importance": estimator.feature_importances_
    })

    try:
        feat_imp["std"] = np.std(
            [tree.feature_importances_ for tree in estimator.estimators_],
            axis=0)
    except AttributeError:
        if plot_error_bars:
            logger.warning(
                f"cannot plot error bars for this estimator: {estimator.__class__.__name__}"
            )
            plot_error_bars = False

    feat_imp = feat_imp.sort_values(by="importance",
                                    ascending=False).iloc[:top_n]
    feat_imp = feat_imp.set_index("feature", drop=True)
    feat_imp = feat_imp.sort_values(by="importance", ascending=True)

    plot_kwargs = dict(
        title=f"Features Importances for {estimator.__class__.__name__}",
        figsize=figsize)
    if plot_error_bars is True:
        plot_kwargs["xerr"] = "std"
        fig = feat_imp.plot.barh(**plot_kwargs)
    else:
        fig = feat_imp.plot.barh(**plot_kwargs)
    plt.xlabel("Feature Importance")

    if print_table is True:
        from IPython.display import display

        msg = f" Top {top_n} features in descending order of importance "
        print(f"\n{msg:-^100}\n")
        display(feat_imp.sort_values(by="importance", ascending=False))

    return fig, feat_imp