Пример #1
0
def cross_validate(estimator: BaseEstimator, X: pd.DataFrame, y: pd.DataFrame,
                   num_splits: int, save_name: str) -> None:
    """
    function to perform cross validation and call error_profile at the end to generate an error report for a sklearn
    model
    :param estimator: SkLearn classification model
    :param X: dataframe containing data
    :param y: dataframe containing class labels corresponding to X
    :param num_splits: number of folds for k-fold cross validation
    :param save_name: save name for error profile plots (file extension will be appended)
    :return: None
    """
    splitter = StratifiedKFold(n_splits=num_splits,
                               shuffle=True,
                               random_state=0)

    predictions = {"test": [], "train": []}
    y_true = {"test": [], "train": []}

    for train_index, test_index in splitter.split(X, y):
        estimator.fit(X.iloc[train_index, :], y.iloc[train_index, 0])
        test_pred = estimator.predict(X.iloc[test_index, :])
        train_pred = estimator.predict(X.iloc[train_index, :])

        predictions["train"].append(train_pred)
        predictions["test"].append(test_pred)

        y_true["train"].append(np.array(y.iloc[train_index])[:, 0])
        y_true["test"].append(np.array(y.iloc[test_index])[:, 0])

    error_profile(y_true, predictions, model_type=save_name)
Пример #2
0
def loop_snippet(clf: BaseEstimator, repeat: int, x, y, xt):
    time_table = []
    for i in range(repeat):
        start = time.perf_counter()
        clf.fit(x, y)
        clf.predict(xt)
        time_table.append(time.perf_counter() - start)
    return time_table
Пример #3
0
    def _model_predict(self, model: BaseEstimator, data: pd.DataFrame) -> np.array:

        if self._task._task_type == BINARY_CLASSIFICATION:
            predictions = model.predict_proba(data)

        elif self._task._task_type == MULTI_CLASS_CLASSIFICATION:
            predictions = model.predict(data)

        elif self._task._task_type == REGRESSION:
            predictions = model.predict(data)

        return predictions
Пример #4
0
    def evaluate(self, model: BaseEstimator, X, y, X_test, y_test):
        metrics_logger = MetricsLogger(
            classes=audioset.ontology.MUSIC_GENRE_CLASSES,
            classsmetrics_filepath=self.classmetrics_filepath,
            show_top_classes=25,
            class_sort_key='ap'
        )

        logging.info('---- Train stats ----')
        predictions = model.predict(X)
        metrics_logger.log(predictions, y)

        logging.info('---- Test stats ----')
        predictions = model.predict(X_test)
        metrics_logger.log(predictions, y_test, show_classes=True)
Пример #5
0
def build_submission(model_sj: BaseEstimator, model_iq: BaseEstimator,
                     test_features_sj: pd.DataFrame,
                     test_features_iq: pd.DataFrame, raw_path: str,
                     pred_path: str, name: str) -> pd.DataFrame:

    submission = pd.read_csv(os.path.join(raw_path, 'submission_format.csv'))

    y_pred_sj = model_sj.predict(test_features_sj)
    y_pred_iq = model_iq.predict(test_features_iq)
    y_pred = np.concatenate((y_pred_sj, y_pred_iq))

    submission['total_cases'] = np.round(y_pred).astype(int)
    submission.to_csv(os.path.join(pred_path, name + '.csv'), index=None)

    return submission
Пример #6
0
def summarize_feature_comparisons(
        base_clf: BaseEstimator, comparison_clfs: Dict[str, BaseEstimator], X_test, y_test
):
    from mlxtend.evaluate import mcnemar, cochrans_q, mcnemar_table

    summary_dict = collections.OrderedDict()
    mcnemar_tbs = dict()

    # create list of predicted values
    base_y_predict = base_clf.predict(X_test)
    y_predictions = [base_y_predict]
    for idx, (name, clf) in enumerate(comparison_clfs.items()):
        # get the probability
        y_predict_proba = clf.predict_proba(X_test)
        y_predict = clf.predict(X_test)

        # form mcnemar tables against base classifier
        tb = mcnemar_table(y_test, base_y_predict, y_predict)
        mcnemar_tbs[f"base vs {name}"] = tb.values()

        # store predictions per classifier
        y_predictions.append(y_predict)

    # first run cochrans Q test
    qstat, pval = cochrans_q(y_test, *y_predictions)
    summary_dict["cochrans_q"] = qstat
    summary_dict["cochrans_q_pval"] = pval

    # run mcnemars test against all the predictions
    for name, table in mcnemar_tbs.items():
        chi2stat, pval = mcnemar(table, exact=True)
        summary_dict[f"mcnemar_{name}_chi2stat"] = chi2stat
        summary_dict[f"mcnemar_{name}_pval"] = pval

    return summary_dict
def standard_report(
    estimator: BaseEstimator,
    X_test: Union[pd.DataFrame, np.ndarray],
    y_test: Union[pd.Series, np.ndarray],
    zero_division: str = "warn",
) -> None:
    """Display standard report of diagnostic metrics and plots for classification.

    Parameters
    ----------
    estimator : BaseEstimator
        Fitted classification estimator for evaluation.
    X_test : DataFrame or ndarray of shape (n_samples, n_features)
        Predictor test set.
    y_test : Series or ndarray of shape (n_samples,)
        Target test set.
    zero_division : str, optional
        Value to return for division by zero: 0, 1, or 'warn'.
    """
    table = classification_report(y_test,
                                  estimator.predict(X_test),
                                  zero_division=zero_division,
                                  heatmap=True)
    classification_plots(estimator, X_test, y_test)
    display(table)
Пример #8
0
def evaluate_fchl(rep_computer: FCHLRepresentation,
                  model: BaseEstimator,
                  mols: List[str],
                  n_jobs: int = 1,
                  y_lower: List[float] = None) -> np.ndarray:
    """Run an FCHL-based model

    Args:
        rep_computer: Tool used to compute the FCHL-compatible representations for each molecule
        model: Model to be evaluated
        mols: List of molecules (XYZ format) to evaluate
        n_jobs: Number of threads to use for generating representations
        y_lower: Lower-fidelity estimate of the property. Used for delta learning models
    Returns:
        Results from the inference
    """

    # Convert the input molecules into FCHL-ready inputs
    rep_computer.n_jobs = n_jobs
    reps = rep_computer.transform(mols)

    # Run the model
    y_pred = model.predict(reps).tolist()
    if y_lower is not None:
        y_pred = np.add(y_pred, y_lower)
    return y_pred
Пример #9
0
def generate(model: base.BaseEstimator, sentences: List[List[str]]) -> None:
    """Tag the sentences with the given model.

    Parameters
    ----------
    sentences : list
        List of lists of strings representing the sentences to tag.
    """
    print(f"Tagging {len(sentences)} sentences.")

    # Since the models were trained on the lemmatized version of the words,
    # we also lemmatize them when tagging unlabeled sentences.
    lemmatizer = stem.WordNetLemmatizer()

    for sentence in sentences:
        # Convert to the lemmatized versions
        lemmatized = [lemmatizer.lemmatize(w.lower()) for w in sentence]

        # Convert to conllu.TokenList because models expect that.
        # Since they are essentially dicts, we build them that way.
        tags = model.predict([[{"lemma": w} for w in lemmatized]])

        print("Word\tTag")
        for w, t in zip(sentence, tags[0]):
            print(f"{w}\t{t}")
        print()
Пример #10
0
def max_std_sampling(regressor: BaseEstimator,
                     X: modALinput,
                     n_instances: int = 1,
                     random_tie_break=False,
                     **predict_kwargs) -> np.ndarray:
    """
    Regressor standard deviation sampling strategy.

    Args:
        regressor: The regressor for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    _, std = regressor.predict(X, return_std=True, **predict_kwargs)
    std = std.reshape(X.shape[0], )

    if not random_tie_break:
        return multi_argmax(std, n_instances=n_instances)

    return shuffled_argmax(std, n_instances=n_instances)
Пример #11
0
def evaluate(df: pd.DataFrame, target_column: Text,
             clf: BaseEstimator) -> Dict:
    """Evaluate classifier on a dataset

    Args:
        df {pandas.DataFrame}: dataset
        target_column {Text}: target column name
        clf {sklearn.base.BaseEstimator}: classifier (trained model)

    Returns:
        Dict: Dict of reported metrics
            'f1' - F1 score
            'cm' - Comnfusion Matrix
            'actual' - true values for test data
            'predicted' - predicted values for test data
    """

    # Get X and Y
    y_test = df.loc[:, target_column].values.astype('int32')
    X_test = df.drop(target_column, axis=1).values.astype('float32')

    prediction = clf.predict(X_test)
    f1 = f1_score(y_true=y_test, y_pred=prediction, average='macro')
    cm = confusion_matrix(prediction, y_test)

    return {'f1': f1, 'cm': cm, 'actual': y_test, 'predicted': prediction}
Пример #12
0
 def run_inference(
     self, batch: Sequence[numpy.ndarray], model: BaseEstimator,
     **kwargs) -> Iterable[PredictionResult]:
   # vectorize data for better performance
   vectorized_batch = numpy.stack(batch, axis=0)
   predictions = model.predict(vectorized_batch)
   return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
Пример #13
0
    def _predict_regression(
            self,
            X: np.ndarray,
            model: BaseEstimator,
            task_type: int,
            Y_train: Optional[np.ndarray] = None) -> np.ndarray:
        def send_warnings_to_log(
            message: Union[Warning, str],
            category: Type[Warning],
            filename: str,
            lineno: int,
            file: Optional[TextIO] = None,
            line: Optional[str] = None,
        ) -> None:
            self.logger.debug('%s:%s: %s:%s' %
                              (filename, lineno, str(category), message))
            return

        with warnings.catch_warnings():
            warnings.showwarning = send_warnings_to_log
            Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred
Пример #14
0
def get_preds_probas(
    est: BaseEstimator, X_test: DataFrame, y_test: Series, mapper_dict: Dict
) -> DataFrame:
    """
    Get prediction probabilities (if available) or return true and predicted
    labels
    """
    df_preds = DataFrame(est.predict(X_test), index=X_test.index)
    if hasattr(est.named_steps["clf"], "predict_proba"):
        # Get prediction probabilities (if available)
        df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index)

        # Append prediction and prediction probabilities
        df_summ = concat([df_preds, df_probas], axis=1)
        df_summ.columns = ["predicted_label"] + [
            f"probability_of_{i}" for i in range(0, len(np.unique(y_test)))
        ]

        # Get label (class) with maximum prediction probability for each row
        df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1)
        df_summ["probability_of_max_class"] = df_probas.max(axis=1)

        # Compare .predict_proba() and manually extracted prediction
        # probability
        lhs = df_summ["max_class_number_manually"]
        rhs = df_summ["predicted_label"].replace(mapper_dict)
        assert (lhs == rhs).eq(True).all()
    else:
        df_summ = df_preds.copy()
    # Get true label
    df_summ.insert(0, "true_label", y_test)
    return df_summ
Пример #15
0
    def decision_boundary(self, x: np.ndarray, y: np.ndarray,
                          model: BaseEstimator):
        x0 = x[:, 0]
        x1 = x[:, 1]

        x_min, x_max = x0.min() - 1, x0.max() + 1
        y_min, y_max = x1.min() - 1, x1.max() + 1

        xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                             np.arange(y_min, y_max, 0.1))

        z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        z = z.reshape(xx.shape)
        z = z.astype(np.str)

        y = [str(label) for label in y]

        fig = px.scatter(x=x0, y=x1, color=y)

        # fig = go.Figure()

        contour = go.Contour(z=z,
                             x=np.arange(x_min, x_max, 0.1),
                             y=np.arange(y_min, y_max, 0.1),
                             line_width=0,
                             colorscale=[[0, '#ff9900'], [1, '#6666ff']],
                             opacity=0.4,
                             showscale=False)

        fig.add_trace(contour)

        fig.update_layout(title='Decision boundary', legend_title='Label')

        pyo.iplot(fig)
Пример #16
0
Файл: train.py Проект: ku222/GCP
 def evaluate_model(self, model: BaseEstimator, xtest: np.ndarray,
                    ytest: np.ndarray) -> ModelStats:
     """Get the accuracy, recall, precision of this model"""
     ypreds = model.predict(xtest)
     return ModelStats(accuracy=accuracy_score(ypreds, ytest),
                       precision=precision_score(ypreds, ytest),
                       recall=recall_score(ypreds, ytest))
Пример #17
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds,
                            degree_range, lambda_range):
        """
        Cross-validate to find best hyperparameters with k-fold CV.
        :param X: Training data.
        :param y: Training targets.
        :param model: sklearn model.
        :param lambda_range: Range of values for the regularization hyperparam.
        :param degree_range: Range of values for the degree hyperparam.
        :param k_folds: Number of folds for splitting the training data into.
        :return: A dict containing the best model parameters,
            with some of the keys as returned by model.get_params()
        """

        # TODO: Do K-fold cross validation to find the best hyperparameters
        #  Notes:
        #  - You can implement it yourself or use the built in sklearn utilities
        #    (recommended). See the docs for the sklearn.model_selection package
        #    http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
        #  - If your model has more hyperparameters (not just lambda and degree)
        #    you should add them to the search.
        #  - Use get_params() on your model to see what hyperparameters is has
        #    and their names. The parameters dict you return should use the same
        #    names as keys.
        #  - You can use MSE or R^2 as a score.

        # ====== YOUR CODE: ======
        #raise NotImplementedError()
        # ========================

        kf = sklearn.model_selection.KFold(k_folds)
        smallest_loss = np.inf
        best_params = {"bostonfeaturestransformer__degree": 1, "linearregressor__reg_lambda": 0.2}
        count = 0


        for lam in lambda_range:
            for deg in degree_range:
                model.set_params(linearregressor__reg_lambda=lam, bostonfeaturestransformer__degree=deg)
                avg_mse = 0.0
                count += 1

                for train_i, test_i in kf.split(X):
                    x_train = X[train_i]
                    y_train = y[train_i]
                    model.fit(x_train, y_train)
                    y_pred = model.predict(X[test_i])
                    avg_mse += np.square(y[test_i] - y_pred).sum() / (2 * X.shape[0])

                avg_mse /= k_folds

                #check if the current params are the best
                if avg_mse <= smallest_loss:
                    smallest_loss = avg_mse
                    best_params = {"linearregressor__reg_lambda": lam, "bostonfeaturestransformer__degree": deg}
                    # ========================
        print(count)
        return best_params
Пример #18
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    #
    # Notes:
    # - You can implement it yourself or use the built in sklearn utilities
    #   (recommended). See the docs for the sklearn.model_selection package
    #   http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    # - If your model has more hyperparameters (not just lambda and degree)
    #   you should add them to the search.
    # - Use get_params() on your model to see what hyperparameters is has
    #   and their names. The parameters dict you return should use the same
    #   names as keys.
    # - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    params = {
        'linearregressor__reg_lambda': lambda_range,
        'bostonfeaturestransformer__degree': degree_range
    }

    kf = KFold(n_splits=k_folds)
    best_params = ParameterGrid(params)[0]
    best_mse = np.inf
    best_r_2 = 0.0

    for p_dict in ParameterGrid(params):
        cur_acc = 0.0
        curr_r_2 = 0.0
        model.set_params(**p_dict)
        for train_index, test_index in kf.split(X):
            model.fit(X[train_index], y=y[train_index])
            mse, rsq = evaluate_accuracy(y[test_index],
                                         model.predict(X[test_index]))
            cur_acc += mse
            curr_r_2 += rsq

        cur_acc /= k_folds
        curr_r_2 /= k_folds

        if curr_r_2 > best_r_2:
            best_r_2 = curr_r_2
            best_params = p_dict
    # ========================

    return best_params
Пример #19
0
def score_model(estimator: BaseEstimator, X: np.ndarray, y: np.ndarray) -> str:
    """
    Runs a cross_val_score with cv = 5 on arrays X, y with a neg mean squared error score.
    Performs the RMSE conversion and prints out scores.

    Args:
        estimator (BaseEstimator): Trained sklearn estimator object (Regressor)
        X (np.ndarray): Feature array
        y (np.ndarray): Target array

    Returns:
        no_val_rmse: [np.float64] RMSE score based on the training data
        no_val_r2: [np.float64] R^2 score based on the training data
        val_rmse_scores: [np.ndarray] Series of RMSE scores from cross validation
        cv_mean: [np.float64] Mean of all cross-validated RMSE scores
        cv_std: [np.float64] StDev of all cross-validated RMSE scores
        cv_cov: [np.float64] CoV of all cross-validated RMSE scores (CoV = StDev / Mean)
    """

    val_scores = cross_val_score(estimator, X, y, scoring="neg_mean_squared_error")
    val_scores = val_scores * -1
    val_rmse_scores = np.sqrt(val_scores)

    no_val_mse = mean_squared_error(y, estimator.predict(X))
    no_val_rmse = np.sqrt(no_val_mse)
    no_val_r2 = r2_score(y, estimator.predict(X))

    cv_mean = np.mean(val_rmse_scores)
    cv_std = np.std(val_rmse_scores)
    cv_cov = cv_std / cv_mean

    print("Non-validation Scores")
    print("-----------")
    print(f"RMSE (No Val): {np.round(no_val_rmse, 3)}")
    print(f"R^2 (No Val): {np.round(no_val_r2, 3)}")
    print()
    print("Validation Scores")
    print("-----------")
    print(f"RMSE's: {np.round(val_rmse_scores, 3)}")
    print(f"Mean: {np.round(cv_mean, 3)}")
    print(f"StDev: {np.round(cv_std, 3)}")
    print(f"CoV: {np.round(cv_cov, 3)}")

    return no_val_rmse, no_val_r2, val_rmse_scores, cv_mean, cv_std, cv_cov
Пример #20
0
def predict(model: BaseEstimator, sample: list) -> int:
    """Make a prediction.

    Returns:
    A -1 indicating an outlier or 1 indicating a normal value.
    """

    # Need to reshape a single sample because input needs to be 2d
    result = model.predict(sample)
    return int(result[0])
Пример #21
0
    def test_vector_alignment(self):
        # Mock out a generic scikit-learn classifier
        mocked_model = BaseEstimator()
        mocked_model.fit = MagicMock()
        mocked_model.predict = MagicMock(return_value=[True])

        # Create a simple data frame extending to January 15
        date_sequence = pd.date_range('1/1/2011', periods=15, freq='D')
        time_series = pd.DataFrame({
            # This column will be accessed by name to generate the targets vector.
            'Violent Crime Committed?': [True, True] + [False]*13,

            # Actual time series used for nonsequential prediction will contain more than one column.
            # However, we just need to verify that it grabs the correct slices of each column,
            # so one stand-in column will suffice.
            'Other Data': [0]*10 + [1]*5
        }, index=date_sequence)

        # Construct a NonsequentialPredictor with the mock
        predictor = NonsequentialPredictor(time_series, model=mocked_model)

        # The date to predict comes before the end of the time series,
        # so all rows from the 13th on should be discarded
        date_to_predict = datetime.date(2011, 1, 13)

        # The mock always predicts True, so predict() should return True
        self.assertTrue(predictor.predict(date_to_predict))

        # And both fit and predict should have been called
        self.assertTrue(mocked_model.fit.called)
        self.assertTrue(mocked_model.predict.called)

        # When feeding training data to the sklearn model,
        # predict() needs to align each day of the time series with whether a violent crime was committed the NEXT day.
        # Thus, the first element of the Violent Crime Committed? column should have been removed
        #  before being used as the model's targets vector because it has no previous day to partner with.
        expected_targets = [True] + [False]*11

        # Similarly, the last element of any other column (in this case, 'Other Data')
        # should only go up to the day before the day we're trying to predict
        expected_features = [[0]]*10 + [[1]]*2

        # Get the two arguments passed to mocked_model
        fit_args = mocked_model.fit.call_args
        observed_features = fit_args[0][0]
        observed_targets = fit_args[0][1]

        # Equality tests with numpy arrays are wonky, so I convert numpy arrays to Python lists
        self.assertEqual(observed_targets.tolist(), expected_targets)
        self.assertEqual(observed_features.tolist(), expected_features)

        # Confirm the correct argument was passed to predict
        print(mocked_model.predict.call_args)
        observed_day_to_predict = mocked_model.predict.call_args[0][0]
        self.assertEqual(observed_day_to_predict.tolist(), [[1]])
Пример #22
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #
    # Notes:
    # - You can implement it yourself or use the built in sklearn utilities
    #   (recommended). See the docs for the sklearn.model_selection package
    #   http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    # - If your model has more hyperparameters (not just lambda and degree)
    #   you should add them to the search.
    # - Use get_params() on your model to see what hyperparameters is has
    #   and their names. The parameters dict you return should use the same
    #   names as keys.
    # - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    kf = sklearn.model_selection.KFold(n_splits=k_folds)
    best_params = 0
    min_mse = np.inf
    for curr_degree in degree_range:
        for curr_lambda in lambda_range:
            params = dict(linearregressor__reg_lambda=curr_lambda,
                          bostonfeaturestransformer__degree=curr_degree)
            model.set_params(**params)
            mse = 0
            counter = 0
            for train_index, test_index in kf.split(X):
                counter = counter + 1
                model.fit(X[train_index], y[train_index])
                y_pred = model.predict(X[test_index])
                mse = mse + np.mean((y[test_index] - y_pred)**2)

            avg_mse = mse / counter
            print("avg_mse:", avg_mse, " labmda:", curr_lambda, " degree:",
                  curr_degree)
            if avg_mse < min_mse:
                best_params = params
                min_mse = avg_mse
    # ========================

    return best_params
Пример #23
0
def score_data(data: pd.DataFrame, model: BaseEstimator) -> pd.DataFrame:
    """Score data using model."""
    feature_columns = [
        'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
        'petal width (cm)'
    ]
    label_to_classes_map = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
    X = data[feature_columns].values
    data['predicted_labels'] = model.predict(X)
    data['predicted_class'] = (
        data['predicted_labels'].apply(lambda e: label_to_classes_map[e]))
    return data
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #  Notes:
    #  - You can implement it yourself or use the built in sklearn utilities
    #    (recommended). See the docs for the sklearn.model_selection package
    #    http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    #  - If your model has more hyperparameters (not just lambda and degree)
    #    you should add them to the search.
    #  - Use get_params() on your model to see what hyperparameters is has
    #    and their names. The parameters dict you return should use the same
    #    names as keys.
    #  - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    kf = sklearn.model_selection.KFold(k_folds)
    params_grid = sklearn.model_selection.ParameterGrid({
        'bostonfeaturestransformer__degree':
        degree_range,
        'linearregressor__reg_lambda':
        lambda_range
    })
    best_loss = 0
    for param in params_grid:
        model.set_params(**param)
        avg_score = 0.0
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            avg_score += r2_score(y_test, y_pred)
        avg_score /= k_folds
        if avg_score > best_loss:
            best_loss = avg_score
            best_params = param

        # ========================

    return best_params
Пример #25
0
def compute_score(model: BaseEstimator, designs: List[str]) -> List[float]:
    """Assign a score to a series of designs given a machine learning model

    Args:
        model (BaseEstimator): Scikit-learn model
        designs ([str]): List of strings describing the model
    """

    # Run inference
    y_pred = model.predict(designs)

    # Return results
    return y_pred.tolist()
Пример #26
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #  Notes:
    #  - You can implement it yourself or use the built in sklearn utilities
    #    (recommended). See the docs for the sklearn.model_selection package
    #    http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    #  - If your model has more hyperparameters (not just lambda and degree)
    #    you should add them to the search.
    #  - Use get_params() on your model to see what hyperparameters is has
    #    and their names. The parameters dict you return should use the same
    #    names as keys.
    #  - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    # params = model.get_params()
    kf = sklearn.model_selection.KFold(n_splits=k_folds)
    params_grid = {
        'bostonfeaturestransformer__degree': degree_range,
        'linearregressor__reg_lambda': lambda_range
    }
    min_acc = np.inf

    for params in list(sklearn.model_selection.ParameterGrid(params_grid)):
        model.set_params(**params)
        curr_acc = 0
        for train_idx, test_idx in kf.split(X):
            train_x, train_y = X[train_idx], y[train_idx]
            test_x, test_y = X[test_idx], y[test_idx]
            model.fit(train_x, train_y)
            y_pred = model.predict(test_x)
            curr_acc += mse_score(test_y, y_pred)
        mean = curr_acc / k_folds
        if mean < min_acc:
            min_acc = mean
            best_params = params
    # ========================

    return best_params
Пример #27
0
    def fit(self, X, original_y):
        base_est = BaseEstimator()
        base_est.predict = lambda X: np.zeros(X.shape[0], dtype=float)
        self.estimators_ = [base_est]

        for i in range(self.n_estimators):
            grad = self.loss_grad(original_y, self._predict(X))
            estimator = deepcopy(self.base_regressor)
            estimator.fit(X, grad)

            self.estimators_.append(estimator)

        self.out_ = self._outliers(grad)
        self.feature_importances_ = self._calc_feature_imps()

        return self
Пример #28
0
def finalize_model(
    model: BaseEstimator,
    X_train: CSVData,
    Y_train: CSVData,
    X_test: CSVData,
    test_ids: CSVData,
    output: str,
    smote_fn: SamplerFnType = None,
    outlier_detection: Any = None,
    header: Tuple[str, str] = ("id", "y"),
    label_indexing: int = 0,
    export_int: bool = False,
) -> None:
    """Train the model on the complete data and generate the submission file.

    Parameters
    ----------
    model: The model
    X_train: The training data
    Y_train: The training labels
    X_test: The test data
    test_ids: The IDs for the test data
    output: The path where to dump the output
    smote_fn: The function that takes labels and returns SMOTE
    label_indexing: What to start indexing the label from
    export_int: Whether to export the CSV as integers
    """
    print("Training model...")

    if outlier_detection is not None:
        outliers = outlier_detection.fit_predict(X_train)
        X_train = X_train[outliers == 1]
        Y_train = Y_train[outliers == 1]

    if smote_fn:
        smote = smote_fn(Y_train)
        X_train, Y_train = smote.fit_resample(X_train, Y_train)

    model.fit(X_train, Y_train)

    print("Model trained")
    Y_pred = model.predict(X_test) + label_indexing
    submission: Any = np.stack([test_ids, Y_pred], 1)  # Add IDs
    create_submission_file(output,
                           submission,
                           header=header,
                           export_int=export_int)
Пример #29
0
def log_performance(
    X_test: np.ndarray,
    y_test_binarized: np.ndarray,
    model: BaseEstimator,
    binarizer: MultiLabelBinarizer,
    logger: Logger,
) -> None:
    """Logs performance of the model to the log file"""
    y_test_pred_binarized = model.predict(X_test)
    logger.info("-" * 80)
    logger.info("**EVALUATION\nClassification Report \n**")
    logger.info(
        classification_report(y_test_binarized,
                              y_test_pred_binarized,
                              target_names=binarizer.classes_,
                              zero_division=1))
    logger.info("\nAccuracy Score: {}".format(
        accuracy_score(y_test_binarized, y_test_pred_binarized)))
Пример #30
0
def link_prediction_pipeline(graph: nx.Graph, embeddings: np.array,
                             id2node: list, node2id: list,
                             classifier: BaseEstimator, **kwargs) -> dict:
    non_edges_train, non_edges_test, edges_train, edges_test = kwargs["non_edges_train"], kwargs["non_edges_test"],\
                                                               kwargs["edges_train"], kwargs["edges_test"]
    X_train, X_test, Y_train, Y_test = link_pred_train_test_split(
        embeddings, node2id, **kwargs)

    # Classify
    classifier.fit(X_train, Y_train)
    y_pred = classifier.predict(X_test)
    y_true = Y_test

    return {
        "micro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="micro"),
        "macro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="macro"),
        "accuracy": accuracy_score(y_true=y_true, y_pred=y_pred)
    }
Пример #31
0
def eval(model: base.BaseEstimator, test_data: List[conllu.TokenList]) -> None:
    """Evaluate a model using the provided dataset.

    Parameters
    ----------
    test_data : list
        List of sentences represented as `conllu.TokenList`.
    """
    print(f"Evaluating with {len(test_data)} sentences.")

    y_test = feature_extraction.extract_tags(test_data)

    y_pred = model.predict(test_data)
    accuracy = metrics.accuracy(y_test, y_pred)
    amb_accuracy = metrics.ambiguous_accuracy(test_data, y_test, y_pred)

    print("Model accuracy:", accuracy)
    print("Model ambiguous words accuracy:", amb_accuracy)