示例#1
0
def _prep_keras_inputs(train, test): 
    """Get the train/test data into the right format for Keras. 

    First, split the train and test into features and target.
    Next, since keras models only accept np.ndarray's, format the 
    training/test data to meet that. In addition, since this is a 
    classification problem and a `softmax` will be used at the final 
    layer, the test set needs to be fed in as two dimensional. 

    Args: 
    ----
        train: pandas DataFrame
        test: pandas DataFrame

    Return: 
    ------
        train_features: np.ndarray
        train_target: np.ndarray
        test_features: np.ndarray
        test_target: np.ndarray
    """
    
    # Break out the DataFrames. 
    train_target, train_features = get_target_features(train)
    test_target, test_features = get_target_features(test)
    
    # Format everything. 
    train_target = train_target.astype(int)
    train_target = np_utils.to_categorical(train_target)

    test_target = test_target.astype(int)
    test_target = np_utils.to_categorical(test_target)

    return train_features, train_target, test_features, test_target
示例#2
0
def run_sklearn_param_search(model, train, cv_fold_generator, model_name, 
        random=False, num_iterations=10): 
    """Perform a model search over possible parameter values.
    
    Args: 
    ----
        model: varied
            Holds the model to perform the grid search over. Expected to implement 
            the sklearn model interface. 
        train: np.ndarray
        cv_fold_generator: SequentialTimeFold object 
            An object that generates folds to perform cross-validation over. 
        model_name: str
        random (optional): bool
            Holds whether or not to use RandomizedSearchCV or GridSearchCV. 
        num_iterations (optional): int
            Number of iterations to use for random searching (if used). 

    Returns: 
    -------
        best_model: sklearn.<searcher>.best_estimator_
        best_mean_score: float
    """

    train_target, train_features = get_target_features(train)
    eval_metric = return_scorer('auc_precision_recall')

    fit_params={}
    if random: 
        params = _get_random_params(model_name)
        grid_search = RandomizedSearchCV(estimator=model, param_distributions=params, 
                scoring=eval_metric, cv=cv_fold_generator, fit_params=fit_params, 
                n_iter=num_iterations)
    else: 
        params = _get_grid_params(model_name)
        grid_search = GridSearchCV(estimator=model, param_grid=params, 
                scoring=eval_metric, cv=cv_fold_generator, fit_params=fit_params)
    grid_search.fit(train_features.values, train_target.values)

    best_model = grid_search.best_estimator_
    best_mean_score = grid_search.best_score_

    return best_model, best_mean_score
示例#3
0
        log_train_results(model_name, validation, best_fit_model, best_score, scores)
        # log_scores(best_fit_model, hold_out_features, hold_out_target, model_name, 
        # date_parts, hold_out_feats_pre_norm)
    else: 
        beg_dt, end_dt = sys.argv[3], sys.argv[4]
        beg_date, end_date = format_date(beg_dt), format_date(end_dt)

        model = get_model(model_name, {})

        best_params = get_best_params(model_name)
        model.set_params(**best_params)

        dt_range = pd.date_range(beg_date, end_date)
        for dt in dt_range: 
            validation, hold_out = get_train_test(input_df, 'date_fire', dt,
                    train=False)
            validation, hold_out = prep_data(validation), prep_data(hold_out)
            Y_train, X_train = get_target_features(validation)
            Y_test, X_test = get_target_features(hold_out)
            # Don't run models if there are no obs
            if X_train.shape[0] and X_test.shape[0]: 
                model.fit(X_train, Y_train)
                pred_probs = model.predict_proba(X_test)[:, 1]
                roc_auc, pr_auc = None, None
                # We can't get area under the curve if there are no fires :(. 
                if Y_test.sum() != 0: 
                    roc_auc = return_score('auc_roc', pred_probs, Y_test)
                    pr_auc = return_score('auc_precision_recall', pred_probs, Y_test)
                log_feat_importances(model, X_train, dt)
                log_test_results(dt, Y_test, pred_probs, roc_auc, pr_auc)
示例#4
0
        log_train_results(model_name, validation, best_fit_model, best_score,
                          score_type)
    else:
        beg_dt, end_dt = sys.argv[3], sys.argv[4]
        beg_date, end_date = format_date(beg_dt), format_date(end_dt)

        model = get_model(model_name)

        best_params = get_best_params(model_name)
        model.set_params(**best_params)

        dt_range = pd.date_range(beg_date, end_date)
        for dt in dt_range:
            validation, hold_out = get_train_test(input_df, 'date_fire', dt)
            validation, hold_out = prep_data(validation), prep_data(hold_out)
            Y_train, X_train = get_target_features(validation)
            Y_test, X_test = get_target_features(hold_out)
            # Don't run models if there are no obs for a day.
            if X_train.shape[0] and X_test.shape[0]:
                model.fit(X_train, Y_train)
                pred_probs = model.predict_proba(X_test)[:, 1]
                roc_auc, pr_auc = None, None
                # We can't get area under the curve if there are no fires :(.
                if Y_test.sum() != 0:
                    roc_auc = return_score('auc_roc', pred_probs, Y_test)
                    pr_auc = return_score('auc_precision_recall', pred_probs,
                                          Y_test)
                log_feat_importances(model, X_train, dt)
                log_test_results(dt, geo_cols_df, Y_test, pred_probs, roc_auc,
                                 pr_auc)
示例#5
0
def run_sklearn_param_search(model, train, cv_fold_generator, 
        random=False, num_iterations=10, model_name=None, test=None): 
    """Perform a model grid search over the inputted parameters and folds. 
    
    For the given model and the relevant grid parameters, perform a 
    grid search with those grid parameters, and return the best model. 

    Args: 
    ----
        model: varied
            Holds the model to perform the grid search over. Expected 
            to implement the sklearn model interface. 
        train: np.ndarray
        cv_fold_generator: SequentialTimeFold/StratifiedTimeFold object 
            An object that generates folds to perform cross-validation over. 
        random (optional): bool
            Holds whether or not to use RandomizedSearchCV or GridSearchCV. 
        num_iterations (optional): int
            Number of iterations to use for random searching (if used). 
        model_name (optional): str
            Holds the model_name, to be used to determine if it is a 
            boosting model, and whether or not to use early stopping. Must
            be passed in if `early_stopping_tolerance` is passed in. 
        test (optional): np.ndarray
            To be used for early stopping if passed in. 

    Returns: 
    -------
        best_model: sklearn.<searcher>.best_estimator_
            The best model as obtained through the parameter search. 
        best_mean_score: float
            The `mean_validation_score` from a sklearn.<searcher> object. 
        scores: list
            The scores from each run of the paramateter search. 
    """

    train_target, train_features = get_target_features(train)
    eval_metric = return_scorer('auc_precision_recall')

    fit_params={}
    if test and (model_name == 'gboosting' or model_name == 'xgboost'):
        test_target, test_features = get_target_features(test)
        # The monitor callback and xgboost use code under the hood
        # that requires these changes. 
        test_target = test_target.values.astype('float32')  
        test_features = test_features.values.astype('float32')
        test_target = test_target.copy(order='C')
        test_features = test_features.copy(order='C')

        early_stopping_tolerance = 5
        fit_params = _prep_fit_params(model_name, fit_params, 
                early_stopping_tolerance, test_features, test_target)

    if random: 
        params = _get_random_params(model_name)
        grid_search = RandomizedSearchCV(estimator=model, param_distributions=params, 
                scoring=eval_metric, cv=cv_fold_generator, fit_params=fit_params, 
                n_iter=num_iterations)
    else: 
        params = _get_grid_params(model_name)
        grid_search = GridSearchCV(estimator=model, param_grid=params, 
                scoring='roc_auc', cv=cv_fold_generator, fit_params=fit_params)
    grid_search.fit(train_features.values, train_target.values)

    best_model = grid_search.best_estimator_
    best_mean_score = grid_search.best_score_
    scores = grid_search.grid_scores_

    return best_model, best_mean_score, scores