예제 #1
0
def test_frequency_selection():
    # Test frequency selection over grid-search and cross_validation
    model = Pipeline(steps=[
        ('driver',
         ExtractDriver(fs=fs,
                       low_fq=4.,
                       max_low_fq=7.,
                       low_fq_width=low_fq_width,
                       random_state=0)),
        ('add', AddDriverDelay()),
        ('dar', DARSklearn(fs=fs, ordar=20, ordriv=1, max_ordar=20)),
    ])

    param_grid = {
        'driver__low_fq': [3., 5., 7.],
        'driver__low_fq_width': [0.25, 0.5, 1.],
    }

    gscv = GridSearchCVProgressBar(model,
                                   param_grid=param_grid,
                                   return_train_score=False,
                                   verbose=1)
    X = MultipleArray(raw_signal, None)
    gscv.fit(X)

    assert gscv.best_params_['driver__low_fq'] == 5
    assert gscv.best_params_['driver__low_fq_width'] == 1
예제 #2
0
def run_grid_search(model, path, param_grid, X, y, cv=3):
    start = time.time()

    search = GridSearchCVProgressBar(model,
                                     param_grid,
                                     scoring='roc_auc',
                                     cv=cv,
                                     n_jobs=-1,
                                     verbose=2)
    search.fit(X, y)

    print("Total Runtime for Grid Search: {:.4} seconds".format(
        round(time.time() - start, 2)))

    best_score = search.best_score_
    best_params = search.best_params_

    print("Testing Accuracy: {:.4}%".format(best_score * 100))
    print("\nOptimal Parameters: {}".format(best_params))

    search_results = pd.DataFrame.from_dict(search.cv_results_)

    search_results.to_csv('./grid_search_results/' + path + '_' +
                          str(round(best_score, 4)).replace('.', '') + '_' +
                          time.asctime().replace(' ', '_'))

    return search_results, best_score, best_params
예제 #3
0
def build_model():    
    """
    Builds a model, runs through paramaters with GridSearch and loads the best set
    """
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
        ])),
         ('clf', MultiOutputClassifier(OneVsRestClassifier(SGDClassifier())))
    ])
    
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1,1),(1,2),(1,3)),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        'features__text_pipeline__tfidf__smooth_idf': (True, False),
        'features__transformer_weights': (
            {'text_pipeline': 1},
            {'text_pipeline': 0.5},
            {'text_pipeline': 0.2}
        ),
        'clf__estimator__estimator__n_jobs': [50, 100, 200],
        'clf__estimator__estimator__alpha': [0.0001] #0.001,0.01]    
    }


    cv = GridSearchCVProgressBar(pipeline, param_grid=parameters, n_jobs=-1)

    return cv
예제 #4
0
def train_learning_model_NB(learning_model, hyperparameters, X, y,
                            classifier_string):  #TFIDF FOR FACT OR OPINION
    path_dictionary = {}
    learning_model = learning_model
    clf = GridSearchCVProgressBar(learning_model,
                                  hyperparameters,
                                  cv=10,
                                  verbose=0)
    best_model = clf.fit(X, y)
    grid_df = pd.DataFrame(best_model.cv_results_)
    best_model_path = '../Classification_models/' + classifier_string + '/' + classifier_string + '_' + '.pkl'
    grid_df_path = '../Classification_models/' + classifier_string + '/' + 'grid_search_' + classifier_string + '_' + '.csv'
    grid_df.to_csv(grid_df_path)
    joblib.dump(best_model.best_estimator_, best_model_path)
    print('Saved Model!')
    path_dictionary = (best_model_path, grid_df_path)
    print('Saved Grid Search!')
    return path_dictionary
예제 #5
0
def train_learning_model(learning_model, hyperparameters, all_d2v_models,
                         classifier_string):
    path_dictionary = {}
    for d2v_model in all_d2v_models:
        print('Training ' + d2v_model)
        X_resampled, y_resampled = resampled_SMOTE(d2v_model)
        learning_model = learning_model
        clf = GridSearchCVProgressBar(learning_model,
                                      hyperparameters,
                                      cv=10,
                                      verbose=0)
        best_model = clf.fit(X_resampled, y_resampled)
        grid_df = pd.DataFrame(best_model.cv_results_)
        best_model_path = '../Classification_models/' + classifier_string + '/' + classifier_string + '_' + d2v_model + '.pkl'
        grid_df_path = '../Classification_models/' + classifier_string + '/' + 'grid_search_' + classifier_string + '_' + d2v_model + '.csv'
        grid_df.to_csv(grid_df_path)
        joblib.dump(best_model.best_estimator_, best_model_path)
        print('Saved Model!')
        path_dictionary[d2v_model] = (best_model_path, grid_df_path)
        print('Saved Grid Search!')
    return path_dictionary
    def optimize_hyperparams(self,
                             params_dict,
                             n_iter=10,
                             n_folds=5,
                             search_type='random'):

        tmp_classifier = LGBM_classifier(feature_names=self.model_features)

        # Set boosting type according to classifier name.
        if self.classifier_name == 'random_forest':
            tmp_classifier.set_params(**{
                'boosting': 'rf',
                'bagging_freq': 1,
                'bagging_fraction': 0.7
            })
        elif self.classifier_name == 'lgbm':
            tmp_classifier.set_params(**{'boosting': 'gbdt'})

        if search_type == 'random':
            self.model_selection = RandomizedSearchCV(
                estimator=tmp_classifier,
                param_distributions=params_dict,
                refit=False,
                random_state=2020,
                n_iter=n_iter,
                cv=n_folds,
                verbose=10,
                n_jobs=2)
        elif search_type == 'grid':
            self.model_selection = GridSearchCVProgressBar(
                estimator=tmp_classifier,
                param_grid=params_dict,
                refit=False,
                cv=n_folds,
                verbose=10,
                n_jobs=2)

        self.model_selection.fit(self.X_train, self.y_train)

        return self.model_selection.cv_results_
예제 #7
0
def SVM_I(X, Y, grid):
    """SVM classifier."""
    print(DIVIDER)
    # define model
    mSVM = svm.SVC(random_state=RANDOM_STATE)
    # create cache
    cachedir = mkdtemp()
    # set-up pipeline: normalize, reduce components, model
    pipe = Pipeline(steps=[('scale', None), ('pca', None), ('model', mSVM)],
                    memory=cachedir)
    # grid search parameters
    grid = {
        'scale': [None],
        'pca': [None],
        'model__kernel': ['rbf', 'linear'],
        'model__C': numpy.logspace(-4, 4, 5),
        'model__gamma': numpy.logspace(-4, 4, 5)
    }
    # define grid search and fit the values
    estimator = GridSearchCVProgressBar(pipe,
                                        grid,
                                        scoring='accuracy',
                                        n_jobs=-1,
                                        verbose=2)
    estimator.fit(X.values, Y.values.ravel())
    # store the results of grid search in CSV
    best_df = pandas.DataFrame.from_dict(estimator.cv_results_)
    best_df.to_csv('{0}/svm_I.csv'.format(CSV_ROOT))
    # prepare variables for printing
    means = 100 * estimator.cv_results_['mean_test_score']
    stds = 100 * estimator.cv_results_['std_test_score']
    params = estimator.cv_results_['params']
    i = estimator.best_index_
    print("SVM Best Results: %.2f%% (%.2f%%) with %r" %
          (means[i], stds[i], params[i]))
    print(DIVIDER)
    # remove cache
    rmtree(cachedir)
    def _model(self,
               X_train,
               y_train,
               model_name: str = 'Logistic Regression',
               apply_reduction: bool = False) -> str:
        """
        Trains the models, saves the pickled object to the model_path.
        :param: X_train: Training feature data
        :param: y_train: Output feature data
        :param model_name: which model to be used?
        :param apply_reduction: whether to apply PCA reduction
        :return: Classification report!
        """
        param_grid = {}

        steps = [('scaler', StandardScaler())]

        if apply_reduction:
            steps.append(('dimension_reduction', PCA()))
            param_grid['dimension_reduction__n_components'] = [5, 10]

        if model_name == 'Logistic Regression':
            steps.append(('logistic', LogisticRegression()))
            param_grid['logistic__C'] = [
                1e-4, 1e-3, 1e-2, 1e-1, 1e1, 1e2, 1e3, 1e4
            ]
            param_grid['logistic__penalty'] = ['l1', 'l2', 'elasticnet']
            param_grid['logistic__solver'] = [
                'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'
            ]
            param_grid['logistic__multi_class'] = ['auto', 'ovr']

        elif model_name == 'Random Forest':
            steps.append(
                ('random_forest', RandomForestClassifier(random_state=1769)))
            param_grid['random_forest__max_depth'] = [20, 50, 100]
            param_grid['random_forest__n_estimators'] = [10, 50, 100]
            param_grid['random_forest__criterion'] = ['gini', 'entropy']
            param_grid['random_forest__min_samples_split'] = [2, 0.3]
            param_grid['random_forest__max_features'] = ['auto', 'sqrt']

        estimator = GridSearchCVProgressBar(Pipeline(steps),
                                            param_grid=param_grid,
                                            cv=3,
                                            refit=True)
        print(estimator.get_params().keys())
        estimator.fit(X_train, y_train)

        if not os.path.exists(os.path.join(self.model_path, model_name)):
            os.makedirs(os.path.join(self.model_path, model_name))

        pickle.dump(
            estimator,
            open(
                os.path.join(self.model_path, model_name, f"{model_name}.pkl"),
                'wb'))
        predictions = estimator.predict(X_train)
        return classification_report(y_train, predictions, output_dict=True)
class Model():
    def __init__(self, **kwargs):

        #self.model_params = kwargs
        self.classifier_name = kwargs.get('classifier', 'lgbm')
        self.categorical_features = kwargs.get('categorical_features', [])
        self.numerical_features = kwargs.get('numerical_features', [])
        self.text_features = kwargs.get('text_features', [])
        self.sequence_features = kwargs.get('sequence_features', [])
        self.scale_numerical = kwargs.get('scale_numerical', False)
        self.accepts_sparse = kwargs.get('accepts_sparse', True)

        self.pipeline = CustomPipeline(
            categorical_features=self.categorical_features,
            numerical_features=self.numerical_features,
            text_features=self.text_features,
            sequence_features=self.sequence_features,
            accepts_sparse=self.accepts_sparse,
            scale_numerical=self.scale_numerical).build_pipeline()

    def get_model_params(self, deep=True):
        return {
            'classifier': self.classifier_name,
            'classifier_params': self.get_classifier_params(),
            'categorical_features': self.categorical_features,
            'numerical_features': self.numerical_features,
            'text_features': self.text_features,
            'sequence_features': self.sequence_features,
            'scale_numerical': self.scale_numerical,
            'accepts_sparse': self.accepts_sparse
        }

    def get_classifier_params(self):
        return self.classifier.get_params()

    @property
    def model_features(self):

        feature_names = []
        transformers = self.pipeline.named_steps[
            'feature_engineering'].transformer_list
        for transformer in transformers:
            transformer_features = transformer[1].get_feature_names()
            feature_names.extend(transformer_features)

        #HACK
        feature_names = [
            "".join(c if c.isascii() and c.isalnum() else "_" for c in str(x))
            for x in feature_names
        ]

        return feature_names

    @property
    def n_features(self):
        return (len(self.model_features))

    def transform(self, data, transform_test=False):

        print('Fitting pipeline...')
        self.pipeline.fit(data.train.X)

        print('Transforming data...')
        self.X_train = self.pipeline.transform(data.train.X)
        self.y_train = data.train.y

        self.X_val = self.pipeline.transform(data.val.X)
        self.y_val = data.val.y

        if transform_test:
            self.X_test = self.pipeline.transform(data.test.X)
            self.y_test = data.test.y

    def fit_classifier(self, **kwargs):

        if self.classifier_name == 'lgbm':
            boosting_name = 'gbdt'
        elif self.classifier_name == 'random_forest':
            boosting_name = 'rf'

        self.classifier = LGBM_classifier(feature_names=self.model_features)

        #Set boosting type according to classifier name.
        if self.classifier_name == 'random_forest':
            self.classifier.set_params(**{
                'boosting': 'rf',
                'bagging_freq': 1,
                'bagging_fraction': 0.7
            })
        elif self.classifier_name == 'lgbm':
            self.classifier.set_params(**{'boosting': 'gbdt'})

        #set other paramaters based on kwargs if any.
        self.classifier.set_params(**kwargs)

        #train classifier
        print('Training classifier')
        self.classifier.fit(self.X_train, self.y_train, self.X_val, self.y_val)

    def fit_best_classifier(self):
        assert hasattr(self, 'model_selection')
        self.fit_classifier(**self.model_selection.best_params_)

    def predict(self, X_transf):
        return self.classifier.predict(X_transf)

    def score(self, X_transf, y):
        return self.classifier.score(X_transf, y)

    @property
    def get_feature_importance(self):
        return {
            'feature_importance': self.classifier.feature_importance_,
            'feature_name': self.model_features
        }

    def plot_feature_importance(self, n_features=30):
        importance_df = pd.DataFrame(self.get_feature_importance).sort_values(
            by='feature_importance', ascending=True)
        importance_df[-n_features:].plot.barh(x=1,
                                              y=0,
                                              title='Feature Importance',
                                              legend=False,
                                              figsize=(8, 10))

    def get_performance_metrics(self):

        auc_train = self.score(self.X_train, self.y_train)
        auc_val = self.score(self.X_val, self.y_val)

        print('training AUC ROC score: ', auc_train)

        print('validation AUC ROC score: ', auc_val)

        overfitting = abs(auc_train - auc_val) / auc_train
        print('relative over-fitting: ', overfitting)

        return {
            'auc_training': auc_train,
            'auc_validation': auc_val,
            'overfitting': overfitting
        }

    def optimize_hyperparams(self,
                             params_dict,
                             n_iter=10,
                             n_folds=5,
                             search_type='random'):

        tmp_classifier = LGBM_classifier(feature_names=self.model_features)

        # Set boosting type according to classifier name.
        if self.classifier_name == 'random_forest':
            tmp_classifier.set_params(**{
                'boosting': 'rf',
                'bagging_freq': 1,
                'bagging_fraction': 0.7
            })
        elif self.classifier_name == 'lgbm':
            tmp_classifier.set_params(**{'boosting': 'gbdt'})

        if search_type == 'random':
            self.model_selection = RandomizedSearchCV(
                estimator=tmp_classifier,
                param_distributions=params_dict,
                refit=False,
                random_state=2020,
                n_iter=n_iter,
                cv=n_folds,
                verbose=10,
                n_jobs=2)
        elif search_type == 'grid':
            self.model_selection = GridSearchCVProgressBar(
                estimator=tmp_classifier,
                param_grid=params_dict,
                refit=False,
                cv=n_folds,
                verbose=10,
                n_jobs=2)

        self.model_selection.fit(self.X_train, self.y_train)

        return self.model_selection.cv_results_

    def get_model_selection_results(self):
        results = self.model_selection.cv_results_
        results_df = pd.DataFrame(
            {key: results[key]
             for key in results if key != 'params'})
        return results_df
예제 #10
0
    'driver__low_fq_width': [0.25, 0.5, 1.],
}

###############################################################################
# Then we plug the model into GridSearchCV and we fit it.
#
# This performs a grid-search with cross-validation: First, multiple train and
# test sets are defined by the splitting strategy, as defined by the parameter
# `cv` in GridSearchCV. Then, GridSearchCV will loop over each parameter
# configuration, fitting the model on one train set and evaluating it on the
# corresponding test set.

# Plug the model and the parameter grid into a GridSearchCV estimator
# (GridSearchCVProgressBar is identical to GridSearchCV, but it adds a nice
# progress bar to monitor progress.)
gscv = GridSearchCVProgressBar(model, param_grid=param_grid, cv=3,
                               return_train_score=False, verbose=1)

# Fit the grid-search. We use `MultipleArray` to put together low_sig and
# high_sig. If high_sig is None, we use low_sig for both the driver and the
# modeled signal.
X = MultipleArray(low_sig, None)
gscv.fit(X)

###############################################################################
# Print the results of the grid search.

print("\nBest parameters set found over cross-validation:\n")
print(gscv.best_params_)

###############################################################################
# Plot the results of the grid search.