예제 #1
0
def test_pipeline_raise_set_params_error():
    # Test pipeline raises set params error message for nested models.
    pipe = Pipeline([('cls', LinearRegression())])

    # expected error message
    error_msg = ('Invalid parameter %s for estimator %s. '
                 'Check the list of available parameters '
                 'with `estimator.get_params().keys()`.')

    assert_raise_message(ValueError,
                         error_msg % ('fake', 'Pipeline'),
                         pipe.set_params,
                         fake='nope')

    # nested model check
    assert_raise_message(ValueError,
                         error_msg % ("fake", pipe),
                         pipe.set_params,
                         fake__estimator='nope')
예제 #2
0
def Create_XGBoost_Model():
    learning_rates = [0.1, 0.05, 0.01]
    num_estimators = [10, 20, 30] + list(range(45, 100, 5))
    max_depths = [2**x for x in range(1, 7)]
    grid = {
        'xgbclassifier__learning_rate': learning_rates,
        'xgbclassifier__n_estimators': num_estimators,
        'xgbclassifier__max_depth': max_depths
    }
    xgb_model = xgb.XGBClassifier()
    cv_kfold = KFold(n_splits=N_split, shuffle=True, random_state=4)
    pipeline = Pipeline([('under', RandomUnderSampler()),
                         ('xgbclassifier', xgb_model)])
    xgb_model_grid_search = GridSearchCV(estimator=pipeline,
                                         param_grid=grid,
                                         cv=cv_kfold,
                                         n_jobs=-1,
                                         verbose=4)
    return xgb_model_grid_search
예제 #3
0
 def __init__(self, model_file: str = None) -> None:
     super().__init__()
     # pip install sklearn
     from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
     from imblearn.over_sampling import SMOTE
     from sklearn.linear_model import LogisticRegression
     from imblearn.pipeline import Pipeline
     self.pipeline = Pipeline([
         ('vect', CountVectorizer()),
         ('tfidf', TfidfTransformer()),
         ('smote', SMOTE()),
         ('clf',
          LogisticRegression(
              solver='newton-cg',
              multi_class='multinomial',
              random_state=42,
              max_iter=100,
          )),
     ])
예제 #4
0
def balanceSampling(X_tr, y_train, up_ratio=1,dn_ratio=1):
    """
    Docstring: up and under sampling data
    
    Parameters
    ----------
    up_ratio: upsampling ratio
    dn_ratio: downsampling ratio

    """
    # Ratio argument is the percentage of the upsampled minority class in relation to the majority class. Default is 1.0
    over = SMOTE(sampling_strategy = up_ratio)
    under = RandomUnderSampler(sampling_strategy = dn_ratio)
    steps = [('over', over), ('under', under)]
    pipeline = Pipeline(steps=steps)
    X_train_sm, y_train_sm = pipeline.fit_resample(X_tr, y_train)
    
    print(X_train_sm.shape, y_train_sm.shape)
    return X_train_sm, y_train_sm
예제 #5
0
파일: main.py 프로젝트: djdajing/NDSC
def _train(X, y, save_model):
    #print y.value_counts()
    #min_sample = min(y.value_counts())
    #print "min : ",min_sample
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    from imblearn.over_sampling import SMOTE
    # Pipeline item
    cv = CountVectorizer()  # make into bag of words
    tfidf = TfidfTransformer(use_idf=True)  # apply tfidf
    upsampling = SMOTE(k_neighbors=9)
    svm = SGDClassifier(penalty='l2', loss='modified_huber')

    parameters = {
        'cv__stop_words': ('english', None),  # remove stopwords or not
        'cv__max_df':
        (0.8, 0.9, 0.85, 0.95),  # if a word apper more than x*10% then ignore
        'svm__alpha': (1e-3, 1e-4),  #learning rate
        'svm__max_iter': (5000, 10000),  # max iteration
        'svm__tol': (1e-4, 1e-3, 1e-2)  # when to stop
    }

    # training
    #text_clf_svm = Pipeline([('cv', cv), ('tfidf', tfidf), ('upsampling', upsampling), ('svm', svm)])
    text_clf_svm = Pipeline([('cv', cv), ('tfidf', tfidf), ('svm', svm)])
    gs_clf = GridSearchCV(text_clf_svm,
                          parameters,
                          n_jobs=-1,
                          cv=skf.split(X, y),
                          scoring='f1_micro')

    gs_clf = gs_clf.fit(X, y)

    print "Best Parameter : ", gs_clf.best_params_
    print "F1 Score : ", gs_clf.best_score_

    print "============================================"

    # Saving model
    if save_model:
        saving_path = Utilities.construct_filepath(out_dir, [category, label],
                                                   ".model")
        pickle.dump(gs_clf.best_estimator_, open(saving_path, 'wb'))
예제 #6
0
def confusion_matrix(data, target, category, clf, class_names, title):
    """
    Plot and save confuction matrix for specified classifier.

    Args:
        data (numpy.ndarray): Data samples
        target (numpy.ndarray): Data labels (target variable values)
        category (str): Specification of the type of prediction being made.
        Valid values are 'book-relevance', 'type', 'category' and 'category-broad'.
        clf (object): Classifier for which to plot the confuction matrix.
        class_names (list): List of class names
        title (str): Plot title
    """

    # Initialize random forest classifier, apply wrapper and add to pipeline.
    clf_eval = Pipeline([('scaling', RobustScaler()), ('clf', clf)])

    # Split data into training and test sets.
    data_train, data_test, target_train, target_test = train_test_split(
        data, target, shuffle=False, test_size=0.1)

    # Fit model.
    clf_eval.fit(data_train, target_train)
    np.set_printoptions(precision=2)

    # Plot confusion matrix and save plot.
    disp = metrics.plot_confusion_matrix(clf_eval,
                                         data_test,
                                         target_test,
                                         display_labels=class_names,
                                         cmap=plt.cm.Blues,
                                         normalize='true',
                                         xticks_rotation='vertical')

    # UNCOMMENT TO SET TITLE.
    # disp.ax_.set_title("Normalized Confusion Matrix - " + title)
    disp.figure_.set_size_inches(9.0, 9.0, forward=True)
    plt.tight_layout()
    plt.savefig('../results/plots/cfm_' + category + '_' +
                title.lower().replace(' ', '_') + '.eps')
    plt.clf()
    plt.close()
예제 #7
0
def createPipeline(model, oversampler_type, *args, **kwargs):

    if oversampler_type == "SMOTE":
        oversampler = SMOTE(sampling_strategy="minority", random_state=0)
    elif oversampler_type == "SVMSMOTE":
        oversampler = SVMSMOTE(sampling_strategy="minority", random_state=0)
    elif oversampler_type == "RandomOverSampler":
        oversampler = RandomOverSampler(sampling_strategy="minority", random_state=0)
    else:
        raise ValueError("RAPIDS pipeline only supports 'SMOTE', 'SVMSMOTE' and 'RandomOverSampler' oversampling methods.")

    if model == "LogReg":
        from sklearn.linear_model import LogisticRegression
        clf = ("clf", LogisticRegression(random_state=0))
    elif model == "kNN":
        from sklearn.neighbors import KNeighborsClassifier
        clf = ("clf", KNeighborsClassifier())
    elif model == "SVM":
        from sklearn.svm import SVC
        clf = ("clf", SVC(random_state=0, probability=True))
    elif model == "DT":
        from sklearn.tree import DecisionTreeClassifier
        clf = ("clf", DecisionTreeClassifier(random_state=0))
    elif model == "RF":
        from sklearn.ensemble import RandomForestClassifier
        clf = ("clf", RandomForestClassifier(random_state=0))
    elif model == "GB":
        from sklearn.ensemble import GradientBoostingClassifier
        clf = ("clf", GradientBoostingClassifier(random_state=0))
    elif model == "XGBoost":
        from xgboost import XGBClassifier
        clf = ("clf", XGBClassifier(random_state=0, n_jobs=6))
    elif model == "LightGBM":
        from lightgbm import LGBMClassifier
        clf = ("clf", LGBMClassifier(objective="binary", random_state=0, n_jobs=6))
    else:
        raise ValueError("RAPIDS pipeline only supports LogReg, kNN, SVM, DT, RF, GB, XGBoost, and LightGBM algorithms for classification problems.")
    
    steps = [("sampling", oversampler), ("fs", kwargs["feature_selector"])] if "feature_selector" in kwargs.keys() else [("sampling", oversampler)]
    steps.append(clf)
    pipeline = Pipeline(steps)
    return pipeline
 def model_training(self):
     pre = PreProcessing()
     print('Reading data')
     df = self.data.read_data(train=True)
     print('Starting training')
     X_train, y_train = pre.preprocess(df, train=True)
     print('Starting training model')
     model = CatBoostClassifier()
     steps = [('over', SMOTE()), ('model', CatBoostClassifier())]
     pipeline = Pipeline(steps=steps)
     pipeline.fit(X_train, y_train)
     modelo = pipeline['model']
     model = {
         'model': modelo,
         'preprocessing': pre,
         'columns': pre.feature_names
     }
     print(model)
     dump(model, '../output/modelo.pkl')
     return model
예제 #9
0
    def _validate_estimator(self, default=DecisionTreeClassifier()):
        """Check the estimator and the n_estimator attribute, set the
        `base_estimator_` attribute."""
        if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
            raise ValueError("n_estimators must be an integer, "
                             "got {0}.".format(type(self.n_estimators)))

        if self.n_estimators <= 0:
            raise ValueError("n_estimators must be greater than zero, "
                             "got {0}.".format(self.n_estimators))

        if self.base_estimator is not None:
            base_estimator = clone(self.base_estimator)
        else:
            base_estimator = clone(default)

        self.base_estimator_ = Pipeline([('sampler', RandomUnderSampler(
            sampling_strategy=self.sampling_strategy,
            replacement=self.replacement,
            ratio=self.ratio)), ('classifier', base_estimator)])
예제 #10
0
파일: pipe_setup.py 프로젝트: mborysiak/ff
    def ensemble_pipe(self, pipes):
        """Create a mean ensemble pipe where individual pipes feed into 
           a mean voting ensemble model.

        Args:
            pipes (list): List of pipes that will have their outputs averaged

        Returns:
            Pipeline: Pipeline object that has multiple multiple feeding Voting object
        """
        ests = []
        for i, p in enumerate(pipes):
            ests.append((f'p{i}', p))

        if self.model_obj == 'reg':
            ensemble = VotingRegressor(estimators=ests)
        elif self.model_obj == 'class':
            ensemble = VotingClassifier(estimators=ests)

        return Pipeline([('ensemble', ensemble)])
예제 #11
0
def under_sample_with_SMOTE(X, y):
    '''
    Undersample the date with SMOTE algorithm
    :param X:
    :param y: labels
    :return:
    '''
    counter = collections.Counter(y)
    print(counter)
    # define pipeline
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    # transform the dataset
    X, y = pipeline.fit_resample(X, y)
    # summarize the new class distribution
    counter = collections.Counter(y)
    print(counter)
    return X, y
예제 #12
0
def test_pipeline_sample_transform():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2,
                               class_sep=2,
                               weights=[0.1, 0.9],
                               n_informative=3,
                               n_redundant=1,
                               flip_y=0,
                               n_features=20,
                               n_clusters_per_class=1,
                               n_samples=5000,
                               random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pca = PCA()
    pca2 = PCA()
    pipeline = Pipeline([('pca', pca), ('rus', rus), ('pca2', pca2)])

    pipeline.fit(X, y).transform(X)
예제 #13
0
def Create_XGBoost_Model():
    learning_rates = [0.1, 0.05, 0.01]
    num_estimators = [10, 20, 30] + list(range(45, 100, 5))
    max_depths = [2**x for x in range(1, 7)]
    grid = {
        'xgbclassifier__learning_rate': learning_rates,
        'xgbclassifier__n_estimators': num_estimators,
        'xgbclassifier__max_depth': max_depths
    }
    xgb_model = xgb.XGBClassifier(objective='binary:logistic',
                                  eval_metric='logloss')
    cv_kfold = KFold(n_splits=5, shuffle=True, random_state=4)
    pipeline = Pipeline([('sample', SMOTE()), ('xgbclassifier', xgb_model)])
    xgb_model_grid_search = GridSearchCV(estimator=pipeline,
                                         param_grid=grid,
                                         scoring='roc_auc',
                                         cv=cv_kfold,
                                         n_jobs=-1,
                                         verbose=4)
    return xgb_model_grid_search
예제 #14
0
    def _build_pipeline(self):
        """
        Built the classifier pipeline.

        Returns:
            clf: (Pipeline) Pipeline.
        """
        # assign appropriate classifier
        if self.classifier.lower() == 'dummyclassifier':
            clf = DummyClassifier(strategy='most_frequent')
        elif self.classifier.lower() == 'decisiontreeclassifier':
            clf = DecisionTreeClassifier(
                **self.parameters if not self.mode == 'grid' else {})
        elif self.classifier.lower() == 'gaussiannb':
            clf = GaussianNB()
        elif self.classifier.lower() == 'multinomialnb':
            clf = MultinomialNB()
        elif self.classifier.lower() == 'svc':
            clf = SVC(probability=True, **self.parameters)
        elif self.classifier.lower() == 'adaboostclassifier':
            clf = AdaBoostClassifier(
                **self.parameters if not self.mode == 'grid' else {})
        elif self.classifier.lower() == 'randomforestclassifier':
            clf = RandomForestClassifier(
                n_jobs=-1,
                **self.parameters if not self.mode == 'grid' else {})
        elif self.classifier.lower() == 'mlpclassifier':
            clf = MLPClassifier(
                max_iter=3000,
                **self.parameters if not self.mode == 'grid' else {})
        else:
            raise ValueError('Invalid classifier: {}'.format(self.classifier))

        log.info('Selected classifier: %s', self.classifier)
        log.debug('Classifier info: %s', clf)

        # SMOTE over-sample
        smote = SMOTE(sampling_strategy='minority')
        clf = Pipeline([('SMOTE', smote), (self.classifier, clf)])

        return clf
예제 #15
0
def check_oversamplers_classifiers(oversamplers, classifiers, n_runs,
                                   random_state):
    """Extract estimators and parameters grids."""

    # Extract estimators
    estimators_products = product([smpl[0:2] for smpl in oversamplers],
                                  [clf[0:2] for clf in classifiers],
                                  range(n_runs))
    estimators = [('%s|%s_%s' % (smpl_name, clf_name, run_id),
                   Pipeline([(smpl_name, smpl), (clf_name, clf)]))
                  for (smpl_name, smpl), (clf_name,
                                          clf), run_id in estimators_products]

    # Extract parameters grids
    oversamplers_param_grids = [
        {('%s__%s' % (smpl[0], par)): val
         for par, val in smpl[2].items()} if len(smpl) > 2 else {}
        for smpl in oversamplers
    ]
    classifiers_param_grids = [
        {('%s__%s' % (clf[0], par)): val
         for par, val in clf[2].items()} if len(clf) > 2 else {}
        for clf in classifiers
    ]
    param_grids_products = product(oversamplers_param_grids,
                                   classifiers_param_grids, range(n_runs))
    random_states = check_random_states(random_state, len(estimators))
    param_grids = []
    est_names, _ = zip(*estimators)
    for (oversampler_param_grid , classifier_param_grid, run_id), random_state, est_name in \
            zip(param_grids_products, random_states, est_names):
        param_grid = {}
        param_grid.update(oversampler_param_grid)
        param_grid.update(classifier_param_grid)
        param_grid = {('%s__%s' % (est_name, par)): val
                      for par, val in param_grid.items()}
        param_grid.update({'est_name': [est_name]})
        param_grid.update({'random_state': [random_state]})
        param_grids.append(param_grid)

    return {'estimators': estimators, 'param_grids': param_grids}
예제 #16
0
def hyper_paramytize_optimization(f):
    print ("model with no experience with Smote STSRCOM", file = f)
    print ("--------------------------------------------------------------------", file = f)
    counter = Counter(y)
    # estimate scale_pos_weight value
    estimate = counter[0] / counter[1]
    print('Estimate: %.3f' % estimate, file = f)
    print(counter[0], file = f)
    print(counter[1], file = f)
    model = XGBClassifier(objective='binary:logistic', eval_metric='logloss')
    random = RandomUnderSampler(sampling_strategy=0.33)
    # define grid
    # weights = [1,3, 10, 25,30, 50, 75, 99, 100]
    # param_grid = dict(scale_pos_weight=weights)
    # param_grid= {'xgbclassifier__scale_pos_weight': weights}
    learning_rates = [0.1, 0.05, 0.01]
    max_depths = [1, 2, 3, 5, 8, 10, 14,18]
    n_estimator = range(60, 220, 40)
    weights = [1, 10, 25, 50, 75, 99, 100, 1000]
    param_grid = {'xgbclassifier__max_depth': max_depths,
                  'xgbclassifier__learning_rate': learning_rates,
                  'xgbclassifier__n_estimators': n_estimator}

    print (param_grid, file = f)
    # define evaluation procedure
    cv = StratifiedKFold(n_splits=10)
    # define grid search
    # pipeline = Pipeline([('under', random), ('xgbclassifier', model)])
    pipeline = Pipeline([('sample', SMOTE()), ('xgbclassifier', model)])
    grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
    # execute the grid search
    grid_result = grid.fit(X, y)
    # report the best configuration
    print (grid_result, file=f)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_), file = f)
    # report all configurations
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param), file = f)
def SMOTE_Analysis(k, o, u):
    try:
        model = DecisionTreeClassifier()
        over = SMOTE(sampling_strategy=o, k_neighbors=k, random_state=2)
        under = RandomUnderSampler(sampling_strategy=u)
        steps = [('over', over), ('under', under)]
        pipeline = Pipeline(steps=steps)
        Xn, yn = pipeline.fit_resample(X, y.ravel())
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        scores = cross_val_score(model,
                                 Xn,
                                 yn,
                                 scoring='roc_auc',
                                 cv=cv,
                                 n_jobs=-1)
        score = np.mean(scores)
        print("k={}, over={}, under={}, Mean ROC AUC: {:.3f}".format(
            k, o, u, score))
        return [k, o, u]
    except Exception as e:
        return ""
예제 #18
0
    def tune_model_hyperparameters(self):
        # this can be used to tune classifier hyperparameters
        pipe = Pipeline([('resample', SMOTE()),
                         ('model', RandomForestClassifier())])

        kf = StratifiedKFold(n_splits=10, shuffle=True)

        p_grid = dict(model__n_estimators=[50, 100, 200])
        grid_search = GridSearchCV(estimator=pipe,
                                   param_grid=p_grid,
                                   cv=kf,
                                   refit=True)
        grid_search.fit(self._X_pca_train, self._y_train)

        #   Adding below in as could be helpful to know how to get fitted scaler if used
        # best = grid_search.best_estimator_
        # print(best)
        prediction = grid_search.predict(self._X_pca_test)
        cnf_matrix = confusion_matrix(self._y_test, prediction)

        return prediction, cnf_matrix
    def resampling(self,
                   oversample_ratio=0.3,
                   minority_num=368,
                   majority_num=10000,
                   minority_label='1.0',
                   majority_label='0.0'):
        # define resampling
        under = RandomUnderSampler(sampling_strategy={
            majority_label: majority_num,
            minority_label: minority_num
        })
        over = SMOTE(sampling_strategy=oversample_ratio)

        # define pipeline
        pipeline = Pipeline(steps=[('u', under), ('o', over)])

        X_sm, y_sm = pipeline.fit_resample(self.X, self.y)

        print('Proportion in data after resample: ', Counter(y_sm))

        return X_sm, y_sm
예제 #20
0
def syntetic_sampling(X, y, over_sampling, under_sampling):
  """
  Apply Synthetic Minority Oversampling Technique (SMOTE)
  to tn unbalanced class

  :type X: pandas DataFrame
  :param X: Training Features

  :type y: pandas Series
  :param y: Training Features

  :return: resampled data
  :rtype: tuple
  """

  over = SMOTE(sampling_strategy=over_sampling)
  under = RandomUnderSampler(sampling_strategy=under_sampling)
  steps = [('o', over), ('u', under)]
  pipeline = Pipeline(steps=steps)

  return pipeline.fit_resample(X, y)
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    iris = load_iris()
    scaler = StandardScaler()
    km = KMeans(random_state=0)
    # As pipeline doesn't clone estimators on construction,
    # it must have its own estimators
    scaler_for_pipeline = StandardScaler()
    km_for_pipeline = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)
예제 #22
0
def training_imbalance(descr_series, classes_codes, TFIDF_, IMB_, FS_,
                       req_percentage, CLF_, model_path):
    """ Trains models using handled setting and saves them as .sav objects.

        Parameters:
            descr_series(Series): description series;
            classes_codes(Series): series with classes' codes;
            TFIDF_: vectorizer;
            IMB_: SMOTE method;
            FS_: ranking terms method;
            req_percentage(int): percentage to be taken from the ranked list;
            CLF_: classifier;
            model_path(str): the path to the model.

    """
    transformer = feature_selection.SelectPercentile(FS_)
    clf_model = Pipeline([('tfidf', TFIDF_), ('imba', IMB_),
                          ('fs', transformer), ('clf', CLF_)])
    clf_model.set_params(fs__percentile=req_percentage).fit(
        descr_series, classes_codes)
    dump(clf_model, open(model_path + '.sav', 'wb'))
def split_smote(drug_df, drug_name):
    X = drug_df.drop([drug_name], axis=1)
    y = drug_df[drug_name]
    counter = Counter(y)
    print('Originally, the distribution of classes is: {}'.format(counter))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=42,
                                                        stratify=y)
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    Xsm_train, ysm_train = pipeline.fit_resample(X_train, y_train)
    counter_balance = Counter(ysm_train)
    print(
        'After SMOTE sampling, the distribution of classes in Training set is: {}'
        .format(counter_balance))
    XSM_train = pd.DataFrame(Xsm_train, columns=X_train.columns)
    return XSM_train, ysm_train, X_test, y_test
예제 #24
0
def cvsmote():
    X = df_small.drop(['HospID', 'SiteID', 'surgid', 'Complics', 'Mortality'],
                      axis=1)
    y = df_small['Mortality']

    steps = [('over', SMOTE()),
             ('model',
              XGBClassifier(objective='binary:logistic',
                            eval_metric='logloss'))]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    for scoring in ["accuracy", "roc_auc"]:
        cv = StratifiedKFold(n_splits=10, random_state=0)
        scores = cross_val_score(pipeline,
                                 X,
                                 y,
                                 scoring=scoring,
                                 cv=cv,
                                 n_jobs=-1)
        print("Model", scoring, " mean=", scores.mean(), "stddev=",
              scores.std())
예제 #25
0
def model_select():
    for nome_balanceador, balanceador in balanceadores:
        if classificador_ja_executado(nome, nome_balanceador):
            continue
        else:
            print(balanceador)
            pipeline = Pipeline([('dimension', PCA(n_components=250)),
                                 ('balance', balanceador), ('clf', modelo)])
            print("# Rodando o algoritmo %s" % nome)
            print()

            np.set_printoptions(precision=4)
            pipeline.fit(dados_completo_x, dados_completo_y)

            print("Detailed classification report:")
            print()
            print("The model is trained on the full development set.")
            print("The scores are computed on the full evaluation set.")
            print()
            y_pred = pipeline.predict(test_x)
            matriz_confusao = confusion_matrix(test_y, y_pred)
            nome_arquivo = nome + '_' + nome_balanceador + '_best_mucilage'
            plot_confusion_matrix(matriz_confusao,
                                  nome_arquivo, [1, 2, 3, 4],
                                  False,
                                  title='Confusion matrix' + nome +
                                  ' (best parameters)')
            plot_confusion_matrix(matriz_confusao,
                                  nome_arquivo, [1, 2, 3, 4],
                                  True,
                                  title='Confusion matrix ' + nome +
                                  ', normalized')
            print('Matriz de Confusão')
            print(matriz_confusao)
            print(classification_report(y_true=test_y, y_pred=y_pred,
                                        digits=4))
            y_pred = pipeline.predict_proba(test_x)
            roc_auc_aux(test_y, y_pred, nome, nome_balanceador)
            print()
            sys.stdout.flush()
예제 #26
0
def train_validate(model,
                   preprocess,
                   param_grid,
                   X_train,
                   y_train,
                   metric='roc_auc',
                   n_iter=20):

    final_model = Pipeline([('upsampling', preprocess['upsampling']),
                            ('transform', preprocess['transform']),
                            ('classifier', model)])

    model_search = BayesSearchCV(estimator=final_model,
                                 search_spaces=param_grid,
                                 scoring=metric,
                                 n_iter=n_iter,
                                 n_jobs=-1).fit(X_train, y_train)

    print("Parameters search completed!")

    best_model = model_search.best_estimator_

    best_model_scores = cross_validate(best_model,
                                       X_train,
                                       y_train,
                                       cv=RepeatedStratifiedKFold(n_repeats=5),
                                       scoring=metric,
                                       n_jobs=-1)

    print("Cross validation on best model completed!")

    best_model_scores = best_model_scores["test_score"]

    mean_valid_score = np.round(np.mean(best_model_scores), 4)

    print("Mean validation score: ", mean_valid_score)

    done()

    return best_model, best_model_scores, mean_valid_score
예제 #27
0
def hyper_paramitize_scale_gridSearch():
    counter = Counter(y)
    # estimate scale_pos_weight value
    estimate = counter[0] / counter[1]
    print('Estimate: %.3f' % estimate)
    print(counter[0])
    print(counter[1])
    model = XGBClassifier(objective='binary:logistic', eval_metric='logloss')
    random = RandomUnderSampler(sampling_strategy=0.33)
    # define grid
    # weights = [1,3, 10, 25,30, 50, 75, 99, 100]
    #param_grid = dict(scale_pos_weight=weights)
    #param_grid= {'xgbclassifier__scale_pos_weight': weights}
    learning_rates = [0.1, 0.05, 0.01]
    max_depths = [1, 2, 3, 5, 8, 10]
    param_grid = {
        'xgbclassifier__max_depth': max_depths,
        'xgbclassifier__learning_rate': learning_rates
    }
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
    # define grid search
    pipeline = Pipeline([('under', random), ('xgbclassifier', model)])

    grid = GridSearchCV(estimator=pipeline,
                        param_grid=param_grid,
                        n_jobs=-1,
                        cv=cv,
                        scoring='roc_auc')
    # execute the grid search
    grid_result = grid.fit(X, y)
    # report the best configuration
    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))
    # report all configurations
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
예제 #28
0
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(n_classes=2,
                               class_sep=2,
                               weights=[0.1, 0.9],
                               n_informative=3,
                               n_redundant=1,
                               flip_y=0,
                               n_features=20,
                               n_clusters_per_class=1,
                               n_samples=5000,
                               random_state=0)
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression()
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
예제 #29
0
def under_over_sample(X,
                      y,
                      under_samp_rate=0.15,
                      over_samp_rate=0.75,
                      random_state=42):
    under = RandomUnderSampler(
        sampling_strategy=under_samp_rate,
        random_state=random_state,
    )
    over = RandomOverSampler(sampling_strategy=over_samp_rate,
                             random_state=random_state)
    steps = [('under', under), ('over', over)]
    pipeline = Pipeline(steps=steps)

    X_res, y_res = pipeline.fit_resample(np.array(X).reshape(-1, 1), y)

    combined = pd.DataFrame(data={
        "TEXT": X_res.squeeze(),
        "OUTPUT_LABEL": y_res
    })

    return combined.fillna("")
예제 #30
0
    def test_evaluate_pipeline(self):
        runner = CliRunner()
        pattern = "/*.joblib"
        X, y = load_dataset()

        dummy_pipeline = Pipeline(
            [("dummy_classifier", DummyClassifier(strategy="constant", constant=0))]
        )

        with tempfile.TemporaryDirectory() as destination:
            threshold = destination + "/DUMMY_threshold.json"
            train_pipeline(
                X=X,
                y=y,
                model="DUMMY",
                pipeline=dummy_pipeline,
                destination=destination,
                ignore_prints=True,
                ignore_html=True,
            )
            pipeline_path = glob.glob(destination + pattern)
            runner.invoke(
                main,
                [
                    "evaluate",
                    "--pipeline",
                    pipeline_path[0],
                    "--threshold",
                    threshold,
                    "--prefix",
                    "DUMMY",
                    "--destination",
                    destination,
                ],
            )
            files = glob.glob(destination + "/*")
            self.assertTrue(any([".png" in file for file in files]))
            self.assertTrue(any([".json" in file for file in files]))
            self.assertTrue(any([".csv" in file for file in files]))