Пример #1
0
def predefined_ops():
    '''return dict of user defined none-default instances of operators
    '''
    clean = {
        'clean':
        Cleaner(dtype_filter='not_datetime',
                na1='null',
                na2='mean',
                drop_uid=True),
        'cleanNA':
        Cleaner(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'),
        'cleanMn':
        Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'),
    }
    #
    encode = {
        'woe8': WoeEncoder(max_leaf_nodes=8),
        'woe5': WoeEncoder(max_leaf_nodes=5),
        'woeq8': WoeEncoder(q=8),
        'woeq5': WoeEncoder(q=5),
        'woeb5': WoeEncoder(bins=5),
        'woem': WoeEncoder(mono=True),
        'oht': OhtEncoder(),
        'ordi': OrdiEncoder(),

        # 'bin10': BinEncoder(bins=10, int_bins=True),  # 10 bin edges encoder
        # 'bin5': BinEncoder(bins=5, int_bins=True),  # 5 bin edges encoder
        # 'binm10': BinEncoder(max_leaf_nodes=10,
        #                      int_bins=True),  # 10 bin tree cut edges encoder
        # 'binm5': BinEncoder(max_leaf_nodes=5,
        #                     int_bins=True),  # 5 bin tree cut edges encoder
    }

    resample = {
        # over_sampling
        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),
        # clean outliers
        'inlierForest':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'IsolationForest',
                            'contamination': 0.1
                        }),
        'inlierLocal':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'LocalOutlierFactor',
                            'contamination': 0.1
                        }),
        'inlierEllip':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'EllipticEnvelope',
                            'contamination': 0.1
                        }),
        'inlierOsvm':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'OneClassSVM',
                            'contamination': 0.1
                        }),
    }

    scale = {
        'stdscale': StandardScaler(),
        'minmax': MinMaxScaler(),
        'absmax': MaxAbsScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'quantile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        # kernel approximation
        'Nys': Nystroem(random_state=0),
        'rbf': RBFSampler(random_state=0),
        'rfembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(WoeEncoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(LogisticRegression(penalty='l1', solver='saga',
                                           C=1e-2)),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fxgb':
        SelectFromModel(
            XGBClassifier(n_jobs=-1,
                          booster='gbtree',
                          max_depth=2,
                          n_estimators=50), ),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)),

        # fixed number of features
        'fxgb20':
        SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'),
                        max_features=20),
        'frf20':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=20),
        'frf10':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=10),
        'fRFElog':
        RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1),
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }

    imp = {
        "impXGB":
        XGBClassifier(n_jobs=-1,
                      booster='gbtree',
                      max_depth=2,
                      n_estimators=50),
        "impRF":
        ExtraTreesClassifier(n_estimators=100, max_depth=2)
    }

    instances = {}
    instances.update(**clean, **encode, **scale, **feature_c, **feature_m,
                     **feature_u, **resample, **imp)
    return instances
def test_iht_fit_sample_wrong_class_obj():
    from sklearn.cluster import KMeans
    est = KMeans()
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    assert_raises_regex(ValueError, "Invalid parameter `estimator`",
                        iht.fit_sample, X, Y)
Пример #3
0
def test_iht_fit_resample():
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_resample(X, Y)
    assert X_resampled.shape == (12, 2)
    assert y_resampled.shape == (12, )
Пример #4
0
        sampler.__class__.__name__))
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

###############################################################################
# ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude
# samples. All samples which are classified with a low probability will be
# removed.

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=5000,
                      weights=(0.01, 0.05, 0.94),
                      class_sep=0.8)

clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = InstanceHardnessThreshold(random_state=0,
                                    estimator=LogisticRegression(
                                        solver='lbfgs', multi_class='auto'))
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
plot_resampling(X, y, sampler, ax3)
ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

plt.show()
Пример #5
0
def test_iht_fit_resample_wrong_class_obj():
    from sklearn.cluster import KMeans
    est = KMeans()
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    with pytest.raises(ValueError, match="Invalid parameter `estimator`"):
        iht.fit_resample(X, Y)
Пример #6
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
def under_sample_InstanceHardnessThreshold(train_inputs, train_targets):
    sampler = InstanceHardnessThreshold(random_state=32)
    train_inputs, train_targets = _sampler_helper(sampler, train_inputs,
                                                  train_targets)
    return train_inputs, train_targets
Пример #8
0
                           n_clusters_per_class=1,
                           random_state=0,
                           weights=[0.65, 0.3, 0.05],
                           n_repeated=0,
                           n_redundant=0)
print('采样前: {}'.format(Counter(y).items()))

# 下采样
sampler = TomekLinks(ratio='auto', random_state=0)
sampler1 = EditedNearestNeighbours(random_state=0)
sampler2 = RepeatedEditedNearestNeighbours(random_state=0, max_iter=500)
sampler3 = AllKNN(random_state=0)
sampler4 = CondensedNearestNeighbour(random_state=0)
sampler5 = OneSidedSelection(random_state=0, n_seeds_S=5)
sampler6 = NeighbourhoodCleaningRule(random_state=0)
sampler7 = InstanceHardnessThreshold(random_state=0, cv=10)

for x in [
        sampler, sampler1, sampler2, sampler3, sampler4, sampler5, sampler6,
        sampler7
]:
    X_new, y_new = x.fit_sample(X, y)
    print('采样后: {}'.format(Counter(y_new).items()))
    # 拟合
    y_pred = SVC().fit(X_new, y_new).predict(X)
    print(accuracy_score(y, y_pred))

# 不重新采样的效果
y_pred1 = y_pred = SVC().fit(X, y).predict(X)
print('不重新采样的 acc: {}'.format(accuracy_score(y, y_pred)))
axs = [a for ax in axs for a in ax]
for ax, sampling_strategy in zip(axs, (0, {
        1: 25,
        0: 10
}, {
        1: 14,
        0: 10
}, {
        1: 10,
        0: 10
})):
    if sampling_strategy == 0:
        c0, c1 = plot_resampling(ax, X_vis, y, 'Original set')
    else:
        iht = InstanceHardnessThreshold(sampling_strategy=sampling_strategy,
                                        estimator=LogisticRegression(),
                                        return_indices=True)
        X_res, y_res, idx_res = iht.fit_resample(X, y)
        X_res_vis = pca.transform(X_res)
        plot_resampling(
            ax, X_res_vis, y_res,
            'Instance Hardness Threshold ({})'.format(sampling_strategy))
        # plot samples which have been removed
        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res)
        c3 = ax.scatter(X_vis[idx_samples_removed, 0],
                        X_vis[idx_samples_removed, 1],
                        alpha=.2,
                        label='Removed samples')

plt.figlegend((c0, c1, c3), ('Class #0', 'Class #1', 'Removed samples'),
              loc='lower center',
Пример #10
0
def instance_hardness_threshold_optimized():
    return InstanceHardnessThreshold(estimator=GradientBoostingClassifier(),
                                     sampling_strategy='auto',
                                     random_state=0,
                                     cv=6,
                                     n_jobs=-1)
pca = PCA(n_components=2)
X_vis = pca.fit_transform(X)

# Two subplots, unpack the axes array immediately
f, axs = plt.subplots(2, 2)

axs = [a for ax in axs for a in ax]
for ax, ratio in zip(axs, (0,
                           {1: 25, 0: 10},
                           {1: 14, 0: 10},
                           {1: 10, 0: 10})):
    if ratio == 0:
        c0, c1 = plot_resampling(ax, X_vis, y, 'Original set')
    else:
        iht = InstanceHardnessThreshold(ratio=ratio,
                                        estimator=LogisticRegression(),
                                        return_indices=True)
        X_res, y_res, idx_res = iht.fit_sample(X, y)
        X_res_vis = pca.transform(X_res)
        plot_resampling(ax, X_res_vis, y_res,
                        'Instance Hardness Threshold ({})'.format(ratio))
        # plot samples which have been removed
        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                           idx_res)
        c3 = ax.scatter(X_vis[idx_samples_removed, 0],
                        X_vis[idx_samples_removed, 1],
                        alpha=.2, label='Removed samples')

plt.figlegend((c0, c1, c3), ('Class #0', 'Class #1', 'Removed samples'),
              loc='lower center', ncol=3, labelspacing=0.)
plt.tight_layout(pad=3)
Пример #12
0
def instance_hardness_thresold(X, y):
    iht = InstanceHardnessThreshold(random_state=42)
    X_res, y_res = iht.fit_resample(X, y)
    return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train,
                        PM_ova_X_train, PM_ova_y_train, SC_ova_X_train,
                        SC_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN()
        AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ALLKNN":
        AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN()
        AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train,
                                                    AP_ova_y_train)
        PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train,
                                                    PM_ova_y_train)
        SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train,
                                                    SC_ova_y_train)
    elif imb_technique == "CNN":
        AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ENN":
        AP_enn, PM_enn, SC_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "IHT":
        AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NCR":
        AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AP_ova_y_train = [
            0 if i == "Add penalty" else 1 for i in AP_ova_y_train
        ]
        AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train]
        PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_ova_y_train = [
            0 if i == "Send for Credit Collection" else 1
            for i in SC_ova_y_train
        ]
        SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NM":
        AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss()
        AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "OSS":
        AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RENN":
        AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "SMOTE":
        AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE()
        AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "BSMOTE":
        AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN()
        AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train,
                                                   AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train,
                                                   PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train,
                                                   SC_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek()
        AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "TOMEK":
        AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks()
        AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "ROS":
        AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RUS":
        AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
Пример #14
0
def run_basic_svm(X_train,
                  y_train,
                  selected_features,
                  scorers,
                  refit_scorer_name,
                  subset_share=0.1,
                  n_splits=10,
                  parameters=None):
    '''Run an extensive grid search over all parameters to find the best parameters for SVM Classifier.
    The search shall be done only with a subset of the data. Default subset is 0.1. Input is training and test data.

    subset_share=0.1'''

    #Create a subset to train on
    print("[Step 1]: Create a data subset")
    subset_min = 300  #Minimal subset is 100 samples.

    if subset_share * X_train.shape[0] < subset_min:
        number_of_samples = subset_min
        print("minimal number of samples used: ", number_of_samples)
    else:
        number_of_samples = subset_share * X_train.shape[0]

    X_train_subset, y_train_subset = modelutil.extract_data_subset(
        X_train, y_train, number_of_samples)
    print("Got subset sizes X train: {} and y train: {}".format(
        X_train_subset.shape, y_train_subset.shape))

    print("[Step 2]: Define test parameters")
    if parameters is None:  #If no parameters have been defined, then do full definition
        # Guides used from
        # https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines
        # Main set of parameters for the grid search run 1: Select scaler, sampler and kernel for the problem
        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            NearMiss(version=1),
            EditedNearestNeighbours(),
            AllKNN(),
            CondensedNearestNeighbour(random_state=0),
            InstanceHardnessThreshold(random_state=0,
                                      estimator=LogisticRegression(
                                          solver='lbfgs', multi_class='auto')),
            SMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]
        test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]

        # gamma default parameters
        param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var()))

        parameters = [
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['linear', 'sigmoid']
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['poly'],
                'svm__degree': [2, 3]  # Only relevant for poly
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['rbf'],
                'svm__gamma':
                [param_scale, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2,
                 1e3]  # Only relevant in rbf, default='auto'=1/n_features
            }
        ]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

        print("Selected Parameters: ", parameters)
    else:
        print("Parameters defined in the input: ", parameters)

    # Main pipeline for the grid search
    pipe_run1 = Pipeline([('imputer',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='median')),
                          ('scaler', StandardScaler()),
                          ('sampling', modelutil.Nosampler()),
                          ('feat', modelutil.ColumnExtractor(cols=None)),
                          ('svm', SVC())])

    print("Pipeline: ", pipe_run1)

    print("Stratified KFold={} used.".format(n_splits))
    skf = StratifiedKFold(n_splits=n_splits)

    pipe_run1 = pipe_run1
    params_run1 = parameters  #params_debug #params_run1
    grid_search_run1 = GridSearchCV(pipe_run1,
                                    params_run1,
                                    verbose=1,
                                    cv=skf,
                                    scoring=scorers,
                                    refit=refit_scorer_name,
                                    return_train_score=True,
                                    iid=True,
                                    n_jobs=-1).fit(X_train_subset,
                                                   y_train_subset)

    results_run1 = modelutil.generate_result_table(grid_search_run1,
                                                   params_run1,
                                                   refit_scorer_name)
    print("Result size=", results_run1.shape)
    print("Number of NaN results: {}. Replace them with 0".format(
        np.sum(results_run1['mean_test_' + refit_scorer_name].isna())))

    return grid_search_run1, params_run1, pipe_run1, results_run1
data_X = imp.fit_transform(data_X)
scaler = StandardScaler()
scaler.fit(data_X)
data_X = scaler.transform(data_X)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(data_X, data_Y, test_size=0.33, random_state=42)
########################################################################
########################################################################
########################################################################
samplers = [
        NearMiss(version=2, random_state=42),
        CondensedNearestNeighbour(random_state=42),  
        EditedNearestNeighbours(random_state=42),
        RepeatedEditedNearestNeighbours(random_state=42), 
        AllKNN(random_state=42),     
        InstanceHardnessThreshold(random_state=42),
        NeighbourhoodCleaningRule(random_state=42),
        OneSidedSelection(random_state=42),
        RandomUnderSampler(random_state=42),
        TomekLinks(random_state=42)
        
       ]
samplers_name = ['Near Miss Classifier', 'Condensed Nearest Neighbour Naive Bayes',
             'Edited Nearest Neighbours', 'Repeated Edited Nearest Neighbours',
             'All KNN', 'Instance Hardness Threshold',
             'Neighbour hood Cleaning Rule' , 'OneSidedSelection', 'Random Under Sampler',
             'TomekLinks(random_state=42)'
            ]

params = {'n_estimators': 10, 'max_depth': 3, 'subsample': 0.5,
                  'learning_rate': 0.89, 'min_samples_leaf': 1, 'random_state': 5}
Пример #16
0
def pipe_main(pipe=None):
    '''pipeline construction using sklearn estimators, final step support only
    classifiers currently
    
    .. note::
        data flows through a pipeline consisting of steps as below:
            raw data --> clean --> encoding --> scaling --> feature construction 
            --> feature selection --> resampling --> final estimator
            see scikit-learn preprocess & estimators
    parameter
    ----
    pipe - str 
        - in the format of 'xx_xx' of which 'xx' means steps in pipeline,
          default None
    return
    ----
        1) pipeline instance of chosen steps
        2) if pipe is None, a dict indicating possible choice of 'steps'
    '''
    clean = {
        'clean':
        Split_cls(dtype_filter='not_datetime', na1='null', na2=-999),
        'cleanNA':
        Split_cls(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Split_cls(dtype_filter='not_datetime', na1='most_frequent',
                  na2='mean'),
    }
    #
    encode = {
        'woe': Woe_encoder(max_leaf_nodes=5),
        'oht': Oht_encoder(),
        'ordi': Ordi_encoder(),
    }

    resample = {

        # over_sampling
        'rover':
        RandomOverSampler(),
        'smote':
        SMOTE(),
        'bsmote':
        BorderlineSMOTE(),
        'adasyn':
        ADASYN(),

        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),

        # under sampling cleaning methods
        'tlinks':
        TomekLinks(n_jobs=-1),
        'oside':
        OneSidedSelection(n_jobs=-1),
        'cleanNN':
        NeighbourhoodCleaningRule(n_jobs=-1),
        'enn':
        EditedNearestNeighbours(n_jobs=-1),
        'ann':
        AllKNN(n_jobs=-1),
        'cnn':
        CondensedNearestNeighbour(n_jobs=-1),

        # clean outliers
        'inlierForest':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'IsolationForest'}),
        'inlierLocal':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'LocalOutlierFactor'}),
        'inlierEllip':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'EllipticEnvelope'}),
        'inlierOsvm':
        FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}),
        # combine
        'smoteenn':
        SMOTEENN(),
        'smotelink':
        SMOTETomek(),
    }

    scale = {
        'stdscale': StandardScaler(),
        'maxscale': MinMaxScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'qauntile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(normalize_components=True, n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        'rtembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(Woe_encoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(
            LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc')),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fsvm':
        SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)),
        'fxgb':
        SelectFromModel(XGBClassifier(n_jobs=-1)),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20),
        'fRFErf':
        RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5),
            step=0.3,
            n_features_to_select=20),
        'fRFElog':
        RFE(LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc'),
            step=0.3,
            n_features_to_select=20)
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }
    # sklearn estimator
    t = all_estimators(type_filter=['classifier'])
    estimator = {}
    for i in t:
        try:
            estimator.update({i[0]: i[1]()})
        except Exception:
            continue

    estimator.update(
        dummy=DummyClassifier(),
        XGBClassifier=XGBClassifier(n_jobs=-1),
        LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'),
        EasyEnsembleClassifier=EasyEnsembleClassifier(),
        BalancedRandomForestClassifier=BalancedRandomForestClassifier(),
        RUSBoostClassifier=RUSBoostClassifier(),
        SVC=SVC(C=0.01, gamma='auto'))

    if pipe is None:
        feature_s = {}
        feature_s.update(**feature_m, **feature_u)
        return {
            'clean': clean.keys(),
            'encoding': encode.keys(),
            'resample': resample.keys(),
            'scale': scale.keys(),
            'feature_c': feature_c.keys(),
            'feature_s': feature_s.keys(),
            'classifier': estimator.keys()
        }
    elif isinstance(pipe, str):
        l = pipe.split('_')
        all_keys_dict = {}
        all_keys_dict.update(**clean, **encode, **scale, **feature_c,
                             **feature_m, **feature_u, **estimator, **resample)
        steps = []
        for i in l:
            if all_keys_dict.get(i) is not None:
                steps.append((i, all_keys_dict.get(i)))
            else:
                raise KeyError(
                    "'{}' invalid key for sklearn estimators".format(i))
        return Pipeline(steps)

    else:
        raise ValueError("input pipe must be a string in format 'xx[_xx]'")
Пример #17
0
axs = [a for ax in axs for a in ax]
for ax, sampling_strategy in zip(axs, (0, {
        1: 25,
        0: 10
}, {
        1: 14,
        0: 10
}, {
        1: 10,
        0: 10
})):
    if sampling_strategy == 0:
        c0, c1 = plot_resampling(ax, X_vis, y, 'Original set')
    else:
        iht = InstanceHardnessThreshold(
            sampling_strategy=sampling_strategy,
            estimator=LogisticRegression(solver='lbfgs', multi_class='auto'),
            return_indices=True)
        X_res, y_res, idx_res = iht.fit_resample(X, y)
        X_res_vis = pca.transform(X_res)
        plot_resampling(
            ax, X_res_vis, y_res,
            'Instance Hardness Threshold ({})'.format(sampling_strategy))
        # plot samples which have been removed
        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res)
        c3 = ax.scatter(X_vis[idx_samples_removed, 0],
                        X_vis[idx_samples_removed, 1],
                        alpha=.2,
                        label='Removed samples')

plt.figlegend((c0, c1, c3), ('Class #0', 'Class #1', 'Removed samples'),
              loc='lower center',
Пример #18
0
def fscore(params_org):
    #print(params_org)
    parambk = copy.deepcopy(params_org)
    ifError =0
    global best, HPOalg,params_best, errorcount
    params= params_org['classifier']
    classifier = params.pop('name')
    p_random_state = params.pop('random_state')
    
    if (classifier == 'SVM'):  
        param_value= params.pop('gamma_value')
        if(params['gamma'] == "value"):
            params['gamma'] = param_value
        else:
            pass   
        clf = SVC(max_iter = 10000, cache_size= 700, random_state = p_random_state,**params)
        #max_iter=10000 and cache_size= 700 https://github.com/EpistasisLab/pennai/issues/223
        #maxvalue https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L262
    elif(classifier == 'RF'):        
        clf = RandomForestClassifier(random_state = p_random_state, **params)
    elif(classifier == 'KNN'):
        p_value = params.pop('p')
        if(p_value==0):
            params['metric'] = "chebyshev"
        elif(p_value==1):
            params['metric'] = "manhattan"
        elif(p_value==2):
            params['metric'] = "euclidean"
        else:
            params['metric'] = "minkowski"
            params['p'] = p_value
        #https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L302
        clf = KNeighborsClassifier(**params)
    elif(classifier == 'DTC'):        
        clf = DecisionTreeClassifier(random_state = p_random_state, **params)
    elif(classifier == 'LR'):        
        penalty_solver = params.pop('penalty_solver')
        params['penalty'] = penalty_solver.split("+")[0]
        params['solver'] = penalty_solver.split("+")[1]
        clf = LogisticRegression(random_state = p_random_state, **params)
    #resampling parameter
    p_sub_params= params_org.pop('sub')
    p_sub_type = p_sub_params.pop('type')
    sampler = p_sub_params.pop('smo_grp')
    gmean = []
    if (p_sub_type == 'SMOTE'):
        smo = SMOTE(**p_sub_params)
    elif (p_sub_type == 'ADASYN'):
        smo = ADASYN(**p_sub_params)
    elif (p_sub_type == 'BorderlineSMOTE'):
        smo = BorderlineSMOTE(**p_sub_params)
    elif (p_sub_type == 'SVMSMOTE'):
        smo = SVMSMOTE(**p_sub_params)
    elif (p_sub_type == 'SMOTENC'):
        smo = SMOTENC(**p_sub_params)
    elif (p_sub_type == 'KMeansSMOTE'):
        smo = KMeansSMOTE(**p_sub_params)
    elif (p_sub_type == 'RandomOverSampler'):
        smo = RandomOverSampler(**p_sub_params)
#Undersampling
    elif (p_sub_type == 'TomekLinks'):
        smo = TomekLinks(**p_sub_params)
    elif (p_sub_type == 'ClusterCentroids'):
        if(p_sub_params['estimator']=='KMeans'):
            p_sub_params['estimator']= KMeans(random_state = p_random_state)
        elif(p_sub_params['estimator']=='MiniBatchKMeans'):
            p_sub_params['estimator']= MiniBatchKMeans(random_state = p_random_state)
        smo = ClusterCentroids(**p_sub_params) 
    elif (p_sub_type == 'RandomUnderSampler'):
        smo = RandomUnderSampler(**p_sub_params)
    elif (p_sub_type == 'NearMiss'):
        smo = NearMiss(**p_sub_params)
    elif (p_sub_type == 'InstanceHardnessThreshold'):
        if(p_sub_params['estimator']=='knn'):
            p_sub_params['estimator']= KNeighborsClassifier()
        elif(p_sub_params['estimator']=='decision-tree'):
            p_sub_params['estimator']=DecisionTreeClassifier()
        elif(p_sub_params['estimator']=='adaboost'):
            p_sub_params['estimator']=AdaBoostClassifier()
        elif(p_sub_params['estimator']=='gradient-boosting'):
            p_sub_params['estimator']=GradientBoostingClassifier()
        elif(p_sub_params['estimator']=='linear-svm'):
            p_sub_params['estimator']=CalibratedClassifierCV(LinearSVC())
        elif(p_sub_params['estimator']=='random-forest'):
            p_sub_params['estimator']=RandomForestClassifier(n_estimators=100)
        smo = InstanceHardnessThreshold(**p_sub_params) 
    elif (p_sub_type == 'CondensedNearestNeighbour'):
        smo = CondensedNearestNeighbour(**p_sub_params)
    elif (p_sub_type == 'EditedNearestNeighbours'):
        smo = EditedNearestNeighbours(**p_sub_params)
    elif (p_sub_type == 'RepeatedEditedNearestNeighbours'):
        smo = RepeatedEditedNearestNeighbours(**p_sub_params) 
    elif (p_sub_type == 'AllKNN'):
        smo = AllKNN(**p_sub_params)
    elif (p_sub_type == 'NeighbourhoodCleaningRule'):
        smo = NeighbourhoodCleaningRule(**p_sub_params) 
    elif (p_sub_type == 'OneSidedSelection'):
        smo = OneSidedSelection(**p_sub_params)
#Combine
    elif (p_sub_type == 'SMOTEENN'):
        smo = SMOTEENN(**p_sub_params)
    elif (p_sub_type == 'SMOTETomek'):
        smo = SMOTETomek(**p_sub_params)
    e=''
    try:        
        for train, test in cv.split(X, y):
            if(p_sub_type=='NO'):
                X_smo_train, y_smo_train = X[train], y[train]
            else:
                X_smo_train, y_smo_train = smo.fit_sample(X[train], y[train])
            y_test_pred = clf.fit(X_smo_train, y_smo_train).predict(X[test])
            gm = geometric_mean_score(y[test], y_test_pred, average='binary')
            gmean.append(gm)
        mean_g=np.mean(gmean)
    except Exception as eec:
        e=eec
        mean_g = 0
        ifError =1 
        errorcount = errorcount+1
    gm_loss = 1 - mean_g
    abc=time.time()-starttime
    if mean_g > best:
        best = mean_g
        params_best = copy.deepcopy(parambk)
    return {'loss': gm_loss,
            'mean': mean_g,
            'status': STATUS_OK,         
            # -- store other results like this
            'run_time': abc,
            'iter': iid,
            'current_best': best,
            'eval_time': time.time(),            
            'SamplingGrp': sampler,
            'SamplingType': p_sub_type,
            'ifError': ifError,
            'Error': e,
            'params' : parambk,
            'attachments':
                {'time_module': pickle.dumps(time.time)}
           }   
    return fbeta_score(y_true, y_pred, beta=2)


#evaluate a model
def evaluate_model(X, y, model):
    #define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    #define the model evaluation metric
    metric = make_scorer(f2_measure)
    #evaluate model
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
    return scores


#define the location of the dataset
full_path = 'german.csv'
#load the dataset
X, y, cat_ix, num_ix = load_dataset(full_path)
#define model to evaluate
model = LogisticRegression(solver='liblinear', class_weight='balanced')
#define the data sampling
sampling = InstanceHardnessThreshold()
# one hot encode categorical, normalize numerical
ct = ColumnTransformer([('c', OneHotEncoder(), cat_ix),
                        ('n', MinMaxScaler(), num_ix)])
# scale, then sample, then fit model
pipeline = Pipeline(steps=[('t', ct), ('s', sampling), ('m', model)])
#evaluate the model and store results
scores = evaluate_model(X, y, pipeline)
print('%.3f (%.3f)' % (mean(scores), std(scores)))
Пример #20
0
X4,y4=SVMSMOTE().fit_resample(X,y)
X5,y5=KMeansSMOTE().fit_resample(X,y)
X6,y6=SMOTEN().fit_resample(X,y)
#X7,y7=SMOTENC().fit_resample(X,y)
X8,y8=RandomOverSampler().fit_resample(X,y)

#Algoritmos de Undersampling
X9,y9=RandomUnderSampler().fit_resample(X,y)
X10,y10=NearMiss().fit_resample(X,y)
X11,y11=EditedNearestNeighbours().fit_resample(X,y)
X12,y12=RepeatedEditedNearestNeighbours().fit_resample(X,y)
X13,y13=AllKNN().fit_resample(X,y)
#X14,y14=CondensedNearestNeighbour().fit_resample(X,y)
X15,y15=OneSidedSelection().fit_resample(X,y)
X16,y16=NeighbourhoodCleaningRule().fit_resample(X,y)
X17,y17=InstanceHardnessThreshold().fit_resample(X,y)

#Técnicas combinadas
X18,y18=SMOTEENN().fit_resample(X,y)
X19,y19=SMOTETomek().fit_resample(X,y)

"""Exemplo de reamostragem dos dados."""

fig, ax = plt.subplots(1,2, figsize=(20,5))
sns.countplot(x='Churn', data=pd.DataFrame(y), ax=ax[0])
sns.countplot(x='Churn', data = pd.DataFrame(y9), ax=ax[1]);

"""Separando os dados de treino e de teste."""

X_treino, X_teste, y_treino, y_teste = train_test_split(X,y, random_state=42)
X_treino1, X_teste1, y_treino1, y_teste1 = train_test_split(X1,y1, random_state=42)
Пример #21
0
                      header=0,
                      na_values='?')

dataset.drop(dataset.columns[[26, 27]], axis=1, inplace=True)

values = dataset.values
X = values[:, 0:33]
y = values[:, 33]

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

imputer = Imputer(strategy='median')
X = imputer.fit_transform(X)

iht = InstanceHardnessThreshold(random_state=12)
X = X.astype(int)
y = y.astype(int)
X, y = iht.fit_sample(X, y)
#print('Amount of each class after under-sampling: {0}'.format(Counter(y)))

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=12)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.predict(X_test)

app = Flask(__name__)
Пример #22
0
def main():
    print("Connecting to ML4H DB..")

    conn = pymysql.connect(host='nightmare.cs.uct.ac.za',
                           port=3306,
                           user='******',
                           passwd='oesaerex',
                           db='ochomo001')

    print("Connected")

    cur = conn.cursor()
    print("Executing SQL query..")
    print(
        "SQL script: select person_id,tot_consultations_attended,tot_consultations_missed, gender, age, city_village, profession, last_consultation_date, last_consultation_attendance,missed_last_appointment from ML4H_consultation_defaulter_sets"
    )
    print("Retrieving all consultation features...")
    #SQL Query to pull in all consultation features and results from the Dataset
    cur.execute(
        "select person_id,tot_consultations_attended,tot_consultations_missed, gender, age, city_village, profession, last_consultation_date, last_consultation_attendance,missed_last_appointment from ML4H_consultation_defaulter_sets"
    )
    print("Executed.")

    patient_ids = []
    patients = {}
    consultation_features = []
    consultation_results = []
    occupations = []
    locations = []
    #["Consultations_Attended", "Consultations_Missed", "Sex", "Age", "Occupation", "Location","Day_of_week","Last_appointment" ]

    #load in all the consultation features into each patient and the result class
    for row in cur:
        if row[0] not in patient_ids:
            patient_ids.append(row[0])
            patients[row[0]] = (Consultation_Patient(int(row[0])))

            patients[row[0]].features[0] = int(
                row[1])  #Consultations Attended feature
            patients[row[0]].features[1] = int(
                row[2])  #Consultations Missed features

            #Patient Sex feature
            if row[3] == "M":
                patients[row[0]].features[2] = 0
            elif row[3] == "F":
                patients[row[0]].features[2] = 1

            patients[row[0]].features[3] = int(row[4])  #Patient Age feature

            patients[row[0]].features[5] = get_feature_index(
                row[5], locations)  #Patient location feature

            patients[row[0]].features[4] = get_feature_index(
                row[6], occupations)  #Patient Occupation feature

            patients[row[0]].features[6] = row[7].weekday(
            )  #Next appointment day of the week feature

            patients[row[0]].features[7] = int(
                row[8]
            )  #Whether a patient attended their last appointment feature

            consultation_features.append(
                patients[row[0]].features
            )  #Join above together and add to feataure list
            consultation_results.append(
                row[9]
            )  #Whether patient attended their last consultation RESULT SET

    cur.close()

    conn.close()

    print(len(consultation_results))
    print(len(consultation_features))

    #Split training set and hold out set
    X_train1, X_validation1, Y_train1, Y_validation1 = model_selection.train_test_split(
        consultation_features,
        consultation_results,
        test_size=0.3,
        random_state=7)
    #Find out balances of the training sets
    print("Y_train")
    check_result_distr(Y_train1)
    print("Y_val")
    check_result_distr(Y_validation1)
    # DATA IS IMBALANCED
    # Trying to balance data appropriately - Using multiple sampler tools to see which is best
    samplers = [['ALLKNN', AllKNN()], ['NearMiss', NearMiss()],
                ['CondensedNearestNeighbour',
                 CondensedNearestNeighbour()], ['TomekLinks',
                                                TomekLinks()],
                ['NeighbourhoodCleaningRule',
                 NeighbourhoodCleaningRule()],
                ['InstanceHardnessThreshold',
                 InstanceHardnessThreshold()],
                ['RandomUnderSampler',
                 RandomUnderSampler()]]

    #Write results of AllKNN results(best sampler) to file
    f1 = open('consultation_technique_comparison.csv', 'w')

    X_resamp, Y_resamp = samplers[0][1].fit_sample(X_train1, Y_train1)
    X_resamp_orig, Y_resamp_orig = samplers[0][1].fit_sample(
        consultation_features, consultation_results)
    results_final = apply_machine_learning_techniques(X_resamp_orig,
                                                      Y_resamp_orig, X_resamp,
                                                      Y_resamp, X_validation1,
                                                      Y_validation1)
    f1.write(',Logistic Regression' + ", "
             'K Neighbours Classifier' + "," + 'Decision Tree Classifier' +
             "," + 'Gaussian NB' + "," + 'Random Forrest' + "," +
             'MLPClassifier' + "," + 'AdaBoostClassifier' + "," +
             'Support Vector Machine')
    f1.write("\n")
    f1.write("Roc " + results_final[0] + "\n")
    f1.write("Sensitivity " + results_final[1] + "\n")
    f1.write("Specificity " + results_final[2] + "\n")
    f1.write("Unseen Roc " + results_final[3])

    f1.close()

    #Write results of the original balanced dataset
    f = open('consultation_balance_comparison.csv', 'w')
    f.write("Sampler,Attended ,Missed ," + 'Logistic Regression' + ", "
            'K Neighbours Classifier' + "," + 'Decision Tree Classifier' +
            "," + 'Gaussian NB' + "," + 'Random Forrest' + "," +
            'MLPClassifier' + "," + 'AdaBoostClassifier' + "," +
            'Support Vector Machine')
    f.write("\n")
    orig_distribution = check_result_distr(Y_train1)
    orig_results = apply_machine_learning_techniques(X_train1, Y_train1,
                                                     X_validation1,
                                                     Y_validation1,
                                                     X_resamp_orig,
                                                     Y_resamp_orig)[0]
    f.write("Orig" + "," + orig_distribution + orig_results + "\n")

    #Write results of all other sampler balancing technique results
    for sampler in samplers:
        print(sampler[0])
        X_resamp, Y_resamp = sampler[1].fit_sample(X_train1, Y_train1)
        distribution = check_result_distr(Y_resamp)
        results = apply_machine_learning_techniques(X_resamp, Y_resamp,
                                                    X_validation1,
                                                    Y_validation1,
                                                    X_resamp_orig,
                                                    Y_resamp_orig)[0]

        f.write(sampler[0] + "," + distribution + results + "\n")

    f.close()
Пример #23
0
def Sampling(X, y, method):
    """
    function to sample imbalanced dataset:

    Arguments:
    X -- trainset features
    y -- trainset labels
    method -- sampling method

    Return:
    X_res -- sampled trainset features
    y_res -- sampled trainset labels
    """

    #Under-sampling:
    if method == 'RandomUnderSampler':
        from imblearn.under_sampling import RandomUnderSampler
        us = RandomUnderSampler()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'TomekLinks':
        from imblearn.under_sampling import TomekLinks
        us = TomekLinks()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'OneSidedSelection':
        from imblearn.under_sampling import OneSidedSelection
        us = OneSidedSelection()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'NeighbourhoodCleaningRule':
        from imblearn.under_sampling import NeighbourhoodCleaningRule
        us = NeighbourhoodCleaningRule()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'NearMiss':
        from imblearn.under_sampling import NearMiss
        us = NearMiss()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'InstanceHardnessThreshold':
        from imblearn.under_sampling import InstanceHardnessThreshold
        us = InstanceHardnessThreshold()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'AllKNN':
        from imblearn.under_sampling import AllKNN
        us = AllKNN()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'RepeatedEditedNearestNeighbours':
        from imblearn.under_sampling import RepeatedEditedNearestNeighbours
        us = RepeatedEditedNearestNeighbours()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'EditedNearestNeighbours':
        from imblearn.under_sampling import EditedNearestNeighbours
        us = EditedNearestNeighbours()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'CondensedNearestNeighbour':
        from imblearn.under_sampling import CondensedNearestNeighbour
        us = CondensedNearestNeighbour()
        X_res, y_res = us.fit_resample(X, y)

    # Combination of over- and under-sampling:
    elif method == 'SMOTEENN':
        from imblearn.combine import SMOTEENN
        us = SMOTEENN()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'SMOTETomek':
        from imblearn.combine import SMOTETomek
        us = SMOTETomek()
        X_res, y_res = us.fit_resample(X, y)

    return X_res, y_res
Пример #24
0
                   X_vis[y == 0, 1],
                   label="Class #0",
                   alpha=0.5,
                   edgecolor=almost_black,
                   facecolor=palette[0],
                   linewidth=0.15)
        ax.scatter(X_vis[y == 1, 0],
                   X_vis[y == 1, 1],
                   label="Class #1",
                   alpha=0.5,
                   edgecolor=almost_black,
                   facecolor=palette[2],
                   linewidth=0.15)
        ax.set_title('Original set')
    else:
        iht = InstanceHardnessThreshold(ratio=ratio)
        X_res, y_res = iht.fit_sample(X, y)
        X_res_vis = pca.transform(X_res)

        ax.scatter(X_res_vis[y_res == 0, 0],
                   X_res_vis[y_res == 0, 1],
                   label="Class #0",
                   alpha=.5,
                   edgecolor=almost_black,
                   facecolor=palette[0],
                   linewidth=0.15)
        ax.scatter(X_res_vis[y_res == 1, 0],
                   X_res_vis[y_res == 1, 1],
                   label="Class #1",
                   alpha=.5,
                   edgecolor=almost_black,
Пример #25
0
选择一个多数类样本(需要下采样)加入集合C,其他的这类样本放入集合S;
使用集合S训练一个1-NN的分类器,对集合S中的样本进行分类;
将集合S中错分的样本加入集合C;
重复上述过程, 直到没有样本再加入到集合C.
'''
from imblearn.under_sampling import CondensedNearestNeighbour
cnn = CondensedNearestNeighbour(random_state=0)
X_resampled, y_resampled = cnn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#显然,CondensedNearestNeighbour方法对噪音数据是很敏感的,也容易加入噪音数据到集合C中.
#因此,OneSidedSelection函数使用 TomekLinks方法来剔除噪声数据(多数类样本).
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection(random_state=0)
X_resampled, y_resampled = oss.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
'''
NeighbourhoodCleaningRule 算法主要关注如何清洗数据而不是筛选(considering)他们. 因此,该算法将使用
EditedNearestNeighbours和 3-NN分类器结果拒绝的样本之间的并集.
'''
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule(random_state=0)
X_resampled, y_resampled = ncr.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

#InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器,然后将概率低于阈值的样本剔除掉.
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import InstanceHardnessThreshold
iht = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression())
X_resampled, y_resampled = iht.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

imbalances = [
    RandomUnderSampler(),
    TomekLinks(),
    ClusterCentroids(),
    NearMiss(version=1),
    NearMiss(version=2),
    NearMiss(version=3),
    CondensedNearestNeighbour(size_ngh=3, n_seeds_S=51),
    OneSidedSelection(size_ngh=5, n_seeds_S=51),
    InstanceHardnessThreshold(),
    RandomOverSampler(ratio='auto'),
    SMOTE(ratio='auto', kind='regular'),
    SMOTE(ratio='auto', kind='borderline1'),
    SMOTE(ratio='auto', kind='borderline2'),
    SMOTETomek(ratio='auto'),
    SMOTEENN(ratio='auto')
]

classifiers = [
    LogisticRegression(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(n_neighbors=5)
]
Пример #27
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.calculate_importance = False

        self.sampler = InstanceHardnessThreshold(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.product(),
            bug_features.component(),
            bug_features.is_mozillian(),
            bug_features.bug_reporter(),
            bug_features.blocked_bugs_number(),
            bug_features.priority(),
            bug_features.has_cve_in_alias(),
            bug_features.comment_count(),
            bug_features.comment_length(),
            bug_features.reporter_experience(),
            bug_features.number_of_bug_dependencies(),
        ]

        cleanup_functions = [
            feature_cleanup.url(),
            feature_cleanup.fileref(),
            feature_cleanup.hex(),
            feature_cleanup.dll(),
            feature_cleanup.synonyms(),
            feature_cleanup.crash(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(
                    feature_extractors,
                    cleanup_functions,
                    rollback=True,
                    rollback_when=self.rollback,
                ),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("title", self.text_vectorizer(min_df=0.0001), "title"),
                    (
                        "comments",
                        self.text_vectorizer(min_df=0.0001),
                        "comments",
                    ),
                ]),
            ),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Пример #28
0
X_resampled = pd.DataFrame(X_resampled)
X_resampled.columns = [
    'is_static', 'is_enum', 'uses_variables', 'call_method', 'is_interface',
    'is_local_class', 'call_external_method'
]
y_resampled = pd.DataFrame(y_resampled)
y_resampled.columns = ['is_code_smell']
undersampled_data = pd.concat([X_resampled, y_resampled], axis=1)
print("TomekLinks")
print(undersampled_data.describe())
undersampled_data.to_csv('../../dataset/LIC/LIC_TomekLinks.csv', index=False)

#
#
#InstanceHardnessThreshold non efficace il a garder les memes instances
rus = InstanceHardnessThreshold(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, Y)
X_resampled = pd.DataFrame(X_resampled)
X_resampled.columns = [
    'is_static', 'is_enum', 'uses_variables', 'call_method', 'is_interface',
    'is_local_class', 'call_external_method'
]
y_resampled = pd.DataFrame(y_resampled)
y_resampled.columns = ['is_code_smell']
undersampled_data = pd.concat([X_resampled, y_resampled], axis=1)
print("InstanceHardnessThreshold")
print(undersampled_data.describe())
undersampled_data.to_csv('../../dataset/LIC/LIC_InstanceHardnessThreshold.csv',
                         index=False)

#NearMiss
Пример #29
0
def test_iht_fit_resample_class_obj():
    est = GradientBoostingClassifier(random_state=RND_SEED)
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_resample(X, Y)
    assert X_resampled.shape == (12, 2)
    assert y_resampled.shape == (12, )
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title(f"Resampling using {sampler.__class__.__name__}")
fig.tight_layout()

###############################################################################
# ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude
# samples. All samples which are classified with a low probability will be
# removed.

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=5000,
                      weights=(0.01, 0.05, 0.94),
                      class_sep=0.8)

clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title(f"Linear SVC with y={Counter(y)}")
sampler = InstanceHardnessThreshold(
    random_state=0,
    estimator=LogisticRegression(solver="lbfgs", multi_class="auto"),
)
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title(f"Decision function for {sampler.__class__.__name__}")
plot_resampling(X, y, sampler, ax3)
ax3.set_title(f"Resampling using {sampler.__class__.__name__}")
fig.tight_layout()

plt.show()