Exemplo n.º 1
0
    def create_model_from_training_data(self):
        training_comments = []
        training_ratings = []
        print("Training classifier model..")
        for sentidata in self.training_data:
            comments = preprocess_text(sentidata.text)
            training_comments.append(comments)
            training_ratings.append(sentidata.rating)

        # discard stopwords, apply stemming, and discard words present in less than 3 comments
        self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem,
                                          sublinear_tf=True,
                                          max_df=0.5,
                                          stop_words=mystop_words,
                                          min_df=3)
        X_train = self.vectorizer.fit_transform(training_comments).toarray()
        Y_train = np.array(training_ratings)

        #Apply SMOTE to improve ratio of the minority class
        smote_model = SVMSMOTE(sampling_strategy=0.5,
                               random_state=None,
                               k_neighbors=15,
                               m_neighbors=15,
                               out_step=.0001,
                               svm_estimator=None,
                               n_jobs=1)

        X_resampled, Y_resampled = smote_model.fit_sample(X_train, Y_train)

        model = self.get_classifier()
        model.fit(X_resampled, Y_resampled)

        return model
Exemplo n.º 2
0
    def data_oversample(self):

        x_train, x_val, x_test, y_train, y_val, y_test = self.data_sample_split(
        )

        for i in [0, 1, 2, 3, 4]:

            if i not in y_train:

                print("lesion " + i +
                      " not in y_train. Redoing sample split...")
                data_sample_split()

        print("Presampled train dataset: %s" % Counter(y_train))

        resample = SVMSMOTE(random_state=42)  # SVMSMOTE, SMOTENC

        x_train, y_train = resample.fit_resample(x_train, y_train)

        x_val, y_val = resample.fit_resample(x_val, y_val)

        print("Resampled train dataset: %s" % Counter(y_train))

        ##        x_test, y_test   = resample.fit_resample(x_test, y_test)

        return x_train, x_val, x_test, y_train, y_val, y_test
Exemplo n.º 3
0
def getData(splitData=True, useImbalancer=False, useStratify=False):
    global standard_scaler
    data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv")
    X = data.values[:, 1:-1]
    rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy()
    X = np.concatenate((X, rank_dummy), axis=1)
    y = data.values[:, 0].reshape(-1, 1)
    if useStratify:
        stratify = y
    else:
        stratify = None
    if splitData:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=101,
                                                            shuffle=True,
                                                            stratify=stratify)
    else:
        X_train = X
        y_train = y
    if useImbalancer and splitData:
        tl = TomekLinks(sampling_strategy='majority')
        X_train, y_train = tl.fit_sample(X=X_train, y=y_train)
        # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T)
    if splitData:
        unique, counts = np.unique(y_test, return_counts=True)
    # print("y_test\n", np.asarray((unique, counts)).T)
    if splitData:
        return X_train, X_test, y_train.ravel(), y_test.ravel()
    else:
        return X_train, y_train.ravel()
Exemplo n.º 4
0
def svm_smote(X,
              y,
              visualize=False,
              pca2d=True,
              pca3d=True,
              tsne=True,
              pie_evr=True):
    sm = SVMSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
Exemplo n.º 5
0
def svm_smote(X, y):
    """Balancing data using SVMSMOTE

    Args:
        X: Training set without Class Target
        y:Training set Class Target

    Returns:
        balanced train_x, test_x
    """
    sample = SVMSMOTE(random_state=42)
    X, y = sample.fit_resample(X, y)
    print('after balancing:', X.shape)
    return X, y
def Predict(data, mode):
    train, test = data
    idx = test.id.values.astype(int)
    y = train.median_relevance.values

    train_query = list(
        train.apply(lambda x: '%s' % x['query_preprocessed'], axis=1))
    train_title = list(
        train.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1))

    test_query = list(
        test.apply(lambda x: '%s' % x['query_preprocessed'], axis=1))
    test_title = list(
        test.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1))

    stop_words = text.ENGLISH_STOP_WORDS.union(['http','www','img','border','color','style','padding','table','font', \
                                                'thi','inch','ha','width','height','0','1','2','3','4','5','6','7','8','9'])
    stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english')))

    tfv = text.TfidfVectorizer(min_df=7,  max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', \
                               ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words)

    tfv.fit(train_query + train_title)
    X_train = hstack([tfv.transform(train_query), tfv.transform(train_title)])
    X_test = hstack([tfv.transform(test_query), tfv.transform(test_title)])

    sim = similarlity_stack()
    if mode == 'eda':
        svd = TruncatedSVD(n_components=200)
        scl = StandardScaler(with_mean=False)
        svm = SVC(C=10,
                  gamma="auto",
                  kernel="rbf",
                  class_weight=None,
                  probability=True)
        clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\
                            ('scl', scl),\
                            ('svm', svm)])
    elif mode == 'sampling':
        svd = TruncatedSVD(n_components=200)
        scl = StandardScaler(with_mean=False)
        svm = SVC(C=10,
                  gamma="auto",
                  kernel="rbf",
                  class_weight=None,
                  probability=True)
        sampling = SVMSMOTE(svm_estimator=svm, k_neighbors=4)
        clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\
                                  ('scl', scl),\
                                  ('sampling', sampling),\
                                  ('svm', svm)])

    clf.fit(X_train, y)
    preds = clf.predict(X_test)
    pred_probas = clf.predict_proba(X_test)

    submission = pd.DataFrame({"id": idx, "prediction": preds})
    submission_probas = pd.DataFrame(pred_probas, index=idx)

    return submission, submission_probas
Exemplo n.º 7
0
def borderline_smoth_func(train_x, train_y, target):
    try:
        logger.info(
            f"counter before border line SMOTH is: {train_y[target].value_counts()}"
        )
        # transform the dataset
        #oversample = BorderlineSMOTE()
        oversample = SVMSMOTE()
        train_x, train_y = oversample.fit_resample(train_x, train_y)
        # summarize the new class distribution
        logger.info(
            f"counter after borderline SMOTH is: {train_y[target].value_counts()}"
        )
        return train_x, train_y
    except Exception as ex:
        logger.error(f"failed to run borderline_smoth_func due to: {ex}")
Exemplo n.º 8
0
def test_svm_smote_not_svm(data):
    """Check that we raise a proper error if passing an estimator that does not
    expose a `support_` fitted attribute."""

    err_msg = "`svm_estimator` is required to exposed a `support_` fitted attribute."
    with pytest.raises(RuntimeError, match=err_msg):
        SVMSMOTE(svm_estimator=LogisticRegression()).fit_resample(*data)
Exemplo n.º 9
0
def logistic_regression(lr_params,
                        train_feat,
                        train_label,
                        model,
                        test_feat,
                        test_label,
                        vec_params=None,
                        random_state=42):
    '''
    A function to model data using logistic regression with under- or over-sampling.
    '''
    if model == 'svmsmote':
        pipe = make_pipeline(CountVectorizer(**vec_params),
                             SVMSMOTE(random_state=random_state),
                             LogisticRegression(**lr_params))
    elif model == 'rus':
        pipe = make_pipeline(CountVectorizer(**vec_params),
                             RandomUnderSampler(random_state=random_state),
                             LogisticRegression(**lr_params))

    pipe_fit = pipe.fit(train_feat, train_label)
    y_pred = pipe_fit.predict(test_feat)

    cnf_matrix = confusion_matrix(test_label, y_pred)

    return pipe, pipe_fit, y_pred, cnf_matrix
Exemplo n.º 10
0
def oversample(x, y, method):
    randomstate = 42
    if method == 'No Sample':
        # 不采样
        return x, y
    elif method == 'random':
        # 随机过采样
        ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=randomstate)
        X_resampled, y_resampled = ros.fit_resample(x, y)
    elif method == 'SMOTE':
        # SMOTE算法
        X_resampled, y_resampled = SMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y)
    elif method == 'Sparse SMOTE':
        # Sparse SMOTE算法
        X_resampled, y_resampled = SparseSMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y)
    elif method == 'SMOTEBorderline-1':
        # BorderlineSmote算法 borderline-1
        X_resampled, y_resampled = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-1', random_state=randomstate).fit_resample(x, y)
    elif method == 'SMOTEBorderline-2':
        # BorderlineSmote算法 borderline-2
        X_resampled, y_resampled = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-2', random_state=randomstate).fit_resample(x, y)
    elif method == 'SVMSMOTE':
        # SVMSMOTE算法
        X_resampled, y_resampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y)
    elif method == 'ADASYN':
        # ADASYN算法
        X_resampled, y_resampled = ADASYN(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y)
    elif method == 'mwmote':
        # MWMOTE算法
        X_resampled, y_resampled = MWMOTE.MWMOTE(x, y, N=1000, return_mode='append')
    # 统计过采样数量
    # from collections import Counter
    # print(sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled
Exemplo n.º 11
0
    def get_sampler(self):
        sampler = None
        if self.sampler == 'random-over-sampler':
            sampler = RandomOverSampler(random_state=self.random_seed)

        elif self.sampler == 'adasyn':
            sampler = ADASYN(random_state=self.random_seed, n_jobs=self.njobs)

        elif self.sampler == 'smote':
            sampler = SMOTE(random_state=self.random_seed, n_jobs=self.njobs)

        elif self.sampler == 'svm-smote':
            sampler = SVMSMOTE(random_state=self.random_seed,
                               n_jobs=self.njobs)

        elif self.sampler == 'random-under-sampler':
            sampler = RandomUnderSampler(random_state=self.random_seed)

        elif self.sampler == 'tomek-links':
            sampler = TomekLinks(n_jobs=self.njobs)

        elif self.sampler == 'near-miss':
            sampler = NearMiss(n_jobs=self.njobs)

        elif self.sampler == 'instance-hardness':
            sampler = InstanceHardnessThreshold(random_state=self.random_seed,
                                                n_jobs=self.njobs)

        return sampler
def svmsampler(X, y, over_pct=0.1, under_pct=1):
    over = SVMSMOTE(random_state=42, sampling_strategy=over_pct)
    under = RandomUnderSampler(random_state=42, sampling_strategy=under_pct)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(X, y)
    return X, y
def run_upsample(json_file_path, fmt_file_path):
    json_manager = JsonManager(json_file_path)

    if json_manager.get_upsample_status() == True:
        print(f"Upsampling started using {json_file_path} and {fmt_file_path}")
        upsampled_path = json_manager.get_upsampled_path()
        constants.remove_folder_if_exists(\
         constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path)

        hot_encoded_folder = os.fsdecode(os.path.join(\
         json_manager.get_hot_encoded_path(), \
         constants.HOT_ENCODED_CSV_FOLDER_NAME))

        hot_encoded_file = os.fsdecode(os.path.join(\
         hot_encoded_folder, \
         constants.HOT_ENCODED_CSV_FILENAME))

        hotEncoded_data = pd.read_csv(hot_encoded_file)
        features_data = pd.read_csv(hot_encoded_file, \
        usecols = list(hotEncoded_data.columns)[:-1]) # everything except label
        labels_data = pd.read_csv(hot_encoded_file, \
        usecols = [list(hotEncoded_data.columns)[-1]]) # label

        sm = SVMSMOTE(random_state=json_manager.get_random_state())
        X_res, y_res = sm.fit_resample(features_data, labels_data)
        csv_ready = np.append(X_res, y_res, axis=constants.COLUMN_AXIS)

        upsampled_folder = constants.add_folder_to_directory(\
         constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path)

        upsampled_file_path = os.fsdecode(os.path.join(\
         upsampled_folder, constants.UPSAMPLED_CSV_FILENAME))

        if os.path.exists(upsampled_file_path):
            os.remove(upsampled_file_path)

        f = open(fmt_file_path, "r")
        fmt = f.readline()
        f.close()

        header = ','.join(str(i) for i in hotEncoded_data.columns)
        np.savetxt(upsampled_file_path, csv_ready, \
         fmt = fmt, \
         delimiter = constants.CSV_DELIMITER, \
         header = header, \
         comments='')
        print(f"Upsampling finished, results in {upsampled_file_path}")
Exemplo n.º 14
0
    def fit(self, X, Y):
        #        print('Kernel:', kernel_dict)
        train_data = np.append(X, Y.reshape(len(Y), 1), axis=1)

        if self.databalance == 'LowSampling':
            data_maj = train_data[Y == 1]  # 将多数
            data_min = train_data[Y != 1]
            index = np.random.randint(len(data_maj), size=len(data_min))
            lower_data_maj = data_maj[list(index)]
            train_data = np.append(lower_data_maj, data_min, axis=0)
            X = train_data[:, :-1]
            Y = train_data[:, -1]
            self.Y = Y

        elif self.databalance == 'UpSampling':
            X, Y = SVMSMOTE(random_state=42).fit_sample(train_data[:, :-1],\
                                       np.asarray(train_data[:, -1]))
            self.Y = Y

        else:
            X = X
            Y = Y
            self.Y = Y

        m = Y.shape[0]

        # Kernel
        if self.kernel_dict['type'] == 'RBF':
            K = Kernel.RBF(m, self.kernel_dict['sigma'])
        elif self.kernel_dict['type'] == 'LINEAR':
            K = Kernel.LINEAR(m)
        elif self.kernel_dict['type'] == 'POLY':
            K = Kernel.POLY(m, self.kernel_dict['d'])

        K.calculate(X)

        tmp1 = np.hstack((np.ones((1, 2 * m)), [[0]]))
        M_BR = K.kernelMat + np.eye(m) / (self.C * self.m_value)
        tmp2 = np.hstack((M_BR, K.kernelMat, np.ones((m, 1))))
        M_BL = K.kernelMat + np.eye(m) / (self.C * (1 - self.m_value))
        tmp3 = np.hstack((K.kernelMat, M_BL, np.ones((m, 1))))

        L = np.vstack((tmp1, tmp2, tmp3))
        R = np.ones(2 * m + 1)
        R[0] = 0
        R[m + 1:] = -1
        # solve

        solution = LA.solve(L, R)
        b = solution[-1]
        alpha = solution[:m]
        beta = solution[m:2 * m]
        print('b', b)
        #        self.gamma = gamma
        self.beta = beta
        self.alpha = alpha
        self.b = b
        self.K = K
        self.kernelMat = K.kernelMat
Exemplo n.º 15
0
    def fit(self, X, Y):
        #        print('Kernel:', self.kernel_dict)
        train_data = np.append(X, Y.reshape(len(Y), 1), axis=1)

        if self.databalance == 'LowSampling':
            data_maj = train_data[Y == 1]  # 将多数
            data_min = train_data[Y != 1]
            index = np.random.randint(len(data_maj), size=len(data_min))
            lower_data_maj = data_maj[list(index)]
            train_data = np.append(lower_data_maj, data_min, axis=0)
            X = train_data[:, :-1]
            Y = train_data[:, -1]
            self.Y = Y

        elif self.databalance == 'UpSampling':
            X, Y = SVMSMOTE(random_state=42).fit_sample(train_data[:, :-1],\
                                       np.asarray(train_data[:, -1]))
            self.Y = Y

        else:
            X = X
            Y = Y
            self.Y = Y

        m = len(Y)

        # Kernel
        if self.kernel_dict['type'] == 'RBF':
            K = Kernel.RBF(m, self.kernel_dict['sigma'])
            K.calculate(X)
        elif self.kernel_dict['type'] == 'LINEAR':
            K = Kernel.LINEAR(m)
            K.calculate(X)
        elif self.kernel_dict['type'] == 'POLY':
            K = Kernel.POLY(m, self.kernel_dict['d'])
            K.calculate(X)

        H = np.multiply(np.dot(np.matrix(Y).T, np.matrix(Y)), K.kernelMat)
        M_BR = H + np.eye(m) / (self.C)
        # Concatenate
        L_L = np.concatenate((np.matrix(0), np.matrix(Y).T), axis=0)
        L_R = np.concatenate((np.matrix(Y), M_BR), axis=0)
        L = np.concatenate((L_L, L_R), axis=1)
        R = np.ones(m + 1)
        R[0] = 0
        # solve
        b_a = LA.solve(L, R)
        b = b_a[0]
        alpha = b_a[1:]

        e = alpha / self.C

        self.alpha = alpha
        self.b = b
        self.K = K
        self.kernelMat = K.kernelMat

        return self.alpha, self.b, e
Exemplo n.º 16
0
def roc_curves(df, number_of_matches):
    number_of_matches = int(number_of_matches)
    df_played_matches = df.iloc[0:number_of_matches-1]
    classifier = LogisticRegression(max_iter=300, multi_class = 'multinomial', solver = 'saga',penalty='elasticnet',l1_ratio = .95)
    classifier = OneVsRestClassifier(classifier)
    count = 0
    Data = df_played_matches[['home_pos', 'visitor_pos', 'spi1', 'spi2', 'draw%', 'home_form', 'visitor_form', 'importance1', 'importance2', 'xG1', 'xG2']]
    Target = df_played_matches['home_result']
    y = np.asarray(Target)
    enc = LabelEncoder()
    label_encoder = enc.fit(y)
    y = label_encoder.transform(y)
    X = np.asarray(Data)
    n_classes = 3
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=5)
    from imblearn.over_sampling import SVMSMOTE
    SVMSMOTE = SVMSMOTE()
    columns = Data.columns
    up_sampled_X,up_sampled_y=SVMSMOTE.fit_sample(X_train, y_train)
    up_sampled_X = pd.DataFrame(data=up_sampled_X,columns=columns )
    up_sampled_y= pd.DataFrame(data=up_sampled_y,columns=['home_result'])

    scaler = RobustScaler()
    scaler.fit(up_sampled_X)
    X_train = scaler.transform(up_sampled_X)
    X_test = scaler.transform(X_test)

    y_train = label_binarize(np.asarray(up_sampled_y), classes=[0, 1, 2])
    y_test = label_binarize(np.asarray(y_test), classes=[0, 1, 2])
    y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    data = [fpr[2], tpr[2]]
    dataset = pd.DataFrame({'FPR': data[0], 'TPR': data[1]})
    dataset.to_csv("reticulate1.csv")
Exemplo n.º 17
0
    def over_sample(self,
                    method="BorderLine",
                    sampling_strategy="minority",
                    random_state=42,
                    k_neighbors=5,
                    n_neighbors=10,
                    kind="borderline-1"):
        """
        过采样方法
        :param method: str, option: ADASYN, BorderLine,KMeans,Random,SVM
        :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m}
        :param random_state:int
        :param k_neighbors:int
        :param n_neighbors:int
        :param kind:str, borderline-1,borderline-2
        :return:df
        """
        feature_name = self._df.columns.difference(["id",
                                                    self._target]).tolist()
        X = self._df[feature_name].values
        y = self._df[self._target].values

        print("Original label shape {}".format(Counter(y)))

        if method == "ADASYN":
            overSm = ADASYN(sampling_strategy=sampling_strategy,
                            random_state=random_state,
                            n_neighbors=k_neighbors)
        elif method == "BorderLine":
            overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy,
                                     random_state=random_state,
                                     k_neighbors=k_neighbors,
                                     m_neighbors=n_neighbors,
                                     kind=kind)
        elif method == "KMeans":
            overSm = KMeansSMOTE(sampling_strategy=sampling_strategy,
                                 random_state=random_state,
                                 k_neighbors=k_neighbors)
        elif method == "Random":
            overSm = RandomOverSampler(sampling_strategy=sampling_strategy,
                                       random_state=random_state)
        elif method == "SVM":
            overSm = SVMSMOTE(sampling_strategy=sampling_strategy,
                              random_state=random_state,
                              k_neighbors=k_neighbors,
                              m_neighbors=n_neighbors,
                              out_step=0.5)
        else:
            print("不支持{}该抽样方法".format(method))
            return self._df

        X_res, y_res = overSm.fit_resample(X, y)
        print("overSample label shape {}".format(Counter(y_res)))
        _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1)
        df_new = pd.DataFrame(data=_data,
                              columns=feature_name + [self._target])
        return df_new
Exemplo n.º 18
0
    def random_forest_param_selection(x: DataFrame, y: DataFrame = None,
                                               cv=DEFAULT_CV, metric: str = DEFAULT_METRIC,
                                               jobs: int = DEFAULT_THREAD,
                                               random_state: int = DEFAULT_RANDOM_STATE,
                                               refit: bool = DEFAULT_REFIT):
        """

        :param x:
        :param y:
        :param cv:
        :param metric:
        :param jobs:
        :param random_state:
        :param refit:
        :return:
        """

        param_grid = {
            'criterion': ['entropy', 'gini'],
            'max_depth': [80, 90],
            'max_features': ['log2', 'sqrt'],
            'min_samples_leaf': [2, 5],
            'n_estimators': [10, 150, 300, 600]
        }

        new_params = {'rf__' + k: v for k, v in param_grid.items()}
        upsampling_model = Pipeline([
            ('svmsmote', SVMSMOTE(svm_estimator=SVC(), k_neighbors=5, m_neighbors=5, n_jobs=jobs, random_state=random_state)),
            ('rf', RandomForestClassifier(random_state=random_state, warm_start=True, n_jobs=jobs))
        ])

        grid_search = ms.GridSearchCV(
            # RandomForestClassifier(random_state=random_state, warm_start=True, n_jobs=jobs),
            # param_grid=param_grid,
            upsampling_model,
            param_grid=new_params,
            scoring=metric,
            cv=cv,
            refit=refit,
            n_jobs=jobs,
            verbose=Tuning.DEFAULT_VERBOSE
        )
        grid_search.fit(x, y)

        print("Best parameters:")
        print()
        print(grid_search.best_params_)
        print()
        print("Grid scores:")
        print()
        means = grid_search.cv_results_['mean_test_score']
        stds = grid_search.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
            print("%0.4f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print()

        return grid_search.best_estimator_
def GradientBoost():
    print('Gradient Boost')
    X, Y = SVMSMOTE(random_state=42).fit_sample(x_train, y_train)
    #    clf = GradientBoostingClassifier(learning_rate=0.005, n_estimators=400,max_depth=11,\
    #                                     min_samples_leaf =70, min_samples_split =1000, \
    #                                     max_features='sqrt', subsample=1, random_state=10)
    clf = GradientBoostingClassifier()
    clf.fit(X, Y)
    ypred = clf.predict(x_test)
    Precision.precision(ypred, y_test)
Exemplo n.º 20
0
    def __get_smote(self):
        if self.algorithm == 'Borderline':
            return BorderlineSMOTE(random_state=RANDOM_STATE)
        elif self.algorithm == 'KMeans':
            return KMeansSMOTE(random_state=RANDOM_STATE,
                               kmeans_estimator=KMeans(n_clusters=20))
        elif self.algorithm == 'SVM':
            return SVMSMOTE(random_state=RANDOM_STATE)
        elif self.algorithm == 'Tomek':
            return SMOTETomek(random_state=RANDOM_STATE)

        return SMOTE(random_state=RANDOM_STATE)
Exemplo n.º 21
0
    def fit(self, X, y):

        train_data = np.append(X, y.reshape(len(y), 1), axis=1)

        clf = [[]] * self.n_estimator

        if databalance == 'LowSampling':
            data_maj = train_data[y == 1]  # 将多数
            data_min = train_data[y != 1]
            index = np.random.randint(len(data_maj), size=len(data_min))
            lower_data_maj = data_maj[list(index)]
            train_data = np.append(lower_data_maj, data_min, axis=0)

        elif databalance == 'UpSampling':
            x_train, y_train = SVMSMOTE(random_state=42).fit_sample(train_data[:, :-1],\
                                       np.asarray(train_data[:, -1]))
            train_data = np.append(x_train,
                                   y_train.reshape(len(y_train), 1),
                                   axis=1)

        else:
            train_data = train_data

        for i in range(self.n_estimator):
            #sample = np.array(subsample(dataset=data[:-test_length, :], ratio=0.7))
            sample = np.array(self.subsample(dataset=train_data, ratio=0.8))
            train_data = sample
            x_train = train_data[:, :-1]
            y_train = train_data[:, -1]

            if self.kernel_dict_type == 'LINEAR':
                C = GridSearch_parametre.LS_FSVM_best(x_train,y_train,self.kernel_dict_type,\
                                                      self.param_grid,self.judgment,self.fuzzyvalue, self.r_max, self.r_min)
                kernel_dict = {'type': 'LINEAR'}

            elif self.kernel_dict_type == 'RBF':
                C,sigma = GridSearch_parametre.LS_FSVM_best(x_train,y_train,self.kernel_dict_type,\
                                                      self.param_grid,self.judgment,self.fuzzyvalue, self.r_max, self.r_min)
                kernel_dict = {'type': 'RBF', 'sigma': sigma}

            elif self.kernel_dict_type == 'POLY':
                C,d = GridSearch_parametre.LS_FSVM_best(x_train,y_train,self.kernel_dict_type,\
                                                      self.param_grid,self.judgment,self.fuzzyvalue, self.r_max, self.r_min)
                kernel_dict = {'type': 'POLY', 'd': d}

            clf[i] = LS_FSVM.LSFSVM(C, kernel_dict, self.fuzzyvalue, 'origine',
                                    self.r_max, self.r_min)
            clf[i]._mvalue(x_train, y_train)
            clf[i].fit(x_train, y_train)

        with open('LSFsvm_bagging.pkl', 'wb') as f:
            for i in range(self.n_estimator):
                pickle.dump(clf[i], f, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 22
0
def get_oversampler(sampler_name, **add_params):
    sampler_name = sampler_name.lower()

    if sampler_name == 'adasyn':
        return ADASYN(**add_params)
    elif sampler_name == 'smote':
        return SMOTE(**add_params)
    elif sampler_name == 'smotenc':
        return SMOTENC(**add_params)
    elif sampler_name == 'svmsmote':
        return SVMSMOTE(**add_params)
    else:
        print('Choose one of predefined over-samplers')
Exemplo n.º 23
0
def _SMOTE_SVM(self):
    # Oversampling - SMOTE - Synthetic Minority Over-sampling Technique
    # print('before SMOTE df', self.x_train)
    print("before SMOTE df", self.x_train.shape)
    smote = SVMSMOTE(
        k_neighbors=5, m_neighbors=5, random_state=self.seed
    )  # sampling_strategy=0.8
    self.X_train_smote, self.y_train_smote = smote.fit_sample(
        self.x_train, self.y_train
    )
    print("X_train_SMOTE:\n", self.X_train_smote[1])

    self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"]
    )

    # print('len smote: \n', len(self.X_train_smote))
    print("len new x_train after smote: \n", len(self.x_train))

    number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1]
    print("number positive responses y_train:\n", len(number_pos_x))
Exemplo n.º 24
0
    def fit(self, X, y):
        """Fitting."""
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y

        minority_X = self.X_[self.y_ == 1]
        minority_y = self.y_[self.y_ == 1]
        majority_X = self.X_[self.y_ == 0]
        majority_y = self.y_[self.y_ == 0]

        for i in range(self.ensemble_size):
            self.estimators_.append(base.clone(self.base_estimator))

        for n, estimator in enumerate(self.estimators_):
            np.random.seed(self.random_state + (n * 2))
            bagXminority = minority_X[np.random.choice(
                round(minority_X.shape[0] /
                      2), len(minority_y), replace=True), :]
            bagXmajority = majority_X[np.random.choice(
                round(majority_X.shape[0] /
                      2), len(majority_y), replace=True), :]

            bagyminority = np.ones(len(minority_y)).astype('int')
            bagymajority = np.zeros(len(majority_y)).astype('int')

            train_X = np.concatenate((bagXmajority, bagXminority))
            train_y = np.concatenate((bagymajority, bagyminority))

            # unique, counts = np.unique(train_y, return_counts=True)

            if self.oversampled == "ROS":
                ovs = RandomOverSampler(random_state=self.random_state)
                train_X, train_y = ovs.fit_resample(train_X, train_y)
            elif self.oversampled == "SMOTE":
                ovs = SMOTE(random_state=self.random_state)
                train_X, train_y = ovs.fit_resample(train_X, train_y)
            elif self.oversampled == "SVMSMOTE":
                ovs = SVMSMOTE(random_state=self.random_state)
                train_X, train_y = ovs.fit_resample(train_X, train_y)
            elif self.oversampled == "B2SMOTE":
                ovs = BorderlineSMOTE(random_state=self.random_state,
                                      kind="borderline-2")
                train_X, train_y = ovs.fit_resample(train_X, train_y)

            estimator.fit(train_X, train_y)

        # Return the classifier
        return self
Exemplo n.º 25
0
def hyper_paramytize_optimization():
    print("model with no experience with Smote STSRCOM", file=f)
    print(
        "--------------------------------------------------------------------",
        file=f)
    counter = Counter(y)
    # estimate scale_pos_weight value
    estimate = counter[0] / counter[1]
    print('Estimate: %.3f' % estimate, file=f)
    print(counter[0], file=f)
    print(counter[1], file=f)
    model = XGBClassifier(objective='binary:logistic', eval_metric='logloss')
    random = RandomUnderSampler(sampling_strategy=0.33)
    # define grid
    # weights = [1,3, 10, 25,30, 50, 75, 99, 100]
    # param_grid = dict(scale_pos_weight=weights)
    # param_grid= {'xgbclassifier__scale_pos_weight': weights}
    learning_rates = [0.1, 0.05, 0.01]
    max_depths = [1, 2, 3, 5, 8, 10, 14, 18]
    n_estimator = range(60, 220, 40)
    weights = [1, 10, 25, 50, 75, 99, 100, 1000]
    param_grid = {
        'xgbclassifier__max_depth': max_depths,
        'xgbclassifier__learning_rate': learning_rates,
        'xgbclassifier__n_estimators': n_estimator,
        'xgbclassifier__scale_pos_weight=weights': weights
    }
    print(param_grid, file=f)
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
    # define grid search
    # pipeline = Pipeline([('under', random), ('xgbclassifier', model)])
    pipeline = Pipeline([('sample', SVMSMOTE()), ('xgbclassifier', model)])
    grid = GridSearchCV(estimator=pipeline,
                        param_grid=param_grid,
                        n_jobs=-1,
                        cv=cv,
                        scoring='roc_auc')
    # execute the grid search
    grid_result = grid.fit(X, y)
    # report the best configuration
    print(grid_result, file=f)
    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_),
          file=f)
    # report all configurations
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param), file=f)
def equalize_training_dataset_with_SVMSMOTE(x_train, y_train):
    from imblearn.over_sampling import SVMSMOTE

    old_shape = list(x_train.shape)
    # reshape before using using over/undersampling method
    x_tmp = np.reshape(x_train, (x_train.shape[0], -1))
    x_resampled, y_resampled = SVMSMOTE(sampling_strategy='not majority',
                                        n_jobs=8).fit_resample(x_tmp, y_train)
    print(sorted(Counter(y_resampled).items()))
    # reshape after using over/undersampling method
    old_shape[0] = x_resampled.shape[0]
    x_resampled = np.reshape(x_resampled, tuple(old_shape))

    return x_resampled, y_resampled
Exemplo n.º 27
0
def over_sample(X, y, sampler="SMOTE"):
    samplers = {
        "RandomOverSampler": RandomOverSampler(),
        "ADASYN": ADASYN(),
        "SMOTE": SMOTE(),
        "BorderlineSMOTE": BorderlineSMOTE(),
        "SVMSMOTE": SVMSMOTE(),
        "SMOTENC": SMOTENC(categorical_features=[]),
    }
    sampler = samplers[sampler]

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    return X_resampled, y_resampled
Exemplo n.º 28
0
def runSmote(X, y, algorithm='default', split_synthetic=False, verbose=True):
    if verbose:
        log.info("Data before oversampling")
        log.info("Dataset: {0}, {1}".format(X.shape, len(y)))

    n_casos = np.count_nonzero(y == 1)
    n_controles = np.count_nonzero(y == 0)

    N = abs(n_casos - n_controles)

    if algorithm == 'Borderline':
        if verbose:
            log.info("Running Borderline Smote")
        X_novo, y_novo = BorderlineSMOTE(
            random_state=random_state).fit_resample(X, y)
    elif algorithm == 'KMeans':
        if verbose:
            log.info("Running KMeans Smote")
        X_novo, y_novo = KMeansSMOTE(
            random_state=random_state,
            kmeans_estimator=KMeans(n_clusters=20)).fit_resample(X, y)
    elif algorithm == 'SVM':
        if verbose:
            log.info("Running SVM Smote")
        X_novo, y_novo = SVMSMOTE(random_state=random_state).fit_resample(X, y)
    elif algorithm == 'Tomek':
        if verbose:
            log.info("Running Smote Tomek")
        X_novo, y_novo = SMOTETomek(random_state=random_state).fit_resample(
            X, y)
    else:
        if verbose:
            log.info("Running default Smote")
        X_novo, y_novo = SMOTE(random_state=random_state).fit_resample(X, y)

    if verbose:
        log.info("Data after oversampling")
        log.info("Dataset: {0}, {1}".format(X_novo.shape, len(y_novo)))

    if split_synthetic:
        synthetic_X = X_novo[-N:]
        synthetic_y = y_novo[-N:]

        return X, y, synthetic_X, synthetic_y
    else:
        return X_novo, y_novo, None, None
Exemplo n.º 29
0
def test_svm_smote(data):
    svm_smote = SVMSMOTE(random_state=42)
    svm_smote_nn = SVMSMOTE(random_state=42,
                            k_neighbors=NearestNeighbors(n_neighbors=6),
                            m_neighbors=NearestNeighbors(n_neighbors=11),
                            svm_estimator=SVC(gamma='scale', random_state=42))

    X_res_1, y_res_1 = svm_smote.fit_resample(*data)
    X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)
def get_oversampling_models():
    models, names = list(), list()
    # RandomOverSampler
    models.append(RandomOverSampler())
    names.append('ROS')
    # SMOTE
    models.append(SMOTE())
    names.append('SMOTE')
    # BorderlineSMOTE
    models.append(BorderlineSMOTE())
    names.append('BLSMOTE')
    # SVMSMOTE
    models.append(SVMSMOTE())
    names.append('SVMSMOTE')
    # ADASYN
    models.append(ADASYN())
    names.append('ADASYN')
    return models, names