예제 #1
0
def test_fetch_error():
    with raises(ValueError, match='is not a dataset available.'):
        fetch_datasets(filter_data=tuple(['rnd']))
    with raises(ValueError, match='dataset with the ID='):
        fetch_datasets(filter_data=tuple([-1]))
    with raises(ValueError, match='dataset with the ID='):
        fetch_datasets(filter_data=tuple([100]))
    with raises(ValueError, match='value in the tuple'):
        fetch_datasets(filter_data=tuple([1.00]))
예제 #2
0
def test_fetch_error():
    with raises(ValueError, match='is not a dataset available.'):
        fetch_datasets(filter_data=tuple(['rnd']))
    with raises(ValueError, match='dataset with the ID='):
        fetch_datasets(filter_data=tuple([-1]))
    with raises(ValueError, match='dataset with the ID='):
        fetch_datasets(filter_data=tuple([100]))
    with raises(ValueError, match='value in the tuple'):
        fetch_datasets(filter_data=tuple([1.00]))
예제 #3
0
def __extract_binarized_imbalanced_datasets():
    for dataset_name, dataset_values in fetch_datasets().items():
        write_dataset_to_csv("./binarized-datasets/" + dataset_name + ".csv", dataset_values)

# if __name__ == '__main__':
    # __labelize_dataset("E:/python-workspace/resampler/binarized-datasets/"
    #                    "2_Class_Data_February_Cleaned_with_custom_header.csv",
    #                    "E:/python-workspace/resampler/binarized-datasets/custom_ds.csv")
    #__extract_binarized_imbalanced_datasets()
예제 #4
0
def load_dataset(data_name):

    load_data = fetch_datasets(verbose=True)[data_name]
    print(load_data.data.shape)
    print(Counter(load_data.target))

    X = pd.DataFrame(load_data.data)
    y = pd.DataFrame(load_data.target, columns=['Label'])

    return X, y
예제 #5
0
def load_datasets(dataset, names):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = [
            "feature_" + str(i) for i in range(0, data['data'].shape[1])
        ]
        X = data['data']
        y = data['target']
    y[y != 1] = 0

    names.add(dataset)
    output = []
    output.append(X.shape[0])
    output.append(X.shape[1])
    output.append(float(format(len(abs(y[y != 1])) / sum(y[y == 1]), '.2f')))

    return output
def get_dataset(dataset):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset not in ['bloob', 'circle', 'moon']:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = [
            "feature_" + str(i) for i in range(0, data['data'].shape[1])
        ]
        X = data['data']
        y = data['target']
    y[y != 1] = 0

    return X, y, cl_names
예제 #7
0
def test_documentation_example():
    """Test basic code example shown in documentation"""
    from imblearn.datasets import fetch_datasets

    datasets = fetch_datasets(filter_data=['oil'])
    X, y = datasets['oil']['data'], datasets['oil']['target']

    labels, counts = np.unique(y, return_counts=True)
    assert counts[0] > counts[1]

    kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 100},
                               smote_args={'k_neighbors': 10})
    X_resampled, y_resampled = kmeans_smote.fit_sample(X, y)

    labels, counts = np.unique(y_resampled, return_counts=True)
    assert counts[0] == counts[1]
예제 #8
0
def print_examples():
    ts = fetch_datasets()['thyroid_sick']
    print(ts.data.shape)
    target_classes = sorted(Counter(ts.target).items())
    print(sorted(Counter(ts.target).items()))
    ds = load_ds('../datasets/binarized-datasets/thyroid_sic_2.data')
    labels = ['Target classes']
    healty, sick = ([len(list(filter(lambda x: x[-1] == 0, ds)))],
                    [len(list(filter(lambda x: x[-1] == 1, ds)))])
    # healty = [target_classes[0][1]]
    # sick = [target_classes[1][1]]

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width / 2, healty, width, label='Healthy')
    rects2 = ax.bar(x + width / 2, sick, width, label='Sick')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Number of samples')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    ax.legend()

    def autolabel(rects):
        """Attach a text label above each bar in *rects*, displaying its height."""
        for rect in rects:
            height = rect.get_height()
            ax.annotate(
                '{}'.format(height),
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center',
                va='bottom')

    autolabel(rects1)
    autolabel(rects2)

    fig.tight_layout()

    plt.show()
예제 #9
0
def trial(name, sampling_strategy, k_neighbors, n_jobs):
    setup = f'''
    from imblearn.datasets import fetch_datasets
    from imblearn.over_sampling import SMOTE
    
    sampling_strategy = '{sampling_strategy}'
    k_neighbors = {k_neighbors}
    n_jobs = {n_jobs}
    dataset = fetch_datasets()['{name}']
    X, y = dataset.data, dataset.target
    smote = SMOTE(sampling_strategy, k_neighbors=k_neighbors, n_jobs=n_jobs, random_state=0)
    '''
    setup = textwrap.dedent(setup).strip()
    t = timeit('smote.fit_resample(X, y)', setup=setup, number=100)

    dataset = fetch_datasets()[name]
    X, y = dataset.data, dataset.target
    smote = SMOTE(sampling_strategy, k_neighbors=k_neighbors, n_jobs=n_jobs, random_state=0)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    idx = -len(X)
    X_new, y_new = X_resampled[idx:], y_resampled[idx:]
    return X_new, y_new, t
예제 #10
0
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import geometric_mean_score
from matplotlib import pyplot as plt
import seaborn as sns
from seaborn import scatterplot
from numpy import where
from collections import Counter
import numpy as np

# get_ipython().run_line_magic('matplotlib', 'inline')

st.title("Skripsiku")

name = 'pen_digits'

dataset = fetch_datasets()[name]

X = dataset.data
y = dataset.target

df = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)

st.subheader('Dataset Name :')
st.write(name)
st.write(df)

cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

for train_index, test_index, in cv.split(X, y):
    # st.write("Train: \n", train_index, "\nValidation:\n", test_index)
    X_train, X_test = X[train_index], X[test_index]
예제 #11
0
def fetch(*args, **kwargs):
    return fetch_datasets(*args, download_if_missing=True, **kwargs)
예제 #12
0
def templet(sampler_name, sample_ratio):
    """
    模板方法
    :param sampler_name: 采样算法名
    :param sample_ratio: 采样比例
    :return:
    """
    dataset = fetch_datasets()['satimage']
    X = dataset.data
    y = dataset.target
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        sb = None
        if sampler_name == 'CART':
            sb = DummySampler()
        elif sampler_name == 'SMOTE':
            sb = SMOTE(N=sample_ratio, k_neighbors=5, random_state=42)
        elif sampler_name == 'Border1':
            sb = BorderSMOTE(N=sample_ratio,
                             m_neighbors=9,
                             k_neighbors=5,
                             random_state=42,
                             kind='borderline1')
        elif sampler_name == 'Border2':
            sb = BorderSMOTE(N=sample_ratio,
                             m_neighbors=9,
                             k_neighbors=5,
                             random_state=42,
                             kind='borderline2')
        elif sampler_name == 'ADASYN':
            sb = ADASYN(bata=sample_ratio, k_neighbors=5, random_state=42)
        elif sampler_name == 'Safe-level':
            sb = SafeLevelSMOTE(N=sample_ratio, k_neighbors=5, random_state=42)
        else:
            pass
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])  # 采样
        model = tree.DecisionTreeClassifier(max_depth=8,
                                            min_samples_split=10,
                                            random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]
        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        # write2dic
        fill_dic('precision', sampler_name, sample_ratio, precision)
        fill_dic('recall', sampler_name, sample_ratio, recall)
        fill_dic('f1', sampler_name, sample_ratio, f1)
        fill_dic('auc', sampler_name, sample_ratio, auc)
        fill_dic('gmean', sampler_name, sample_ratio, gmean)
    print('%s %.1f building id transforming took %fs!' %
          (sampler_name, sample_ratio, time.time() - start_time))
        ax.text(j, i, format(cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')


###############################################################################
# Load an imbalanced dataset
###############################################################################
# We will load the UCI SatImage dataset which has an imbalanced ratio of 9.3:1
# (number of majority sample for a minority sample). The data are then split
# into training and testing.

satimage = fetch_datasets()['satimage']
X, y = satimage.data, satimage.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=0)

###############################################################################
# Classification using a single decision tree
###############################################################################
# We train a decision tree classifier which will be used as a baseline for the
# rest of this example.

###############################################################################
# The results are reported in terms of balanced accuracy and geometric mean
# which are metrics widely used in the literature to validate model trained on
# imbalanced set.
예제 #14
0
            print(f"Total train time: {clf_copy._total_train_time}")
            print(f"Total fit time: {clf_copy._fit_time}")
            print(f"Total iterations: {clf_copy._iter_count}")
            print(f"Final Sampling Probability: {clf_copy._sampling_proba}")
    # Save results
    try:
        results_df = pd.concat(
            [pd.read_csv(output_path, index_col=0),
             pd.DataFrame(results)])
    except FileNotFoundError:
        results_df = pd.DataFrame(results)
    results_df.to_csv(output_path)


if __name__ == "__main__":
    database = fetch_datasets()
    for dataset_name in DATASETS:
        for estimator_ in ESTIMATORS:
            for cost_scaling_ in COST_SCALINGS:
                try:
                    run_experiment(dataset=database[dataset_name],
                                   estimator=estimator_,
                                   cost_scaling=cost_scaling_,
                                   output_path=OUTPUT_PATH,
                                   verbose=True)
                    print(
                        f"Completed experiment on {dataset_name} dataset with {type(estimator_)} model "
                        + f"and cost scaling {cost_scaling_}")
                except BlockingIOError:
                    print(
                        f"Error running experiment on {dataset_name} dataset with {type(estimator_)} model "
예제 #15
0
    cut_perc = 0.1
    cut_intervals = (512, 256, 128, 64)
    continuation = True

    # Nia to be used in the experiment
    evos = [
        GreyWolfOptimizer, SelfAdaptiveDifferentialEvolution, GeneticAlgorithm,
        EvolutionStrategyMpL, ParticleSwarmAlgorithm
    ]

    # Datasets
    dataset_names = [
        'libras_move', 'spectrometer', 'optical_digits', 'oil', 'ozone_level',
        'arrhythmia', 'us_crime', 'yeast_ml8'
    ]
    datasets = fetch_datasets(verbose=True)

    with open('./results/results_all5.csv', 'w') as f:
        print(
            'Algorithm,Dataset,Fold,Accuracy,Fscore,TrainingTime,NoFeatures,Solution'
        )
        print(
            'Algorithm,Dataset,Fold,Accuracy,Fscore,TrainingTime,NoFeatures,Solution',
            file=f)

        # For each dataset
        for dataset_name in dataset_names:
            dataset = datasets[dataset_name]
            scaler = MinMaxScaler()  # Scale it
            dataset.data = scaler.fit_transform(dataset.data)
            skf = StratifiedKFold(n_splits=10,
예제 #16
0
def model(boosting_name, data_name, classifier_name, cv_name, mode):
    """
    模板方法
    :param boosting_name: 集成学习的方法
    :param data_name: 数据集名称
    :param classifier_name: 使用的基分类器
    :param cv_name: 交叉验证模式
    :param mode: 采样模式
    :return:
    """
    # 加载数据
    if data_name in fetch_datasets().keys():
        dataset = fetch_datasets()[data_name]
        X = dataset.data
        y = dataset.target
        print(Counter(y))
    else:
        # 加载自定义数据
        df = pd.read_csv('../imbalanced_data/%s.csv' % data_name, header=None)
        array = df.values.astype(float)
        X = array[:, 0:array.shape[1] - 1]
        y = array[:, -1]
        print(Counter(y))
    base = None
    if classifier_name == 'CART':
        base = tree.DecisionTreeClassifier(max_depth=8,
                                           random_state=42,
                                           min_samples_split=10)
    elif classifier_name == 'svm':
        base = svm.SVC()
    else:
        pass
    # 起始时间
    start_time = time.time()
    cv = None
    if cv_name == 'StratifiedKFold':
        cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    elif cv_name == 'RepeatedStratifiedKFold':
        cv = RepeatedStratifiedKFold(n_repeats=10,
                                     n_splits=10,
                                     random_state=42)
    else:
        pass
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)  # 插值点(保证每一折的fpr和tpr相同)
    aucs = []
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        classifier = None
        if boosting_name == 'CART':
            classifier = base
        elif boosting_name == 'Bagging':
            classifier = BaggingClassifier(base_estimator=base,
                                           n_estimators=40)
        elif boosting_name == 'BalancedBagging':
            classifier = BalancedBaggingClassifier(base_estimator=base,
                                                   ratio='auto',
                                                   replacement=True,
                                                   random_state=42)
        elif boosting_name == 'Adaboost':
            classifier = AdaBoostClassifier(base_estimator=base,
                                            n_estimators=40)
        elif boosting_name == 'Random Forest':
            classifier = RandomForestClassifier(max_depth=8,
                                                min_samples_split=10,
                                                n_estimators=40,
                                                random_state=42)
        elif boosting_name == 'EasyEnsemble':
            model_under(boosting_name, X_train_minmax, y[train], X_test_minmax,
                        y[test])
            continue
        elif boosting_name == 'BalanceCascade':
            model_under(boosting_name, X_train_minmax, y[train], X_test_minmax,
                        y[test])
            continue
        elif boosting_name == 'SMOTEBoost':
            classifier = SMOTEBoost(rate=100,
                                    n_estimators=40,
                                    weak_estimator=base,
                                    random_state=42,
                                    class_dist=False)
        elif boosting_name == 'RUSBoost':
            classifier = RUSBoost(ratio=50,
                                  n_estimators=40,
                                  weak_estimator=base,
                                  random_state=42,
                                  class_dist=False)
        else:
            pass
        classifier.fit(X_train_minmax, y[train])  # 采样
        predict = classifier.predict(X_test_minmax)
        probability = classifier.predict_proba(X_test_minmax)[:, 1]
        # 指标计算
        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        accuracy = metrics.accuracy_score(y[test], predict)
        # -------------step6.计算每一折的ROC曲线和PR曲线上的点 -------------
        fpr, tpr, thresholds = metrics.roc_curve(y[test], probability)
        # 对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0  # 为什么?
        roc_auc = metrics.auc(fpr, tpr)
        aucs.append(roc_auc)
        # write2dic
        fill_dic('precision', boosting_name, precision)
        fill_dic('recall', boosting_name, recall)
        fill_dic('f1', boosting_name, f1)
        fill_dic('auc', boosting_name, auc)
        fill_dic('gmean', boosting_name, gmean)

    if boosting_name != 'EasyEnsemble' and boosting_name != 'BalanceCascade':
        # 将frp和tpr写入文件
        # 在mean_fpr100个点,每个点处插值插值多次取平均
        mean_tpr /= cv.get_n_splits()
        # 坐标最后一个点为(1,1)
        mean_tpr[-1] = 1.0
        # 计算平均AUC值
        mean_auc = metrics.auc(mean_fpr, mean_tpr)

        # 将平均fpr和tpr拼接起来存入文件
        filename = './ROC/{data_name}/{mode}/{base_classifier}/{sampler}.csv'. \
            format(data_name=data_name, mode=mode, base_classifier=classifier_name, sampler=boosting_name)
        # 将文件路径分割出来
        file_dir = os.path.split(filename)[0]
        # 判断文件路径是否存在,如果不存在,则创建,此处是创建多级目录
        if not os.path.isdir(file_dir):
            os.makedirs(file_dir)
        # # 然后再判断文件是否存在,如果不存在,则创建
        # if not os.path.exists(filename):
        #     os.system(r'touch %s' % filename)
        # 将结果拼合起来
        all = np.c_[mean_fpr, mean_tpr]
        np.savetxt(filename, all, delimiter=',', fmt='%f')

    print('%s building id transforming took %fs!' %
          (boosting_name, time.time() - start_time))
예제 #17
0
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np


def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}").format(accuracy_score(true_value, pred))
    print("precision: {}").format(precision_score(true_value, pred))
    print("recall: {}").format(recall_score(true_value, pred))
    print("f1: {}").format(f1_score(true_value, pred))


data = fetch_datasets()['wine_quality']

X_train, X_test, y_train, y_test = train_test_split(data['data'],
                                                    data['target'],
                                                    random_state=2)

#build normal model
pipeline = make_pipeline(RandomForestClassifier(random_state=42))
model = piepline.fit(X_train, y_test)
prediction = model.predict(X_test)

#build model with SMOTE
smote_pipeline = make_pipeline_imb(SMOTE(random_state=4),
                                   RandomForestClassifier(random_state=42))
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)
예제 #18
0
def test():
    dic = {'recall': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'precision': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'f1': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'auc': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'gmean': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}}

    results = prettytable.PrettyTable(["Classifier", "Precision", 'Recall', 'AUC', 'F-measure', 'G-mean'])

    # 加载数据
    dataset = fetch_datasets()['satimage']
    X = dataset.data
    y = dataset.target
    print(Counter(y))
    # 随机种子,保证每次实验结果相同
    np.random.seed(42)
    # -------------------------------------------CART----------------------------------------------------
    # 起始时间
    start_time = time.time()
    # 交叉验证CART
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # initialize CART
        cart = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        # 归一化
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        cart.fit(X_train_minmax, y[train])
        # 测试
        predict = cart.predict(X_test_minmax)
        probability = cart.predict_proba(X_test_minmax)

        cart_auc = metrics.roc_auc_score(y[test], probability[:, 1])
        cart_precision = metrics.precision_score(y[test], predict)
        cart_recall = metrics.recall_score(y[test], predict)
        if cart_precision == 0:
            cart_f1 = 0
        else:
            cart_f1 = 2 * (cart_precision * cart_recall) / (cart_precision + cart_recall)
        cart_gmean = geometric_mean_score(y[test], predict)
        dic['precision']['CART'].append(cart_precision)
        dic['recall']['CART'].append(cart_recall)
        dic['f1']['CART'].append(cart_f1)
        dic['auc']['CART'].append(cart_auc)
        dic['gmean']['CART'].append(cart_gmean)
    print('CART building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------------SMOTE----------------------------------------------------------
    # 起始时间
    start_time = time.time()
    # 交叉验证
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # preprocess
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # initialize sampler
        sb = SMOTE(N=100, k_neighbors=5, random_state=42)
        # sampling
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])
        # initialize classifier
        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        # model = svm.SVC(class_weight={1: 20})
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['SMOTE'].append(precision)
        dic['recall']['SMOTE'].append(recall)
        dic['f1']['SMOTE'].append(f1)
        dic['auc']['SMOTE'].append(auc)
        dic['gmean']['SMOTE'].append(gmean)

    print('SMOTE building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------Borderline-SMOTE1----------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 初始化采样器
        sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline1')
        # 采样
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Border1'].append(precision)
        dic['recall']['Border1'].append(recall)
        dic['f1']['Border1'].append(f1)
        dic['auc']['Border1'].append(auc)
        dic['gmean']['Border1'].append(gmean)

    print('BorderSmote1 building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------Borderline-SMOTE2----------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 初始化采样器
        sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline2')
        # 采样
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Border2'].append(precision)
        dic['recall']['Border2'].append(recall)
        dic['f1']['Border2'].append(f1)
        dic['auc']['Border2'].append(auc)
        dic['gmean']['Border2'].append(gmean)

    print('BorderSmote2 building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------ADASYN---------------------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        sb = ADASYN(bata=0.1, k_neighbors=5, random_state=42)
        # 预测
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['ADASYN'].append(precision)
        dic['recall']['ADASYN'].append(recall)
        dic['f1']['ADASYN'].append(f1)
        dic['auc']['ADASYN'].append(auc)
        dic['gmean']['ADASYN'].append(gmean)

    print('ADASYN building id transforming took %fs!' % (time.time() - start_time))

    # ------------------------------------------------Safe-Level-SMOTE----------------------------------------------
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        sb = SafeLevelSMOTE(N=100, k_neighbors=5, random_state=42)
        # 预测
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Safe-level'].append(precision)
        dic['recall']['Safe-level'].append(recall)
        dic['f1']['Safe-level'].append(f1)
        dic['auc']['Safe-level'].append(auc)
        dic['gmean']['Safe-level'].append(gmean)

    print('Safe-level building id transforming took %fs!' % (time.time() - start_time))

    # display
    results.add_row(['CART',
                     np.mean(np.array(dic['precision']['CART'])),
                     np.mean(np.array(dic['recall']['CART'])),
                     np.mean(np.array(dic['auc']['CART'])),
                     np.mean(np.array(dic['f1']['CART'])),
                     np.mean(np.array(dic['gmean']['CART']))])
    results.add_row(['SMOTE',
                     np.mean(np.array(dic['precision']['SMOTE'])),
                     np.mean(np.array(dic['recall']['SMOTE'])),
                     np.mean(np.array(dic['auc']['SMOTE'])),
                     np.mean(np.array(dic['f1']['SMOTE'])),
                     np.mean(np.array(dic['gmean']['SMOTE']))])
    results.add_row(['Border1',
                     np.mean(np.array(dic['precision']['Border1'])),
                     np.mean(np.array(dic['recall']['Border1'])),
                     np.mean(np.array(dic['auc']['Border1'])),
                     np.mean(np.array(dic['f1']['Border1'])),
                     np.mean(np.array(dic['gmean']['Border1']))])
    results.add_row(['Border2',
                     np.mean(np.array(dic['precision']['Border2'])),
                     np.mean(np.array(dic['recall']['Border2'])),
                     np.mean(np.array(dic['auc']['Border2'])),
                     np.mean(np.array(dic['f1']['Border2'])),
                     np.mean(np.array(dic['gmean']['Border2']))])
    results.add_row(['ADASYN',
                     np.mean(np.array(dic['precision']['ADASYN'])),
                     np.mean(np.array(dic['recall']['ADASYN'])),
                     np.mean(np.array(dic['auc']['ADASYN'])),
                     np.mean(np.array(dic['f1']['ADASYN'])),
                     np.mean(np.array(dic['gmean']['ADASYN']))])
    results.add_row(['Safe-level',
                     np.mean(np.array(dic['precision']['Safe-level'])),
                     np.mean(np.array(dic['recall']['Safe-level'])),
                     np.mean(np.array(dic['auc']['Safe-level'])),
                     np.mean(np.array(dic['f1']['Safe-level'])),
                     np.mean(np.array(dic['gmean']['Safe-level']))])
    print(results)
예제 #19
0
def fetch(*args, **kwargs):
    return fetch_datasets(*args, download_if_missing=True, **kwargs)
예제 #20
0
def test_fetch_error(filter_data, err_msg):
    with pytest.raises(ValueError, match=err_msg):
        fetch_datasets(filter_data=filter_data)
예제 #21
0
from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import numpy
datasets = [
    'ecoli', 'optical_digits', 'satimage', 'pen_digits', 'abalone',
    'sick_euthyroid', 'spectrometer', 'car_eval_34', 'isolet', 'us_crime',
    'yeast_ml8', 'scene', 'libras_move', 'thyroid_sick', 'coil_2000',
    'arrhythmia', 'solar_flare_m0', 'oil', 'car_eval_4', 'wine_quality',
    'letter_img', 'yeast_me2', 'webpage', 'ozone_level', 'mammography',
    'protein_homo', 'abalone_19'
]
for dataset in datasets:
    object = fetch_datasets(data_home='./data/')[dataset]
    X, y = object.data, object.target
    train_X, test_X, train_y, test_y = train_test_split(
        X, y)  # splits 75%/25% by default
    numpy.savez('./data/zendo_stable/' + dataset + '.npz',
                train_X=train_X,
                test_X=test_X,
                train_y=train_y,
                test_y=test_y)
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black",
        )

    ax.set_ylabel("True label")
    ax.set_xlabel("Predicted label")


###############################################################################
# Load an imbalanced dataset
###############################################################################
# We will load the UCI SatImage dataset which has an imbalanced ratio of 9.3:1
# (number of majority sample for a minority sample). The data are then split
# into training and testing.

satimage = fetch_datasets()["satimage"]
X, y = satimage.data, satimage.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=0)

###############################################################################
# Classification using a single decision tree
###############################################################################
# We train a decision tree classifier which will be used as a baseline for the
# rest of this example.

###############################################################################
# The results are reported in terms of balanced accuracy and geometric mean
# which are metrics widely used in the literature to validate model trained on
예제 #23
0
def run_eval(dataset, base_learners, methods):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = [
            "feature_" + str(i) for i in range(0, data['data'].shape[1])
        ]
        X = data['data']
        y = data['target']

    y[y != 1] = 0

    processes = []

    for method in methods:
        p = Process(target=train_classifier,
                    args=(X, y, base_learners, method,
                          cl_names))  # Passing the list
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    N = len(methods)
    ind = numpy.arange(N)  # the x locations for the groups
    width = 0.35  # the width of the bars: can also be len(x) sequence

    raw_data = dict()

    for method in methods:
        with open('temp_features/' + method, 'rb') as filehandle:
            # read the data as binary data stream
            model = pickle.load(filehandle)
            # print (method, model.feature_importances_)
            raw_data[method] = model.feature_importances_
            f_num = len(model.feature_importances_)
    index = ["Feature " + str(k) for k in range(1, f_num + 1)]
    # index = ["Atrribute 1","Atrribute 2","Atrribute 3","Atrribute 4","Atrribute 5","Atrribute 6"]
    df = pd.DataFrame(raw_data, index=index)
    df = df.transpose()

    ax = df.plot.bar(stacked=True, alpha=0.75, rot=25)
    ax.set_ylabel("Feature importance")
    ax.set_xlabel("Methods")
    ax.legend(loc='center left', bbox_to_anchor=(0.1, 01.07),
              ncol=3)  # here is the magic

    ax.figure.savefig('Images/features/' + dataset + '.png',
                      bbox_inches='tight',
                      dpi=200)
            pass


if __name__ == '__main__':
    import time
    import prettytable
    from collections import Counter
    from sklearn import tree
    from sklearn import metrics
    from sklearn import preprocessing
    from imblearn.datasets import fetch_datasets
    from imblearn.metrics import geometric_mean_score
    from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold

    start_time = time.time()
    dataset = fetch_datasets()['oil']
    X = dataset.data
    y = dataset.target
    # print(Counter(y))

    cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    dic = {'recall': [], 'precision': [], 'f1': [], 'auc': [], 'gmean': []}
    results = prettytable.PrettyTable(["Classifier", "Precision", 'Recall', 'F-measure', 'AUC', 'G-mean'])
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        sb = BorderSMOTE(N=100, m_neighbors=9, k_neighbors=5, random_state=42, kind='borderline1')
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


ozone = fetch_datasets()['ozone_level']
X, y = ozone.data, ozone.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

bagging = BaggingClassifier(random_state=0)
balanced_bagging = BalancedBaggingClassifier(random_state=0)

print('Class distribution of the training set: {}'.format(Counter(y_train)))

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

print('Class distribution of the test set: {}'.format(Counter(y_test)))

print('Classification results using a bagging classifier on imbalanced data')
y_pred_bagging = bagging.predict(X_test)
예제 #26
0
def obtain_data(dataset_name):
    dataset = fetch_datasets()[dataset_name]
    return dataset.data, dataset.target
def run_eval(dataset, folds, iterations, baseL, methods):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "rain_aus":
        X, y, cl_names = load_rain_aus()
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])]
        X = data['data']
        y = data['target']
    y[y != 1] = 0

    unique_attr = set([i.split("?")[0] for i in cl_names])
    print(dataset + "\t" + str(len(unique_attr)) + "\t" + str(f'{sum(abs(y[y == 1])):,}') + "\t" + str(
        f'{len(abs(y[y != 1])):,}') + "\t1:" + str(format(len(abs(y[y != 1])) / sum(y[y == 1]), '.2f')))

    list_of_dicts = []
    list_of_dicts_stats = []

    for t_dict in range(0, len(methods)):
        list_of_dicts.append(defaultdict(dict))
        list_of_dicts_stats.append(defaultdict(dict))

    for weak_learners in baseL:
        for item in list_of_dicts:
            item[weak_learners] = defaultdict(list)

    for weak_learners in baseL:
        for item in list_of_dicts_stats:
            item[weak_learners] = defaultdict(list)

    for samples in range(0, iterations):

        sss = StratifiedKFold(n_splits=folds, shuffle=True, random_state=int(time.time()))
        for weak_learners in baseL:
            print("iteration=", samples, " weak learners=", weak_learners)

            # for weak_learners in baseL:
            for train_index, test_index in sss.split(X, y):

                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                processes = []
                for method in methods:
                    p = Process(target=train_and_predict,
                                args=(X_train, y_train, X_test, weak_learners, method, cl_names))
                    p.start()
                    processes.append(p)

                for p in processes:
                    p.join()

                for index, method in enumerate(methods):
                    with open('temp_preds/' + method, 'rb') as filehandle:
                        list_of_dicts[index] = update_performance_stats(
                            calculate_performance(y_test, pickle.load(filehandle)),
                            list_of_dicts[index],
                            weak_learners
                        )

                    with open('temp_preds/stats_' + method, 'rb') as filehandle:
                        list_of_dicts_stats[index] = update_resource_stats(pickle.load(filehandle),
                                                                           list_of_dicts_stats[index],
                                                                           weak_learners,
                                                                           method
                                                                           )
    plot_single_dataset(methods, list_of_dicts, "Images/Performance/" + dataset + "/", baseL)
    plot_resource_stats_time(methods, list_of_dicts_stats, "Images/Performance/" + dataset + "/Resource/", baseL)
    plot_resource_stats_scores(methods, list_of_dicts_stats, "Images/Performance/" + dataset + "/Resource/", baseL)
    return list_of_dicts, list_of_dicts_stats
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


ozone = fetch_datasets()['ozone_level']
X, y = ozone.data, ozone.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

bagging = BaggingClassifier(random_state=0)
balanced_bagging = BalancedBaggingClassifier(random_state=0)

print('Class distribution of the training set: {}'.format(Counter(y_train)))

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

print('Class distribution of the test set: {}'.format(Counter(y_test)))

print('Classification results using a bagging classifier on imbalanced data')
y_pred_bagging = bagging.predict(X_test)
예제 #29
0
        return X_resampled, y_resampled


if __name__ == '__main__':
    import time
    import prettytable
    from collections import Counter
    from sklearn import tree
    from sklearn import metrics
    from sklearn import preprocessing
    from imblearn.datasets import fetch_datasets
    from imblearn.metrics import geometric_mean_score
    from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold

    start_time = time.time()
    dataset = fetch_datasets()['satimage']
    X = dataset.data
    y = dataset.target
    print(Counter(y))

    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    dic = {'recall': [], 'precision': [], 'f1': [], 'auc': [], 'gmean': []}
    results = prettytable.PrettyTable(
        ["Classifier", "Precision", 'Recall', 'F-measure', 'AUC', 'G-mean'])
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
예제 #30
0
def test_fetch_error(filter_data, err_msg):
    with pytest.raises(ValueError, match=err_msg):
        fetch_datasets(filter_data=filter_data)
예제 #31
0
def run_eval(dataset, baseL, methods):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])]
        X = data['data']
        y = data['target']

    y[y != 1] = 0
    print("===============-- " + dataset + " --===============")

    processes = []
    for method in methods:
        p = Process(target=train_and_predict, args=(X, y, baseL, method))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    list_of_dicts = []

    for method in methods:
        with open('temp_preds_adaac/' + method, 'rb') as filehandle:
            list_of_dicts.append(update_stats(pickle.load(filehandle)))

    plot_amort_vs_non_amort(methods, list_of_dicts, baseL, "Images/Amort_vs_non_amort/" + dataset + "/")
    return list_of_dicts
예제 #32
0
파일: run.py 프로젝트: jhkjhkim/CUSBoost.NC
     }
for data in dataset:
    print("dataset : ", data)
    '''
    fetch_data = fetch_datasets()[data]
    
    X = fetch_data.data
    y = fetch_data.target
    
    normalization_object = Normalizer()
    X = normalization_object.fit_transform(X)
    
    labelencoder = LabelEncoder()
    y = labelencoder.fit_transform(y)
    '''
    fetch_data = fetch_datasets()[data]
    
    
    X = fetch_data.data
    y = fetch_data.target
    
    Standard_object = StandardScaler()
    X = Standard_object.fit_transform(X)
    
    labelencoder = LabelEncoder()
    y = labelencoder.fit_transform(y)
    value, counts = np.unique(y, return_counts=True)
    
    if counts[0]>= counts[1]:
        fraction = int((counts[1]/counts[0])*100)
    else: 
예제 #33
0
from sklearn.model_selection import KFold
from imblearn.datasets import fetch_datasets

from photonai.base import Hyperpipe, PipelineElement, OutputSettings
from photonai.optimization import FloatRange, Categorical, IntegerRange

# example of imbalanced dataset
dataset = fetch_datasets()["coil_2000"]
X, y = dataset.data, dataset.target
# ratio class 0: 0.06%, class 1: 0.94%

my_pipe = Hyperpipe(
    "basic_svm_pipe_no_performance",
    optimizer="random_grid_search",
    optimizer_params={"n_configurations": 10},
    metrics=["accuracy", "precision", "recall"],
    best_config_metric="recall",
    outer_cv=KFold(n_splits=3),
    inner_cv=KFold(n_splits=5),
    verbosity=1,
    output_settings=OutputSettings(project_folder="./tmp/"),
)


# ADD ELEMENTS TO YOUR PIPELINE
my_pipe += PipelineElement("StandardScaler")

my_pipe += PipelineElement(
    "PCA", hyperparameters={"n_components": IntegerRange(5, 20)}, test_disabled=True
)
def run_eval(dataset, base_learners, methods):
    if dataset == "wilt":
        X, y, cl_names = load_wilt()
    elif dataset == "adult":
        X, y, cl_names = load_adult()
    elif dataset == "diabetes":
        X, y, cl_names = load_diabetes()
    elif dataset == "phoneme":
        X, y, cl_names = load_phoneme()
    elif dataset == "mushroom":
        X, y, cl_names = load_mushroom()
    elif dataset == "electricity":
        X, y, cl_names = load_electricity()
    elif dataset == "speeddating":
        X, y, cl_names = load_speed_dating()
    elif dataset == "credit":
        X, y, cl_names = load_credit()
    elif dataset == "eeg_eye":
        X, y, cl_names = load_eeg_eye()
    elif dataset == "spam":
        X, y, cl_names = load_spam()
    elif dataset == "skin":
        X, y, cl_names = load_skin()
    elif dataset == "bank":
        X, y, cl_names = load_bank()
    elif dataset == "kdd":
        X, y, cl_names = load_kdd()
    elif dataset == "landsatM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "musk2":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "spliceM":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "semeion_orig":
        X, y, cl_names = load_mat_data(dataset)
    elif dataset == "waveformM":
        X, y, cl_names = load_mat_data(dataset)
    else:
        from imblearn import datasets

        data = datasets.fetch_datasets()[dataset]
        cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])]
        X = data['data']
        y = data['target']

    y[y != 1] = 0

    list_of_scores = []
    processes = []

    for method in methods:
        p = Process(target=train_classifier, args=(X, y, base_learners, method, cl_names))  # Passing the list
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    for method in methods:
        with open('temp/' + method, 'rb') as filehandle:
            # read the data as binary data stream
            list_of_scores.append(pickle.load(filehandle))

    y[y != 1] = -1

    for idx in range(0, len(list_of_scores)):
        list_of_scores[idx] = numpy.array(list_of_scores[idx]) * y

    overall_confs = []
    positive_confs = []
    negative_confs = []

    for conf in list_of_scores:
        overall_confs.append(conf)
        positive_confs.append(conf[y == 1])
        negative_confs.append(conf[y == -1])

    num_bins = 40
    fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 4))
    plt.rcParams.update({'font.size': 12})
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'dimgray', 'peru', 'hotpink', 'tomato']
    default_cycler = (cycler(color=colors) +
                      cycler(linestyle=['-', (0, (1, 1)), '--', '-.',
                                        (0, (5, 10)),
                                        (0, (5, 1)),
                                        '-', (0, (1, 1)), '--', '-.',
                                        (0, (5, 10))]))

    ax1.set_prop_cycle(default_cycler)
    ax2.set_prop_cycle(default_cycler)
    ax3.set_prop_cycle(default_cycler)

    ax1.set_title("Positive CDF")
    ax1.grid(True)
    ax1.set_xlim(-1, 1)
    ax2.set_xlim(-1, 1)
    ax3.set_xlim(-1, 1)

    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'dimgray', 'peru', 'hotpink', 'tomato', 'indigo', 'lightskyblue']

    output = defaultdict(list)

    for idx in range(0, len(positive_confs)):
        pos_conf = positive_confs[idx]
        counts_positives, bin_edges_positives = numpy.histogram(pos_conf, bins=num_bins, normed=True)
        cdf_positives = numpy.cumsum(counts_positives)
        # ax1.plot(bin_edges_positives[:-1], cdf_positives / cdf_positives[-1], label=methods[idx],color=colors[idx])
        ax1.plot(bin_edges_positives[:-1], cdf_positives / cdf_positives[-1], label=methods[idx])
        output[methods[idx]].append(bin_edges_positives[:-1])
        output[methods[idx]].append(cdf_positives)

    # ax1.legend(loc='best')
    ax1.set_xlabel("Margin")

    ax1.set_ylabel("Cumulative Distribution")
    ax1.axhline(0, color='black')
    ax1.axvline(0, color='black')

    ax2.grid(True)

    ax2.axhline(0, color='black')
    ax2.axvline(0, color='black')
    ax2.set_title("Negative CDF")

    for idx in range(0, len(negative_confs)):
        if idx == 0:
            ax2.set_ylabel("Cumulative Distribution")
            ax2.set_xlabel("Margin")

        neg_conf = negative_confs[idx]
        counts_negatives, bin_edges_negatives = numpy.histogram(neg_conf, bins=num_bins, normed=True)
        cdf_negatives = numpy.cumsum(counts_negatives)
        # ax2.plot(bin_edges_negatives[:-1], cdf_negatives / cdf_negatives[-1], label=methods[idx],color=colors[idx])
        ax2.plot(bin_edges_negatives[:-1], cdf_negatives / cdf_negatives[-1], label=methods[idx])
        output[methods[idx]].append(bin_edges_negatives[:-1])
        output[methods[idx]].append(cdf_negatives)

    ax3.grid(True)

    ax3.axhline(0, color='black')
    ax3.axvline(0, color='black')
    ax3.set_title("Overall CDF")

    for idx in range(0, len(negative_confs)):
        if idx == 0:
            ax3.set_ylabel("Cumulative Distribution")
            ax3.set_xlabel("Margin")

        over_conf = overall_confs[idx]
        counts_overall, bin_edges_overall = numpy.histogram(over_conf, bins=num_bins, normed=True)
        cdf_overall = numpy.cumsum(counts_overall)
        # ax3.plot(bin_edges_overall[:-1], cdf_overall / cdf_overall[-1], label=methods[idx], color=colors[idx])
        ax3.plot(bin_edges_overall[:-1], cdf_overall / cdf_overall[-1], label=methods[idx])
        output[methods[idx]].append(bin_edges_overall[:-1])
        output[methods[idx]].append(cdf_overall)

    plt.legend(loc='upper center', bbox_to_anchor=(-0.7, 1.305), ncol=5)

    if not os.path.exists("Images/cdf_plots/" + dataset):
        os.makedirs("Images/cdf_plots/" + dataset)

    plt.savefig("Images/cdf_plots/" + dataset + "/cdf_" + str(base_learners) + ".png", bbox_inches='tight',
                dpi=200)
    return output
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

random_seed = 3

# ## Preparing the data

# In[2]:

np.random.seed(random_seed)

# In[3]:

libras = imb_datasets.fetch_datasets()['libras_move']
X, y = libras['data'], libras['target']

# In[4]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# ## Fitting a pipeline

# In[5]:

oversampler = sv.MulticlassOversampling(sv.distance_SMOTE())
classifier = KNeighborsClassifier(n_neighbors=5)

# In[6]: