sm = SMOTE(k_neighbors=args.kneighbors,
               random_state=args.randomseed,
               n_jobs=-1)
    x_resampled, y_resampled = sm.fit_sample(X, y)
    # after over sampleing 读取分类信息并返回数量
    np_resampled_y = np.asarray(np.unique(y_resampled, return_counts=True))
    df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum'])
    print("\nNumber of samples after over sampleing:\n{0}\n".format(
        df_resampled_y))

    # 初始化 classifier
    clf = CascadeForestClassifier(random_state=args.randomseed)
    print("\nClassifier parameters:")
    print(clf.get_params())
    print("\nSMOTE parameters:")
    print(sm.get_params())
    print("\n")

    # 使用SMOTE后数据进行训练
    clf.fit(x_resampled, y_resampled)
    # 预测测试集
    y_pred = clf.predict(X_test)

    # 输出测试集统计结果
    if (num_categories > 2):
        model_evaluation(num_categories, y_test, y_pred)
    else:
        bi_model_evaluation(y_test, y_pred)
    end_time = time.time()  # 程序结束时间
    print("\n[Finished in: {0:.6f} mins = {1:.6f} seconds]".format(
        ((end_time - start_time) / 60), (end_time - start_time)))
Пример #2
0
# Binarización de características categóricas.
# Usamos M.todense() para ver los datos en tamaño normal. Si no, se guardan en formato COOmatrix

OHE = preprocessing.OneHotEncoder(
    categorical_features = categoricalAttributes,
    handle_unknown = 'ignore'
    ).fit( X_train )
X_train = OHE.transform(X_train).todense()
X_test = OHE.transform(X_test).todense()

# Equilibrado de representación de cada clase

sm = SMOTE( ratio = 'minority', random_state = seed, k_neighbors = 3 )
Xres, Yres = sm.fit_sample( X_train, y_train )

print ("Tras el equilibrado con SMOTE:{}".format(sm.get_params()))
for i in np.unique( y_train ):
    print( "Número de instancias en la clase {}: {}  {}"
        .format( i, len( np.where( y_train == i )[0] ), len( np.where( y_test == i )[0] ) )
    )


# Creación y ajuste del modelo de aprendizaje Random Forest
# para la selección de instancais y características

rfc = RandomForestClassifier(
    random_state=seed,
    n_estimators = 50,
    n_jobs = -1,
    max_depth = 30,
    min_samples_leaf = 10,