예제 #1
0
parser.add_argument('-n','--n_rounds', help='Number of Boost iterations', type=int, default=2000)
parser.add_argument('-e','--eta', help='Learning rate', type=float, default=0.01)
parser.add_argument('-r','--r_seed', help='Set random seed', type=int, default=3)
parser.add_argument('-b','--minbin', help='Minimum categorical bin size', type=int, default=1)
parser.add_argument('-ct','--cat_trans', help='Category transformation method', type=str, default='std')
parser.add_argument('-cv','--cv', action='store_true')
parser.add_argument('-codetest','--codetest', action='store_true')
parser.add_argument('-getcached', '--getcached', action='store_true')
parser.add_argument('-extra', '--extra', action='store_true')
m_params = vars(parser.parse_args())

# Load data
X, y, X_sub, ids = data.load(m_params)

print("BNP Parabas: AdaBoost...\n")
clf =  AdaBoostClassifier(n_estimators=30, learning_rate=0.001, algorithm='SAMME', random_state=1)

if m_params['cv']:
	# do cross validation scoring
	kf = KFold(X.shape[0], n_folds=4, shuffle=True, random_state=1)
	scr = np.zeros([len(kf)])
	oob_pred = np.zeros(X.shape[0])

	for i, (tr_ix, val_ix) in enumerate(kf):
		clf.fit(X[tr_ix], y[tr_ix])
		pred = clf.predict_proba(X[val_ix])
		oob_pred[val_ix] = np.array(pred[:,1])
		scr[i] = log_loss(y[val_ix], np.array(pred[:,1]))
		print('Train score is:', scr[i])
	print(log_loss(y, oob_pred))
	print oob_pred[1:10]
X_spatial_norm = norm_scaller.fit_transform(X_spatial, Y)

# combine features
X_norm = np.concatenate([X_norm, X_spatial_norm], -1)
# save features to csv
np.savetxt("features.csv", X_norm, delimiter=",")
np.savetxt("labels.csv", Y, delimiter=",")


# find the optimal value for the Adaboost and Decision Tree
parameters = {'n_estimators': [15, 25, 50, 75], "learning_rate": [0.5, 0.75], "base_estimator__max_depth":[1, 3, 5, 7], "base_estimator__max_features":[.5], "base_estimator__max_leaf_nodes":[3, 5, 7]}
# base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(criterion="entropy",
                                        class_weight="balanced",
                                        random_state=0)
base_model = AdaBoostClassifier(base_estimator=base_estimator,
                                random_state=0)
clf = GridSearchCV(base_model, parameters)
clf.fit(X_norm, Y)
best_params = clf.best_params_


acc = []
roc_auc_values = []
mcc = []
# perform k-fold cross validation
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
fold = 0

for train_index, test_index in kf.split(X_norm, Y):
    X_train = X_norm[train_index]
    X_test = X_norm[test_index]
y_logistic_regression=logistic_regression.predict(standard_scale.fit_transform(x_test))

print(accuracy_score(y_test, y_logistic_regression))
print(precision_score(y_test, y_logistic_regression))
print(recall_score(y_test, y_logistic_regression))
print(f1_score(y_test, y_logistic_regression))


# # Adaboost

# In[40]:


adaboost_classifier=AdaBoostClassifier(
    RandomForestClassifier(), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5
)

adaboost_classifier.fit(x_train_sm, y_train_sm)


# In[41]:


# adaboost metrics

y_train_pred_adaboost=cross_val_predict(adaboost_classifier, x_train_sm, y_train_sm, cv=5)


# In[42]:
    if (data[i, -1] == 0):
        data[i, -1] = -1

train_data = data[0:5000, :]
test_data = data[5000:, :]
train_x = train_data[:, 0:-1]
train_y = train_data[:, -1]
test_x = test_data[:, 0:-1]
test_y = test_data[:, -1]

# train_index_list=[]
# test_index_list=[]
# kf=KFold(n_splits=5,shuffle=False)
# for train_index,test_index in kf.split(train_x):
#     train_index_list.append(train_index)
#     test_index_list.append(test_index)
#
# print(test_index_list[3])
#
# train_data=train_x[test_index_list[3],:]

w = np.ones(train_y.shape)
weight = w / train_y.shape[0]

estimator = AdaBoostClassifier()
estimator.fit(train_x, train_y, sample_weight=weight)

predict_y = estimator.predict(test_x)
score = estimator.score(test_x, test_y)
print(score)
예제 #5
0
encoded_test_data[categorical_variables] = encoded_test_data[
    categorical_variables].apply(lambda x: d[x.name].transform(x))

independent_variables = [
    x for x in train_data.columns
    if x not in ['victim_id', 'datetime', 'criticality']
]
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


def scorer(estimator, X, y):
    y1 = np.array(estimator.predict(X))
    score = roc_auc_score(y, y1)
    return score


from sklearn.ensemble import AdaBoostClassifier

adam = AdaBoostClassifier(learning_rate=2, n_estimators=48, random_state=0)

adam.fit(encoded_train_data[independent_variables],
         encoded_train_data['criticality'])
test_predictions = adam.predict(encoded_test_data[independent_variables])
victim_id = test_data['victim_id']
submission = pd.DataFrame({
    'victim_id': victim_id,
    'criticality': test_predictions
})
submission.to_csv('dataquest_submission4.csv', index=False)
예제 #6
0
pipe1 = Pipeline([('pca', PCA()), ('classifier', GaussianNB())])
param = {'pca__n_components': [4, 5, 6]}
gsv = GridSearchCV(pipe1, param_grid=param, n_jobs=2, scoring='f1', cv=2)
gsv.fit(features_train, labels_train)
clf = gsv.best_estimator_
print("GausianNB with PCA fitting time: %rs" % round(time() - t0, 3))
pred = clf.predict(features_test)

t0 = time()
test_classifier(clf, my_dataset, financial_features, folds=1000)
print("GausianNB  evaluation time: %rs" % round(time() - t0, 3))
'''
Adaboost tuned for comparision with final algorithm
'''
from sklearn.tree import DecisionTreeClassifier
abc = AdaBoostClassifier(random_state=40)
data = featureFormat(my_dataset, financial_features, sort_keys=True)
labels, features = targetFeatureSplit(data)
dt = []
for i in range(6):
    dt.append(DecisionTreeClassifier(max_depth=(i + 1)))
ab_params = {'base_estimator': dt, 'n_estimators': [60, 45, 101, 10]}
t0 = time()
abt = GridSearchCV(
    abc,
    ab_params,
    scoring='f1',
)
abt = abt.fit(features_train, labels_train)
print("AdaBoost fitting time: %rs" % round(time() - t0, 3))
abc = abt.best_estimator_
예제 #7
0
n_classes = 3
n_estimators = 30
cmap = plt.cm.RdYlBu
plot_step = 0.02  # fine step width for decision surface contours
plot_step_coarser = 0.5  # step widths for coarse classifier guesses
RANDOM_SEED = 13  # fix the seed on each iteration

# Load data
iris = load_iris()

plot_idx = 1

models = [DecisionTreeClassifier(max_depth=None),
          RandomForestClassifier(n_estimators=n_estimators),
          ExtraTreesClassifier(n_estimators=n_estimators),
          AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                             n_estimators=n_estimators)]

for pair in ([0, 1], [0, 2], [2, 3]):
    for model in models:
        # We only take the two corresponding features
        X = iris.data[:, pair]
        y = iris.target

        # Shuffle
        idx = np.arange(X.shape[0])
        np.random.seed(RANDOM_SEED)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        # Standardize
예제 #8
0
num_folds = 10
seed = 7
scoring = 'accuracy'
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=validation_size, random_state=seed)

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('GBC', GradientBoostingClassifier()))
models.append(('RFC', RandomForestClassifier()))
models.append(('ETC', ExtraTreesClassifier()))
models.append(('SVM', SVC()))
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
예제 #9
0
    rf_train_acc, rf_test_acc))
print("Precision score: ", precision_score(Y_test, predictions))
print("Recall score: ", recall_score(Y_test, predictions))
print("F1 score : ", rf_f1_score)

confusion_matrix(Y_test, predictions)
"""Model#3: Ada Boost Classifier"""

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
tree = DecisionTreeClassifier(random_state=11,
                              max_features="auto",
                              class_weight="balanced",
                              max_depth=None)

model_ada = AdaBoostClassifier(base_estimator=tree)
model_ada = model_ada.fit(sequences_matrix, Y_train)
predictions = model_ada.predict(test_sequences_matrix)

ada_train_acc = accuracy_score(Y_train, model_ada.predict(sequences_matrix))
ada_test_acc = accuracy_score(Y_test, predictions)
ada_f1_score = f1_score(predictions, Y_test)
print("Accuracy score: \n a) Train : {}\n b) Test : {}".format(
    ada_train_acc, ada_test_acc))
print("Precision score: ", precision_score(Y_test, predictions))
print("Recall score: ", recall_score(Y_test, predictions))
print("F1 score : ", ada_f1_score)

confusion_matrix(Y_test, predictions)
"""Model#4: Recurrent Neural Networks"""
예제 #10
0
def learn(fname):
    data = pd.read_csv(fname, encoding='utf-8')
    # shuffle data
    data = data.sample(frac=1).reset_index(drop=True)

    for idx, row in data.iterrows():
        if row["content"] is np.nan:
            data.drop(idx, inplace=True)
        elif isinstance(row["content"], str):
            data.set_value(idx, "content", clean_content(row["content"]))

    X = data["label"].as_matrix()
    del data["label"]
    Y = data.as_matrix()

    X_train, X_test, y_train, y_test = train_test_split(Y,
                                                        X,
                                                        test_size=0.20,
                                                        random_state=42)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train[:, 4])
    print(X_train_counts.shape)
    X_test_counts = count_vect.transform(X_test[:, 4])
    print(X_test_counts.shape)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
    print(X_train_tfidf.shape)
    print(X_test_tfidf.shape)

    # add punctuation features
    mnb_clf = MultinomialNB().fit(X_train_tfidf, y_train)
    ada_clf = AdaBoostClassifier(n_estimators=100).fit(X_train_tfidf, y_train)
    text_clf = SGDClassifier(loss='hinge',
                             penalty='l2',
                             alpha=1e-3,
                             n_iter=5,
                             random_state=42)
    svm_clf = text_clf.fit(X_train_tfidf, y_train)

    joblib.dump(tfidf_transformer, 'tfidf_transformer.pkl')
    joblib.dump(count_vect, 'count_vect.pkl')
    joblib.dump(mnb_clf, 'mnb_clf.pkl')
    joblib.dump(svm_clf, 'svm_clf.pkl')
    joblib.dump(ada_clf, 'ada_clf.pkl')

    predicted = mnb_clf.predict(X_test_tfidf)
    ada_predictions = ada_clf.predict(X_test_tfidf)
    svm_predictions = svm_clf.predict(X_test_tfidf)

    ada_score = np.mean(ada_predictions == y_test)
    mnb_score = np.mean(predicted == y_test)
    svm_score = np.mean(svm_predictions == y_test)

    print("MNB: ", mnb_score)
    print("ADA: ", ada_score)
    print("SVM: ", svm_score)

    sketchy_score = (mnb_score + ada_score + svm_score) / 3.0
    print("Sketchy score: ", sketchy_score)
예제 #11
0
Mem_Ext = memory_usage_psutil()
print("Extra Trees Memory usage: ", Mem_Ext)

Cpu_Ext = psutil.cpu_percent()
print("Extra Trees Cpu Percent: ", Cpu_Ext)

# In[16]:

#Adaboost

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

start_time = time.clock()
AdaB_model = AdaBoostClassifier(
    RandomForestClassifier(n_estimators=100,
                           n_jobs=-1,
                           criterion='gini',
                           class_weight='balanced'))
AdaB_model = AdaB_model.fit(Train_SVD, Y_train)
pred_Adab = AdaB_model.predict(Test_SVD)

Acc_Adab = accuracy_score(Y_test, pred_Adab)
print("Adaboost accuracy =", Acc_Adab)

F1_Adab = f1_score(Y_test, pred_Adab, average='micro')
print("Adaboost F-1 score(micro) = ", F1_Adab)

F1W_Adab = f1_score(Y_test, pred_Adab, average='weighted')
print("Adaboost F-1 score(weighted) = ", F1W_Adab)

Time_Adab = time.clock() - start_time
예제 #12
0
    X = Z
try:
    try:
        os.remove('.'.join(args.file) + '.' + args.model + '.predict.xml')
    except OSError:
        pass

except OSError:
    pass

dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
svr = SVR(kernel='poly', C=1e3, degree=2)
reg = linear_model.Lasso(alpha=0.1)
rf = RandomForestClassifier(max_depth=2, random_state=0)
ab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                        n_estimators=600,
                        learning_rate=1.5,
                        algorithm="SAMME")
gnb = MultinomialNB()  #GaussianNB()
lrg = LogisticRegression()
model = dt
if args.model == 'lrg':
    model = lrg
elif args.model == 'svr':
    model = svr
elif args.model == 'rf':
    model = rf
elif args.model == 'av':
    model == ab
elif args.model == 'gnb':
    model = gnb
if args.semi == 'svr' and len(args.file) == 2:  # one labled and one unlabled
예제 #13
0
파일: Adaboost.py 프로젝트: Merajul/Python

from sklearn.model_selection import KFold

cv = 5
kf = KFold(n_splits=cv, shuffle=True)


for train_index, test_index in kf.split(feature, target):
    X_train, X_test = feature[train_index], feature[test_index]
    y_train, y_test = target[train_index], target[test_index]
	
	
	from sklearn.ensemble import AdaBoostClassifier

    clf4 = AdaBoostClassifier()
    clf4.fit(X_train, y_train)

    y_train_pred4 = clf4.predict(X_train)
    y_pred4 = clf4.predict(X_test)

   
    prec4, rec4, f14, acc_train4, acc_test4 = getScore(y_test, y_pred4, y_train_pred4)

   
    prec_sum4 = prec_sum4 + prec4
    rec_sum4 = rec_sum4 + rec4
    f14_sum = f14_sum + f14
    sum_acc_train4 = sum_acc_train4 + acc_train4
    sum_acc_test4 = sum_acc_test4 + acc_test4
	
    "Standard Scaler", "Normal Scaler", "MinMaxScaler", "MaxAbsScaler",
    "Kernel Centerer"
]

for preprocess, name in zip(preprocessors, preprocessors_type):

    print "-------------------------------------\n"
    print "For Preprocessor : ", preprocess
    print "--------------------------------------\n"

    data = preprocess.fit_transform(forestFrame.values)
    train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(
        data, target_labels.values, test_size=0.3)

    rf = RandomForestClassifier(n_estimators=101)
    ada = AdaBoostClassifier(n_estimators=101)
    bagging = BaggingClassifier(n_estimators=101)
    gradBoost = GradientBoostingClassifier(n_estimators=101)

    classifiers = [rf, ada, bagging, gradBoost]
    classifier_names = [
        "Random Forests", "Adaboost", "Bagging", "Gradient Boost"
    ]

    for classifier, classifier_name in zip(classifiers, classifier_names):

        classifier.fit(train_data, train_labels)
        predicted_labels = classifier.predict(test_data)
        print "----------------------------------\n"
        print "Accuracy for ", classifier_name, " : ", metrics.accuracy_score(
            test_labels, predicted_labels)
예제 #15
0
y_test=y_test[:,0].reshape(-1)
print("BAGGİNG")
from sklearn.ensemble import BaggingClassifier
bg = BaggingClassifier(clf,max_samples=0.5, max_features=1.0, n_estimators=20)
bg.fit(X_train, y_train)
y_pred =bg.predict(X_test)
polat=accuracy_score(y_test, y_pred)
print("Accuracy:",polat)
#ADABOOST
#integer
X_train=X_train.astype ("int")
X_test=X_test.astype("int")
y_train=y_train.astype("int")
y_test=y_test.astype("int")
print("ADABOOST")
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=10, learning_rate=0.01)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
memati=accuracy_score(y_test, y_pred)
print("Accuracy:",memati)
'''# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
# initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()


# your code here!  name your classifier object clf if you want the
# visualization code (prettyPicture) to show you the decision boundary


clf = AdaBoostClassifier(n_estimators=50)

clf.fit(features_train, labels_train)

pred = clf.predict(features_test)

acc = accuracy_score(pred, labels_test)

print(clf.score(features_test, labels_test))
print(f'Accuracy: {acc}')

try:
    prettyPicture(clf, features_test, labels_test)
except NameError:
    pass
예제 #17
0
# hf.lasso_selection(features, labels, features_list)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
random_seed = 1303
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=random_seed)
cv = StratifiedShuffleSplit(labels_train,
                            n_iter=20,
                            test_size=0.5,
                            random_state=random_seed)

ada = AdaBoostClassifier(random_state=random_seed)
selector = RFE(ada, step=1)
pipe_ada = Pipeline(steps=[('RFE', selector), ('ada', ada)])
params_ada_gs = {
    "RFE__n_features_to_select": np.arange(11, 15, 2),
    "ada__learning_rate": np.arange(0.3, 0.7, 0.2),
    "ada__n_estimators": [50, 100]
}

# pipe_ada= Pipeline(steps=[('RFE', selector), ('ada', ada)])
# params_ada_gs = {"RFE__n_features_to_select": [15],
#                  "ada__learning_rate" : [0.5],
#                  "ada__n_estimators" : [50]
#                 }

gs = GridSearchCV(pipe_ada, params_ada_gs, scoring='f1', cv=cv)
예제 #18
0
파일: current.py 프로젝트: skazi019/NLP
#%%
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np

#%%
data = pd.read_csv('spambase.data').as_matrix()
np.random.shuffle(data)

X = data[:, :48]
y = data[:, -1]

#%%
Xtrain = X[:-100, ]
ytrain = y[:-100, ]
Xtest = X[-100:, ]
ytest = y[-100:, ]

#%%
model = MultinomialNB()
model.fit(Xtrain, ytrain)
print("\nAccuracy for NB: ", model.score(Xtest, ytest))

#%%
from sklearn.ensemble import AdaBoostClassifier

#%%
model = AdaBoostClassifier()
model.fit(Xtrain, ytrain)
print("Accuracy for Adaboost is: ", model.score(Xtest, ytest))
예제 #19
0
kfold = model_selection.KFold(n_splits=10, random_state=7)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart,
                          n_estimators=num_trees,
                          random_state=7)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

# AdaBoost Classification

from sklearn.ensemble import AdaBoostClassifier
seed = 7
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

# Voting Ensemble for Classification

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

kfold = model_selection.KFold(n_splits=10, random_state=seed)

# create the sub models
estimators = []
model1 = LogisticRegression(solver='lbfgs', max_iter=10000)
예제 #20
0
names = [
    "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
    "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes",
    "QDA"
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
예제 #21
0
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.pipeline import Pipeline

from customer_review_API_lib import run_test_predictions

DIRECTORY = 'C:/Users/bergj/Documents/Geroge Mason/Courses/2019-Spring/GMU- CS 584/FinalProject/data/'

run_test_predictions(
    toys_file='{}{}'.format(DIRECTORY, 'amazon_reviews_us_Apparel_v1_00.tsv'),
    apparel_file='{}{}'.format(DIRECTORY, 'amazon_reviews_us_Toys_v1_00.tsv'),
    min_words=12,
    n_reviews=200000,
    pipeline=Pipeline([('tfidf',
                        TfidfVectorizer(norm='l2',
                                        max_df=0.6,
                                        min_df=75,
                                        ngram_range=(1, 1),
                                        stop_words='english')),
                       ('feature-extract', SelectKBest(chi2, k=20)),
                       ('clf',
                        AdaBoostClassifier(base_estimator=LogisticRegression(
                            C=0.1, class_weight='balanced',
                            solver='liblinear'),
                                           n_estimators=10,
                                           learning_rate=5))]),
    gridsearch_args=dict(param_grid={
        'clf__n_estimators': [2, 5, 10, 15],
        'clf__learning_rate': [0.1, 1, 5]
    },
                         scoring='f1'))
        clf_mlp.fit(X_train, y_train)
        end_time = time.time() - start_time
        print end_time
        print "Evaluation time"
        start_time = time.time()
        predictions = clf_mlp.predict(X_test)
        end_time = time.time() - start_time
        print end_time
        print(classification_report(y_test, predictions))
        joblib.dump(
            clf_mlp, 'datasetB_results/' + 'MLP_logistic_sgd_' + method + '-' +
            size + '.joblib.pkl')

        from sklearn.ensemble import AdaBoostClassifier
        print "AdaBoostClassifier"
        clf_ada = AdaBoostClassifier()
        print "Training classifier"
        start_time = time.time()
        clf_ada.fit(X_train, y_train)
        end_time = time.time() - start_time
        print end_time
        print "Evaluation time"
        start_time = time.time()
        predictions = clf_ada.predict(X_test)
        end_time = time.time() - start_time
        print end_time
        print(classification_report(y_test, predictions))
        joblib.dump(
            clf_ada, 'datasetB_results/' + 'AdaBoostClassifier_' + method +
            '-' + size + '.joblib.pkl')
예제 #23
0
})

# appending our result table
result_tabulation = result_tabulation.append(Bagging_Meta_estimator,
                                             ignore_index=True)

# view the result table
result_tabulation

# In[183]:

# Adaboost
from sklearn.ensemble import AdaBoostClassifier

# build the model
adaboost = AdaBoostClassifier(random_state=10)
# fit the model
adaboost.fit(X_train, y_train)

# In[184]:

# predict the values
y_pred_adaboost = adaboost.predict(X_test)

# In[185]:

adaboost_metrics = pd.Series({
    'Model':
    "AdaBoost",
    'AUC Score':
    metrics.roc_auc_score(y_test, y_pred_adaboost),
예제 #24
0
    
    
    start_time = time.time()
    print("开始训练模型..")
    model.fit(x_train, t_train) # 训练模型

    end_time = time.time()
    print("训练结束! 耗时:", end_time-start_time, "s")
    
    joblib.dump(model, save_file) # 保存模型

if __name__ == "__main__":
    # adaboost
    n_est = 20
    print("弱分类器数为: ", n_est)

    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),
                            algorithm="SAMME", n_estimators=n_est, learning_rate=0.5)

    dataset_dir = os.path.abspath(os.path.join(os.getcwd(), "..")) + "/model"
    save_dir = dataset_dir + '/model.pkl'

    if(os.path.isfile(save_dir)): # 读取模型
        model = joblib.load(save_dir) # 加载模型
    else:
	    train_mnist(model, save_dir) # 训练模型
    
    (x_train, t_train), (x_test, t_test) = load_mnist() # 加载训练集、测试集
    print("测试集识别精确度:", model.score(x_test, t_test))

    
예제 #25
0
  RandomForestClassifier(criterion='entropy',
                         min_samples_split=5,
                         random_state=24)),
 ('Random Forrest 2',
  RandomForestClassifier(criterion='entropy',
                         max_depth=20,
                         random_state=24)),
 ('Random Forrest 3',
  RandomForestClassifier(criterion='entropy',
                         min_samples_split=20,
                         random_state=24)),
 ('Random Forrest 4',
  RandomForestClassifier(criterion='entropy',
                         min_samples_split=50,
                         random_state=24)),
 ('ADdaBOost', AdaBoostClassifier(n_estimators=100,
                                  random_state=24)),
 ('PERceptron',
  CalibratedClassifierCV(Perceptron(max_iter=50,
                                    tol=-np.infty,
                                    random_state=24),
                         cv=10,
                         method='isotonic')),
 ('PERceptron 2',
  CalibratedClassifierCV(Perceptron(max_iter=100,
                                    tol=-np.infty,
                                    random_state=24),
                         cv=10,
                         method='isotonic')),
 ('KNeighbors Classifier', KNeighborsClassifier(n_neighbors=5)),
 ('KNeighbors Classifier 2', KNeighborsClassifier(n_neighbors=2)),
 ('Multi-Layer Perceptron', MLPClassifier(random_state=24))]
예제 #26
0
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier

# 导包
from sklearn.model_selection import cross_val_score

from __init__ import write_log

log_file = "test_cnn_9_1.log"
# 载入数据,sklearn中自带的iris数据集
iris = load_iris()
write_log(str(iris), file=log_file)
"""
AdaBoostClassifier参数解释
base_estimator:弱分类器,默认是CART分类树:DecisionTressClassifier
algorithm:在scikit-learn实现了两种AdaBoost分类算法,即SAMME和SAMME.R,
           SAMME就是原理篇介绍到的AdaBoost算法,指Discrete AdaBoost
           SAMME.R指Real AdaBoost,返回值不再是离散的类型,而是一个表示概率的实数值,算法流程见后文
                            两者的主要区别是弱分类器权重的度量,SAMME使用了分类效果作为弱分类器权重,SAMME.R使用了预测概率作为弱分类器权重。
           SAMME.R的迭代一般比SAMME快,默认算法是SAMME.R。因此,base_estimator必须使用支持概率预测的分类器。
loss:这个只在回归中用到,不解释了
n_estimator:最大迭代次数,默认50。在实际调参过程中,常常将n_estimator和学习率learning_rate一起考虑
learning_rate:每个弱分类器的权重缩减系数v。f_k(x)=f_{k-1}*a_k*G_k(x)。较小的v意味着更多的迭代次数,默认是1,也就是v不发挥作用。
另外的弱分类器的调参,弱分类器不同则参数不同,这里不详细叙述
"""
# 构建模型
clf = AdaBoostClassifier(n_estimators=100)  # 弱分类器个数设为100
scores = cross_val_score(clf, iris.data, iris.target)
print(scores.mean())
y = df[DEPENDENT_VARIABLE]
X = df.drop(DEPENDENT_VARIABLE,axis=1)


from sklearn.tree import DecisionTreeClassifier
basetree = DecisionTreeClassifier( criterion="entropy")
from sklearn.feature_selection import RFE
rfe = RFE(basetree)

rfe.fit(X,y)

rfe.ranking_

rankdf = pd.DataFrame({"rank" : rfe.ranking_ , "feature":X.columns})

rankdf[rankdf["rank"] == 1]
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import RandomizedSearchCV , cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier , AdaBoostClassifier


X_train , X_test ,y_train, y_test = train_test_split(X , y , stratify = y)

basetree = DecisionTreeClassifier( criterion="gini" , min_samples_split=0.4)
clf = AdaBoostClassifier(n_estimators=50 , learning_rate=0.5)

cross_val_score(clf , X,y,scoring="roc_auc")
예제 #28
0
#将数据规定在[-1,1]之间
scaler = MinMaxScaler(feature_range=(-1, 1))
# 数据转换
rescaledX = scaler.fit_transform(traindata)
# 设定数据的打印格式
set_printoptions(precision=3)
print(rescaledX)
# 调参
num_folds = 5
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
from sklearn.model_selection import GridSearchCV
scoring = 'accuracy'
param_grid = {'n_estimators': [10, 30, 50, 70, 90, 100]}
model = AdaBoostClassifier(LogisticRegression(C=1000,
                                              multi_class='multinomial',
                                              solver='lbfgs'),
                           algorithm='SAMME.R')
#LogisticRegression(),algorithm='SAMME.R'
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring=scoring,
                    cv=kfold)
grid_result = grid.fit(X=rescaledX, y=trainlabel)

print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))
예제 #29
0
    msg = "Taxa de acerto do vencedor entre os dois algoritmos no mundo real: {0}".format(
        taxa_de_acerto)
    print(msg)


resultados = {}

from sklearn.naive_bayes import MultinomialNB
modeloMultinomial = MultinomialNB()
resultadoMultinomial = fit_and_predict("MultinomialNB", modeloMultinomial,
                                       treino_dados, treino_marcacoes)
resultados[resultadoMultinomial] = modeloMultinomial

from sklearn.ensemble import AdaBoostClassifier
modeloAdaBoost = AdaBoostClassifier(
    random_state=0)  # elimina randomizacao (sempre mesmo resultado)
resultadoAdaBoost = fit_and_predict("AdaBoostClassifier", modeloAdaBoost,
                                    treino_dados, treino_marcacoes)
resultados[resultadoAdaBoost] = modeloAdaBoost

# Algoritmo Um Contra Resto usando LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
modeloOneVsRest = OneVsRestClassifier(
    LinearSVC(random_state=0))  # elimina randomizacao (sempre mesmo resultado)
resultadoOneVsRest = fit_and_predict("OneVsRest", modeloOneVsRest,
                                     treino_dados, treino_marcacoes)
resultados[resultadoOneVsRest] = modeloOneVsRest

# Algoritmo Um Contra Um (todas as categorias são testadas entre si)
from sklearn.multiclass import OneVsOneClassifier
if ( len(inpgrade1.columns) == 2 ):
	grade = load_data("ml_scripts/data/" + course + "/MasterTrainingData1.csv")
	X = grade[['Homework 1', 'Quiz 1 ']].values
	scaler = StandardScaler().fit(X)
	X = scaler.transform(X)
	y = grade[["Grade"]].values.ravel()
	y1 = []
	for label in y:
		if label == "Good":
			y1.append(0)
		if label == "OK":
			y1.append(1)
		if label == "High-risk":
			y1.append(2)
	# model = MLPClassifier(random_state=0, hidden_layer_sizes=(7, 20), alpha=0.0001, solver='lbfgs',max_iter=200, learning_rate = 'adaptive')
	model = AdaBoostClassifier(random_state=0, n_estimators=1000)
	model.fit(X, y1)
	chosenModels[0] = model
	
elif ( len(inpgrade1.columns) == 5 ):   
	grade = load_data("ml_scripts/data/" + course + "/MasterTrainingData2.csv")
	X = grade[['Quiz 1 ', 'Quiz 2 ', 'Quiz 3', 'Homework 1', 'Homework 2']].values
	y = grade[["Grade"]].values.ravel()
	y1 = []
	for label in y:
		if label == "Good":
			y1.append(0)
		if label == "OK":
			y1.append(1)
		if label == "High-risk":
			y1.append(2)