Exemplo n.º 1
0
def learn(learning_rate, X_train, y_train, X_test, y_test):
	model = GradientBoostingClassifier(
		n_estimators=250,
		verbose=True,
		random_state=241,
		learning_rate=learning_rate
		)
	model.fit(X_train, y_train)
	
	# plot scores
	test_score = list(range(250))
	train_score = list(range(250))

	for i, predictions in enumerate(model.staged_decision_function(X_test)):
		predictions = [x[0] for x in predictions.tolist()] # unpack this stupid format
		predictions = [1/(1 + math.exp(-x)) for x in predictions]
		test_score[i] = log_loss(y_test, predictions)

	for i, predictions in enumerate(model.staged_decision_function(X_train)):
		predictions = [x[0] for x in predictions.tolist()] # unpack this stupid format
		predictions = [1/(1 + math.exp(-x)) for x in predictions]
		train_score[i] = log_loss(y_train, predictions)

	plt.figure()
	plt.plot(test_score, 'r', linewidth=2)
	plt.plot(train_score, 'g', linewidth=2)
	plt.legend(['test', 'train'])
	plt.show()
	
	return train_score, test_score
Exemplo n.º 2
0
Arquivo: main.py Projeto: tz3/sandbox
test_deviance = {}


def sigmoid(y_pred):
    return 1 / (1 + math.e ** (-y_pred))


learning_rates = [1, 0.5, 0.3, 0.2, 0.1]
for learning_rate in learning_rates:
    model = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate)
    model.fit(X_train, y_train)

    # compute test set deviance
    test_deviance[learning_rate] = np.zeros((250,), dtype=np.float64)

    for i, y_pred in enumerate(model.staged_decision_function(X_test)):
        # clf.loss_ assumes that y_test[i] in {0, 1}
        test_deviance[learning_rate][i] = log_loss(y_test, sigmoid(y_pred))

    plt.plot((np.arange(test_deviance[learning_rate].shape[0]) + 1)[::5], test_deviance[learning_rate][::5],
             '-', label='label = {}'.format(learning_rate))
plt.legend(loc='upper left')
plt.xlabel('Boosting Iterations')
plt.ylabel('Test Set Deviance')

plt.show()

# 3. Как можно охарактеризовать график качества на тестовой выборке,
# начиная с некоторой итерации: переобучение (overfitting) или недообучение (underfitting)?
#  В ответе укажите одно из слов overfitting либо underfitting.
print('overfitting')
Exemplo n.º 3
0
X = df.iloc[:, 1:].as_matrix()
y = df.iloc[:, 0].as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

for l in [1, 0.5, 0.3, 0.2, 0.1]:
    cf = GradientBoostingClassifier(n_estimators=250,
                                    verbose=True, random_state=241, learning_rate=l)
    cf.fit(X_train, y_train)

    train_loss = []
    test_loss = []

    # log loss for train set
    for stage, array in enumerate(cf.staged_decision_function(X_train)):
        # apply sigmoid function
        transformed = []
        for row in array:
            transformed.append(float(1) / (1+np.exp(-row[0])))
        # calculate metric
        score = log_loss(y_train, transformed)
        train_loss.append(score)

    # log loss for test set
    for stage, array in enumerate(cf.staged_decision_function(X_test)):
            # apply sigmoid function
            transformed = []
            for row in array:
                transformed.append(float(1) / (1+np.exp(-row[0])))
            # calculate metric
Exemplo n.º 4
0
from sklearn.ensemble import RandomForestClassifier


clfR = RandomForestClassifier(n_estimators=250,verbose=True, random_state=241)
clfR.fit(X_train,y_train)
print log_loss(y_test, clfR.predict_proba(X_test))

########################################


clf = GradientBoostingClassifier(n_estimators=250,verbose=True, random_state=241,learning_rate = 0.2)
clf.fit(X_train,y_train)

sdf = []
k = 0
for y_pred in  enumerate(clf.staged_decision_function(X_test)):
    sdf.append([])
    for i in y_pred[1]:
        sdf[k].append( 1 / (1 + math.exp(-1* i )))
        #sdf[k].append( i + 1-1)
    k+= 1

k = 0
for i  in sdf:
    print (str(k)+ " " + str( log_loss(y_true = y_test, y_pred =  i)))
    k+=1
a= []
for i  in sdf:
    a.append(log_loss(y_true = y_test, y_pred =  i))
print min(a)
Exemplo n.º 5
0
data = pandas.read_csv('gbm-data.csv').values
y=data[:,0]
X=data[:,1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=24)



train_loss_learning_rate=[]
test_loss_learning_rate=[]

for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]:
    GBC=GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=250, verbose=True, random_state=241)
    GBC.fit(X_train,y_train)
    train_loss=[]
    test_loss=[]
    for i, y in enumerate(GBC.staged_decision_function(X_train)):
        y_ = 1 / (1 + np.exp(-y))
        train_loss.append(log_loss(y_train,y_))
    train_loss_learning_rate.append(train_loss)
    for i, y in enumerate(GBC.staged_decision_function(X_test)):
        y_ = 1 / (1 + np.exp(-y))
        test_loss.append(log_loss(y_test,y_))
    test_loss_learning_rate.append(test_loss)

a=np.array(train_loss_learning_rate)
b=np.array(test_loss_learning_rate)

farbe = ['orange','turquoise', 'blue','gray','magenta']
learning_rate = [1, 0.5, 0.3, 0.2, 0.1]

plt.figure(num=1)
Exemplo n.º 6
0
X_train, X_test, y_train, y_test = train_test_split(feature,
                                                    target,
                                                    test_size=0.8,
                                                    random_state=241)

learning_rate = [1, 0.5, 0.3, 0.2, 0.1]
loss_train = []
loss_test = []
min_loss = []

gbc = GradientBoostingClassifier(n_estimators=250,
                                 verbose=True,
                                 random_state=241,
                                 learning_rate=0.2)
gbc.fit(X_train, y=y_train)
score_train = gbc.staged_decision_function(X_train)
score_test = gbc.staged_decision_function(X_test)
for pred in score_train:
    loss_train.append(
        log_loss(y_train,
                 [1 / (1 + math.exp((-1) * y_pred)) for y_pred in pred]))
for pred in score_test:
    loss_test.append(
        log_loss(y_test,
                 [1 / (1 + math.exp((-1) * y_pred)) for y_pred in pred]))
    min_loss.append(min(loss_test))

min_value = min(min_loss)
min_index = min_loss.index(min_value)

rfc = RandomForestClassifier(n_estimators=300, random_state=241)
Exemplo n.º 7
0
y = data[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

def sigmoid(arr):
    return 1./(1. + np.exp(-arr))

# for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]:
for learning_rate in [0.2, 0.1]:
    clf = GradientBoostingClassifier(n_estimators=250,
        learning_rate=learning_rate,
        # verbose=True,
        random_state=241)
    clf.fit(X_train, y_train)

    predict_train_by_iter = clf.staged_decision_function(X_train)
    predict_test_by_iter = clf.staged_decision_function(X_test)

    loss_train_by_iter = []
    loss_test_by_iter = []
    
    for predict in predict_train_by_iter:
        loss_value = log_loss(y_train, sigmoid(predict))
        loss_train_by_iter.append(loss_value)

    for predict in predict_test_by_iter:
        loss_value = log_loss(y_test, sigmoid(predict))
        loss_test_by_iter.append(loss_value)

    min_loss_index = np.argmin(loss_test_by_iter)
    print('learning_rate=%s, min_loss_value=%s, iteration(from 1)=%s' % (
Exemplo n.º 8
0
def sigmoid(y):
    return 1. / (1 + np.exp(-y))


for i in [1, 0.5, 0.3, 0.2, 0.1]:
    gbt = GradientBoostingClassifier(n_estimators=250,
                                     verbose=True,
                                     random_state=241,
                                     learning_rate=i)
    gbt.fit(X_train, y_train)

    train_loss = []
    test_loss = []

    for j, y_pred in enumerate(gbt.staged_decision_function(X_train)):
        train_loss.append(log_loss(y_train, sigmoid(y_pred)))

    for j, y_pred in enumerate(gbt.staged_decision_function(X_test)):
        test_loss.append(log_loss(y_test, sigmoid(y_pred)))

    min_train_loss = np.min(train_loss)
    iter_train = np.argmin(train_loss)
    min_test_loss = np.min(test_loss)
    iter_test = np.argmin(test_loss)

    print("{}:\nmin train_loss {} on iteration {}".format(
        gbt, min_train_loss, iter_train))
    print("{}:\nmin test_loss {} on iteration {}".format(
        gbt, min_test_loss, iter_test))
Exemplo n.º 9
0
X = df.drop(['Activity'], axis=1).values

# In[4]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=241)

# In[17]:

gbm_model = GradientBoostingClassifier(n_estimators=250,
                                       learning_rate=0.2,
                                       verbose=True,
                                       random_state=241)
gbm_model.fit(X_train, y_train)

# In[18]:

arr = []
for i in gbm_model.staged_decision_function(X_test):
    arr.append(log_loss(y_test, [(1.0 / (1.0 + math.exp(-j))) for j in i]))
min(arr)

# In[27]:

rf_model = RandomForestClassifier(n_estimators=36, random_state=241)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict_proba(X_test)[:, 1]
log_loss(y_test, y_pred)
Exemplo n.º 10
0
    "predicted_den.npy")
true_density_RF = np.load(
    "/Users/lls/Documents/CODE/stored_files/shear/classification/density_only/true_den.npy"
)

pred_all = np.array([pred_i, pred_density_RF])
true_all = np.array([true_test, true_density_RF])
fpr, tpr, auc, fig = get_multiple_rocs(pred_all,
                                       true_all,
                                       labels=["GBT", "RF"])
plt.savefig(path + "roc_vs_RF.png")

# score test vs train

score_test = np.zeros(clf.n_estimators, )
for i, y_pred in enumerate(clf.staged_decision_function(testing_features)):
    score_test[i] = clf.loss_(true_test, y_pred)

score_train = np.zeros(clf.n_estimators, )
for i, y_pred in enumerate(clf.staged_decision_function(training_features)):
    score_train[i] = clf.loss_(true_train, y_pred)

score_train -= score_train[0]
score_test -= score_test[0]

plt.figure()
plt.plot(np.arange(clf.n_estimators), score_train, label="score train")
plt.plot(np.arange(clf.n_estimators), score_test, label="score test")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.xlabel("N estimators")
Exemplo n.º 11
0
import pandas
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics as met
import matplotlib.pyplot as plt

df = pandas.read_csv("gbm-data.csv")

vals = df.values

X_train, X_test, y_train, y_test = train_test_split(vals[:, 1:], vals[:, 0], test_size=0.8, random_state=241)

# for lr in [1, 0.5, 0.3, 0.2, 0.1]:
clf = GradientBoostingClassifier(learning_rate=1, n_estimators=250, verbose=False, random_state=241)
clf.fit(X_train, y_train)
sc_train = enumerate(clf.staged_decision_function(X_train))
sc_test  = enumerate(clf.staged_decision_function(X_test))
train_loss = {}
test_loss = {}
for i, y_predicted in sc_train:
    train_loss[i] = met.log_loss(y_train,1/(1+np.exp(-y_predicted)))

for i, y_predicted in sc_test:
    test_loss[i] = met.log_loss(y_test, 1/(1+np.exp(-y_predicted)))

plt.figure()
plt.plot(list(test_loss.values()), 'r', linewidth=2)
plt.plot(list(train_loss.values()), 'g', linewidth=2)
plt.legend(['test', 'train'])
plt.show()
Exemplo n.º 12
0
from sklearn.metrics import log_loss

data = pandas.read_csv('gbm-data.csv')
X = data.drop('Activity', axis=1)
y = data['Activity']
data = np.array(pandas.read_csv('gbm-data.csv').values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

for learning_rate in [0.2]:
    cls = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate)
    cls.fit(X_train, y_train)

    print(cls.learning_rate)
    sigma_func = lambda x: 1/(1+math.e**(-x))
    sdc_train = list(cls.staged_decision_function(X_train))
    sdc_test = list(cls.staged_decision_function(X_test))
    for i in range(250):
        pred_train = list(map(sigma_func, sdc_train[i]))
        pred_test = list(map(sigma_func, sdc_test[i]))
        loss_train = log_loss(y_train, pred_train)
        loss_test = log_loss(y_test, pred_test)
        print(i, loss_train, loss_test)


clf = RandomForestClassifier(n_estimators=36, random_state=241)
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)

print(log_loss(y_test, pred))
Exemplo n.º 13
0
 y = np_data[:, 0]
 X = np_data[:, 1:]
 # X = data.drop('Activity', axis=1).values
 # y = data.Activity.values
 X_train, X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     test_size=0.8,
                                                     random_state=241)
 # for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]:
 for learning_rate in [0.2]:
     gbc = GradientBoostingClassifier(learning_rate=learning_rate,
                                      n_estimators=250,
                                      verbose=True,
                                      random_state=241)
     gbc.fit(X=X_train, y=y_train)
     staged_decision_train = gbc.staged_decision_function(X_test)
     # staged_decision_test = gbc.staged_decision_function(X_test)
     # yy1 = sigmoid(np.array(list(staged_decision_train)))
     # yy2 = sigmoid(staged_decision_test)
     test_loss = np.empty(250)
     for i, y_pred in enumerate(staged_decision_train):
         y_pred = 1.0 / (1.0 + np.exp(-y_pred))
         test_loss[i] = log_loss(y_test, y_pred)
     print(test_loss.max())
     if learning_rate == 0.2:
         print('learning_rate == 0.2')
         lr02_min = test_loss.min()
         lr02_idxmin = test_loss.argmin()
         print(lr02_min)
         print(lr02_idxmin)
         with open('/home/dima/lr_w5_z2_1_1.txt', 'w') as out:
Exemplo n.º 14
0
y = np.array(df['Activity'].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

def sigmoid(y_pred):
     return 1 / (1 + np.exp(-y_pred))

rates = [1, 0.5, 0.3, 0.2, 0.1]

r = 0.2

clf = GradientBoostingClassifier(n_estimators=250, learning_rate=r, verbose=True, random_state=241)
clf.fit(X_train, y_train)

test_loss = [
    (i, log_loss(y_test, sigmoid(y_pred))) for i, y_pred in enumerate(clf.staged_decision_function(X_test))
]

train_loss = [
    (i, log_loss(y_train, sigmoid(y_pred))) for i, y_pred in enumerate(clf.staged_decision_function(X_train))
]

#
# plt.figure()
# plt.plot([loss for i, loss in test_loss], 'r', linewidth=2)
# plt.plot([loss for i, loss in train_loss], 'g', linewidth=2)
# plt.legend(['test', 'train'])
# plt.show();

printAndWriteAnswer(1, 'overfitting')
Exemplo n.º 15
0
def gb(data):
    X = data[data.columns.values[1:]].values
    y = data[data.columns.values[:1]].values.ravel()
    N = len(y)

    X_train, X_test, y_train, y_test = \
        cv.train_test_split(X, y,
                            test_size=0.8,
                            random_state=241)

    # ------------------------------------------------------
    # Deal with Gradient Boosting
    # ------------------------------------------------------

    # Reserve an array to store iteration with min log_loss for each learning rate
    min_iterations_train = []
    min_iterations_test = []


    # Fit Gradient Boosting Classifiers with different learning rates
    learning_rates = [1, 0.5, 0.3, 0.2, 0.1]
    for lr in learning_rates:
        print("GB learning rate = ", lr)

        # Fit the classifier
        gbclf = GradientBoostingClassifier(n_estimators=250,
                                           verbose=True,
                                           random_state=241,
                                           learning_rate=lr)
        gbclf.fit(X_train, y_train)

        # Get log_loss errors after every iteration of the Gradient Boosting
        y_train_pred = gbclf.staged_decision_function(X_train)
        log_loss_train = []
        for y_t_p in y_train_pred:
            log_loss_train.append(log_loss(y_train, 1 / (1 + np.exp(-y_t_p))))

        y_test_pred = gbclf.staged_decision_function(X_test)
        log_loss_test = []
        for y_t_p in y_test_pred:
            log_loss_test.append(log_loss(y_test, 1 / (1 + np.exp(-y_t_p))))

        # Min log-loss and the corresponding iteration
        log_loss_train_min_ind = np.argmin(log_loss_train) + 1
        log_loss_test_min_ind = np.argmin(log_loss_test) + 1
        log_loss_train_min = np.min(log_loss_train)
        log_loss_test_min = np.min(log_loss_test)
        min_iterations_train.append((log_loss_train_min, log_loss_train_min_ind))
        min_iterations_test.append((log_loss_test_min, log_loss_test_min_ind))

        # Plot the errors for both TRAIN and TEST sets (w/ the curr Learning Rate)
        plt.figure('GB learning rate: ' + str(lr))
        plt.plot(log_loss_test, 'r', linewidth=2)
        plt.plot(log_loss_train, 'g', linewidth=2)
        plt.legend(['log_loss_test', 'log_loss_train'])
        plt.draw()


    # Optimal TEST iteration for the learning rate 0.2
    print('Optimal iterations TEST vs. learning rate:')
    for t in zip(min_iterations_test, learning_rates):
        print('min: ', t[0][0], 'min_ind: ', t[0][1], 'learning rate: ', t[1])
    t = [(x[0], x[1]) for x, y in zip(min_iterations_test, learning_rates) if y == 0.2]
    opt_log_loss = t[0][0]
    opt_log_loss_ind = t[0][1]
    writefile('%0.2f %d' % (opt_log_loss, opt_log_loss_ind), 'log-loss-0.2.out')


    # ------------------------------------------------------
    # Deal with Random Forests
    # ------------------------------------------------------
    clf = RandomForestClassifier(n_estimators=opt_log_loss_ind, random_state=241)
    clf.fit(X_train, y_train)
    y_test_pred_rf = clf.predict_proba(X_test)
    log_loss_test_rf = log_loss(y_test, y_test_pred_rf)
    # log-loss over the test set using Random Forests
    writefile('%0.2f' % (log_loss_test_rf), 'log-loss-rf.out')


    return 0
Exemplo n.º 16
0
plt.style.use('ggplot')

df = pd.read_csv('gbm-data.csv')
val = df.values
X = val[:,1:]
y = val[:,0]

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.8, random_state=241)

#learning_rates = [1, 0.5, 0.3, 0.2, 0.1]
learning_rates  = [0.2]
sigmoid = lambda x: 1 / (1 + np.exp(-x))

log_loss_test = []
for l in learning_rates:
    clf = GradientBoostingClassifier(n_estimators=250, verbose=True,
                                     random_state=241,learning_rate = l)
    print('fitting...')
    clf.fit(X_train, y_train)
    print('building staged_decision_function')
    staged_dec = clf.staged_decision_function(X_test)
    for pred in staged_dec:
        y_pred = sigmoid(pred)
        log_loss_test.append(log_loss(y_test,y_pred))
best_iter = [np.argmin(log_loss_test),log_loss_test[np.argmin(log_loss_test)]]
#clf1 = RandomForestClassifier(n_estimators = 37, random_state=241)
#clf1.fit(X_train, y_train)
#prediction = clf1.predict_proba(X_test)
#res = log_loss(y_test,prediction)
#        
df = pandas.read_csv('gbm-data.csv', index_col=None)																#1
dfa = df.values
X = dfa[:,1:]
y = dfa[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

def sigma(y_pred):
	return 1/(1 + np.exp(-y_pred))

# for rate in [1, 0.5, 0.3, 0.2, 0.1]:																				#2
for rate in [0.2]:																				#2
	print rate
	clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate = rate)
	clf.fit(X_train, y_train)

	sigma_y_train = [sigma(y) for y in clf.staged_decision_function(X_train)]
	sigma_y_test  = [sigma(y) for y in clf.staged_decision_function(X_test) ]

	log_loss_train = [log_loss(y_train, y) for y in sigma_y_train]
	log_loss_test  = [log_loss(y_test , y) for y in sigma_y_test ]
	min_log_loss_test = min(log_loss_test)
	it_min_log_loss_test = log_loss_test.index(min_log_loss_test)
	print ">>>> it: ", it_min_log_loss_test, " val: ", min_log_loss_test											#4

	if rate == 0.2:																									#5
		rf = RandomForestClassifier(random_state=241, n_estimators=it_min_log_loss_test)
		rf.fit(X_train, y_train)
		tree_log_loss_test = log_loss(y_test, rf.predict_proba(X_test)[:,1])
		print ">>>>>>>> rf log_loss val: ", tree_log_loss_test

	plt.figure()
X = data_values[:, 1:]
y = data_values[:, 0]


# Разбейте выборку на обучающую и тестовую, используя функцию train_test_split
# с параметрами test_size = 0.8 и random_state = 241.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

# 2
# Обучите GradientBoostingClassifier с параметрами n_estimators=250, verbose=True, random_state=241
# и для каждого значения learning_rate из списка [1, 0.5, 0.3, 0.2, 0.1] проделайте следующее:
for lr in [1, 0.5, 0.3, 0.2, 0.1]:
    clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=lr)
    clf.fit(X_train, y_train)

    # Используйте метод staged_decision_function для предсказания качества на обучающей и тестовой выборке на каждой итерации.
    score_prediction_train = clf.staged_decision_function(X_train)
    score_prediction_test = clf.staged_decision_function(X_test)

    # Преобразуйте полученное предсказание с помощью сигмоидной функции по формуле 1 / (1 + e^{−y_pred}), где y_pred — предсказаное значение.
    score_prediction_train_mod = 1 / (1 + math.exp(-score_prediction_train))
    score_prediction_test_mod = 1 / (1 + math.exp(-score_prediction_test))

    # Вычислите и постройте график значений log-loss
    # (которую можно посчитать с помощью функции sklearn.metrics.log_loss) на обучающей и тестовой выборках,
    # а также найдите минимальное значение метрики и номер итерации, на которой оно достигается.
    log_loss_graph_train = log_loss(y_train, clf.predict_proba(X_train)[:, 1])
    log_loss_graph_test = log_loss(y_test, clf.predict_proba(X_test)[:, 1])

    print("%s -> ll[train] = %s -> ll[test] = %s" % (lr, log_loss_graph_train, log_loss_graph_test))
Exemplo n.º 19
0
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    data_ar[:, 1:], data_ar[:, 0], random_state=241, test_size=0.8)

learning_rate = [1, 0.5, 0.3, 0.2, 0.1]

res = {}

plt.figure(figsize=(6, 30))

for i, lr in enumerate(learning_rate):
    classifier = GradientBoostingClassifier(learning_rate=lr,
                                            n_estimators=250,
                                            random_state=241,
                                            verbose=True)
    classifier.fit(X_train, y_train)
    train_staged_decision = classifier.staged_decision_function(X_train)
    test_staged_decision = classifier.staged_decision_function(X_test)
    sigmoid_train = [
        1 / (1 + np.exp(-y_pred)) for y_pred in train_staged_decision
    ]
    sigmoid_test = [
        1 / (1 + np.exp(-y_pred)) for y_pred in test_staged_decision
    ]
    predictions_train = classifier.predict_proba(X_train)
    predictions_test = classifier.predict_proba(X_test)
    log_loss_train = [
        metrics.log_loss(y_train, iteration_pred)
        for iteration_pred in sigmoid_train
    ]
    log_loss_test = [
        metrics.log_loss(y_test, iteration_pred)
Exemplo n.º 20
0
#         iteration = train_loss.index(min_metric)
#     elif test_min < min_metric:
#         min_metric = test_min
#         iteration = test_loss.index(min_metric)
#     print('iter = {} val = {:.2}'.format(iteration, test_min))
#     list_of_mins.append(test_min)
    
#     plt.figure()
#     plt.plot(test_loss, 'r', linewidth=2)
#     plt.plot(train_loss, 'g', linewidth=2)
#     plt.legend(['test', 'train'])
#     plt.show()
# print('FINAL:\niter = {} val = {:.2}'.format(iteration, test_min))
# -------------------------------------------------------------------------------------------------------------------------
gbc = GradientBoostingClassifier(n_estimators=250, random_state=241, learning_rate=0.2)
gbc.fit(train_matrix, train_vec)
test_loss = []
iter_list = []
for i, pred in enumerate(gbc.staged_decision_function(test_matrix)):
    iter_list.append(i)
    test_loss.append(log_loss(test_vec, sigmoid(pred)))

test_min = np.amin(test_loss)
# prints iteration with lowest loss (loss = 0.53, iteration = 36)
# print('{:.2} {}'.format(test_min, iter_list[test_loss.index(test_min)]), end='')
# -------------------------------------------------------------------------------------------------------------------------
# find log loss of rfc's prediction using iterations found in prev task as amount of estimators
rfc = RandomForestClassifier(n_estimators=iter_list[test_loss.index(test_min)], random_state=241)
rfc.fit(train_matrix, train_vec)
print('{:.2}'.format(log_loss(test_vec, rfc.predict_proba(test_matrix))), end='') # log loss is 0.54
Exemplo n.º 21
0
for rate in [1, 0.5, 0.3, 0.2, 0.1]:

    # training the model
    clf = GradientBoostingClassifier(learning_rate=rate,
                                     n_estimators=250,
                                     verbose=True,
                                     random_state=241)
    clf.fit(X_train, y_train)

    # initializing lists for losses
    test_loss = []
    train_loss = []

    # filling them with values from stages of model's decision-making
    # using log_loss between true class and sigmoid of predicted
    for y_pred in clf.staged_decision_function(X_train):
        train_loss.append(log_loss(y_train, 1 / (1 + np.exp(-y_pred))))
    for y_pred in clf.staged_decision_function(X_test):
        test_loss.append(log_loss(y_test, 1 / (1 + np.exp(-y_pred))))

    # observing minimum losses for each rate
    print(f'''
for learning_rate={rate}
train loss: min, iteration are {min(train_loss)}, {np.argmin(train_loss)}
test loss: min, iteration are {min(test_loss)}, {np.argmin(test_loss)}
''')

    # plotting losses without blocking the loop
    plt.figure()
    plt.title(f'Train and test losses for learning_rate={rate}')
    plt.xlabel('Iteration')
Exemplo n.º 22
0
gbt_noRand05 = GradientBoostingClassifier(loss='deviance',
                                          learning_rate=0.05,
                                          n_estimators=500,
                                          subsample=1.0,
                                          min_samples_split=20,
                                          min_samples_leaf=10,
                                          max_depth=4)
# Apprentissage du modele
gbt_noRand05.fit(X_train, y_train)

niter = 500
iter = np.arange(niter) + 1
test_deviance = np.zeros((niter, ), dtype=np.float64)
# staged_decision_functio : décision fonction à chaque iteration
for i, y_pred in enumerate(gbt_noRand05.staged_decision_function(X_test)):
    # clf.loss_ assumes that y_test[i] in {0, 1}
    test_deviance[i] = gbt_noRand05.loss_(y_test, y_pred)

plt.figure(figsize=(8, 6))
# Erreur sur le test (evolution deviance)
plt.plot(iter, test_deviance, label='Test', color='darkorange')
# min vers 100
# Erreur sur apprentissage (evolution deviance)
plt.plot(iter, gbt_noRand05.train_score_, label='Apprentissage', color='navy')
# Diminution de l'erreur rapport modele precedant (par rapport au oob)
#plt.plot(iter,gbt_noRand05.oob_improvement_)
plt.legend(loc="upper right", fontsize=12)
# Prediction des probabilités de 1 , array2d
probas_test = gbt_noRand05.predict_proba(X_test)[:, 1]
probas_train = gbt_noRand05.predict_proba(X_train)[:, 1]
Exemplo n.º 23
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=42)

learn_rates = [1, 0.5, 0.3, 0.2, 0.1]

for lr in learn_rates:
    clf = GradientBoostingClassifier(n_estimators=250,
                                     verbose=True,
                                     learning_rate=lr,
                                     random_state=241)
    clf.fit(X_train, y_train)
    #compute quality on training set
    train_loss = []
    for i, y_pred in enumerate(clf.staged_decision_function(X_train)):
        y_pred_sigmoid = 1.0 / (1 + np.exp(-y_pred))
        loss = log_loss(y_train, y_pred_sigmoid)
        train_loss.append(loss)

    #compute quality and find minimum loss on test set
    min_loss = [0, 10]
    test_loss = []
    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        y_pred_sigmoid = 1.0 / (1 + np.exp(-y_pred))
        loss = log_loss(y_test, y_pred_sigmoid)
        test_loss.append(loss)
        if loss < min_loss[1]:
            min_loss[0] = i
            min_loss[1] = loss
Exemplo n.º 24
0
y = data[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, \
                                      random_state=42)

# choose lr (learning rate) out of [1, 0.5, 0.3, 0.2, 0.1]
lr = 0.2
print('Learning rate =', lr)

# fit gradient boosting
clf = GradientBoostingClassifier(n_estimators=250, verbose=True, \
                                     learning_rate = lr, random_state=241)
clf.fit(X_train, y_train)

# retrieve predictions on each iteration
stage_train = list(clf.staged_decision_function(X_train))
stage_test = list(clf.staged_decision_function(X_test))

# convert predictions to the probability range
for i in range(len(stage_train)):
    for j in range(len(stage_train[0])):
        stage_train[i][j] = 1 / (1 + math.exp(-stage_train[i][j]))
for i in range(len(stage_test)):
    for j in range(len(stage_test[0])):
        stage_test[i][j] = 1 / (1 + math.exp(-stage_test[i][j]))

# calculate logloss on each iteration
logloss_train  = [sklearn.metrics.log_loss(y_train, stage_train[i]) \
                     for i in range(len(stage_train))]
logloss_test   = [sklearn.metrics.log_loss(y_test, stage_test[i]) \
                     for i in range(len(stage_test))]
Exemplo n.º 25
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=241)

iter_number = 0
learning_rates = [1, 0.5, 0.3, 0.2, 0.1]
for rate in learning_rates:
    clf = GradientBoostingClassifier(n_estimators=250,
                                     verbose=True,
                                     random_state=241,
                                     learning_rate=rate)
    clf.fit(X_train, y_train)

    sdf_train = clf.staged_decision_function(X_train)
    sdf_test = clf.staged_decision_function(X_test)

    score_train = []
    for y_pred in sdf_train:
        score_train.append(log_loss(y_train, sigm(y_pred)))

    score_test = []
    min_loss = 1
    for i, y_pred in enumerate(sdf_test):
        loss = log_loss(y_test, sigm(y_pred))
        score_test.append(loss)
        if rate == 0.2 and loss < min_loss:
            min_loss = loss
            iter_number = i
Exemplo n.º 26
0

data = pandas.read_csv('Data/gbm-data.csv')
datanp = data.values
y = datanp[:, 0]
x = datanp[:, [x for x in range(1, 1777)]]

X_train, X_test, y_train, y_test  = train_test_split(x, y, test_size=0.8, random_state=241)
test_score = dict()

for learning_rate in [0.2]:
    print("learning rate:", learning_rate)
    cls = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate)
    cls.fit(X_train, y_train)

    for i, pred in enumerate(cls.staged_decision_function(X_test)):
        predicted = sigmoid(pred)
        test_score[i] = log_loss(y_test, predicted)


    #train_score = dict()
    #for i, pred in enumerate(cls.staged_decision_function(X_train)):
    #    train_score[i] = cls.loss_(y_train, pred)

pp.pprint(test_score)
res = min(test_score, key=test_score.get)
print(res)


cls2 = GradientBoostingClassifier(n_estimators=36, verbose=True, random_state=241)
cls2.fit(X_train, y_train)
Exemplo n.º 27
0
import matplotlib.pyplot as plt
#%matplotlib inline

data = pd.read_csv('gbm-data.csv')
y = data['Activity'].values
X = data.drop('Activity', axis = 1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 241)
    
    

learning_rate = [1, 0.5, 0.3, 0.2, 0.1] 
for i in learning_rate:
    clf = GBC(n_estimators = 250, verbose = True, random_state = 241, learning_rate = i)
    clf.fit(X_train, y_train)
    train_loss = [log_loss(y_train, 1.0/(1 + exp(-y_pred))) for y_pred in clf.staged_decision_function(X_train)]
    test_loss =  [log_loss(y_test, 1.0/(1 + exp(-y_pred))) for y_pred in clf.staged_decision_function(X_test)]
    
    plt.figure()
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])

clf = GBC(n_estimators = 250, verbose = True, random_state = 241, learning_rate = 0.2)
clf.fit(X_train, y_train)
test_loss = [log_loss(y_test, 1.0/(1 + exp(-y_pred))) for y_pred in clf.staged_decision_function(X_test)]

with open('log-loss.txt', 'w') as f:
    f.write(str(round(min(test_loss), 2)) + ' ' + str(test_loss.index(min(test_loss))))

clf = RFC(random_state = 241, n_estimators = test_loss.index(min(test_loss)))   
Exemplo n.º 28
0
import pandas
import math
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

data = pandas.read_csv('gbm-data.csv')

train = data.drop('Activity', 1)
target = data['Activity']

train = train.values
target = target.values

X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.8, random_state=241)

list = [1, 0.5, 0.3, 0.2, 0.1]

for i in list:
	clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=i)
	for predict in clf.staged_decision_function(X=X_train):
		predict = 1 + math.exp(- predict)

		test_loss = log_loss(y_true=y_train, y_pred=predict)

Exemplo n.º 29
0
import matplotlib.pyplot as plt

data = pandas.read_csv('gbm-data.csv')
X = data[list(range(1, len(data.columns)))]
y = np.ravel(data[[0]])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]:
    clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate)
    clf.fit(X_train, y_train)

    log_train = []
    log_test = []
    
    for y_pred in clf.staged_decision_function(X_train):
        log_train.append(log_loss(y_train, 1 / (1 + np.exp(-y_pred))))

    for y_pred in clf.staged_decision_function(X_test):
        log_test.append(log_loss(y_test, 1 / (1 + np.exp(-y_pred))))
    
    if learning_rate == 0.2:
        mini = min(log_test)
        ind = (log_test).index(mini)
    
    plt.figure()
    plt.plot(log_test, 'r', linewidth=2)
    plt.plot(log_train, 'g', linewidth=2)
    plt.legend(['test', 'train'])
    plt.show()
Exemplo n.º 30
0
t0 = DT.datetime.now()
model_gbc.fit(X_train, y_train3)
t1 = DT.datetime.now()
print('GBC took ' + str(t1 - t0))

z_gbc = model_gbc.predict_proba(X_test)[:, 1]

#ROC
fpr_gbc, tpr_gbc, thresh_gbc = skm.roc_curve(y_test3, z_gbc)
plt.figure(3)
plt.plot(fpr_gbc, tpr_gbc, 'r-')

# AUC
skm.auc(fpr_gbc, tpr_gbc)

# Deviance (see https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regularization.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regularization-py)
# compute test set deviance
test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64)

for i, y_pred in enumerate(model_gbc.staged_decision_function(X_test)):
    # clf.loss_ assumes that y_test[i] in {0, 1}
    test_deviance[i] = model_gbc.loss_(y_test3, y_pred)

plt.plot((np.arange(test_deviance.shape[0]) + 1)[::1],
         test_deviance[::1],
         '-',
         color='red',
         label=str(params))
#plt.close()
Exemplo n.º 31
0
X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    test_size=0.8,
                                                    random_state=241)


# for lr in [1, 0.5, 0.3, 0.2, 0.1]:
for lr in [0.2]:
    clf = GradientBoostingClassifier(n_estimators=250,
                                     verbose=True,
                                     random_state=241,
                                     learning_rate=lr)
    clf.fit(X_train, y_train)

    sigmoid_test_arr, sigmoid_train_arr = [], []

    train_pred = clf.staged_decision_function(X_train)
    test_pred = clf.staged_decision_function(X_test)

    test_pred_arr, train_pred_arr = [], []

    for i, val in enumerate(train_pred):
        sigmoid = 1 / (1 + np.exp(-val))
        train_pred_arr.append(log_loss(y_train, sigmoid))

    for i, val in enumerate(test_pred):
        sigmoid = 1 / (1 + np.exp(-val))
        test_pred_arr.append(log_loss(y_test, sigmoid))

    test_tuples, train_tuples = [], []

    i = 0
Exemplo n.º 32
0
Min_Loss = []

# Train GradientBoostingClassifier (n_estimators = 250, verbose = True, random_state = 241).
for lr in [1, 0.5, 0.3, 0.2, 0.1]:
    gbc = GradientBoostingClassifier(n_estimators=250,
                                     verbose=True,
                                     random_state=241,
                                     learning_rate=lr)
    gbc.fit(X_train, y_train)

    test_loss, train_loss = [], []

    # Use the staged_decision_function method to predict
    # the scores of the training and test samples at each iteration.
    # Transform the resulting prediction using the sigmoid function.
    for iter_ in gbc.staged_decision_function(X_train):
        train_loss.append(
            log_loss(y_train, [1.0 / (1 + np.exp(-x)) for x in iter_]))

    for iter_ in gbc.staged_decision_function(X_test):
        test_loss.append(
            log_loss(y_test, [1.0 / (1 + np.exp(-x)) for x in iter_]))

    Min_Loss.append(
        (test_loss[np.argmin(test_loss)], np.argmin(test_loss) + 1))

    # Calculate and plot the log-loss values on the training and test samples.
    plt.figure()
    plt.ylabel('log_loss')
    plt.xlabel('iteration')
    plt.plot(test_loss, 'r', linewidth=2)
Exemplo n.º 33
0
Arquivo: gbm.py Projeto: samoubiza/ML
X= data[:, 1:]
#split into train test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

#train
clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=0.2)
clf.fit(X_train, y_train)

#verify log loss


loss_on_test = []

for i, pred1 in enumerate(clf.staged_decision_function(X_test)):
##    print(i)
##    print(pred1)
##    print(y_test)
    x = log_loss(y_test, 1.0/(1.0+np.exp(-pred1)))
##    print(x)
    loss_on_test.append(x)

grd2 = clf.staged_predict_proba(X_test)

loss_on_test_proba = []

for i, pred2 in enumerate(grd2):

    loss_on_test_proba.append(log_loss(y_test, pred2))
Exemplo n.º 34
0
learning_rate = [1, 0.5, 0.3, 0.2, 0.1]
for rate in learning_rate:
    
    #обучаем классификатор
    clf = GradientBoostingClassifier(learning_rate = rate,
                                     n_estimators=250,
                                     verbose=True,
                                     random_state=241)
    clf.fit(X_train,Y_train)     
    
    #готовим массывы под функцию потерь
    train_loss = np.zeros(250, dtype=np.float64)
    test_loss = np.zeros(250, dtype=np.float64)
    
    #считаем функцию потерь на обучающих данных
    for i, Y_train_pred in enumerate(clf.staged_decision_function(X_train)):
        Y_train_pred = 1 / (1 + np.exp(-Y_train_pred))
        train_loss[i] = log_loss(Y_train, Y_train_pred)
    
    #считаем функцию потерь на тестовых данных
    for i, Y_test_pred in enumerate(clf.staged_decision_function(X_test)):
        Y_test_pred = 1 / (1 + np.exp(-Y_test_pred))
        test_loss[i] = log_loss(Y_test, Y_test_pred)
    
    #строим графики    
    plt.figure()
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])
    plt.title('learning_rate=%f' %rate)
    
	f = open(filename, 'w')
	f.write(s)
	f.close()


data = pd.read_csv('gbm-data.csv', header=0).values

x_train, x_test, y_train, y_test = train_test_split(data[:, 1:], data[:, 0], test_size=0.8, random_state=241)
# for lr in [1, 0.5, 0.3, 0.2, 0.1]:
for lr in [0.2]:
	print '############## RATE %s ##########' % lr
	clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=lr)
	clf.fit(x_train, y_train)
	train_score = []
	test_score = []
	for i, y_predicted in enumerate(clf.staged_decision_function(x_train)):
		train_score.append(log_loss(y_train, 1 / (1 + np.exp(-y_predicted))))
	for i, y_predicted in enumerate(clf.staged_decision_function(x_test)):
		test_score.append(log_loss(y_test, 1 / (1 + np.exp(-y_predicted))))
	plt.figure()
	plt.plot(test_score, 'g', linewidth=2)
	plt.plot(train_score, 'r', linewidth=2)
	plt.legend(['test', 'train'])
	#plt.show()
	n_iter = np.argmin(np.array(test_score))
	best = np.amin(np.array(test_score))
	res = '%.2f %d' % (best, n_iter)
	print res
	out('5_3.txt', res)

	clf2 = RandomForestClassifier(n_estimators=n_iter, random_state=241)
    X_data_test = arrays[1]
    Y_data_train = arrays[2]
    Y_data_test = arrays[3]

    answer2_argmin = None
    answer2_value = None
    for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]:
        clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241,
                                         learning_rate=learning_rate)
        clf.fit(X_data_train, Y_data_train)

        train_probs = clf.predict_proba(X_data_train)
        test_probs = clf.predict_proba(X_data_test)

        train_losts = []
        for pred in clf.staged_decision_function(X_data_train):
            train_losts.append(log_loss(Y_data_train, [1 / (1 + exp(-x)) for x in pred]))
        train_losts = np.array(train_losts)

        test_losts = []
        for pred in clf.staged_decision_function(X_data_test):
            test_losts.append(log_loss(Y_data_test, [1 / (1 + exp(-x)) for x in pred]))
        test_losts = np.array(test_losts)

        figure()
        plot(test_losts, 'g', linewidth=2)
        plot(train_losts, 'r', linewidth=2)
        legend(['test', 'train'])
        savefig('image-%s.png' % learning_rate)

        if learning_rate == 0.2: