subset_cols = ['pca_1_last', 'pca_2_last','pca_3_last','pca_4_last','pca_5_last', 'pca_6_last','pca_7_last','pca_8_last','pca_9_last','pca_10_last', \
               'R_VALUE_last', 'XR_MAX_last', 'NA_satellite_last', 'NA_SHARPmask_last', 'NA_Rmask_last', 'NA_XR_MAX_last']

fold1_subset = fold1_df[subset_cols]
fold2_subset = fold2_df[subset_cols]
fold3_subset = fold3_df[subset_cols]
testset_subset = testset_df[subset_cols]

## Lav logistisk regression fra R
LR = LogisticRegression()
LR.fit(fold1_subset, fold1_df['label'])

#Score on f2
my_preds = LR.predict_proba(fold2_subset)[:, 1]
true_vals = fold2_df['label']
temp = f1_scores_plot(my_preds, true_vals)  #Næsten det samme. Lidt under

#Fit on fold3 with resize and extract best score
my_preds = LR.predict_proba(fold3_subset)[:, 1]
true_vals = fold3_df['label']
df, best_index = f1_scores_plot(my_preds, true_vals, resize=True)  #0.653
best_threshold = df['threshold'][best_index]

### Fit on everything and predict on test
all_sets = pd.concat([fold1_subset, fold2_subset, fold3_subset])
all_labels = pd.concat([fold1_df, fold2_df, fold3_df])['label']
LR = LogisticRegression()
LR.fit(fold1_subset, fold1_df['label'])
my_preds = LR.predict_proba(testset_subset)[:, 1]

#Save the classes
Пример #2
0
best_model = nn_model
# Set the weights to the weights that gave the lowest validation error during training
best_model.load_weights(Gitlab_Path + '/Models/NN/model_val.hdf5')

### Check perfomance on fold3
fold3_df = load_dataframe(filename='fold3_NA_features.dat')
del fold3_df['id']
dw_cols = [x for x in fold1_df.columns if x[-2:] == 'dw' and x[:3] == 'pca']
fold3_df[dw_cols] = np.log(np.array(fold3_df[dw_cols]))
fold3_df = fold3_df.replace([-np.inf], 0)
x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']]
y_test = fold3_df['label']
my_preds = best_model.predict(x_test).flatten()

#_, best_index = f1_scores_plot(my_preds,y_test,resize = False) #0.712
df, best_index = f1_scores_plot(my_preds, y_test, resize=True)  #0.653
best_threshold = df['threshold'][best_index]

## Creat predictions on
test_set_df = load_dataframe(filename='testSet_NA_features.dat')
del test_set_df['id']
dw_cols = [x for x in test_set_df.columns if x[-2:] == 'dw' and x[:3] == 'pca']
test_set_df[dw_cols] = np.log(np.array(test_set_df[dw_cols]))
test_set_df = test_set_df.replace([-np.inf], 0)
my_y_preds = best_model.predict(test_set_df).flatten()
classifications = np.zeros(len(my_y_preds), dtype=int)
classifications[my_y_preds > best_threshold] = 1

my_df = pd.DataFrame({
    'Id': np.arange(1,
                    len(classifications) + 1),
    preds = np.array(list(preds_fold2) + list(preds_fold3))

    ## Calculate f1_score
    classes = np.zeros(len(preds), dtype=int)
    classes[preds > 0.35] = 1

    f1_list.append(f1_score(y_true=true_values, y_pred=classes))

####
plt.scatter(np.arange(len(f1_list)), f1_list)

print(time() - p0)

## Ensemble the predictions
true_values = fold2_df['label']
df, best_index = f1_scores_plot(preds_ens, true_values)
df['f1_score'][best_index]  #Li

### Check perfomance on fold3
fold3_df = load_dataframe(filename='fold3_NA_features.dat')
del fold3_df['id']
dw_cols = [x for x in fold1_df.columns if x[-2:] == 'dw' and x[:3] == 'pca']
fold3_df[dw_cols] = np.log(np.array(fold3_df[dw_cols]))
fold3_df = fold3_df.replace([-np.inf], 0)
x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']]
y_test = fold3_df['label']
my_preds = clf.predict_proba(x_test)[:, 1]

_, best_index = f1_scores_plot(my_preds, y_test, resize=False)  #0.712

df, best_index = f1_scores_plot(my_preds, y_test, resize=True)  #0.653
                           validation_steps= np.ceil( n_lines['fold3']/ batch_size), #Hvornår skal den stoppe med epoc og starte næste?
                           epochs=3)
    
    #Create the new generat   
    valid_gen = batch_generator(filename=Data_Path + '/fold3_NA.dat',
                                batch_size=batch_size,
                                num_features=num_features)
    
    preds = my_model.predict_generator(valid_gen,
                                       steps = np.ceil( n_lines['fold3']/ batch_size)
                                       ).flatten()
    
    true_vals = load_dataframe(filename = 'fold3_NA_features.dat')['label']
    preds = preds[:len(true_vals)] #Vi predicter 'np.ceil' så genstarter batchen
    
    f1_scores_plot(preds,true_vals)
    f1_scores_plot(preds,true_vals,resize = True)
    
    











Пример #5
0
## Ensemble the predictions
true_values = fold2_df['label']
#df, best_index = f1_scores_plot(preds_ens,true_values)
#df['f1_score'][best_index] #Li


### Check perfomance on fold3

fold3_df = load_dataframe(filename = 'fold3_NA_features.dat')
del fold3_df['id']

x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']]
y_test = fold3_df['label']

my_preds = clf.predict_proba(x_test)[:,1]
_, best_index = f1_scores_plot(my_preds,y_test,resize = False) #0.712
          
                              
#save_obj(clf, Gitlab_Path + "/Models/Trees/tree_0.715")
clf = load_obj(Gitlab_Path + "/Models/Trees/tree_0.715")

####  fit on full and save
all_sets = pd.concat([fold1_df,fold2_df,fold3_df])
del fold1_df, fold2_df, fold3_df

testset = load_dataframe(filename = 'testSet_NA_features.dat')
del testset['id']

clf.fit(all_sets.iloc[:,1:],all_sets.iloc[:,0])
preds_ens = clf.predict_proba(testset)[:,1]
classes = np.zeros(len(preds_ens),dtype = int)
Пример #6
0
    f1_list.append(f1_score(y_true=true_values, y_pred=classes))

#Plot the thing
plt.scatter(np.arange(len(f1_list)), f1_list)

#Ser ud til 0.005 er bedst for den fulde
LR = LogisticRegression(penalty='l2', max_iter=500, C=0.005, solver='saga')
train = pd.concat([fold1_full, fold2_full], axis=0)
label = pd.concat([fold1_df, fold2_df], axis=0)['label']
LR.fit(train, label)

#Fit on fold3 with resize and extract best score
my_preds = LR.predict_proba(fold3_full)[:, 1]
true_vals = fold3_df['label']
df, best_index = f1_scores_plot(my_preds, true_vals, resize=False)
best_threshold = df['threshold'][best_index]

del train, label

### Fit on everything and predict on test
all_sets = pd.concat([fold1_full, fold2_full, fold3_full])
all_labels = pd.concat([fold1_df, fold2_df, fold3_df])['label']
LR = LogisticRegression(penalty='l2', max_iter=2000, C=0.005, solver='saga')
LR.fit(all_sets, all_labels)
my_preds = LR.predict_proba(testset_full)[:, 1]

my_df = pd.DataFrame({
    'Id': np.arange(1,
                    len(my_preds) + 1),
    'ClassLabel': my_preds
    learning_rate=0.03,
    n_estimators=200,
    max_features=n_features,
    validation_fraction=0.3,
    n_iter_no_change=10,
    #min_samples_split = 10, #default is 1
    max_depth=4  #default is 3..?
)

clf.fit(fold1_df.iloc[:, 1:], fold1_df.iloc[:, 0])
preds_ens = clf.predict_proba(fold2_df.iloc[:, 1:])[:, 1]
print(time() - p0)

## Ensemble the predictions
true_values = fold2_df['label']
df, best_index = f1_scores_plot(preds_ens, true_values)
df['f1_score'][best_index]  #Li

### Check perfomance on fold3
fold1_and_2 = pd.concat([fold1_df, fold2_df], axis=0)
clf.fit(fold1_and_2.iloc[:, 1:], fold1_and_2.iloc[:, 0])

x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']]
y_test = fold3_df['label']

my_preds = clf.predict_proba(x_test)[:, 1]
df, best_index = f1_scores_plot(my_preds, y_test, resize=False)

#save_obj(clf, Gitlab_Path + "/Models/Trees/tree_0.723")
clf = load_obj(Gitlab_Path + "/Models/Trees/tree_0.723")
    #tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20),
    tf.keras.callbacks.ModelCheckpoint('./logs/model_val.hdf5',
                                       monitor='val_loss',
                                       save_best_only=True,
                                       verbose=1)
]

# Define optimization algorithm
sgd = tf.optimizers.SGD(lr=0.2)

# Compile model (i.e., build compute graph)
nn_model.compile(optimizer=sgd, loss='MSE')

# Training loop
nn_model.fit(
    x_train,
    y_train,
    batch_size=25,
    epochs=100,
    validation_data=(x_val, y_val),
    validation_freq=1,
    #steps_per_epoch=x_train.shape[0],
    callbacks=callbacks)

## Score
my_preds = LR.predict_proba(fold2_subset)[:, 1]
true_vals = fold2_df['label']
temp = f1_scores_plot(my_preds, true_vals)  #Næsten det samme. Lidt under

## Skal slå 0.644