subset_cols = ['pca_1_last', 'pca_2_last','pca_3_last','pca_4_last','pca_5_last', 'pca_6_last','pca_7_last','pca_8_last','pca_9_last','pca_10_last', \ 'R_VALUE_last', 'XR_MAX_last', 'NA_satellite_last', 'NA_SHARPmask_last', 'NA_Rmask_last', 'NA_XR_MAX_last'] fold1_subset = fold1_df[subset_cols] fold2_subset = fold2_df[subset_cols] fold3_subset = fold3_df[subset_cols] testset_subset = testset_df[subset_cols] ## Lav logistisk regression fra R LR = LogisticRegression() LR.fit(fold1_subset, fold1_df['label']) #Score on f2 my_preds = LR.predict_proba(fold2_subset)[:, 1] true_vals = fold2_df['label'] temp = f1_scores_plot(my_preds, true_vals) #Næsten det samme. Lidt under #Fit on fold3 with resize and extract best score my_preds = LR.predict_proba(fold3_subset)[:, 1] true_vals = fold3_df['label'] df, best_index = f1_scores_plot(my_preds, true_vals, resize=True) #0.653 best_threshold = df['threshold'][best_index] ### Fit on everything and predict on test all_sets = pd.concat([fold1_subset, fold2_subset, fold3_subset]) all_labels = pd.concat([fold1_df, fold2_df, fold3_df])['label'] LR = LogisticRegression() LR.fit(fold1_subset, fold1_df['label']) my_preds = LR.predict_proba(testset_subset)[:, 1] #Save the classes
best_model = nn_model # Set the weights to the weights that gave the lowest validation error during training best_model.load_weights(Gitlab_Path + '/Models/NN/model_val.hdf5') ### Check perfomance on fold3 fold3_df = load_dataframe(filename='fold3_NA_features.dat') del fold3_df['id'] dw_cols = [x for x in fold1_df.columns if x[-2:] == 'dw' and x[:3] == 'pca'] fold3_df[dw_cols] = np.log(np.array(fold3_df[dw_cols])) fold3_df = fold3_df.replace([-np.inf], 0) x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']] y_test = fold3_df['label'] my_preds = best_model.predict(x_test).flatten() #_, best_index = f1_scores_plot(my_preds,y_test,resize = False) #0.712 df, best_index = f1_scores_plot(my_preds, y_test, resize=True) #0.653 best_threshold = df['threshold'][best_index] ## Creat predictions on test_set_df = load_dataframe(filename='testSet_NA_features.dat') del test_set_df['id'] dw_cols = [x for x in test_set_df.columns if x[-2:] == 'dw' and x[:3] == 'pca'] test_set_df[dw_cols] = np.log(np.array(test_set_df[dw_cols])) test_set_df = test_set_df.replace([-np.inf], 0) my_y_preds = best_model.predict(test_set_df).flatten() classifications = np.zeros(len(my_y_preds), dtype=int) classifications[my_y_preds > best_threshold] = 1 my_df = pd.DataFrame({ 'Id': np.arange(1, len(classifications) + 1),
preds = np.array(list(preds_fold2) + list(preds_fold3)) ## Calculate f1_score classes = np.zeros(len(preds), dtype=int) classes[preds > 0.35] = 1 f1_list.append(f1_score(y_true=true_values, y_pred=classes)) #### plt.scatter(np.arange(len(f1_list)), f1_list) print(time() - p0) ## Ensemble the predictions true_values = fold2_df['label'] df, best_index = f1_scores_plot(preds_ens, true_values) df['f1_score'][best_index] #Li ### Check perfomance on fold3 fold3_df = load_dataframe(filename='fold3_NA_features.dat') del fold3_df['id'] dw_cols = [x for x in fold1_df.columns if x[-2:] == 'dw' and x[:3] == 'pca'] fold3_df[dw_cols] = np.log(np.array(fold3_df[dw_cols])) fold3_df = fold3_df.replace([-np.inf], 0) x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']] y_test = fold3_df['label'] my_preds = clf.predict_proba(x_test)[:, 1] _, best_index = f1_scores_plot(my_preds, y_test, resize=False) #0.712 df, best_index = f1_scores_plot(my_preds, y_test, resize=True) #0.653
validation_steps= np.ceil( n_lines['fold3']/ batch_size), #Hvornår skal den stoppe med epoc og starte næste? epochs=3) #Create the new generat valid_gen = batch_generator(filename=Data_Path + '/fold3_NA.dat', batch_size=batch_size, num_features=num_features) preds = my_model.predict_generator(valid_gen, steps = np.ceil( n_lines['fold3']/ batch_size) ).flatten() true_vals = load_dataframe(filename = 'fold3_NA_features.dat')['label'] preds = preds[:len(true_vals)] #Vi predicter 'np.ceil' så genstarter batchen f1_scores_plot(preds,true_vals) f1_scores_plot(preds,true_vals,resize = True)
## Ensemble the predictions true_values = fold2_df['label'] #df, best_index = f1_scores_plot(preds_ens,true_values) #df['f1_score'][best_index] #Li ### Check perfomance on fold3 fold3_df = load_dataframe(filename = 'fold3_NA_features.dat') del fold3_df['id'] x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']] y_test = fold3_df['label'] my_preds = clf.predict_proba(x_test)[:,1] _, best_index = f1_scores_plot(my_preds,y_test,resize = False) #0.712 #save_obj(clf, Gitlab_Path + "/Models/Trees/tree_0.715") clf = load_obj(Gitlab_Path + "/Models/Trees/tree_0.715") #### fit on full and save all_sets = pd.concat([fold1_df,fold2_df,fold3_df]) del fold1_df, fold2_df, fold3_df testset = load_dataframe(filename = 'testSet_NA_features.dat') del testset['id'] clf.fit(all_sets.iloc[:,1:],all_sets.iloc[:,0]) preds_ens = clf.predict_proba(testset)[:,1] classes = np.zeros(len(preds_ens),dtype = int)
f1_list.append(f1_score(y_true=true_values, y_pred=classes)) #Plot the thing plt.scatter(np.arange(len(f1_list)), f1_list) #Ser ud til 0.005 er bedst for den fulde LR = LogisticRegression(penalty='l2', max_iter=500, C=0.005, solver='saga') train = pd.concat([fold1_full, fold2_full], axis=0) label = pd.concat([fold1_df, fold2_df], axis=0)['label'] LR.fit(train, label) #Fit on fold3 with resize and extract best score my_preds = LR.predict_proba(fold3_full)[:, 1] true_vals = fold3_df['label'] df, best_index = f1_scores_plot(my_preds, true_vals, resize=False) best_threshold = df['threshold'][best_index] del train, label ### Fit on everything and predict on test all_sets = pd.concat([fold1_full, fold2_full, fold3_full]) all_labels = pd.concat([fold1_df, fold2_df, fold3_df])['label'] LR = LogisticRegression(penalty='l2', max_iter=2000, C=0.005, solver='saga') LR.fit(all_sets, all_labels) my_preds = LR.predict_proba(testset_full)[:, 1] my_df = pd.DataFrame({ 'Id': np.arange(1, len(my_preds) + 1), 'ClassLabel': my_preds
learning_rate=0.03, n_estimators=200, max_features=n_features, validation_fraction=0.3, n_iter_no_change=10, #min_samples_split = 10, #default is 1 max_depth=4 #default is 3..? ) clf.fit(fold1_df.iloc[:, 1:], fold1_df.iloc[:, 0]) preds_ens = clf.predict_proba(fold2_df.iloc[:, 1:])[:, 1] print(time() - p0) ## Ensemble the predictions true_values = fold2_df['label'] df, best_index = f1_scores_plot(preds_ens, true_values) df['f1_score'][best_index] #Li ### Check perfomance on fold3 fold1_and_2 = pd.concat([fold1_df, fold2_df], axis=0) clf.fit(fold1_and_2.iloc[:, 1:], fold1_and_2.iloc[:, 0]) x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']] y_test = fold3_df['label'] my_preds = clf.predict_proba(x_test)[:, 1] df, best_index = f1_scores_plot(my_preds, y_test, resize=False) #save_obj(clf, Gitlab_Path + "/Models/Trees/tree_0.723") clf = load_obj(Gitlab_Path + "/Models/Trees/tree_0.723")
#tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20), tf.keras.callbacks.ModelCheckpoint('./logs/model_val.hdf5', monitor='val_loss', save_best_only=True, verbose=1) ] # Define optimization algorithm sgd = tf.optimizers.SGD(lr=0.2) # Compile model (i.e., build compute graph) nn_model.compile(optimizer=sgd, loss='MSE') # Training loop nn_model.fit( x_train, y_train, batch_size=25, epochs=100, validation_data=(x_val, y_val), validation_freq=1, #steps_per_epoch=x_train.shape[0], callbacks=callbacks) ## Score my_preds = LR.predict_proba(fold2_subset)[:, 1] true_vals = fold2_df['label'] temp = f1_scores_plot(my_preds, true_vals) #Næsten det samme. Lidt under ## Skal slå 0.644