def evaluate_cotrain(modelo1, modelo2, modelo3, arquitectura1, arquitectura2, arquitectura3, datos, etapa, kfold, iteracion, pipeline, models_info, logs): train_generator_arch1, test1_generator_arch1, STEP_SIZE_TEST1_arch1 = generadores( etapa, arquitectura1, datos, pipeline, False, iteracion, models_info) train_generator_arch2, test1_generator_arch2, STEP_SIZE_TEST1_arch2 = generadores( etapa, arquitectura2, datos, pipeline, False, iteracion, models_info) train_generator_arch3, test1_generator_arch3, STEP_SIZE_TEST1_arch3 = generadores( etapa, arquitectura3, datos, pipeline, False, iteracion, models_info) df1 = evaluar(modelo1, train_generator_arch1, test1_generator_arch1, STEP_SIZE_TEST1_arch1) df2 = evaluar(modelo2, train_generator_arch2, test1_generator_arch2, STEP_SIZE_TEST1_arch2) df3 = evaluar(modelo3, train_generator_arch3, test1_generator_arch3, STEP_SIZE_TEST1_arch3) predicciones = [] for i in range(len(df1)): c1 = (df1['Predictions'][i] == df2['Predictions'][i]) c2 = (df1['Predictions'][i] == df3['Predictions'][i]) c3 = (df2['Predictions'][i] == df3['Predictions'][i]) if c1 and c2 and c3: predicciones.append([df1['Filename'][i], df1['Predictions'][i]]) else: probabilidades = np.array([ df1['Max_Probability'][i], df2['Max_Probability'][i], df3['Max_Probability'][i] ]) indice_prob_max = probabilidades.argmax() clases = np.array([ df1['Predictions'][i], df2['Predictions'][i], df3['Predictions'][i] ]) indice_clas_max = clases.argmax() real = np.array( [df1['Filename'][i], df2['Filename'][i], df3['Filename'][i]]) predicciones.append( [real[indice_prob_max], clases[indice_clas_max]]) results = pd.DataFrame(predicciones, columns=["filename", "predictions"]) #results['filename'] = results['filename'].apply(lambda x:x.split('/')[-1].split('_')[-1][0]) results['filename'] = results['filename'].apply(lambda x: x.split('/')[-2]) y_true = results['filename'].values.tolist() y_pred = results['predictions'].values.tolist() from sklearn.metrics import accuracy_score co_train_accu = accuracy_score(y_pred, y_true) co_train_label = 'co-train' logs.append( [kfold, iteracion, co_train_label, None, None, None, co_train_accu]) save_logs(logs, 'train', pipeline) return co_train_accu
def ssl_global(model_zoo, pipeline): numero_lotes = 5 semi_method = 'co-training-multi' datos = {} models_info = {} #test_cotraining,predicciones = [],[] #logs,logs_time,logs_label = [], [], [] datos["df_base"] = get_dataset(pipeline) datos = split_train_test(datos, pipeline) # Medir tiempo de ejecucion import time start = time.time() for kfold in range(1): for iteracion in range(numero_lotes * 1): print("\n######################") print("K-FOLD {} - ITERACION {}".format(kfold, iteracion)) print("######################\n") datos = get_Fold(kfold, datos, pipeline) if iteracion == 0: etapa = 'train' else: etapa = 'train_EL' print(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '.pickle') for model in model_zoo: model_memory, model_performance = training( kfold, etapa, datos, model, iteracion, models_info, pipeline) models_info[model] = { 'model_memory': model_memory, 'model_performance': model_performance['val_acc'] } df_temp = pd.DataFrame(models_info).T top_models = df_temp.sort_values('model_performance', ascending=False) top_models = top_models.reset_index()['index'].values.tolist()[:3] mod_top1, arch_top1 = models_info[ top_models[0]]['model_memory'], top_models[0] mod_top2, arch_top2 = models_info[ top_models[1]]['model_memory'], top_models[1] mod_top3, arch_top3 = models_info[ top_models[2]]['model_memory'], top_models[2] print("\n") print( "Co-train: ", evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, etapa, kfold, iteracion, pipeline, models_info, logs)) print("\n") if semi_method == 'supervised': break if iteracion < numero_lotes: df_batchset = datos["batch_set"][iteracion] df_batchset.columns = [ pipeline["x_col_name"], pipeline["y_col_name"] ] df_batchset[pipeline["y_col_name"]] = '0' else: if iteracion == numero_lotes: df_LC = pd.DataFrame(pipeline["LC"]) batch_set_LC = list(dividir_lotes(df_LC, numero_lotes)) for i in enumerate(batch_set_LC): print(len(batch_set_LC[i].iloc[:, 0].values.tolist())) pipeline["LC"] = [] df_batchset = pd.DataFrame([ batch_set_LC[int(iteracion - numero_lotes)].iloc[:, 0].values.tolist() ]).T df_batchset.columns = [pipeline["x_col_name"]] df_batchset[pipeline["y_col_name"]] = '0' datos['df_batchset'] = df_batchset datos, EL_iter, LC_iter = labeling(etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, pipeline, iteracion, models_info) #logs_label.append([kfold,iteracion,arch_top1,arch_top2,arch_top3,len(EL_iter),len(LC_iter)]) #save_logs(logs_label,'label',pipeline) #df_EL = pd.DataFrame(EL, columns=[pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores']) #df_LC = pd.DataFrame(LC, columns=[pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores']) print("EL_iter", len(EL_iter)) print("LC_iter", len(LC_iter)) #df_EL = pd.DataFrame(EL_iter, columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) # EXP30 #df_LC = pd.DataFrame(LC_iter, columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) # EXP30 df_EL = pd.DataFrame(datos["EL"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_LC = pd.DataFrame(datos["LC"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) os.makedirs(pipeline["path_label_stats"].split('/')[0], exist_ok=True) df_EL.to_pickle(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '_EL.pickle') df_LC.to_pickle(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '_LC.pickle') df_label_stats = label_stats(df_EL, df_LC, pipeline) #print(df_label_stats) df_label_stats.to_pickle(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '.pickle') df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2] ]) # USANDO MUESTRAS TRAIN Y EL #df_train_EL = df_EL.iloc[:,:2].copy() # EXP30 # UNICAMENTE USANDO MUESTRAS EL #print(df_train) #print("df_train_EL") #print(df_train_EL) #print(df_EL.iloc[:,:2]) #print(df_train_EL) datos['df_train_EL'] = df_train_EL try: print("AUTO-ESTIMATING OF SSL THRESHOLD ...") df_EL_stats = df_label_stats["df_EL_stats"]["df"] df_LC_stats = df_label_stats["df_LC_stats"]["df"] df_U_iter = pd.concat([df_EL_stats, df_LC_stats], ignore_index=True) ssl_th = df_U_iter.describe()["arch_scores_mean"]["mean"] pipeline["ssl_threshold"] = ssl_th print("NEW SSL THRESHOLD: ", ssl_th) except: print("ERROR - AUTO-ESTIMATING SSL THRESHOLD") ssl_th = pipeline["ssl_threshold"] traceback.print_exc() #df_U_iter.describe()["arch_scores_mean"]["25%"] #df_U_iter = pd.concat([df_EL,df_LC], ignore_index=True) #EXP 33 #print("df_U_describe") #print(f"MEAN U_{iteracion}: {ssl_th}") #print(df_U_iter.describe()) #ssl_th = df_U_iter.describe()["arch_scores_mean"]["25%"] #print(f"MEAN U_{iteracion}: {ssl_th}") #print(f" P25 U_{iteracion}: {ssl_th}") #print(f"NUEVO UMBRAL PARA SSL: {ssl_th}") #pipeline["ssl_threshold"] = ssl_th logs_label.append([ kfold, iteracion, arch_top1, arch_top2, arch_top3, len(EL_iter), len(LC_iter), ssl_th ]) save_logs(logs_label, 'label', pipeline) #reset_keras() #models_info = [] end = time.time() print(end - start)
"train_epochs": 5, "batch_epochs": 5 }, "fast": { "train_epochs": 10, "batch_epochs": 10 }, "medium": { "train_epochs": 20, "batch_epochs": 20 }, "slow": { "train_epochs": 30, "batch_epochs": 30 } } logs.append([ "kfold", "iteracion", "arquitectura", "val_loss", "val_accu", "test_loss", "test_accu" ]) logs_time.append(["kfold", "iteracion", "arquitectura", "training_time"]) logs_label.append(["kfold", "iteracion", "arquitectura", "EL", "LC"]) save_logs(logs, 'train', pipeline) save_logs(logs_time, 'time', pipeline) save_logs(logs_label, 'label', pipeline) models = ['ResNet50', 'Xception', 'DenseNet169', 'InceptionV4', 'DenseNet121'] ssl_global(model_zoo=models, pipeline=pipeline)
def training(kfold, etapa, datos, architecture, iteracion, models_info, pipeline): start_model = time.time() base_model, preprocess_input = get_model(architecture, iteracion, models_info, pipeline) model_performance = {} datagen = ImageDataGenerator( preprocessing_function=preprocess_input, rotation_range=90, horizontal_flip=True, vertical_flip=True, ) if etapa=='train': train_generator = datagen.flow_from_dataframe( dataframe=datos['df_train'], x_col=pipeline["x_col_name"], y_col=pipeline["y_col_name"], target_size=(pipeline['img_height'],pipeline['img_width']), class_mode='categorical', batch_size=pipeline["batch_size"], seed=42, shuffle=True) if etapa=='train_EL': train_generator = datagen.flow_from_dataframe( dataframe=datos['df_train_EL'], x_col=pipeline["x_col_name"], y_col=pipeline["y_col_name"], target_size=(pipeline['img_height'],pipeline['img_width']), class_mode='categorical', batch_size=pipeline["batch_size"], seed=42, shuffle=True) if len(datos['df_val']) > 0: val_datagen=ImageDataGenerator(preprocessing_function=preprocess_input) valid_generator=val_datagen.flow_from_dataframe( dataframe=datos['df_val'], x_col=pipeline["x_col_name"], y_col=pipeline["y_col_name"], batch_size=pipeline["batch_size"], seed=42, shuffle=True, class_mode="categorical", target_size=(pipeline['img_height'],pipeline['img_width'])) test_datagen=ImageDataGenerator(preprocessing_function=preprocess_input) test_generator=test_datagen.flow_from_dataframe( dataframe=datos['df_test'], x_col=pipeline["x_col_name"], y_col=pipeline["y_col_name"], batch_size=pipeline["batch_size"], seed=42, shuffle=False, class_mode="categorical", target_size=(pipeline['img_height'],pipeline['img_width'])) num_classes = len( datos["df_train"][ pipeline["y_col_name"] ].unique() ) if etapa == 'train' and pipeline["transfer_learning"] == "classic": finetune_model = transfer_learning_classic( base_model, num_classes ) elif pipeline["transfer_learning"] == "soft": finetune_model = transfer_learning_soft( base_model, num_classes, pipeline["stage_config"][iteracion] ) else: finetune_model = base_model if etapa == 'train': NUM_EPOCHS = pipeline["modality_config"][pipeline["modality"]]["train_epochs"] num_train_images = len(datos['df_train'])*pipeline["aug_factor"] if etapa == 'train_EL': NUM_EPOCHS = pipeline["modality_config"][pipeline["modality"]]["batch_epochs"] num_train_images = len(datos['df_train_EL'])*pipeline["aug_factor"] STEP_SIZE_TRAIN=num_train_images//train_generator.batch_size STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size STEP_SIZE_TEST=test_generator.n//test_generator.batch_size metrics = ['accuracy'] loss='categorical_crossentropy' if pipeline["transfer_learning"] == "soft": LR = pipeline['LR'] else: LR = pipeline["stage_config"][iteracion]['LR'] print(f"LEARNING RATE: {LR}") adam = Adam(lr=float(LR)) finetune_model.compile(adam, loss=loss, metrics=metrics) early = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, restore_best_weights=True) history = finetune_model.fit(train_generator, epochs=NUM_EPOCHS, workers=1, steps_per_epoch=STEP_SIZE_TRAIN, validation_data=valid_generator, validation_steps=STEP_SIZE_VALID, verbose=1, callbacks=[early]) val_score=finetune_model.evaluate(valid_generator,verbose=0,steps=STEP_SIZE_VALID) test_score=finetune_model.evaluate(test_generator,verbose=0,steps=STEP_SIZE_TEST) print("Val Loss : ", val_score[0]) print("Test Loss : ", test_score[0]) print("Val Accuracy : ", val_score[1]) print("Test Accuracy : ", test_score[1]) end_model = time.time() time_training = end_model - start_model print(f"training {architecture}",time_training) logs = [] logs.append([kfold,iteracion,architecture,val_score[0],val_score[1], test_score[0],test_score[1]]) logs_time = [] logs_time.append([kfold,iteracion,architecture,time_training]) save_logs(logs,'train',pipeline) save_logs(logs_time,'time',pipeline) save_plots(history, kfold, iteracion, architecture, pipeline) model_performance['val_acc'] = val_score[1] model_performance['test_acc'] = test_score[1] if pipeline['save_model']: save_path_model = os.path.join( pipeline['save_path_model'], f'{kfold}_{iteracion}_{architecture}.h5') finetune_model.save(save_path_model) model_performance['val_acc'] = val_score[1] return save_path_model , model_performance return finetune_model , model_performance
def ssl_global(model_zoo, pipeline): datos = {} datos["df_base"] = get_dataset(pipeline) datos = split_train_test(datos, pipeline) # Medir tiempo de ejecucion import time start = time.time() split_kfold = pipeline["split_kfold"] num_kfold = pipeline["num_kfold"] for kfold in range(num_kfold): models_info = {} datos = get_Fold(kfold, datos, pipeline) datos_by_fold = {"kfold": kfold, "datos": datos} datos_total.append(datos_by_fold) df_datos = pd.DataFrame(datos_total) datos_path = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(kfold) + '_data.pkl' df_datos.to_pickle(datos_path) numero_lotes = len(datos["batch_set"]) #datos["batch_set"][0] for iteracion in range(numero_lotes * 1): kfold_info = f"K-FOLD {kfold}/{num_kfold} - ITERACION {iteracion}/{numero_lotes}" print("\n") print("#" * len(kfold_info)) print(kfold_info) print("#" * len(kfold_info)) print("\n") print("\n") print(f"CLASS DISTRIBUTION - BATCH_SET {iteracion}") print(datos["batch_set"][iteracion].groupby( pipeline["y_col_name"]).count()) print(f"OK - CLASS DISTRIBUTION - BATCH_SET {iteracion}") print("\n") if iteracion == 0: etapa = 'train' else: etapa = 'train_EL' #print(pipeline["save_path_stats"]+str(pipeline["id"])+'_'+str(iteracion)+'.pkl') for model in model_zoo: #print("##########") #print("AUG_FACTOR - CURRENT: ", pipeline["stage_config"][iteracion]["aug_factor"]) #pipeline["aug_factor"] = pipeline["stage_config"][iteracion]["aug_factor"] print("AUG_FACTOR: ", pipeline["aug_factor"]) model_memory, model_performance = training( kfold, etapa, datos, model, iteracion, models_info, classification_metrics, pipeline) models_info[model] = { 'model_memory': model_memory, 'model_performance': model_performance['val_acc'] } df_temp = pd.DataFrame(models_info).T top_models = df_temp.sort_values('model_performance', ascending=False) top_models = top_models.reset_index()['index'].values.tolist()[:3] mod_top1, arch_top1 = models_info[ top_models[0]]['model_memory'], top_models[0] mod_top2, arch_top2 = models_info[ top_models[1]]['model_memory'], top_models[1] mod_top3, arch_top3 = models_info[ top_models[2]]['model_memory'], top_models[2] #if pipeline['save_model']: # mod_top1 = load_model(mod_top1, compile=True) # mod_top2 = load_model(mod_top2, compile=True) # mod_top3 = load_model(mod_top3, compile=True) # Medir tiempo de ejecucion import time start = time.time() print("EVALUATING CO-TRAINING ...") print("\n") #print("Co-train: ", evaluate_cotrain(mod_top1,mod_top2,mod_top3,arch_top1, # arch_top2,arch_top3,datos,etapa,kfold, # iteracion,pipeline,models_info,logs)) cotrain_acc, cotrain_infer_dfs = evaluate_cotrain( mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, etapa, kfold, iteracion, pipeline, models_info, logs) print("Co-train: ", cotrain_acc) df_cotrain_info = { "kfold": kfold, "iteracion": iteracion, "df_arch1": cotrain_infer_dfs[0], "df_arch2": cotrain_infer_dfs[1], "df_arch3": cotrain_infer_dfs[2] } cotrain_list.append(df_cotrain_info) df_cotrain_list = pd.DataFrame(cotrain_list) #print(df_cotrain_list) infer_pkl = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(iteracion) + '_cotrain_eval.pkl' print("SAVING COTRAIN EVAL PICKLE") df_cotrain_list.to_pickle(infer_pkl) print("OK - SAVING COTRAIN EVAL PICKLE") print("\n") print("OK - EVALUATING CO-TRAINING") end = time.time() infer_time = end - start # SAVE INFER_TIME BY DF_TEST BY ITERATION AND ARCH print(infer_time, len(datos["df_test"])) logs_infer_time = [] logs_infer_time.append([ kfold, iteracion, 'co-train', infer_time, len(datos["df_test"]) ]) save_logs(logs_infer_time, 'infer_time', pipeline) print(f"GETTING BATCH_SET OF ITERATION {iteracion}...") df_batchset = datos["batch_set"][iteracion] df_batchset.columns = [ pipeline["x_col_name"], pipeline["y_col_name"] ] df_batchset[pipeline["y_col_name"]] = '0' datos['df_batchset'] = df_batchset print("LABELING ...") datos, EL_iter, LC_iter, label_infer_df = labeling( etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, pipeline, iteracion, models_info) df_label_info = { "kfold": kfold, "iteracion": iteracion, "df_arch1": label_infer_df[0], "df_arch2": label_infer_df[1], "df_arch3": label_infer_df[2] } label_list.append(df_label_info) df_label_list = pd.DataFrame(label_list) #print(df_label_list) label_pkl = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(iteracion) + '_labeling.pkl' print("SAVING LABEL PICKLE") df_label_list.to_pickle(label_pkl) print("OK - SAVING LABEL PICKLE") print("OK - LABELING") print("EL_iter", len(EL_iter)) print("LC_iter", len(LC_iter)) df_EL = pd.DataFrame(datos["EL"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_LC = pd.DataFrame(datos["LC"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_EL.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_EL.pickle') df_LC.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_LC.pickle') df_label_stats = label_stats(df_EL, df_LC, pipeline) #df_label_stats.to_pickle(pipeline["save_path_stats"]+'exp_'+str(pipeline["id"])+'_'+str(iteracion)+'.pickle') df_label_stats.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_stats.pickle') df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]]) datos['df_train_EL'] = df_train_EL ssl_th = pipeline["ssl_threshold"] logs_label.append([ kfold, iteracion, arch_top1, arch_top2, arch_top3, len(EL_iter), len(LC_iter), ssl_th ]) save_logs(logs_label, 'label', pipeline) reset_keras(pipeline) #if pipeline["restart_weights"]: # reset_keras() #random.seed(SEED) #np.random.seed(SEED) #tensorflow.random.set_random_seed(SEED) end = time.time() print(end - start)
def training(kfold, etapa, datos, architecture, iteracion, models_info, classification_metrics, pipeline): import time callbacks_finetune = [] start_model = time.time() base_model, preprocess_input = get_model(architecture, iteracion, models_info, pipeline) model_performance = {} print("USING TRANSFORMATIONS FROM SSL_TRAIN") datagen = ImageDataGenerator( preprocessing_function=preprocess_input, rotation_range=40, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.01, zoom_range=[0.9, 1.25], horizontal_flip=True, vertical_flip=False, fill_mode='reflect', #data_format='channels_last' ) print("OK - USING TRANSFORMATIONS FROM SSL_TRAIN") if etapa == 'train': print("CREATING GENERATOR FOR TRAIN FROM SSL_TRAIN") train_generator = datagen.flow_from_dataframe( dataframe=datos['df_train'], x_col=pipeline["x_col_name"], y_col=pipeline["y_col_name"], target_size=(pipeline['img_height'], pipeline['img_width']), class_mode='categorical', batch_size=pipeline["batch_size"], seed=42, shuffle=True) print("OK - CREATING GENERATOR FOR TRAIN FROM SSL_TRAIN") print("CLASS DISTRIBUTION - TRAIN INIT") print(datos['df_train'].groupby(pipeline["y_col_name"]).count()) print("OK - CLASS DISTRIBUTION - TRAIN INIT") y_train_unique = datos['df_train'][pipeline["y_col_name"]].unique() df_y_train_unique = datos['df_train'][pipeline["y_col_name"]] print("CLASS Y_TRAIN UNIQUE") print(y_train_unique) print("OK - CLASS Y_TRAIN UNIQUE") if etapa == 'train_EL': print("CREATING GENERATOR FOR TRAIN FROM SSL_TRAIN") train_generator = datagen.flow_from_dataframe( dataframe=datos['df_train_EL'], x_col=pipeline["x_col_name"], y_col=pipeline["y_col_name"], target_size=(pipeline['img_height'], pipeline['img_width']), class_mode='categorical', batch_size=pipeline["batch_size"], seed=42, shuffle=True) print("OK - CREATING GENERATOR FOR TRAIN_EL FROM SSL_TRAIN") print("CLASS DISTRIBUTION - TRAIN EL") print(datos['df_train_EL'].groupby(pipeline["y_col_name"]).count()) print("OK - CLASS DISTRIBUTION - TRAIN INIT") y_train_unique = datos['df_train_EL'][pipeline["y_col_name"]].unique() df_y_train_unique = datos['df_train_EL'][pipeline["y_col_name"]] if len(datos['df_val']) > 0: print("CREATING GENERATOR FOR VAL FROM SSL_TRAIN") val_datagen = ImageDataGenerator( preprocessing_function=preprocess_input) valid_generator = val_datagen.flow_from_dataframe( dataframe=datos['df_val'], x_col=pipeline["x_col_name"], y_col=pipeline["y_col_name"], batch_size=pipeline["batch_size"], seed=42, shuffle=True, class_mode="categorical", target_size=(pipeline['img_height'], pipeline['img_width'])) print("OK - CREATING GENERATOR FOR VAL FROM SSL_TRAIN") print("CREATING GENERATOR FOR TEST FROM SSL_TRAIN") test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input) test_generator1 = test_datagen.flow_from_dataframe( dataframe=datos['df_test1'], x_col=pipeline["x_col_name"] + '1', y_col=pipeline["y_col_name"] + '1', batch_size=pipeline["batch_size"], seed=42, shuffle=False, class_mode="categorical", target_size=(pipeline['img_height'], pipeline['img_width'])) test_generator2 = test_datagen.flow_from_dataframe( dataframe=datos['df_test2'], x_col=pipeline["x_col_name"] + '2', y_col=pipeline["y_col_name"] + '2', batch_size=pipeline["batch_size"], seed=42, shuffle=False, class_mode="categorical", target_size=(pipeline['img_height'], pipeline['img_width'])) print("OK - CREATING GENERATOR FOR TEST FROM SSL_TRAIN") num_classes = len(datos["df_train"][pipeline["y_col_name"]].unique()) print("NUM CLASSES", num_classes) if pipeline["transfer_learning"] == "classic": if pipeline["restart_weights"]: print("TRANSFER LEARNING - CLASSIC + YES RESTART WEIGHTS") finetune_model = transfer_learning_classic(base_model, num_classes, pipeline) print("OK - TRANSFER LEARNING - CLASSIC + YES RESTART WEIGHTS") else: if etapa == 'train': print("TRANSFER LEARNING - TRAIN + CLASSIC") finetune_model = transfer_learning_classic( base_model, num_classes, pipeline) print("OK - TRANSFER LEARNING - TRAIN + CLASSIC") elif etapa == 'train_EL': print( "TRANSFER LEARNING - TRAIN_EL + CLASSIC + NO RESTART WEIGHTS" ) finetune_model = base_model print( "OK - TRANSFER LEARNING - TRAIN_EL + CLASSIC + NO RESTART WEIGHTS" ) elif pipeline["transfer_learning"] == "soft": print("TRANSFER LEARNING - TRAIN + SOFT") finetune_model = transfer_learning_soft( base_model, num_classes, pipeline["stage_config"][iteracion]) print("OK - TRANSFER LEARNING - TRAIN + SOFT") if pipeline["use_stage_config"]: NUM_EPOCHS = pipeline["stage_config"][iteracion]["train_epochs"] AUG_FACTOR = pipeline["stage_config"][iteracion]["aug_factor"] else: NUM_EPOCHS = pipeline["train_epochs"] AUG_FACTOR = pipeline["aug_factor"] print("\n") print("USING AUG_FACTOR OF: ", AUG_FACTOR) print("\n") if etapa == 'train': #NUM_EPOCHS = pipeline["modality_config"][pipeline["modality"]]["train_epochs"] num_train_images = len(datos['df_train']) * AUG_FACTOR if etapa == 'train_EL': #NUM_EPOCHS = pipeline["modality_config"][pipeline["modality"]]["batch_epochs"] num_train_images = len(datos['df_train_EL']) * AUG_FACTOR STEP_SIZE_TRAIN = num_train_images // train_generator.batch_size STEP_SIZE_VALID = valid_generator.n // valid_generator.batch_size STEP_SIZE_TEST1 = test_generator1.n // test_generator1.batch_size STEP_SIZE_TEST2 = test_generator2.n // test_generator2.batch_size metrics = ['accuracy'] loss = 'categorical_crossentropy' if pipeline["transfer_learning"] == "classic": LR = pipeline['learning_rate'] elif pipeline["transfer_learning"] == "soft": LR = pipeline["stage_config"][iteracion]['LR'] #lr_schedule = ExponentialDecay( # initial_learning_rate=1e-2, # decay_steps=10000, # decay_rate=0.9) #print(f"LEARNING RATE: {lr_schedule}") print(f"LEARNING RATE: {LR}") optimizer = Adam(lr=float(LR)) finetune_model.compile(optimizer, loss=loss, metrics=metrics) if pipeline["early_stopping"]: early = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=pipeline["early_stopping_patience"], verbose=1, restore_best_weights=True) callbacks_finetune.append(early) if pipeline["reduce_lr"]: print("USING REDUCE_LR") reduce_lr_loss = ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=pipeline["reduce_lr_patience"], verbose=1, mode='min') callbacks_finetune.append(reduce_lr_loss) print(reduce_lr_loss) print(callbacks_finetune) print("OK - USING REDUCE_LR") if pipeline["checkpoint"]: #mod_filename = f'{kfold}_{architecture}_{iteracion}_'+'{epoch:02d}-{val_loss:.2f}.hdf5' mod_filename = f'{kfold}_{architecture}_{iteracion}.h5' mod_path = os.path.join(pipeline["save_path_models"], mod_filename) mcp_save = ModelCheckpoint(mod_path, save_best_only=True, monitor='val_loss', verbose=1, mode='auto', save_weights_only=False) callbacks_finetune.append(mcp_save) if len(callbacks_finetune) == 0: callbacks_finetune = None if pipeline["class_weight"]: import numpy as np from sklearn.utils import class_weight print("CALCULATING CLASS_WEIGHTS") if etapa == 'train': class_weights = calculate_weights(datos['df_train'], pipeline) if etapa == 'train_EL': class_weights = calculate_weights(datos['df_train_EL'], pipeline) #class_weights = class_weight.compute_class_weight('balanced', # np.unique(train_generator.classes), # train_generator.classes) print("OK - CALCULATING CLASS_WEIGHTS") print("USING CLASS WEIGHTING") print(class_weights) print("OK - CLASS WEIGHTING") else: class_weights = None print("CALLBACKS FINETUNE") print(callbacks_finetune) print("OK - CALLBACKS FINETUNE") history = finetune_model.fit( train_generator, epochs=NUM_EPOCHS, workers=1, steps_per_epoch=STEP_SIZE_TRAIN, validation_data=valid_generator, validation_steps=STEP_SIZE_VALID, verbose=1, callbacks=callbacks_finetune, class_weight=class_weights, ) val_score = finetune_model.evaluate(valid_generator, verbose=0, steps=STEP_SIZE_VALID) test1_score = finetune_model.evaluate(test_generator1, verbose=0, steps=STEP_SIZE_TEST1) test2_score = finetune_model.evaluate(test_generator2, verbose=0, steps=STEP_SIZE_TEST2) class_metrics1 = classification_metrics(finetune_model, train_generator, test_generator1, STEP_SIZE_TEST1, kfold, iteracion, architecture, pipeline) class_metrics2 = classification_metrics(finetune_model, train_generator, test_generator2, STEP_SIZE_TEST2, kfold, iteracion, architecture, pipeline) print("Val Loss : ", val_score[0]) print("Test1 Loss : ", test1_score[0]) print("Test2 Loss : ", test2_score[0]) print("Val Accuracy : ", val_score[1]) print("Test1 Accuracy : ", test1_score[1]) print("Test2 Accuracy : ", test2_score[1]) print(f"Test1 Precision: {class_metrics1[0]}") print(f"Test1 Recall: {class_metrics1[1]}") print(f"Test1 F1-Score: {class_metrics1[2]}") print(f"Test1 Support: {class_metrics1[3]}") print(f"Test2 Precision: {class_metrics2[0]}") print(f"Test2 Recall: {class_metrics2[1]}") print(f"Test2 F1-Score: {class_metrics2[2]}") print(f"Test2 Support: {class_metrics2[3]}") end_model = time.time() time_training = end_model - start_model print(f"training time of - {architecture}", time_training) if pipeline["checkpoint"]: from tensorflow.keras.models import load_model print(f"LOADING BEST MODEL FROM {mod_path}") model_checkpoint_finetune = load_model(mod_path, compile=True) val_score = model_checkpoint_finetune.evaluate(valid_generator, verbose=1, steps=STEP_SIZE_VALID) test_score = model_checkpoint_finetune.evaluate(test_generator, verbose=1, steps=STEP_SIZE_TEST) print("Val Accuracy from Best: ", val_score[1]) print("Test Accuracy from Best: ", test_score[1]) logs = [] logs.append([ kfold, iteracion, architecture, val_score[0], val_score[1], test1_score[0], test1_score[1], class_metrics1[0], class_metrics1[1], class_metrics1[2], test2_score[0], test2_score[1], class_metrics2[0], class_metrics2[1], class_metrics2[2] ]) logs_time = [] logs_time.append([kfold, iteracion, architecture, time_training]) save_logs(logs, 'train', pipeline) save_logs(logs_time, 'time', pipeline) save_plots(history, kfold, iteracion, architecture, pipeline) model_performance['val_acc'] = val_score[1] model_performance['test1_acc'] = test1_score[1] model_performance['test2_acc'] = test2_score[1] exp_id = str(pipeline["id"]) if pipeline['checkpoint']: return model_checkpoint_finetune, model_performance save_path_model = os.path.join( pipeline['save_path_model'], pipeline['dataset_base'], f'exp_{exp_id}_{kfold}_{iteracion}_{architecture}.h5') # SAVE MODELS BEST_ONLY # MODEL CHECKPOINT # https://stackoverflow.com/questions/48285129/saving-best-model-in-keras import time start = time.time() print(f"SAVING MODEL ON {save_path_model}") finetune_model.save(save_path_model) print(f"OK - SAVING MODEL ON {save_path_model}") end = time.time() end_time = end - start print(f"TOTAL TIME TO SAVE: {end_time}") model_performance['val_acc'] = val_score[1] return save_path_model, model_performance return finetune_model, model_performance
def evaluate_cotrain(modelo1,modelo2,modelo3, arquitectura1,arquitectura2,arquitectura3, datos, etapa, kfold, iteracion, pipeline, models_info, logs): train_generator_arch1,test_generator_arch1,STEP_SIZE_TEST_arch1=generadores(etapa,arquitectura1,datos,pipeline,False,iteracion,models_info) train_generator_arch2,test_generator_arch2,STEP_SIZE_TEST_arch2=generadores(etapa,arquitectura2,datos,pipeline,False,iteracion,models_info) train_generator_arch3,test_generator_arch3,STEP_SIZE_TEST_arch3=generadores(etapa,arquitectura3,datos,pipeline,False,iteracion,models_info) df1=evaluar(modelo1,train_generator_arch1,test_generator_arch1,STEP_SIZE_TEST_arch1) df2=evaluar(modelo2,train_generator_arch2,test_generator_arch2,STEP_SIZE_TEST_arch2) df3=evaluar(modelo3,train_generator_arch3,test_generator_arch3,STEP_SIZE_TEST_arch3) import numpy as np predicciones = [] predicciones_logs = [] for i in range(len(df1)): c1 = (df1['Predictions'][i] == df2['Predictions'][i]) c2 = (df1['Predictions'][i] == df3['Predictions'][i]) c3 = (df2['Predictions'][i] == df3['Predictions'][i]) if c1 or c2: predicciones.append([df1['Filename'][i],df1['Predictions'][i]]) selected = df1['Predictions'][i] prob_selected = df1["Max_Probability"][i] predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"democ", df1['Predictions'][i],df1['Max_Probability'][i], df2["Predictions"][i],df2['Max_Probability'][i], df3["Predictions"][i],df3['Max_Probability'][i]]) elif c3: predicciones.append([df2['Filename'][i],df2['Predictions'][i]]) selected = df2['Predictions'][i] prob_selected = df2["Max_Probability"][i] predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"democ", df1['Predictions'][i],df1['Max_Probability'][i], df2["Predictions"][i],df2['Max_Probability'][i], df3["Predictions"][i],df3['Max_Probability'][i]]) else: probabilidades = np.array([df1['Max_Probability'][i],df2['Max_Probability'][i],df3['Max_Probability'][i]]) indice_prob_max = probabilidades.argmax() clases = np.array([df1['Predictions'][i],df2['Predictions'][i],df3['Predictions'][i]]) real = np.array([df1['Filename'][i],df2['Filename'][i],df3['Filename'][i]]) predicciones.append([real[indice_prob_max],clases[indice_prob_max]]) selected = clases[indice_prob_max] prob_selected = probabilidades[indice_prob_max] predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"max", df1['Predictions'][i],df1['Max_Probability'][i], df2["Predictions"][i],df2['Max_Probability'][i], df3["Predictions"][i],df3['Max_Probability'][i]]) results = pd.DataFrame(predicciones,columns=["filename","predictions"]) results['filename'] = results['filename'].apply(lambda x:x.split('/')[-2]) y_true = results['filename'].values.tolist() y_pred = results['predictions'].values.tolist() labels_arch1 = (train_generator_arch1.class_indices) print("LABELS CO-TRAIN") print([*labels_arch1]) architecture = 'co-train' class_metrics = precision_recall_fscore_support(y_true, y_pred, average=pipeline["metrics"]) # TODO Bugfix Calculate Confusion Matrix #cm = calculate_confusion_matrix(y_true, y_pred) # TODO Bugfix Calculate Confusion Matrix #save_confusion_matrix(cm, [*labels_arch1], kfold, iteracion, architecture, pipeline) # TODO Bugfix Calculate Confusion Matrix # normalize confusion matrix #if pipeline["cm_normalize"]: # cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # cm = np.round(cm, 2) # TODO Bugfix Calculate Confusion Matrix #plot_confusion_matrix(cm, [*labels_arch1], kfold, iteracion, architecture, pipeline) #acc_cls = accuracy_by_class(cm, [*labels_arch1]) #print("ACCURACY BY CLASS") #print(acc_cls) #print("LEN ACCURACY BY CLASS") #print(len(acc_cls)) # SAVE ACC_CLS #logs_accBycls = [] #logs_accBycls.append([kfold,iteracion,architecture,acc_cls]) #save_logs(logs_accBycls, 'accBycls', pipeline) #plot_confusion_matrix(y_true, y_pred, [*labels_arch1], kfold, iteracion, architecture, pipeline) from sklearn.metrics import accuracy_score co_train_accu = accuracy_score(y_true,y_pred) #co_train_accu = accuracy_score(y_pred, y_true) logs.append([kfold,iteracion,architecture,None,None,None,co_train_accu, class_metrics[0],class_metrics[1],class_metrics[2],class_metrics[3]]) print(f"Co-train Accuracy: {co_train_accu}") print(f"Co-train Precision: {class_metrics[0]}") print(f"Co-train Recall: {class_metrics[1]}") print(f"Co-train F1-Score: {class_metrics[2]}") print(f"Co-train Support: {class_metrics[3]}") save_logs(logs,'train',pipeline) return co_train_accu, [df1,df2,df3]
def ssl_global(archivos, model_zoo, csvs, pipeline): datos = {} models_info = {} #df_train, df_val, df_test1, df_test2 = get_data(archivos, csvs, pipeline) ACA VOY # Medir tiempo de ejecucion #import time start = time.time() fold = dividir_balanceado2(df_train, 4) for kfold in range(1): if dataset == 'gleasson': #import pandas as pd df_train_58 = pd.DataFrame([fold[kfold][0], fold[kfold][2]]).T df_train_58.columns = [x_col_name, y_col_name] df_val = pd.DataFrame([fold[kfold][1], fold[kfold][3]]).T df_val.columns = [x_col_name, y_col_name] fold1 = dividir_balanceado2(df_train_58, 4) df_train = pd.DataFrame([fold1[0][1], fold1[0][3]]).T df_train.columns = [x_col_name, y_col_name] df_train.to_csv('data/train.csv', index=False) df_val.to_csv('data/val.csv', index=False) df_test1.to_csv('data/test1.csv', index=False) df_test2.to_csv('data/test2.csv', index=False) df_U = pd.DataFrame([fold1[0][0], fold1[0][2]]).T df_U.columns = [x_col_name, y_col_name] EL, LC = [], [] print("train :", len(df_train)) print("val :", len(df_val)) print("u :", len(df_U)) # Segmentación de U en lotes para etiquetar batch_set = list(dividir_lotes(df_U, numero_lotes)) for i in range(len(batch_set)): print(len(batch_set[i].iloc[:, 0].values.tolist())) datos['df_train'] = df_train datos['df_val'] = df_val datos['df_test1'] = df_test1 datos['df_test2'] = df_test2 for iteracion in range(numero_lotes * 1): #import random random.seed(SEED) np.random.seed(SEED) tensorflow.random.set_random_seed(SEED) print("\n######################") print("K-FOLD {} - ITERACION {}".format(kfold, iteracion)) print("######################\n") if iteracion == 0: etapa = 'train' else: etapa = 'train_EL' print(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '.pickle') for model in model_zoo: model_memory, model_performance = entrenamiento( kfold, etapa, datos, model, train_epochs, batch_epochs, early_stopping, iteracion, models_info, pipeline) models_info[model] = { 'model_memory': model_memory, 'model_performance': model_performance['val_acc'] } #import pandas as pd df_temp = pd.DataFrame(models_info).T top_models = df_temp.sort_values('model_performance', ascending=False) top_models = top_models.reset_index()['index'].values.tolist()[:3] mod_top1, arch_top1 = models_info[ top_models[0]]['model_memory'], top_models[0] mod_top2, arch_top2 = models_info[ top_models[1]]['model_memory'], top_models[1] mod_top3, arch_top3 = models_info[ top_models[2]]['model_memory'], top_models[2] if dataset == 'gleasson': print( "\nCo-train1: \n", evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, 'gleasson-patologo1', datos, etapa, kfold, iteracion, pipeline, models_info)) print( "\nCo-train2: \n", evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, 'gleasson-patologo2', datos, etapa, kfold, iteracion, pipeline, models_info)) if semi_method == 'supervised': break if iteracion < numero_lotes: df_batchset = batch_set[iteracion] df_batchset.columns = [x_col_name, y_col_name] df_batchset[y_col_name] = '0' else: if iteracion == numero_lotes: df_LC = pd.DataFrame(LC) batch_set_LC = list(dividir_lotes(df_LC, numero_lotes)) for i in range(len(batch_set_LC)): print(len(batch_set_LC[i].iloc[:, 0].values.tolist())) LC = [] df_batchset = pd.DataFrame([ batch_set_LC[int(iteracion - numero_lotes)].iloc[:, 0].values.tolist() ]).T df_batchset.columns = [x_col_name] df_batchset[y_col_name] = '0' datos['df_batchset'] = df_batchset EL, LC, EL_iter, LC_iter = labeling(etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, EL, LC, datos, pipeline, iteracion, models_info) #logs_label.append([kfold,iteracion,arch_top1,arch_top2,arch_top3,len(EL_iter),len(LC_iter)]) #save_logs(logs_label,'label',pipeline) #df_EL = pd.DataFrame(EL, columns=[x_col_name, y_col_name, 'arch_scores']) #df_LC = pd.DataFrame(LC, columns=[x_col_name, y_col_name, 'arch_scores']) df_EL = pd.DataFrame( EL_iter, columns=[x_col_name, y_col_name, 'arch_scores']) # EXP30 df_LC = pd.DataFrame( LC_iter, columns=[x_col_name, y_col_name, 'arch_scores']) # EXP30 df_label_stats = label_stats(df_EL, df_LC) print(df_label_stats) df_label_stats.to_pickle(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '.pickle') #df_train_EL = pd.concat([df_train,df_EL.iloc[:,:2]]) df_train_EL = df_EL.iloc[:, :2].copy() # EXP30 #print(df_train) print("df_train_EL") print(df_train_EL) #print(df_EL.iloc[:,:2]) #print(df_train_EL) datos['df_train_EL'] = df_train_EL df_EL_stats = df_label_stats["df_EL_stats"]["df"] df_LC_stats = df_label_stats["df_LC_stats"]["df"] df_U_iter = pd.concat([df_EL_stats, df_LC_stats], ignore_index=True) #df_U_iter.describe()["arch_scores_mean"]["25%"] #df_U_iter = pd.concat([df_EL,df_LC], ignore_index=True) #ssl_th = df_U_iter.describe()["arch_scores_mean"]["mean"] #EXP 33 #print("df_U_describe") #print(f"MEAN U_{iteracion}: {ssl_th}") #print(df_U_iter.describe()) #ssl_th = df_U_iter.describe()["arch_scores_mean"]["25%"] #print(f"MEAN U_{iteracion}: {ssl_th}") #print(f" P25 U_{iteracion}: {ssl_th}") #print(f"NUEVO UMBRAL PARA SSL: {ssl_th}") #pipeline["ssl_threshold"] = ssl_th logs_label.append([ kfold, iteracion, arch_top1, arch_top2, arch_top3, len(EL_iter), len(LC_iter), ssl_th ]) save_logs(logs_label, 'label', pipeline) #reset_keras() #models_info = [] end = time.time() print(end - start)
def ssl_global(model_zoo, pipeline): #datos = {} datos = get_dataset(pipeline) #print(datos) #return True #datos = split_train_test(datos, pipeline) #return True # Medir tiempo de ejecucion import time start = time.time() #split_kfold = pipeline["split_kfold"] #num_kfold = pipeline["num_kfold"] method = pipeline["method"] #for kfold in range(num_kfold): models_info = {} if method == "semi-supervised": datos = get_Fold(kfold, datos, pipeline) #return True #datos_by_fold = { # "kfold": kfold, # "datos": datos #} #datos_total.append(datos_by_fold) #df_datos = pd.DataFrame(datos_total) #datos_path = pipeline["save_path_stats"] + 'exp_'+str(pipeline["id"])+'_'+str(kfold)+'_data.pkl' #df_datos.to_pickle(datos_path) if method == "supervised": kfold = 0 total_stages = 1 #pipeline["train_epochs"] elif pipeline[ "labeling_method"] == 'decision' and method == "semi-supervised": total_stages = len(datos["batch_set"]) elif pipeline[ "labeling_method"] == 'democratic' and method == "semi-supervised": total_stages = pipeline["labeling_stages"] else: pass for iteracion in range(total_stages * 1): #kfold_info = f"K-FOLD {kfold}/{num_kfold} - ITERACION {iteracion}/{total_stages}" #print("\n") #print("#"*len(kfold_info)) #print(kfold_info) #print("#"*len(kfold_info)) #print("\n") info = f"METHOD - {method} - ITERATION {iteracion}/{total_stages}" if method == "semi-supervised": print("\n") print(f"CLASS DISTRIBUTION - BATCH_SET {iteracion}") if len(datos["LC"]) > 0: U_set = pd.DataFrame(datos["LC"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) #print("LABELING LOW CONFIDENCE SAMPLES (LC)") print(U_set.groupby(pipeline["y_col_name"]).count()) #print("OK - LABELING LOW CONFIDENCE SAMPLES (LC)") else: U_set = datos['U'] #print("LABELING UNLABELED SAMPLES (U)") print(U_set.groupby(pipeline["y_col_name"]).count()) #print("OK - LABELING UNLABELED SAMPLES (U)") #print( datos["batch_set"][iteracion].groupby(pipeline["y_col_name"]).count() ) print(f"OK - CLASS DISTRIBUTION - BATCH_SET {iteracion}") print("\n") if iteracion == 0 or method == "supervised": etapa = 'train' else: etapa = 'train_EL' for model in model_zoo: print("AUG_FACTOR: ", pipeline["aug_factor"]) model_memory, model_performance = training(kfold, etapa, datos, model, iteracion, models_info, classification_metrics, pipeline) models_info[model] = { 'model_memory': model_memory, 'model_performance': model_performance['val_acc'] } df_temp = pd.DataFrame(models_info).T top_models = df_temp.sort_values('model_performance', ascending=False) top_models = top_models.reset_index()['index'].values.tolist()[:3] mod_top1, arch_top1 = models_info[ top_models[0]]['model_memory'], top_models[0] mod_top2, arch_top2 = models_info[ top_models[1]]['model_memory'], top_models[1] mod_top3, arch_top3 = models_info[ top_models[2]]['model_memory'], top_models[2] # Medir tiempo de ejecucion import time start = time.time() print("EVALUATING CO-TRAINING ...") print("\n") cotrain_acc1, cotrain_infer_dfs1 = evaluate_cotrain( mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, etapa, kfold, iteracion, pipeline, models_info, 'patologo1', logs) print("Co-train - Patologo 1: ", cotrain_acc1) cotrain_acc2, cotrain_infer_dfs2 = evaluate_cotrain( mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, etapa, kfold, iteracion, pipeline, models_info, 'patologo2', logs) print("Co-train - Patologo 2: ", cotrain_acc2) df_cotrain_info = { "kfold": kfold, "iteracion": iteracion, "patologo1": { "df_arch1": cotrain_infer_dfs1[0], "df_arch2": cotrain_infer_dfs1[1], "df_arch3": cotrain_infer_dfs1[2] }, "patologo2": { "df_arch1": cotrain_infer_dfs2[0], "df_arch2": cotrain_infer_dfs2[1], "df_arch3": cotrain_infer_dfs2[2] }, } cotrain_list.append(df_cotrain_info) df_cotrain_list = pd.DataFrame(cotrain_list) infer_pkl = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(iteracion) + '_cotrain_eval.pkl' print("SAVING COTRAIN EVAL PICKLE") df_cotrain_list.to_pickle(infer_pkl) print("OK - SAVING COTRAIN EVAL PICKLE") print("\n") print("OK - EVALUATING CO-TRAINING") end = time.time() infer_time = end - start # SAVE INFER_TIME BY DF_TEST BY ITERATION AND ARCH print(infer_time, len(datos["df_test1"])) logs_infer_time = [] logs_infer_time.append([ kfold, iteracion, 'co-train1', infer_time, len(datos["df_test1"]) ]) save_logs(logs_infer_time, 'infer_time', pipeline) if method == "supervised": print(f"SUPERVISED METHOD COMPLETED FOR ITERATION: {iteracion}") #reset_keras(pipeline) continue print(f"GETTING BATCH_SET OF ITERATION {iteracion}...") print("LABELING ...") if pipeline["labeling_method"] == "decision": datos, EL_iter, LC_iter, EL_accu, LC_accu, label_infer_df = labeling( etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, pipeline, iteracion, models_info) elif pipeline["labeling_method"] == "democratic": datos, EL_iter, LC_iter, EL_accu, LC_accu, label_infer_df = labeling_v2( etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, pipeline, iteracion, models_info) df_label_info = { "kfold": kfold, "iteracion": iteracion, "df_arch1": label_infer_df[0], "df_arch2": label_infer_df[1], "df_arch3": label_infer_df[2] } label_list.append(df_label_info) df_label_list = pd.DataFrame(label_list) label_pkl = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(iteracion) + '_labeling.pkl' print("SAVING LABEL PICKLE") df_label_list.to_pickle(label_pkl) print("OK - SAVING LABEL PICKLE") print("OK - LABELING") print("EL_iter", len(EL_iter)) print("LC_iter", len(LC_iter)) df_EL = pd.DataFrame(datos["EL"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_LC = pd.DataFrame(datos["LC"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_EL.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_EL.pickle') df_LC.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_LC.pickle') df_label_stats = label_stats(df_EL, df_LC, pipeline) df_label_stats.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_stats.pickle') df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]]) datos['df_train_EL'] = df_train_EL ssl_th = pipeline["ssl_threshold"] logs_label.append([ kfold, iteracion, arch_top1, arch_top2, arch_top3, len(EL_iter), len(LC_iter), EL_accu, LC_accu, ssl_th ]) save_logs(logs_label, 'label', pipeline) reset_keras(pipeline) end = time.time() print(end - start)