def evaluate_cotrain(modelo1, modelo2, modelo3, arquitectura1, arquitectura2, arquitectura3, datos, etapa, kfold, iteracion, pipeline, models_info, logs): train_generator_arch1, test1_generator_arch1, STEP_SIZE_TEST1_arch1 = generadores( etapa, arquitectura1, datos, pipeline, False, iteracion, models_info) train_generator_arch2, test1_generator_arch2, STEP_SIZE_TEST1_arch2 = generadores( etapa, arquitectura2, datos, pipeline, False, iteracion, models_info) train_generator_arch3, test1_generator_arch3, STEP_SIZE_TEST1_arch3 = generadores( etapa, arquitectura3, datos, pipeline, False, iteracion, models_info) df1 = evaluar(modelo1, train_generator_arch1, test1_generator_arch1, STEP_SIZE_TEST1_arch1) df2 = evaluar(modelo2, train_generator_arch2, test1_generator_arch2, STEP_SIZE_TEST1_arch2) df3 = evaluar(modelo3, train_generator_arch3, test1_generator_arch3, STEP_SIZE_TEST1_arch3) predicciones = [] for i in range(len(df1)): c1 = (df1['Predictions'][i] == df2['Predictions'][i]) c2 = (df1['Predictions'][i] == df3['Predictions'][i]) c3 = (df2['Predictions'][i] == df3['Predictions'][i]) if c1 and c2 and c3: predicciones.append([df1['Filename'][i], df1['Predictions'][i]]) else: probabilidades = np.array([ df1['Max_Probability'][i], df2['Max_Probability'][i], df3['Max_Probability'][i] ]) indice_prob_max = probabilidades.argmax() clases = np.array([ df1['Predictions'][i], df2['Predictions'][i], df3['Predictions'][i] ]) indice_clas_max = clases.argmax() real = np.array( [df1['Filename'][i], df2['Filename'][i], df3['Filename'][i]]) predicciones.append( [real[indice_prob_max], clases[indice_clas_max]]) results = pd.DataFrame(predicciones, columns=["filename", "predictions"]) #results['filename'] = results['filename'].apply(lambda x:x.split('/')[-1].split('_')[-1][0]) results['filename'] = results['filename'].apply(lambda x: x.split('/')[-2]) y_true = results['filename'].values.tolist() y_pred = results['predictions'].values.tolist() from sklearn.metrics import accuracy_score co_train_accu = accuracy_score(y_pred, y_true) co_train_label = 'co-train' logs.append( [kfold, iteracion, co_train_label, None, None, None, co_train_accu]) save_logs(logs, 'train', pipeline) return co_train_accu
def labeling(etapa, modelo1, modelo2, modelo3, arquitectura1, arquitectura2, arquitectura3, datos, pipeline, iteracion, models_info): """ Labels samples using models Args: etapa (str): Word 1 to compare modelo1 (bytes): Top1 Model modelo2 (bytes): Top2 Model modelo3 (bytes): Top3 Model arquitectura1 (str): Top1 Arch arquitectura2 (str): Top2 Arch arquitectura3 (str): Top3 Arch EL (list): Enlarge labeled samples LC (list): Low confidence samples datos (df): Dataframe with data pipeline (dict): General config iteracion (int): Semi-supervised Stage models_info (list): Info about models Returns: EL (list): Enlarge labeled samples updated LC (list): Low confidence samples updated EL_iter (): LC_iter (): """ etiquetados_EL = 0 etiquetados_LC = 0 EL_iter = [] LC_iter = [] train_generator_arch1, batchset_generator_arch1, STEP_SIZE_BATCH_arch1 = generadores( etapa, arquitectura1, datos, pipeline, True, iteracion, models_info) train_generator_arch2, batchset_generator_arch2, STEP_SIZE_BATCH_arch2 = generadores( etapa, arquitectura2, datos, pipeline, True, iteracion, models_info) train_generator_arch3, batchset_generator_arch3, STEP_SIZE_BATCH_arch3 = generadores( etapa, arquitectura3, datos, pipeline, True, iteracion, models_info) df1 = evaluar(modelo1, train_generator_arch1, batchset_generator_arch1, STEP_SIZE_BATCH_arch1) df2 = evaluar(modelo2, train_generator_arch2, batchset_generator_arch2, STEP_SIZE_BATCH_arch2) df3 = evaluar(modelo3, train_generator_arch3, batchset_generator_arch3, STEP_SIZE_BATCH_arch3) for i in range(len(df1)): arch_scores = {} arch_scores[arquitectura1] = df1['Max_Probability'][i] arch_scores[arquitectura2] = df2['Max_Probability'][i] arch_scores[arquitectura3] = df3['Max_Probability'][i] c1 = (df1['Predictions'][i] == df2['Predictions'][i]) and ( df1['Max_Probability'][i] > pipeline["ssl_threshold"]) and ( df2['Max_Probability'][i] > pipeline["ssl_threshold"]) c2 = (df1['Predictions'][i] == df3['Predictions'][i]) and ( df1['Max_Probability'][i] > pipeline["ssl_threshold"]) and ( df3['Max_Probability'][i] > pipeline["ssl_threshold"]) c3 = (df2['Predictions'][i] == df3['Predictions'][i]) and ( df2['Max_Probability'][i] > pipeline["ssl_threshold"]) and ( df3['Max_Probability'][i] > pipeline["ssl_threshold"]) if c1 and c2 and c3: datos["EL"].append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) EL_iter.append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) etiquetados_EL += 1 else: datos["LC"].append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) LC_iter.append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) etiquetados_LC += 1 print('etiquetados EL {} LC {}'.format(etiquetados_EL, etiquetados_LC)) return datos, EL_iter, LC_iter
def evaluate_cotrain(modelo1,modelo2,modelo3, arquitectura1,arquitectura2,arquitectura3, datos, etapa, kfold, iteracion, pipeline, models_info, logs): train_generator_arch1,test_generator_arch1,STEP_SIZE_TEST_arch1=generadores(etapa,arquitectura1,datos,pipeline,False,iteracion,models_info) train_generator_arch2,test_generator_arch2,STEP_SIZE_TEST_arch2=generadores(etapa,arquitectura2,datos,pipeline,False,iteracion,models_info) train_generator_arch3,test_generator_arch3,STEP_SIZE_TEST_arch3=generadores(etapa,arquitectura3,datos,pipeline,False,iteracion,models_info) df1=evaluar(modelo1,train_generator_arch1,test_generator_arch1,STEP_SIZE_TEST_arch1) df2=evaluar(modelo2,train_generator_arch2,test_generator_arch2,STEP_SIZE_TEST_arch2) df3=evaluar(modelo3,train_generator_arch3,test_generator_arch3,STEP_SIZE_TEST_arch3) import numpy as np predicciones = [] predicciones_logs = [] for i in range(len(df1)): c1 = (df1['Predictions'][i] == df2['Predictions'][i]) c2 = (df1['Predictions'][i] == df3['Predictions'][i]) c3 = (df2['Predictions'][i] == df3['Predictions'][i]) if c1 or c2: predicciones.append([df1['Filename'][i],df1['Predictions'][i]]) selected = df1['Predictions'][i] prob_selected = df1["Max_Probability"][i] predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"democ", df1['Predictions'][i],df1['Max_Probability'][i], df2["Predictions"][i],df2['Max_Probability'][i], df3["Predictions"][i],df3['Max_Probability'][i]]) elif c3: predicciones.append([df2['Filename'][i],df2['Predictions'][i]]) selected = df2['Predictions'][i] prob_selected = df2["Max_Probability"][i] predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"democ", df1['Predictions'][i],df1['Max_Probability'][i], df2["Predictions"][i],df2['Max_Probability'][i], df3["Predictions"][i],df3['Max_Probability'][i]]) else: probabilidades = np.array([df1['Max_Probability'][i],df2['Max_Probability'][i],df3['Max_Probability'][i]]) indice_prob_max = probabilidades.argmax() clases = np.array([df1['Predictions'][i],df2['Predictions'][i],df3['Predictions'][i]]) real = np.array([df1['Filename'][i],df2['Filename'][i],df3['Filename'][i]]) predicciones.append([real[indice_prob_max],clases[indice_prob_max]]) selected = clases[indice_prob_max] prob_selected = probabilidades[indice_prob_max] predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"max", df1['Predictions'][i],df1['Max_Probability'][i], df2["Predictions"][i],df2['Max_Probability'][i], df3["Predictions"][i],df3['Max_Probability'][i]]) results = pd.DataFrame(predicciones,columns=["filename","predictions"]) results['filename'] = results['filename'].apply(lambda x:x.split('/')[-2]) y_true = results['filename'].values.tolist() y_pred = results['predictions'].values.tolist() labels_arch1 = (train_generator_arch1.class_indices) print("LABELS CO-TRAIN") print([*labels_arch1]) architecture = 'co-train' class_metrics = precision_recall_fscore_support(y_true, y_pred, average=pipeline["metrics"]) # TODO Bugfix Calculate Confusion Matrix #cm = calculate_confusion_matrix(y_true, y_pred) # TODO Bugfix Calculate Confusion Matrix #save_confusion_matrix(cm, [*labels_arch1], kfold, iteracion, architecture, pipeline) # TODO Bugfix Calculate Confusion Matrix # normalize confusion matrix #if pipeline["cm_normalize"]: # cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # cm = np.round(cm, 2) # TODO Bugfix Calculate Confusion Matrix #plot_confusion_matrix(cm, [*labels_arch1], kfold, iteracion, architecture, pipeline) #acc_cls = accuracy_by_class(cm, [*labels_arch1]) #print("ACCURACY BY CLASS") #print(acc_cls) #print("LEN ACCURACY BY CLASS") #print(len(acc_cls)) # SAVE ACC_CLS #logs_accBycls = [] #logs_accBycls.append([kfold,iteracion,architecture,acc_cls]) #save_logs(logs_accBycls, 'accBycls', pipeline) #plot_confusion_matrix(y_true, y_pred, [*labels_arch1], kfold, iteracion, architecture, pipeline) from sklearn.metrics import accuracy_score co_train_accu = accuracy_score(y_true,y_pred) #co_train_accu = accuracy_score(y_pred, y_true) logs.append([kfold,iteracion,architecture,None,None,None,co_train_accu, class_metrics[0],class_metrics[1],class_metrics[2],class_metrics[3]]) print(f"Co-train Accuracy: {co_train_accu}") print(f"Co-train Precision: {class_metrics[0]}") print(f"Co-train Recall: {class_metrics[1]}") print(f"Co-train F1-Score: {class_metrics[2]}") print(f"Co-train Support: {class_metrics[3]}") save_logs(logs,'train',pipeline) return co_train_accu, [df1,df2,df3]
def labeling(etapa, modelo1, modelo2, modelo3, arquitectura1, arquitectura2, arquitectura3, datos, pipeline, iteracion, models_info): """ Labels samples using models Args: etapa (str): Word 1 to compare modelo1 (bytes): Top1 Model modelo2 (bytes): Top2 Model modelo3 (bytes): Top3 Model arquitectura1 (str): Top1 Arch arquitectura2 (str): Top2 Arch arquitectura3 (str): Top3 Arch EL (list): Enlarge labeled samples LC (list): Low confidence samples datos (df): Dataframe with data pipeline (dict): General config iteracion (int): Semi-supervised Stage models_info (list): Info about models Returns: EL (list): Enlarge labeled samples updated LC (list): Low confidence samples updated EL_iter (): LC_iter (): """ etiquetados_EL = 0 etiquetados_LC = 0 EL_iter = [] LC_iter = [] predicciones = [] predicciones_logs = [] train_generator_arch1, batchset_generator_arch1, STEP_SIZE_BATCH_arch1 = generadores( etapa, arquitectura1, datos, pipeline, True, iteracion, models_info) train_generator_arch2, batchset_generator_arch2, STEP_SIZE_BATCH_arch2 = generadores( etapa, arquitectura2, datos, pipeline, True, iteracion, models_info) train_generator_arch3, batchset_generator_arch3, STEP_SIZE_BATCH_arch3 = generadores( etapa, arquitectura3, datos, pipeline, True, iteracion, models_info) df1 = evaluar(modelo1, train_generator_arch1, batchset_generator_arch1, STEP_SIZE_BATCH_arch1) df2 = evaluar(modelo2, train_generator_arch2, batchset_generator_arch2, STEP_SIZE_BATCH_arch2) df3 = evaluar(modelo3, train_generator_arch3, batchset_generator_arch3, STEP_SIZE_BATCH_arch3) for i in range(len(df1)): arch_scores = {} arch_scores[arquitectura1] = df1['Max_Probability'][i] arch_scores[arquitectura2] = df2['Max_Probability'][i] arch_scores[arquitectura3] = df3['Max_Probability'][i] c1 = (df1['Predictions'][i] == df2['Predictions'][i]) c2 = (df1['Predictions'][i] == df3['Predictions'][i]) c3 = (df2['Predictions'][i] == df3['Predictions'][i]) p1 = (df1['Max_Probability'][i] > pipeline["ssl_threshold"]) p2 = (df2['Max_Probability'][i] > pipeline["ssl_threshold"]) p3 = (df3['Max_Probability'][i] > pipeline["ssl_threshold"]) if c1 and c2 and p1 and p2 and p3: datos["EL"].append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) predicciones.append([df1['Filename'][i], df1['Predictions'][i]]) selected = df1['Predictions'][i] prob_selected = df1["Max_Probability"][i] predicciones_logs.append([ df1['Filename'][i], selected, prob_selected, "EL", df1['Predictions'][i], df1['Max_Probability'][i], df2["Predictions"][i], df2['Max_Probability'][i], df3["Predictions"][i], df3['Max_Probability'][i] ]) EL_iter.append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) etiquetados_EL += 1 else: datos["LC"].append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) predicciones.append([df1['Filename'][i], df1['Predictions'][i]]) selected = df1['Predictions'][i] prob_selected = df1["Max_Probability"][i] predicciones_logs.append([ df1['Filename'][i], selected, prob_selected, "LC", df1['Predictions'][i], df1['Max_Probability'][i], df2["Predictions"][i], df2['Max_Probability'][i], df3["Predictions"][i], df3['Max_Probability'][i] ]) LC_iter.append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) etiquetados_LC += 1 print('etiquetados EL {} LC {}'.format(etiquetados_EL, etiquetados_LC)) import pandas as pd df_logs = pd.DataFrame(predicciones_logs, columns=[ "y_true", "y_pred", "prob_pred", "type", "pred1", "prob1", "pred2", "prob2", "pred3", "prob3" ]) df_logs["y_true"] = df_logs['y_true'].apply(lambda x: x.split('/')[-2]) print("LABELING LOGS") print(df_logs) df_EL_logs = df_logs[(df_logs["type"] == "EL")].copy() df_LC_logs = df_logs[(df_logs["type"] == "LC")].copy() y_true = df_EL_logs['y_true'].values.tolist() y_pred = df_EL_logs['y_pred'].values.tolist() from sklearn.metrics import accuracy_score EL_accu = accuracy_score(y_true, y_pred) print("EL_ACCU: ", EL_accu) y_true = df_LC_logs['y_true'].values.tolist() y_pred = df_LC_logs['y_pred'].values.tolist() from sklearn.metrics import accuracy_score LC_accu = accuracy_score(y_true, y_pred) print("LC_ACCU: ", LC_accu) return datos, EL_iter, LC_iter, [df1, df2, df3]
def labeling_v2(etapa, modelo1, modelo2, modelo3, arquitectura1, arquitectura2, arquitectura3, datos, pipeline, iteracion, models_info): predicciones = [] predicciones_logs = [] etiquetados_EL = 0 etiquetados_LC = 0 EL_iter = [] LC_iter = [] train_generator_arch1, batchset_generator_arch1, STEP_SIZE_BATCH_arch1 = generadores( etapa, arquitectura1, datos, pipeline, True, iteracion, models_info) train_generator_arch2, batchset_generator_arch2, STEP_SIZE_BATCH_arch2 = generadores( etapa, arquitectura2, datos, pipeline, True, iteracion, models_info) train_generator_arch3, batchset_generator_arch3, STEP_SIZE_BATCH_arch3 = generadores( etapa, arquitectura3, datos, pipeline, True, iteracion, models_info) df1 = evaluar(modelo1, train_generator_arch1, batchset_generator_arch1, STEP_SIZE_BATCH_arch1) df2 = evaluar(modelo2, train_generator_arch2, batchset_generator_arch2, STEP_SIZE_BATCH_arch2) df3 = evaluar(modelo3, train_generator_arch3, batchset_generator_arch3, STEP_SIZE_BATCH_arch3) for i in range(len(df1)): arch_scores = {} arch_scores[arquitectura1] = df1['Max_Probability'][i] arch_scores[arquitectura2] = df2['Max_Probability'][i] arch_scores[arquitectura3] = df3['Max_Probability'][i] c1 = (df1['Predictions'][i] == df2['Predictions'][i]) c2 = (df1['Predictions'][i] == df3['Predictions'][i]) c3 = (df2['Predictions'][i] == df3['Predictions'][i]) if c1 and c2: datos["EL"].append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) predicciones.append([df1['Filename'][i], df1['Predictions'][i]]) selected = df1['Predictions'][i] prob_selected = df1["Max_Probability"][i] predicciones_logs.append([ df1['Filename'][i], selected, prob_selected, "EL", df1['Predictions'][i], df1['Max_Probability'][i], df2["Predictions"][i], df2['Max_Probability'][i], df3["Predictions"][i], df3['Max_Probability'][i] ]) EL_iter.append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) etiquetados_EL += 1 elif c1 or c2: datos["EL"].append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) predicciones.append([df1['Filename'][i], df1['Predictions'][i]]) selected = df1['Predictions'][i] prob_selected = df1["Max_Probability"][i] predicciones_logs.append([ df1['Filename'][i], selected, prob_selected, "EL", df1['Predictions'][i], df1['Max_Probability'][i], df2["Predictions"][i], df2['Max_Probability'][i], df3["Predictions"][i], df3['Max_Probability'][i] ]) EL_iter.append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) etiquetados_EL += 1 elif c3: datos["EL"].append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) predicciones.append([df2['Filename'][i], df2['Predictions'][i]]) selected = df2['Predictions'][i] prob_selected = df2["Max_Probability"][i] predicciones_logs.append([ df1['Filename'][i], selected, prob_selected, "EL", df1['Predictions'][i], df1['Max_Probability'][i], df2["Predictions"][i], df2['Max_Probability'][i], df3["Predictions"][i], df3['Max_Probability'][i] ]) EL_iter.append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) etiquetados_EL += 1 else: datos["LC"].append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) LC_iter.append( [df1['Filename'][i], df1['Predictions'][i], arch_scores]) etiquetados_LC += 1 predicciones.append([df1['Filename'][i], df1['Predictions'][i]]) selected = df1['Predictions'][i] prob_selected = df1["Max_Probability"][i] predicciones_logs.append([ df1['Filename'][i], selected, prob_selected, "LC", df1['Predictions'][i], df1['Max_Probability'][i], df2["Predictions"][i], df2['Max_Probability'][i], df3["Predictions"][i], df3['Max_Probability'][i] ]) print('etiquetados EL {} LC {}'.format(etiquetados_EL, etiquetados_LC)) import pandas as pd df_logs = pd.DataFrame(predicciones_logs, columns=[ "y_true", "y_pred", "prob_pred", "type", "pred1", "prob1", "pred2", "prob2", "pred3", "prob3" ]) df_logs["y_true"] = df_logs['y_true'].apply(lambda x: x.split('/')[-2]) print("LABELING LOGS") print(df_logs) df_EL_logs = df_logs[(df_logs["type"] == "EL")].copy() df_LC_logs = df_logs[(df_logs["type"] == "LC")].copy() y_true = df_EL_logs['y_true'].values.tolist() y_pred = df_EL_logs['y_pred'].values.tolist() from sklearn.metrics import accuracy_score EL_accu = accuracy_score(y_true, y_pred) print("EL_ACCU: ", EL_accu) y_true = df_LC_logs['y_true'].values.tolist() y_pred = df_LC_logs['y_pred'].values.tolist() from sklearn.metrics import accuracy_score LC_accu = accuracy_score(y_true, y_pred) print("LC_ACCU: ", LC_accu) return datos, EL_iter, LC_iter, [df1, df2, df3]