예제 #1
0
def evaluate_cotrain(modelo1, modelo2, modelo3, arquitectura1, arquitectura2,
                     arquitectura3, datos, etapa, kfold, iteracion, pipeline,
                     models_info, logs):

    train_generator_arch1, test1_generator_arch1, STEP_SIZE_TEST1_arch1 = generadores(
        etapa, arquitectura1, datos, pipeline, False, iteracion, models_info)
    train_generator_arch2, test1_generator_arch2, STEP_SIZE_TEST1_arch2 = generadores(
        etapa, arquitectura2, datos, pipeline, False, iteracion, models_info)
    train_generator_arch3, test1_generator_arch3, STEP_SIZE_TEST1_arch3 = generadores(
        etapa, arquitectura3, datos, pipeline, False, iteracion, models_info)

    df1 = evaluar(modelo1, train_generator_arch1, test1_generator_arch1,
                  STEP_SIZE_TEST1_arch1)
    df2 = evaluar(modelo2, train_generator_arch2, test1_generator_arch2,
                  STEP_SIZE_TEST1_arch2)
    df3 = evaluar(modelo3, train_generator_arch3, test1_generator_arch3,
                  STEP_SIZE_TEST1_arch3)

    predicciones = []
    for i in range(len(df1)):

        c1 = (df1['Predictions'][i] == df2['Predictions'][i])
        c2 = (df1['Predictions'][i] == df3['Predictions'][i])
        c3 = (df2['Predictions'][i] == df3['Predictions'][i])

        if c1 and c2 and c3:
            predicciones.append([df1['Filename'][i], df1['Predictions'][i]])
        else:
            probabilidades = np.array([
                df1['Max_Probability'][i], df2['Max_Probability'][i],
                df3['Max_Probability'][i]
            ])
            indice_prob_max = probabilidades.argmax()

            clases = np.array([
                df1['Predictions'][i], df2['Predictions'][i],
                df3['Predictions'][i]
            ])
            indice_clas_max = clases.argmax()

            real = np.array(
                [df1['Filename'][i], df2['Filename'][i], df3['Filename'][i]])
            predicciones.append(
                [real[indice_prob_max], clases[indice_clas_max]])

    results = pd.DataFrame(predicciones, columns=["filename", "predictions"])

    #results['filename'] = results['filename'].apply(lambda x:x.split('/')[-1].split('_')[-1][0])
    results['filename'] = results['filename'].apply(lambda x: x.split('/')[-2])
    y_true = results['filename'].values.tolist()
    y_pred = results['predictions'].values.tolist()

    from sklearn.metrics import accuracy_score
    co_train_accu = accuracy_score(y_pred, y_true)
    co_train_label = 'co-train'

    logs.append(
        [kfold, iteracion, co_train_label, None, None, None, co_train_accu])
    save_logs(logs, 'train', pipeline)
    return co_train_accu
예제 #2
0
def ssl_global(model_zoo, pipeline):

    numero_lotes = 5
    semi_method = 'co-training-multi'

    datos = {}
    models_info = {}
    #test_cotraining,predicciones = [],[]
    #logs,logs_time,logs_label = [], [], []

    datos["df_base"] = get_dataset(pipeline)
    datos = split_train_test(datos, pipeline)

    # Medir tiempo de ejecucion
    import time
    start = time.time()

    for kfold in range(1):
        for iteracion in range(numero_lotes * 1):

            print("\n######################")
            print("K-FOLD {} - ITERACION {}".format(kfold, iteracion))
            print("######################\n")

            datos = get_Fold(kfold, datos, pipeline)

            if iteracion == 0:
                etapa = 'train'
            else:
                etapa = 'train_EL'

            print(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' +
                  str(iteracion) + '.pickle')

            for model in model_zoo:

                model_memory, model_performance = training(
                    kfold, etapa, datos, model, iteracion, models_info,
                    pipeline)

                models_info[model] = {
                    'model_memory': model_memory,
                    'model_performance': model_performance['val_acc']
                }

            df_temp = pd.DataFrame(models_info).T
            top_models = df_temp.sort_values('model_performance',
                                             ascending=False)
            top_models = top_models.reset_index()['index'].values.tolist()[:3]

            mod_top1, arch_top1 = models_info[
                top_models[0]]['model_memory'], top_models[0]
            mod_top2, arch_top2 = models_info[
                top_models[1]]['model_memory'], top_models[1]
            mod_top3, arch_top3 = models_info[
                top_models[2]]['model_memory'], top_models[2]

            print("\n")
            print(
                "Co-train: ",
                evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1,
                                 arch_top2, arch_top3, datos, etapa, kfold,
                                 iteracion, pipeline, models_info, logs))
            print("\n")

            if semi_method == 'supervised':
                break

            if iteracion < numero_lotes:

                df_batchset = datos["batch_set"][iteracion]
                df_batchset.columns = [
                    pipeline["x_col_name"], pipeline["y_col_name"]
                ]
                df_batchset[pipeline["y_col_name"]] = '0'
            else:
                if iteracion == numero_lotes:
                    df_LC = pd.DataFrame(pipeline["LC"])
                    batch_set_LC = list(dividir_lotes(df_LC, numero_lotes))

                    for i in enumerate(batch_set_LC):
                        print(len(batch_set_LC[i].iloc[:, 0].values.tolist()))
                    pipeline["LC"] = []

                df_batchset = pd.DataFrame([
                    batch_set_LC[int(iteracion -
                                     numero_lotes)].iloc[:, 0].values.tolist()
                ]).T
                df_batchset.columns = [pipeline["x_col_name"]]
                df_batchset[pipeline["y_col_name"]] = '0'

            datos['df_batchset'] = df_batchset

            datos, EL_iter, LC_iter = labeling(etapa, mod_top1, mod_top2,
                                               mod_top3, arch_top1, arch_top2,
                                               arch_top3, datos, pipeline,
                                               iteracion, models_info)
            #logs_label.append([kfold,iteracion,arch_top1,arch_top2,arch_top3,len(EL_iter),len(LC_iter)])
            #save_logs(logs_label,'label',pipeline)

            #df_EL = pd.DataFrame(EL, columns=[pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores'])
            #df_LC = pd.DataFrame(LC, columns=[pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores'])
            print("EL_iter", len(EL_iter))
            print("LC_iter", len(LC_iter))
            #df_EL = pd.DataFrame(EL_iter, columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) # EXP30
            #df_LC = pd.DataFrame(LC_iter, columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) # EXP30

            df_EL = pd.DataFrame(datos["EL"],
                                 columns=[
                                     pipeline["x_col_name"],
                                     pipeline["y_col_name"], 'arch_scores'
                                 ])
            df_LC = pd.DataFrame(datos["LC"],
                                 columns=[
                                     pipeline["x_col_name"],
                                     pipeline["y_col_name"], 'arch_scores'
                                 ])

            os.makedirs(pipeline["path_label_stats"].split('/')[0],
                        exist_ok=True)

            df_EL.to_pickle(pipeline["path_label_stats"] +
                            str(pipeline["id"]) + '_' + str(iteracion) +
                            '_EL.pickle')
            df_LC.to_pickle(pipeline["path_label_stats"] +
                            str(pipeline["id"]) + '_' + str(iteracion) +
                            '_LC.pickle')

            df_label_stats = label_stats(df_EL, df_LC, pipeline)
            #print(df_label_stats)
            df_label_stats.to_pickle(pipeline["path_label_stats"] +
                                     str(pipeline["id"]) + '_' +
                                     str(iteracion) + '.pickle')

            df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]
                                     ])  # USANDO MUESTRAS TRAIN Y EL
            #df_train_EL = df_EL.iloc[:,:2].copy() # EXP30 # UNICAMENTE USANDO MUESTRAS EL
            #print(df_train)
            #print("df_train_EL")
            #print(df_train_EL)
            #print(df_EL.iloc[:,:2])
            #print(df_train_EL)
            datos['df_train_EL'] = df_train_EL

            try:
                print("AUTO-ESTIMATING OF SSL THRESHOLD ...")
                df_EL_stats = df_label_stats["df_EL_stats"]["df"]
                df_LC_stats = df_label_stats["df_LC_stats"]["df"]

                df_U_iter = pd.concat([df_EL_stats, df_LC_stats],
                                      ignore_index=True)

                ssl_th = df_U_iter.describe()["arch_scores_mean"]["mean"]
                pipeline["ssl_threshold"] = ssl_th
                print("NEW SSL THRESHOLD: ", ssl_th)
            except:
                print("ERROR - AUTO-ESTIMATING SSL THRESHOLD")
                ssl_th = pipeline["ssl_threshold"]
                traceback.print_exc()

            #df_U_iter.describe()["arch_scores_mean"]["25%"]
            #df_U_iter = pd.concat([df_EL,df_LC], ignore_index=True)

            #EXP 33
            #print("df_U_describe")
            #print(f"MEAN U_{iteracion}: {ssl_th}")
            #print(df_U_iter.describe())
            #ssl_th = df_U_iter.describe()["arch_scores_mean"]["25%"]
            #print(f"MEAN U_{iteracion}: {ssl_th}")
            #print(f" P25 U_{iteracion}: {ssl_th}")

            #print(f"NUEVO UMBRAL PARA SSL: {ssl_th}")
            #pipeline["ssl_threshold"] = ssl_th

            logs_label.append([
                kfold, iteracion, arch_top1, arch_top2, arch_top3,
                len(EL_iter),
                len(LC_iter), ssl_th
            ])
            save_logs(logs_label, 'label', pipeline)
            #reset_keras()
            #models_info = []
    end = time.time()
    print(end - start)
예제 #3
0
        "train_epochs": 5,
        "batch_epochs": 5
    },
    "fast": {
        "train_epochs": 10,
        "batch_epochs": 10
    },
    "medium": {
        "train_epochs": 20,
        "batch_epochs": 20
    },
    "slow": {
        "train_epochs": 30,
        "batch_epochs": 30
    }
}

logs.append([
    "kfold", "iteracion", "arquitectura", "val_loss", "val_accu", "test_loss",
    "test_accu"
])
logs_time.append(["kfold", "iteracion", "arquitectura", "training_time"])
logs_label.append(["kfold", "iteracion", "arquitectura", "EL", "LC"])

save_logs(logs, 'train', pipeline)
save_logs(logs_time, 'time', pipeline)
save_logs(logs_label, 'label', pipeline)

models = ['ResNet50', 'Xception', 'DenseNet169', 'InceptionV4', 'DenseNet121']
ssl_global(model_zoo=models, pipeline=pipeline)
예제 #4
0
def training(kfold, etapa, datos, architecture, iteracion, models_info, pipeline):

    start_model = time.time()
    base_model, preprocess_input = get_model(architecture, iteracion, models_info, pipeline)
    model_performance = {}

    datagen = ImageDataGenerator(
                                    preprocessing_function=preprocess_input,
                                    rotation_range=90,
                                    horizontal_flip=True,
                                    vertical_flip=True,
                                )

    if etapa=='train':
        train_generator = datagen.flow_from_dataframe(
                         dataframe=datos['df_train'],
                         x_col=pipeline["x_col_name"],
                         y_col=pipeline["y_col_name"],
                         target_size=(pipeline['img_height'],pipeline['img_width']),
                         class_mode='categorical',
                         batch_size=pipeline["batch_size"],
                         seed=42,
                         shuffle=True)

    if etapa=='train_EL':
        train_generator = datagen.flow_from_dataframe(
                         dataframe=datos['df_train_EL'],
                         x_col=pipeline["x_col_name"],
                         y_col=pipeline["y_col_name"],
                         target_size=(pipeline['img_height'],pipeline['img_width']),
                         class_mode='categorical',
                         batch_size=pipeline["batch_size"],
                         seed=42,
                         shuffle=True)

    if len(datos['df_val']) > 0:
        val_datagen=ImageDataGenerator(preprocessing_function=preprocess_input)

        valid_generator=val_datagen.flow_from_dataframe(
                        dataframe=datos['df_val'],
                        x_col=pipeline["x_col_name"],
                        y_col=pipeline["y_col_name"],
                        batch_size=pipeline["batch_size"],
                        seed=42,
                        shuffle=True,
                        class_mode="categorical",
                        target_size=(pipeline['img_height'],pipeline['img_width']))

    test_datagen=ImageDataGenerator(preprocessing_function=preprocess_input)

    test_generator=test_datagen.flow_from_dataframe(
                      dataframe=datos['df_test'],
                      x_col=pipeline["x_col_name"],
                      y_col=pipeline["y_col_name"],
                      batch_size=pipeline["batch_size"],
                      seed=42,
                      shuffle=False,
                      class_mode="categorical",
                      target_size=(pipeline['img_height'],pipeline['img_width']))

    num_classes = len( datos["df_train"][ pipeline["y_col_name"] ].unique() )

    if etapa == 'train' and pipeline["transfer_learning"] == "classic":
        finetune_model = transfer_learning_classic( base_model, num_classes )
    elif pipeline["transfer_learning"] == "soft":
        finetune_model = transfer_learning_soft( base_model, num_classes,
                                                 pipeline["stage_config"][iteracion] )
    else:
        finetune_model = base_model
    

    if etapa == 'train':
        NUM_EPOCHS = pipeline["modality_config"][pipeline["modality"]]["train_epochs"]
        num_train_images = len(datos['df_train'])*pipeline["aug_factor"]
    if etapa == 'train_EL':
        NUM_EPOCHS = pipeline["modality_config"][pipeline["modality"]]["batch_epochs"]
        num_train_images = len(datos['df_train_EL'])*pipeline["aug_factor"]

    STEP_SIZE_TRAIN=num_train_images//train_generator.batch_size
    STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
    STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

    metrics = ['accuracy']
    loss='categorical_crossentropy'

    if pipeline["transfer_learning"] == "soft":
        LR = pipeline['LR']
    else:
        LR = pipeline["stage_config"][iteracion]['LR']
    
    print(f"LEARNING RATE: {LR}")
    adam = Adam(lr=float(LR))
    finetune_model.compile(adam, loss=loss, metrics=metrics)

    early = EarlyStopping(monitor='val_loss',
                        min_delta=1e-3,
                        patience=5,
                        verbose=1,
                        restore_best_weights=True)

    history = finetune_model.fit(train_generator,
                epochs=NUM_EPOCHS,
                workers=1,
                steps_per_epoch=STEP_SIZE_TRAIN,
                validation_data=valid_generator,
                validation_steps=STEP_SIZE_VALID,
                verbose=1,
                callbacks=[early])

    val_score=finetune_model.evaluate(valid_generator,verbose=0,steps=STEP_SIZE_VALID)
    test_score=finetune_model.evaluate(test_generator,verbose=0,steps=STEP_SIZE_TEST)

    print("Val  Loss : ", val_score[0])
    print("Test Loss : ", test_score[0])
    print("Val  Accuracy : ", val_score[1])
    print("Test Accuracy : ", test_score[1])

    end_model = time.time()
    time_training = end_model - start_model
    print(f"training {architecture}",time_training)

    logs = []
    logs.append([kfold,iteracion,architecture,val_score[0],val_score[1],
            test_score[0],test_score[1]])

    logs_time = []
    logs_time.append([kfold,iteracion,architecture,time_training])

    save_logs(logs,'train',pipeline)
    save_logs(logs_time,'time',pipeline)
    save_plots(history, kfold, iteracion, architecture, pipeline)

    model_performance['val_acc'] = val_score[1]
    model_performance['test_acc'] = test_score[1]

    if pipeline['save_model']:
        save_path_model = os.path.join(
            pipeline['save_path_model'],
            f'{kfold}_{iteracion}_{architecture}.h5')
        finetune_model.save(save_path_model)
        model_performance['val_acc'] = val_score[1]
        return save_path_model , model_performance

    return finetune_model , model_performance
예제 #5
0
def ssl_global(model_zoo, pipeline):

    datos = {}
    datos["df_base"] = get_dataset(pipeline)
    datos = split_train_test(datos, pipeline)

    # Medir tiempo de ejecucion
    import time
    start = time.time()

    split_kfold = pipeline["split_kfold"]
    num_kfold = pipeline["num_kfold"]

    for kfold in range(num_kfold):

        models_info = {}
        datos = get_Fold(kfold, datos, pipeline)

        datos_by_fold = {"kfold": kfold, "datos": datos}

        datos_total.append(datos_by_fold)
        df_datos = pd.DataFrame(datos_total)
        datos_path = pipeline["save_path_stats"] + 'exp_' + str(
            pipeline["id"]) + '_' + str(kfold) + '_data.pkl'
        df_datos.to_pickle(datos_path)

        numero_lotes = len(datos["batch_set"])

        #datos["batch_set"][0]

        for iteracion in range(numero_lotes * 1):

            kfold_info = f"K-FOLD {kfold}/{num_kfold} - ITERACION {iteracion}/{numero_lotes}"
            print("\n")
            print("#" * len(kfold_info))
            print(kfold_info)
            print("#" * len(kfold_info))
            print("\n")

            print("\n")
            print(f"CLASS DISTRIBUTION - BATCH_SET {iteracion}")
            print(datos["batch_set"][iteracion].groupby(
                pipeline["y_col_name"]).count())
            print(f"OK - CLASS DISTRIBUTION - BATCH_SET {iteracion}")
            print("\n")

            if iteracion == 0:
                etapa = 'train'
            else:
                etapa = 'train_EL'

            #print(pipeline["save_path_stats"]+str(pipeline["id"])+'_'+str(iteracion)+'.pkl')

            for model in model_zoo:

                #print("##########")
                #print("AUG_FACTOR - CURRENT: ", pipeline["stage_config"][iteracion]["aug_factor"])
                #pipeline["aug_factor"] = pipeline["stage_config"][iteracion]["aug_factor"]
                print("AUG_FACTOR: ", pipeline["aug_factor"])

                model_memory, model_performance = training(
                    kfold, etapa, datos, model, iteracion, models_info,
                    classification_metrics, pipeline)

                models_info[model] = {
                    'model_memory': model_memory,
                    'model_performance': model_performance['val_acc']
                }

            df_temp = pd.DataFrame(models_info).T
            top_models = df_temp.sort_values('model_performance',
                                             ascending=False)
            top_models = top_models.reset_index()['index'].values.tolist()[:3]

            mod_top1, arch_top1 = models_info[
                top_models[0]]['model_memory'], top_models[0]
            mod_top2, arch_top2 = models_info[
                top_models[1]]['model_memory'], top_models[1]
            mod_top3, arch_top3 = models_info[
                top_models[2]]['model_memory'], top_models[2]

            #if pipeline['save_model']:
            #   mod_top1 = load_model(mod_top1, compile=True)
            #    mod_top2 = load_model(mod_top2, compile=True)
            #    mod_top3 = load_model(mod_top3, compile=True)

            # Medir tiempo de ejecucion
            import time
            start = time.time()

            print("EVALUATING CO-TRAINING ...")
            print("\n")
            #print("Co-train: ", evaluate_cotrain(mod_top1,mod_top2,mod_top3,arch_top1,
            #                                        arch_top2,arch_top3,datos,etapa,kfold,
            #                                        iteracion,pipeline,models_info,logs))

            cotrain_acc, cotrain_infer_dfs = evaluate_cotrain(
                mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3,
                datos, etapa, kfold, iteracion, pipeline, models_info, logs)

            print("Co-train: ", cotrain_acc)
            df_cotrain_info = {
                "kfold": kfold,
                "iteracion": iteracion,
                "df_arch1": cotrain_infer_dfs[0],
                "df_arch2": cotrain_infer_dfs[1],
                "df_arch3": cotrain_infer_dfs[2]
            }

            cotrain_list.append(df_cotrain_info)
            df_cotrain_list = pd.DataFrame(cotrain_list)
            #print(df_cotrain_list)

            infer_pkl = pipeline["save_path_stats"] + 'exp_' + str(
                pipeline["id"]) + '_' + str(iteracion) + '_cotrain_eval.pkl'

            print("SAVING COTRAIN EVAL PICKLE")
            df_cotrain_list.to_pickle(infer_pkl)
            print("OK - SAVING COTRAIN EVAL PICKLE")

            print("\n")
            print("OK - EVALUATING CO-TRAINING")

            end = time.time()
            infer_time = end - start
            # SAVE INFER_TIME BY DF_TEST BY ITERATION AND ARCH

            print(infer_time, len(datos["df_test"]))

            logs_infer_time = []
            logs_infer_time.append([
                kfold, iteracion, 'co-train', infer_time,
                len(datos["df_test"])
            ])
            save_logs(logs_infer_time, 'infer_time', pipeline)

            print(f"GETTING BATCH_SET OF ITERATION {iteracion}...")

            df_batchset = datos["batch_set"][iteracion]
            df_batchset.columns = [
                pipeline["x_col_name"], pipeline["y_col_name"]
            ]
            df_batchset[pipeline["y_col_name"]] = '0'

            datos['df_batchset'] = df_batchset

            print("LABELING ...")

            datos, EL_iter, LC_iter, label_infer_df = labeling(
                etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2,
                arch_top3, datos, pipeline, iteracion, models_info)

            df_label_info = {
                "kfold": kfold,
                "iteracion": iteracion,
                "df_arch1": label_infer_df[0],
                "df_arch2": label_infer_df[1],
                "df_arch3": label_infer_df[2]
            }

            label_list.append(df_label_info)
            df_label_list = pd.DataFrame(label_list)
            #print(df_label_list)

            label_pkl = pipeline["save_path_stats"] + 'exp_' + str(
                pipeline["id"]) + '_' + str(iteracion) + '_labeling.pkl'

            print("SAVING LABEL PICKLE")
            df_label_list.to_pickle(label_pkl)
            print("OK - SAVING LABEL PICKLE")

            print("OK - LABELING")
            print("EL_iter", len(EL_iter))
            print("LC_iter", len(LC_iter))

            df_EL = pd.DataFrame(datos["EL"],
                                 columns=[
                                     pipeline["x_col_name"],
                                     pipeline["y_col_name"], 'arch_scores'
                                 ])
            df_LC = pd.DataFrame(datos["LC"],
                                 columns=[
                                     pipeline["x_col_name"],
                                     pipeline["y_col_name"], 'arch_scores'
                                 ])

            df_EL.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                            str(pipeline["id"]) + '_' + str(iteracion) +
                            '_EL.pickle')
            df_LC.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                            str(pipeline["id"]) + '_' + str(iteracion) +
                            '_LC.pickle')

            df_label_stats = label_stats(df_EL, df_LC, pipeline)
            #df_label_stats.to_pickle(pipeline["save_path_stats"]+'exp_'+str(pipeline["id"])+'_'+str(iteracion)+'.pickle')

            df_label_stats.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                                     str(pipeline["id"]) + '_' +
                                     str(iteracion) + '_stats.pickle')

            df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]])
            datos['df_train_EL'] = df_train_EL

            ssl_th = pipeline["ssl_threshold"]

            logs_label.append([
                kfold, iteracion, arch_top1, arch_top2, arch_top3,
                len(EL_iter),
                len(LC_iter), ssl_th
            ])
            save_logs(logs_label, 'label', pipeline)

            reset_keras(pipeline)

            #if pipeline["restart_weights"]:
            #    reset_keras()

            #random.seed(SEED)
            #np.random.seed(SEED)
            #tensorflow.random.set_random_seed(SEED)

    end = time.time()
    print(end - start)
예제 #6
0
def training(kfold, etapa, datos, architecture, iteracion, models_info,
             classification_metrics, pipeline):

    import time
    callbacks_finetune = []
    start_model = time.time()
    base_model, preprocess_input = get_model(architecture, iteracion,
                                             models_info, pipeline)
    model_performance = {}

    print("USING TRANSFORMATIONS FROM SSL_TRAIN")
    datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input,
        rotation_range=40,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.01,
        zoom_range=[0.9, 1.25],
        horizontal_flip=True,
        vertical_flip=False,
        fill_mode='reflect',
        #data_format='channels_last'
    )
    print("OK - USING TRANSFORMATIONS FROM SSL_TRAIN")

    if etapa == 'train':
        print("CREATING GENERATOR FOR TRAIN FROM SSL_TRAIN")
        train_generator = datagen.flow_from_dataframe(
            dataframe=datos['df_train'],
            x_col=pipeline["x_col_name"],
            y_col=pipeline["y_col_name"],
            target_size=(pipeline['img_height'], pipeline['img_width']),
            class_mode='categorical',
            batch_size=pipeline["batch_size"],
            seed=42,
            shuffle=True)
        print("OK - CREATING GENERATOR FOR TRAIN FROM SSL_TRAIN")

        print("CLASS DISTRIBUTION - TRAIN INIT")
        print(datos['df_train'].groupby(pipeline["y_col_name"]).count())
        print("OK - CLASS DISTRIBUTION - TRAIN INIT")

        y_train_unique = datos['df_train'][pipeline["y_col_name"]].unique()
        df_y_train_unique = datos['df_train'][pipeline["y_col_name"]]

        print("CLASS Y_TRAIN UNIQUE")
        print(y_train_unique)
        print("OK - CLASS Y_TRAIN UNIQUE")

    if etapa == 'train_EL':
        print("CREATING GENERATOR FOR TRAIN FROM SSL_TRAIN")
        train_generator = datagen.flow_from_dataframe(
            dataframe=datos['df_train_EL'],
            x_col=pipeline["x_col_name"],
            y_col=pipeline["y_col_name"],
            target_size=(pipeline['img_height'], pipeline['img_width']),
            class_mode='categorical',
            batch_size=pipeline["batch_size"],
            seed=42,
            shuffle=True)
        print("OK - CREATING GENERATOR FOR TRAIN_EL FROM SSL_TRAIN")

        print("CLASS DISTRIBUTION - TRAIN EL")
        print(datos['df_train_EL'].groupby(pipeline["y_col_name"]).count())
        print("OK - CLASS DISTRIBUTION - TRAIN INIT")

        y_train_unique = datos['df_train_EL'][pipeline["y_col_name"]].unique()
        df_y_train_unique = datos['df_train_EL'][pipeline["y_col_name"]]

    if len(datos['df_val']) > 0:
        print("CREATING GENERATOR FOR VAL FROM SSL_TRAIN")
        val_datagen = ImageDataGenerator(
            preprocessing_function=preprocess_input)

        valid_generator = val_datagen.flow_from_dataframe(
            dataframe=datos['df_val'],
            x_col=pipeline["x_col_name"],
            y_col=pipeline["y_col_name"],
            batch_size=pipeline["batch_size"],
            seed=42,
            shuffle=True,
            class_mode="categorical",
            target_size=(pipeline['img_height'], pipeline['img_width']))

        print("OK - CREATING GENERATOR FOR VAL FROM SSL_TRAIN")

    print("CREATING GENERATOR FOR TEST FROM SSL_TRAIN")
    test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

    test_generator1 = test_datagen.flow_from_dataframe(
        dataframe=datos['df_test1'],
        x_col=pipeline["x_col_name"] + '1',
        y_col=pipeline["y_col_name"] + '1',
        batch_size=pipeline["batch_size"],
        seed=42,
        shuffle=False,
        class_mode="categorical",
        target_size=(pipeline['img_height'], pipeline['img_width']))

    test_generator2 = test_datagen.flow_from_dataframe(
        dataframe=datos['df_test2'],
        x_col=pipeline["x_col_name"] + '2',
        y_col=pipeline["y_col_name"] + '2',
        batch_size=pipeline["batch_size"],
        seed=42,
        shuffle=False,
        class_mode="categorical",
        target_size=(pipeline['img_height'], pipeline['img_width']))

    print("OK - CREATING GENERATOR FOR TEST FROM SSL_TRAIN")

    num_classes = len(datos["df_train"][pipeline["y_col_name"]].unique())
    print("NUM CLASSES", num_classes)

    if pipeline["transfer_learning"] == "classic":
        if pipeline["restart_weights"]:
            print("TRANSFER LEARNING - CLASSIC + YES RESTART WEIGHTS")
            finetune_model = transfer_learning_classic(base_model, num_classes,
                                                       pipeline)
            print("OK - TRANSFER LEARNING - CLASSIC + YES RESTART WEIGHTS")
        else:
            if etapa == 'train':
                print("TRANSFER LEARNING - TRAIN + CLASSIC")
                finetune_model = transfer_learning_classic(
                    base_model, num_classes, pipeline)
                print("OK - TRANSFER LEARNING - TRAIN + CLASSIC")
            elif etapa == 'train_EL':
                print(
                    "TRANSFER LEARNING - TRAIN_EL + CLASSIC + NO RESTART WEIGHTS"
                )
                finetune_model = base_model
                print(
                    "OK - TRANSFER LEARNING - TRAIN_EL + CLASSIC + NO RESTART WEIGHTS"
                )
    elif pipeline["transfer_learning"] == "soft":
        print("TRANSFER LEARNING - TRAIN + SOFT")
        finetune_model = transfer_learning_soft(
            base_model, num_classes, pipeline["stage_config"][iteracion])
        print("OK - TRANSFER LEARNING - TRAIN + SOFT")

    if pipeline["use_stage_config"]:
        NUM_EPOCHS = pipeline["stage_config"][iteracion]["train_epochs"]
        AUG_FACTOR = pipeline["stage_config"][iteracion]["aug_factor"]
    else:
        NUM_EPOCHS = pipeline["train_epochs"]
        AUG_FACTOR = pipeline["aug_factor"]
        print("\n")
        print("USING AUG_FACTOR OF: ", AUG_FACTOR)
        print("\n")

    if etapa == 'train':
        #NUM_EPOCHS = pipeline["modality_config"][pipeline["modality"]]["train_epochs"]
        num_train_images = len(datos['df_train']) * AUG_FACTOR
    if etapa == 'train_EL':
        #NUM_EPOCHS = pipeline["modality_config"][pipeline["modality"]]["batch_epochs"]
        num_train_images = len(datos['df_train_EL']) * AUG_FACTOR

    STEP_SIZE_TRAIN = num_train_images // train_generator.batch_size
    STEP_SIZE_VALID = valid_generator.n // valid_generator.batch_size
    STEP_SIZE_TEST1 = test_generator1.n // test_generator1.batch_size
    STEP_SIZE_TEST2 = test_generator2.n // test_generator2.batch_size

    metrics = ['accuracy']
    loss = 'categorical_crossentropy'

    if pipeline["transfer_learning"] == "classic":
        LR = pipeline['learning_rate']
    elif pipeline["transfer_learning"] == "soft":
        LR = pipeline["stage_config"][iteracion]['LR']

    #lr_schedule = ExponentialDecay(
    #    initial_learning_rate=1e-2,
    #    decay_steps=10000,
    #    decay_rate=0.9)

    #print(f"LEARNING RATE: {lr_schedule}")
    print(f"LEARNING RATE: {LR}")
    optimizer = Adam(lr=float(LR))
    finetune_model.compile(optimizer, loss=loss, metrics=metrics)

    if pipeline["early_stopping"]:
        early = EarlyStopping(monitor='val_loss',
                              min_delta=1e-3,
                              patience=pipeline["early_stopping_patience"],
                              verbose=1,
                              restore_best_weights=True)
        callbacks_finetune.append(early)

    if pipeline["reduce_lr"]:
        print("USING REDUCE_LR")
        reduce_lr_loss = ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.1,
            patience=pipeline["reduce_lr_patience"],
            verbose=1,
            mode='min')
        callbacks_finetune.append(reduce_lr_loss)
        print(reduce_lr_loss)
        print(callbacks_finetune)
        print("OK - USING REDUCE_LR")

    if pipeline["checkpoint"]:
        #mod_filename = f'{kfold}_{architecture}_{iteracion}_'+'{epoch:02d}-{val_loss:.2f}.hdf5'
        mod_filename = f'{kfold}_{architecture}_{iteracion}.h5'
        mod_path = os.path.join(pipeline["save_path_models"], mod_filename)
        mcp_save = ModelCheckpoint(mod_path,
                                   save_best_only=True,
                                   monitor='val_loss',
                                   verbose=1,
                                   mode='auto',
                                   save_weights_only=False)
        callbacks_finetune.append(mcp_save)

    if len(callbacks_finetune) == 0:
        callbacks_finetune = None

    if pipeline["class_weight"]:
        import numpy as np
        from sklearn.utils import class_weight

        print("CALCULATING CLASS_WEIGHTS")

        if etapa == 'train':
            class_weights = calculate_weights(datos['df_train'], pipeline)
        if etapa == 'train_EL':
            class_weights = calculate_weights(datos['df_train_EL'], pipeline)

        #class_weights = class_weight.compute_class_weight('balanced',
        #                                                np.unique(train_generator.classes),
        #                                                train_generator.classes)
        print("OK - CALCULATING CLASS_WEIGHTS")
        print("USING CLASS WEIGHTING")
        print(class_weights)
        print("OK - CLASS WEIGHTING")
    else:
        class_weights = None

    print("CALLBACKS FINETUNE")
    print(callbacks_finetune)
    print("OK - CALLBACKS FINETUNE")

    history = finetune_model.fit(
        train_generator,
        epochs=NUM_EPOCHS,
        workers=1,
        steps_per_epoch=STEP_SIZE_TRAIN,
        validation_data=valid_generator,
        validation_steps=STEP_SIZE_VALID,
        verbose=1,
        callbacks=callbacks_finetune,
        class_weight=class_weights,
    )

    val_score = finetune_model.evaluate(valid_generator,
                                        verbose=0,
                                        steps=STEP_SIZE_VALID)
    test1_score = finetune_model.evaluate(test_generator1,
                                          verbose=0,
                                          steps=STEP_SIZE_TEST1)
    test2_score = finetune_model.evaluate(test_generator2,
                                          verbose=0,
                                          steps=STEP_SIZE_TEST2)

    class_metrics1 = classification_metrics(finetune_model, train_generator,
                                            test_generator1, STEP_SIZE_TEST1,
                                            kfold, iteracion, architecture,
                                            pipeline)

    class_metrics2 = classification_metrics(finetune_model, train_generator,
                                            test_generator2, STEP_SIZE_TEST2,
                                            kfold, iteracion, architecture,
                                            pipeline)

    print("Val  Loss : ", val_score[0])
    print("Test1 Loss : ", test1_score[0])
    print("Test2 Loss : ", test2_score[0])
    print("Val  Accuracy : ", val_score[1])
    print("Test1 Accuracy : ", test1_score[1])
    print("Test2 Accuracy : ", test2_score[1])

    print(f"Test1 Precision: {class_metrics1[0]}")
    print(f"Test1 Recall: {class_metrics1[1]}")
    print(f"Test1 F1-Score: {class_metrics1[2]}")
    print(f"Test1 Support: {class_metrics1[3]}")

    print(f"Test2 Precision: {class_metrics2[0]}")
    print(f"Test2 Recall: {class_metrics2[1]}")
    print(f"Test2 F1-Score: {class_metrics2[2]}")
    print(f"Test2 Support: {class_metrics2[3]}")

    end_model = time.time()
    time_training = end_model - start_model
    print(f"training time of - {architecture}", time_training)

    if pipeline["checkpoint"]:
        from tensorflow.keras.models import load_model
        print(f"LOADING BEST MODEL FROM {mod_path}")
        model_checkpoint_finetune = load_model(mod_path, compile=True)

        val_score = model_checkpoint_finetune.evaluate(valid_generator,
                                                       verbose=1,
                                                       steps=STEP_SIZE_VALID)
        test_score = model_checkpoint_finetune.evaluate(test_generator,
                                                        verbose=1,
                                                        steps=STEP_SIZE_TEST)

        print("Val  Accuracy from Best: ", val_score[1])
        print("Test Accuracy from Best: ", test_score[1])

    logs = []
    logs.append([
        kfold, iteracion, architecture, val_score[0], val_score[1],
        test1_score[0], test1_score[1], class_metrics1[0], class_metrics1[1],
        class_metrics1[2], test2_score[0], test2_score[1], class_metrics2[0],
        class_metrics2[1], class_metrics2[2]
    ])

    logs_time = []
    logs_time.append([kfold, iteracion, architecture, time_training])

    save_logs(logs, 'train', pipeline)
    save_logs(logs_time, 'time', pipeline)
    save_plots(history, kfold, iteracion, architecture, pipeline)

    model_performance['val_acc'] = val_score[1]
    model_performance['test1_acc'] = test1_score[1]
    model_performance['test2_acc'] = test2_score[1]

    exp_id = str(pipeline["id"])

    if pipeline['checkpoint']:
        return model_checkpoint_finetune, model_performance

        save_path_model = os.path.join(
            pipeline['save_path_model'], pipeline['dataset_base'],
            f'exp_{exp_id}_{kfold}_{iteracion}_{architecture}.h5')
        # SAVE MODELS BEST_ONLY # MODEL CHECKPOINT
        # https://stackoverflow.com/questions/48285129/saving-best-model-in-keras

        import time
        start = time.time()

        print(f"SAVING MODEL ON {save_path_model}")
        finetune_model.save(save_path_model)
        print(f"OK - SAVING MODEL ON {save_path_model}")

        end = time.time()
        end_time = end - start

        print(f"TOTAL TIME TO SAVE: {end_time}")

        model_performance['val_acc'] = val_score[1]
        return save_path_model, model_performance

    return finetune_model, model_performance
예제 #7
0
def evaluate_cotrain(modelo1,modelo2,modelo3,
                    arquitectura1,arquitectura2,arquitectura3,
                    datos, etapa, kfold, iteracion,
                    pipeline, models_info, logs):

    train_generator_arch1,test_generator_arch1,STEP_SIZE_TEST_arch1=generadores(etapa,arquitectura1,datos,pipeline,False,iteracion,models_info)
    train_generator_arch2,test_generator_arch2,STEP_SIZE_TEST_arch2=generadores(etapa,arquitectura2,datos,pipeline,False,iteracion,models_info)
    train_generator_arch3,test_generator_arch3,STEP_SIZE_TEST_arch3=generadores(etapa,arquitectura3,datos,pipeline,False,iteracion,models_info)

    df1=evaluar(modelo1,train_generator_arch1,test_generator_arch1,STEP_SIZE_TEST_arch1)
    df2=evaluar(modelo2,train_generator_arch2,test_generator_arch2,STEP_SIZE_TEST_arch2)
    df3=evaluar(modelo3,train_generator_arch3,test_generator_arch3,STEP_SIZE_TEST_arch3)

    import numpy as np

    predicciones = []
    predicciones_logs = []

    for i in range(len(df1)):

        c1 = (df1['Predictions'][i] == df2['Predictions'][i])
        c2 = (df1['Predictions'][i] == df3['Predictions'][i])
        c3 = (df2['Predictions'][i] == df3['Predictions'][i])

        if c1 or c2:
            predicciones.append([df1['Filename'][i],df1['Predictions'][i]])
            selected = df1['Predictions'][i]
            prob_selected = df1["Max_Probability"][i]
            predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"democ",
                                    df1['Predictions'][i],df1['Max_Probability'][i],
                                    df2["Predictions"][i],df2['Max_Probability'][i],
                                    df3["Predictions"][i],df3['Max_Probability'][i]])
        elif c3:
            predicciones.append([df2['Filename'][i],df2['Predictions'][i]])
            selected = df2['Predictions'][i]
            prob_selected = df2["Max_Probability"][i]
            predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"democ",
                                    df1['Predictions'][i],df1['Max_Probability'][i],
                                    df2["Predictions"][i],df2['Max_Probability'][i],
                                    df3["Predictions"][i],df3['Max_Probability'][i]])
        else:
            probabilidades = np.array([df1['Max_Probability'][i],df2['Max_Probability'][i],df3['Max_Probability'][i]])
            indice_prob_max = probabilidades.argmax()

            clases = np.array([df1['Predictions'][i],df2['Predictions'][i],df3['Predictions'][i]])
            real = np.array([df1['Filename'][i],df2['Filename'][i],df3['Filename'][i]])

            predicciones.append([real[indice_prob_max],clases[indice_prob_max]])
            
            selected = clases[indice_prob_max]
            prob_selected = probabilidades[indice_prob_max]
            predicciones_logs.append([df1['Filename'][i],selected,prob_selected,"max",
                                    df1['Predictions'][i],df1['Max_Probability'][i],
                                    df2["Predictions"][i],df2['Max_Probability'][i],
                                    df3["Predictions"][i],df3['Max_Probability'][i]])
            
    results = pd.DataFrame(predicciones,columns=["filename","predictions"])

    results['filename'] = results['filename'].apply(lambda x:x.split('/')[-2])
    y_true = results['filename'].values.tolist()
    y_pred = results['predictions'].values.tolist()


    labels_arch1 = (train_generator_arch1.class_indices)

    print("LABELS CO-TRAIN")
    print([*labels_arch1])

    architecture = 'co-train'

    class_metrics = precision_recall_fscore_support(y_true, y_pred, average=pipeline["metrics"])
    
    # TODO Bugfix Calculate Confusion Matrix
    #cm = calculate_confusion_matrix(y_true, y_pred)
    
    # TODO Bugfix Calculate Confusion Matrix
    #save_confusion_matrix(cm, [*labels_arch1], kfold, iteracion, architecture, pipeline)

    # TODO Bugfix Calculate Confusion Matrix
    # normalize confusion matrix
    #if pipeline["cm_normalize"]:
    #    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    #    cm = np.round(cm, 2)

    # TODO Bugfix Calculate Confusion Matrix
    #plot_confusion_matrix(cm, [*labels_arch1], kfold, iteracion, architecture, pipeline)
    
    #acc_cls = accuracy_by_class(cm, [*labels_arch1])
    #print("ACCURACY BY CLASS")
    #print(acc_cls)
    #print("LEN ACCURACY BY CLASS")
    #print(len(acc_cls))
    # SAVE ACC_CLS
    #logs_accBycls = []
    #logs_accBycls.append([kfold,iteracion,architecture,acc_cls])
    #save_logs(logs_accBycls, 'accBycls', pipeline)

    #plot_confusion_matrix(y_true, y_pred, [*labels_arch1], kfold, iteracion, architecture, pipeline)
    
    from sklearn.metrics import accuracy_score
    co_train_accu = accuracy_score(y_true,y_pred)
    #co_train_accu = accuracy_score(y_pred, y_true)

    logs.append([kfold,iteracion,architecture,None,None,None,co_train_accu,
    class_metrics[0],class_metrics[1],class_metrics[2],class_metrics[3]])

    print(f"Co-train Accuracy: {co_train_accu}")
    print(f"Co-train Precision: {class_metrics[0]}")
    print(f"Co-train Recall: {class_metrics[1]}")
    print(f"Co-train F1-Score: {class_metrics[2]}")
    print(f"Co-train Support: {class_metrics[3]}")

    save_logs(logs,'train',pipeline)
    return co_train_accu, [df1,df2,df3]
예제 #8
0
def ssl_global(archivos, model_zoo, csvs, pipeline):

    datos = {}
    models_info = {}
    #df_train, df_val, df_test1, df_test2 = get_data(archivos, csvs, pipeline) ACA VOY

    # Medir tiempo de ejecucion
    #import time
    start = time.time()
    fold = dividir_balanceado2(df_train, 4)

    for kfold in range(1):

        if dataset == 'gleasson':
            #import pandas as pd
            df_train_58 = pd.DataFrame([fold[kfold][0], fold[kfold][2]]).T
            df_train_58.columns = [x_col_name, y_col_name]

            df_val = pd.DataFrame([fold[kfold][1], fold[kfold][3]]).T
            df_val.columns = [x_col_name, y_col_name]

            fold1 = dividir_balanceado2(df_train_58, 4)
            df_train = pd.DataFrame([fold1[0][1], fold1[0][3]]).T
            df_train.columns = [x_col_name, y_col_name]

            df_train.to_csv('data/train.csv', index=False)
            df_val.to_csv('data/val.csv', index=False)
            df_test1.to_csv('data/test1.csv', index=False)
            df_test2.to_csv('data/test2.csv', index=False)

            df_U = pd.DataFrame([fold1[0][0], fold1[0][2]]).T
            df_U.columns = [x_col_name, y_col_name]
            EL, LC = [], []

            print("train :", len(df_train))
            print("val   :", len(df_val))
            print("u     :", len(df_U))

            # Segmentación de U en lotes para etiquetar
            batch_set = list(dividir_lotes(df_U, numero_lotes))
            for i in range(len(batch_set)):
                print(len(batch_set[i].iloc[:, 0].values.tolist()))

        datos['df_train'] = df_train
        datos['df_val'] = df_val
        datos['df_test1'] = df_test1
        datos['df_test2'] = df_test2

        for iteracion in range(numero_lotes * 1):

            #import random
            random.seed(SEED)
            np.random.seed(SEED)
            tensorflow.random.set_random_seed(SEED)

            print("\n######################")
            print("K-FOLD {} - ITERACION {}".format(kfold, iteracion))
            print("######################\n")

            if iteracion == 0:
                etapa = 'train'
            else:
                etapa = 'train_EL'

            print(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' +
                  str(iteracion) + '.pickle')

            for model in model_zoo:

                model_memory, model_performance = entrenamiento(
                    kfold, etapa, datos, model, train_epochs, batch_epochs,
                    early_stopping, iteracion, models_info, pipeline)

                models_info[model] = {
                    'model_memory': model_memory,
                    'model_performance': model_performance['val_acc']
                }

            #import pandas as pd
            df_temp = pd.DataFrame(models_info).T
            top_models = df_temp.sort_values('model_performance',
                                             ascending=False)
            top_models = top_models.reset_index()['index'].values.tolist()[:3]

            mod_top1, arch_top1 = models_info[
                top_models[0]]['model_memory'], top_models[0]
            mod_top2, arch_top2 = models_info[
                top_models[1]]['model_memory'], top_models[1]
            mod_top3, arch_top3 = models_info[
                top_models[2]]['model_memory'], top_models[2]

            if dataset == 'gleasson':
                print(
                    "\nCo-train1: \n",
                    evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1,
                                     arch_top2, arch_top3,
                                     'gleasson-patologo1', datos, etapa, kfold,
                                     iteracion, pipeline, models_info))
                print(
                    "\nCo-train2: \n",
                    evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1,
                                     arch_top2, arch_top3,
                                     'gleasson-patologo2', datos, etapa, kfold,
                                     iteracion, pipeline, models_info))

            if semi_method == 'supervised':
                break

            if iteracion < numero_lotes:

                df_batchset = batch_set[iteracion]
                df_batchset.columns = [x_col_name, y_col_name]
                df_batchset[y_col_name] = '0'
            else:
                if iteracion == numero_lotes:
                    df_LC = pd.DataFrame(LC)
                    batch_set_LC = list(dividir_lotes(df_LC, numero_lotes))
                    for i in range(len(batch_set_LC)):
                        print(len(batch_set_LC[i].iloc[:, 0].values.tolist()))
                    LC = []

                df_batchset = pd.DataFrame([
                    batch_set_LC[int(iteracion -
                                     numero_lotes)].iloc[:, 0].values.tolist()
                ]).T
                df_batchset.columns = [x_col_name]
                df_batchset[y_col_name] = '0'

            datos['df_batchset'] = df_batchset

            EL, LC, EL_iter, LC_iter = labeling(etapa, mod_top1, mod_top2,
                                                mod_top3, arch_top1, arch_top2,
                                                arch_top3, EL, LC, datos,
                                                pipeline, iteracion,
                                                models_info)
            #logs_label.append([kfold,iteracion,arch_top1,arch_top2,arch_top3,len(EL_iter),len(LC_iter)])
            #save_logs(logs_label,'label',pipeline)

            #df_EL = pd.DataFrame(EL, columns=[x_col_name, y_col_name, 'arch_scores'])
            #df_LC = pd.DataFrame(LC, columns=[x_col_name, y_col_name, 'arch_scores'])

            df_EL = pd.DataFrame(
                EL_iter, columns=[x_col_name, y_col_name,
                                  'arch_scores'])  # EXP30
            df_LC = pd.DataFrame(
                LC_iter, columns=[x_col_name, y_col_name,
                                  'arch_scores'])  # EXP30

            df_label_stats = label_stats(df_EL, df_LC)
            print(df_label_stats)
            df_label_stats.to_pickle(pipeline["path_label_stats"] +
                                     str(pipeline["id"]) + '_' +
                                     str(iteracion) + '.pickle')

            #df_train_EL = pd.concat([df_train,df_EL.iloc[:,:2]])
            df_train_EL = df_EL.iloc[:, :2].copy()  # EXP30
            #print(df_train)
            print("df_train_EL")
            print(df_train_EL)
            #print(df_EL.iloc[:,:2])
            #print(df_train_EL)
            datos['df_train_EL'] = df_train_EL

            df_EL_stats = df_label_stats["df_EL_stats"]["df"]
            df_LC_stats = df_label_stats["df_LC_stats"]["df"]

            df_U_iter = pd.concat([df_EL_stats, df_LC_stats],
                                  ignore_index=True)
            #df_U_iter.describe()["arch_scores_mean"]["25%"]
            #df_U_iter = pd.concat([df_EL,df_LC], ignore_index=True)
            #ssl_th = df_U_iter.describe()["arch_scores_mean"]["mean"]
            #EXP 33
            #print("df_U_describe")
            #print(f"MEAN U_{iteracion}: {ssl_th}")
            #print(df_U_iter.describe())
            #ssl_th = df_U_iter.describe()["arch_scores_mean"]["25%"]
            #print(f"MEAN U_{iteracion}: {ssl_th}")
            #print(f" P25 U_{iteracion}: {ssl_th}")

            #print(f"NUEVO UMBRAL PARA SSL: {ssl_th}")
            #pipeline["ssl_threshold"] = ssl_th

            logs_label.append([
                kfold, iteracion, arch_top1, arch_top2, arch_top3,
                len(EL_iter),
                len(LC_iter), ssl_th
            ])
            save_logs(logs_label, 'label', pipeline)
            #reset_keras()
            #models_info = []
    end = time.time()
    print(end - start)
예제 #9
0
def ssl_global(model_zoo, pipeline):

    #datos = {}
    datos = get_dataset(pipeline)

    #print(datos)
    #return True

    #datos = split_train_test(datos, pipeline)

    #return True
    # Medir tiempo de ejecucion
    import time
    start = time.time()

    #split_kfold = pipeline["split_kfold"]
    #num_kfold = pipeline["num_kfold"]
    method = pipeline["method"]

    #for kfold in range(num_kfold):

    models_info = {}

    if method == "semi-supervised":
        datos = get_Fold(kfold, datos, pipeline)

    #return True

    #datos_by_fold = {
    #    "kfold": kfold,
    #    "datos": datos
    #}

    #datos_total.append(datos_by_fold)
    #df_datos = pd.DataFrame(datos_total)
    #datos_path = pipeline["save_path_stats"] + 'exp_'+str(pipeline["id"])+'_'+str(kfold)+'_data.pkl'
    #df_datos.to_pickle(datos_path)

    if method == "supervised":
        kfold = 0
        total_stages = 1  #pipeline["train_epochs"]
    elif pipeline[
            "labeling_method"] == 'decision' and method == "semi-supervised":
        total_stages = len(datos["batch_set"])
    elif pipeline[
            "labeling_method"] == 'democratic' and method == "semi-supervised":
        total_stages = pipeline["labeling_stages"]
    else:
        pass

    for iteracion in range(total_stages * 1):

        #kfold_info = f"K-FOLD {kfold}/{num_kfold} - ITERACION {iteracion}/{total_stages}"
        #print("\n")
        #print("#"*len(kfold_info))
        #print(kfold_info)
        #print("#"*len(kfold_info))
        #print("\n")

        info = f"METHOD - {method} - ITERATION {iteracion}/{total_stages}"

        if method == "semi-supervised":
            print("\n")
            print(f"CLASS DISTRIBUTION - BATCH_SET {iteracion}")

            if len(datos["LC"]) > 0:
                U_set = pd.DataFrame(datos["LC"],
                                     columns=[
                                         pipeline["x_col_name"],
                                         pipeline["y_col_name"], 'arch_scores'
                                     ])
                #print("LABELING LOW CONFIDENCE SAMPLES (LC)")
                print(U_set.groupby(pipeline["y_col_name"]).count())
                #print("OK - LABELING LOW CONFIDENCE SAMPLES (LC)")
            else:
                U_set = datos['U']
                #print("LABELING UNLABELED SAMPLES (U)")
                print(U_set.groupby(pipeline["y_col_name"]).count())
                #print("OK - LABELING UNLABELED SAMPLES (U)")

            #print( datos["batch_set"][iteracion].groupby(pipeline["y_col_name"]).count() )
            print(f"OK - CLASS DISTRIBUTION - BATCH_SET {iteracion}")
            print("\n")

        if iteracion == 0 or method == "supervised":
            etapa = 'train'
        else:
            etapa = 'train_EL'

        for model in model_zoo:

            print("AUG_FACTOR: ", pipeline["aug_factor"])

            model_memory, model_performance = training(kfold, etapa, datos,
                                                       model, iteracion,
                                                       models_info,
                                                       classification_metrics,
                                                       pipeline)

            models_info[model] = {
                'model_memory': model_memory,
                'model_performance': model_performance['val_acc']
            }

        df_temp = pd.DataFrame(models_info).T
        top_models = df_temp.sort_values('model_performance', ascending=False)
        top_models = top_models.reset_index()['index'].values.tolist()[:3]

        mod_top1, arch_top1 = models_info[
            top_models[0]]['model_memory'], top_models[0]
        mod_top2, arch_top2 = models_info[
            top_models[1]]['model_memory'], top_models[1]
        mod_top3, arch_top3 = models_info[
            top_models[2]]['model_memory'], top_models[2]

        # Medir tiempo de ejecucion
        import time
        start = time.time()

        print("EVALUATING CO-TRAINING ...")
        print("\n")

        cotrain_acc1, cotrain_infer_dfs1 = evaluate_cotrain(
            mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3,
            datos, etapa, kfold, iteracion, pipeline, models_info, 'patologo1',
            logs)

        print("Co-train - Patologo 1: ", cotrain_acc1)

        cotrain_acc2, cotrain_infer_dfs2 = evaluate_cotrain(
            mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3,
            datos, etapa, kfold, iteracion, pipeline, models_info, 'patologo2',
            logs)

        print("Co-train - Patologo 2: ", cotrain_acc2)

        df_cotrain_info = {
            "kfold": kfold,
            "iteracion": iteracion,
            "patologo1": {
                "df_arch1": cotrain_infer_dfs1[0],
                "df_arch2": cotrain_infer_dfs1[1],
                "df_arch3": cotrain_infer_dfs1[2]
            },
            "patologo2": {
                "df_arch1": cotrain_infer_dfs2[0],
                "df_arch2": cotrain_infer_dfs2[1],
                "df_arch3": cotrain_infer_dfs2[2]
            },
        }

        cotrain_list.append(df_cotrain_info)
        df_cotrain_list = pd.DataFrame(cotrain_list)

        infer_pkl = pipeline["save_path_stats"] + 'exp_' + str(
            pipeline["id"]) + '_' + str(iteracion) + '_cotrain_eval.pkl'

        print("SAVING COTRAIN EVAL PICKLE")
        df_cotrain_list.to_pickle(infer_pkl)
        print("OK - SAVING COTRAIN EVAL PICKLE")

        print("\n")
        print("OK - EVALUATING CO-TRAINING")

        end = time.time()
        infer_time = end - start

        # SAVE INFER_TIME BY DF_TEST BY ITERATION AND ARCH
        print(infer_time, len(datos["df_test1"]))

        logs_infer_time = []
        logs_infer_time.append([
            kfold, iteracion, 'co-train1', infer_time,
            len(datos["df_test1"])
        ])
        save_logs(logs_infer_time, 'infer_time', pipeline)

        if method == "supervised":
            print(f"SUPERVISED METHOD COMPLETED FOR ITERATION: {iteracion}")
            #reset_keras(pipeline)
            continue

        print(f"GETTING BATCH_SET OF ITERATION {iteracion}...")
        print("LABELING ...")

        if pipeline["labeling_method"] == "decision":
            datos, EL_iter, LC_iter, EL_accu, LC_accu, label_infer_df = labeling(
                etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2,
                arch_top3, datos, pipeline, iteracion, models_info)
        elif pipeline["labeling_method"] == "democratic":
            datos, EL_iter, LC_iter, EL_accu, LC_accu, label_infer_df = labeling_v2(
                etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2,
                arch_top3, datos, pipeline, iteracion, models_info)

        df_label_info = {
            "kfold": kfold,
            "iteracion": iteracion,
            "df_arch1": label_infer_df[0],
            "df_arch2": label_infer_df[1],
            "df_arch3": label_infer_df[2]
        }

        label_list.append(df_label_info)
        df_label_list = pd.DataFrame(label_list)

        label_pkl = pipeline["save_path_stats"] + 'exp_' + str(
            pipeline["id"]) + '_' + str(iteracion) + '_labeling.pkl'

        print("SAVING LABEL PICKLE")
        df_label_list.to_pickle(label_pkl)
        print("OK - SAVING LABEL PICKLE")

        print("OK - LABELING")
        print("EL_iter", len(EL_iter))
        print("LC_iter", len(LC_iter))

        df_EL = pd.DataFrame(datos["EL"],
                             columns=[
                                 pipeline["x_col_name"],
                                 pipeline["y_col_name"], 'arch_scores'
                             ])
        df_LC = pd.DataFrame(datos["LC"],
                             columns=[
                                 pipeline["x_col_name"],
                                 pipeline["y_col_name"], 'arch_scores'
                             ])

        df_EL.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                        str(pipeline["id"]) + '_' + str(iteracion) +
                        '_EL.pickle')
        df_LC.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                        str(pipeline["id"]) + '_' + str(iteracion) +
                        '_LC.pickle')

        df_label_stats = label_stats(df_EL, df_LC, pipeline)
        df_label_stats.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                                 str(pipeline["id"]) + '_' + str(iteracion) +
                                 '_stats.pickle')

        df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]])
        datos['df_train_EL'] = df_train_EL

        ssl_th = pipeline["ssl_threshold"]

        logs_label.append([
            kfold, iteracion, arch_top1, arch_top2, arch_top3,
            len(EL_iter),
            len(LC_iter), EL_accu, LC_accu, ssl_th
        ])
        save_logs(logs_label, 'label', pipeline)

        reset_keras(pipeline)

    end = time.time()
    print(end - start)