def save_model_outputs(model, _dataset, model_path):
    npy_path = model_path + 'npy/'
    create_folders(npy_path, model_path + 'model/')
    model_soft = Model(model.input, model.get_layer('features').output)
    # save softmax predictions
    pred = model.predict(_dataset.x_train)[:, :_dataset.num_classes]
    pred_int = np.argmax(pred, axis=1)
    np.save(npy_path + 'train_preds.npy', pred)
    np.save(npy_path + 'train_preds_int.npy', pred_int)
    pred = model.predict(_dataset.x_test)[:, :_dataset.num_classes]
    pred_int = np.argmax(pred, axis=1)
    np.save(npy_path + 'test_preds.npy', pred)
    np.save(npy_path + 'test_preds_int.npy', pred_int)
    # save logits
    logits_train = model_soft.predict(
        _dataset.x_train)[:, :_dataset.num_classes]
    logits_test = model_soft.predict(_dataset.x_test)[:, :_dataset.num_classes]
    np.save(npy_path + 'train_logits.npy', logits_train)
    np.save(npy_path + 'test_logits.npy', logits_test)
    # save confusion matrices
    cm_train = plot_cm(model, _dataset.x_train, _dataset.y_train_int(),
                       _dataset.class_names, model_path + 'train_cm.png')
    cm_test = plot_cm(model, _dataset.x_test, _dataset.y_test_int(),
                      _dataset.class_names, model_path + 'test_cm.png')
    np.save(npy_path + 'train_cm.npy', cm_train)
    np.save(npy_path + 'test_cm.npy', cm_test)
    # save distance matrices
    plot_dm(model_soft, _dataset.x_train, _dataset.y_train_int(),
            _dataset.class_names, model_path + 'train_dm.png')
    plot_dm(model_soft, _dataset.x_test, _dataset.y_test_int(),
            _dataset.class_names, model_path + 'test_dm.png')
    # save model
    plot_model(model, model_path + 'model/model.png')
    save_model(model, model_path + 'model/model')
    K.clear_session()
def prep_noisylabels(dataset, folders, noise_type, noise_ratio, verbose, alpha,
                     temperature, is_dropout):
    # prepare required models first
    prep_teacher_model(dataset, verbose)
    prep_student(dataset, verbose, alpha, temperature)

    # generate and save corrupted labels for each noise type for given noise ratio
    _dataset = get_data(dataset)

    # copy xy model as none for baseline
    path = str(Path(folders['logdir']).parent)
    if not os.path.isdir(path + '/none/'):
        shutil.copytree('{}/models/teacher'.format(dataset), path + '/none/')

    # generate noisy labels
    y_train_noisy, y_test_noisy, probs = get_noisy_labels(
        _dataset, noise_type, noise_ratio)
    if PLOT_NOISE:
        y_train_clean, y_test_clean = _dataset.y_train_int(
        ), _dataset.y_test_int()
        create_folders(folders['noisedir'])
        if not isfile(folders['noisedir'] + 'cmtest.png'):
            print('Noise for {} doesnt exist, creating it...'.format(
                folders['noisedir']))
            # plot confused samples
            if probs is not None:
                create_folders(folders['noisedir'] + '/plots')
                np.save(folders['noisedir'] + 'probs.npy', probs)
                _dataset_noisy = get_data(dataset,
                                          y_noisy=y_train_noisy,
                                          y_noisy_test=y_test_noisy)
                plot_confused_samples(probs,
                                      _dataset_noisy,
                                      path=folders['noisedir'] + 'plots/')
                plot_confused_samples2(probs,
                                       _dataset_noisy,
                                       path=folders['noisedir'] + 'plots/')
            # save confusion matrix
            cm = confusion_matrix(y_train_clean, y_train_noisy)
            plot_matrix(cm,
                        _dataset.class_names,
                        title='Noise ratio: {}'.format(noise_ratio))
            plt.savefig(folders['noisedir'] + 'cmtrain.png')
            cm = confusion_matrix(y_test_clean, y_test_noisy)
            plot_matrix(cm,
                        _dataset.class_names,
                        title='Noise ratio: {}'.format(noise_ratio))
            plt.savefig(folders['noisedir'] + 'cmtest.png')

    return y_train_noisy, y_test_noisy
def tensor_board(dataset, log_dir, is_embed=False, **kwargs):
    tb_dir = log_dir + 'tensorboard/'
    create_folders(tb_dir)
    with open(join(tb_dir, 'metadata.tsv'), 'w') as f:
        np.savetxt(f, dataset.y_test_int())
    if not is_embed:
        return TensorBoard(log_dir=tb_dir, **kwargs)
    else:
        return TensorBoard(log_dir=tb_dir,
                           embeddings_freq=1,
                           embeddings_layer_names=['features'],
                           embeddings_metadata='metadata.tsv',
                           embeddings_data=dataset.x_test,
                           **kwargs)
def train(dataset,
          model,
          epochs=50,
          batch_size=128,
          log_dir=None,
          callbacks=[],
          verbose=1):
    # prepare folders
    create_folders(log_dir + 'model/', log_dir + 'npy/')

    # get data
    x_train, y_train, x_test, y_test = dataset.get_data()
    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.5,
                                                      random_state=RANDOM_SEED)

    # callbacks
    mycallbacks = []
    mycallbacks.extend(callbacks)
    if log_dir is not None:
        mycallbacks.extend([
            MyLogger(log_dir, dataset),
            learning_rate_scheduler(lr_schedule, verbose=verbose),
            lr_plateau(factor=np.sqrt(0.1),
                       cooldown=0,
                       patience=5,
                       min_lr=0.5e-6,
                       verbose=verbose)
        ])
        plot_model(model, log_dir + 'model/model.png')
    # image generator
    datagen = ImageDataGenerator(width_shift_range=0.1, height_shift_range=0.1)
    # train model on clean data
    model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size),
                        steps_per_epoch=dataset.num_train_samples / batch_size,
                        epochs=epochs,
                        validation_data=(x_val, y_val),
                        verbose=verbose,
                        callbacks=mycallbacks)

    loss, acc = model.evaluate(x_test, y_test, verbose=0)
    print('val_loss:', loss, '- val_acc:', acc)

    del callbacks
    gc.collect()

    return model
def train_coteaching(dataset,
                     model1,
                     model2,
                     epochs=50,
                     batch_size=128,
                     log_dir=None,
                     forget_rate=0.2,
                     num_graduals=10,
                     exponent=0.2,
                     learning_rate=1e-3,
                     epoch_decay_start=30):
    def epoch_stats(dataset, model, epoch, logdir, csv_path=None):
        x_train, y_train_noisy, x_test, y_test = dataset.get_data()
        y_train_int = dataset.y_noisy_int()
        y_test_int = dataset.y_test_int()
        clean_index = dataset.idx_clean
        noisy_index = dataset.idx_noisy

        if csv_path is not None:
            train_loss, train_acc = model.evaluate(x_train, y_train, verbose=0)
            test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
            print("-%s: acc_test: %.4f - acc_mix: %.4f" %
                  (model.name, test_acc, train_acc))
            df = pd.read_csv(log_dir + csv_path)
            #row = [{'epoch':epoch, 'acc':acc_mix,'loss':loss_mix, 'val_acc':acc_test, 'val_loss':loss_test,'test_acc':test_acc,'test_loss':test_loss}]
            row = [{
                'epoch': epoch,
                'acc': train_acc,
                'loss': train_loss,
                'test_acc': test_acc,
                'test_loss': test_loss
            }]
            df = df.append(row)
            df.to_csv(log_dir + csv_path, index=False)

            xticks = np.arange(1, epoch + 1, 1.0)
            plt.figure()
            plt.plot(xticks, df['acc'], label="acc")
            #plt.plot(xticks, df['val_acc'], label="val_acc")
            plt.plot(xticks, df['test_acc'], label="test_acc")
            plt.legend(loc='best')
            plt.xlabel('# iterations')
            plt.xticks(xticks)
            plt.title('Accuracy')
            plt.savefig(log_dir + 'accuracy_{}.png'.format(model.name))

            plt.clf()
            plt.plot(xticks, df['loss'], label="loss")
            #plt.plot(xticks, df['val_loss'], label="val_loss")
            plt.plot(xticks, df['test_loss'], label="test_loss")
            plt.legend(loc='best')
            plt.xlabel('# iterations')
            plt.xticks(xticks)
            plt.title('Loss')
            plt.savefig(log_dir + 'loss_{}.png'.format(model.name))
            plt.close()

    create_folders(log_dir + 'model/', log_dir + 'npy/')
    # get data
    x_train, y_train, x_test, y_test = dataset.get_data()
    model1._name = 'model1'
    model2._name = 'model2'

    # number of batches in an epoch
    num_batch_iter = x_train.shape[0] / batch_size
    # calculate forget rates for each epoch (from origianl code)
    forget_rates = np.ones(epochs) * forget_rate
    forget_rates[:num_graduals] = np.linspace(0, forget_rate**exponent,
                                              num_graduals)
    # calculate learning rates for each epoch (from origianl code)
    learning_rates = [learning_rate] * epochs
    for i in range(epoch_decay_start, epochs):
        learning_rates[i] = float(epochs - i) / (
            epochs - epoch_decay_start) * learning_rate

    if log_dir is not None:
        #logcsv_cols =['epoch','acc','loss','val_acc','val_loss','test_acc','test_loss']
        logcsv_cols = ['epoch', 'acc', 'loss', 'test_acc', 'test_loss']
        df = pd.DataFrame(columns=logcsv_cols)
        df.to_csv(log_dir + 'log1.csv', index=False)
        df.to_csv(log_dir + 'log2.csv', index=False)

    for e in range(1, epochs):
        # if learning rate changes, recompile
        if (learning_rates[e] != learning_rates[e - 1]):
            model1.compile(loss='categorical_crossentropy',
                           optimizer=SGD(lr=learning_rates[e],
                                         decay=1e-6,
                                         momentum=0.9,
                                         nesterov=True),
                           metrics=['accuracy'])
            model2.compile(loss='categorical_crossentropy',
                           optimizer=SGD(lr=learning_rates[e],
                                         decay=1e-6,
                                         momentum=0.9,
                                         nesterov=True),
                           metrics=['accuracy'])
            #model2.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rates[e]), metrics=['accuracy'])

        remember_rate = 1 - forget_rates[e]
        print("Epoch: %d/%d; Learning rate: %.7f; n_keep: %d" %
              (e + 1, epochs, learning_rates[e], remember_rate * batch_size))

        # iterate for each batch in an epoch
        for (i, (x_batch,
                 y_batch)) in enumerate(dataset.flow_train(batch_size)):
            num_remember = int(remember_rate * len(x_batch))

            # select samples based on model 1
            y_pred = model1.predict_on_batch(x_batch)
            cross_entropy = np.sum(-y_batch * np.log(y_pred + 1e-8), axis=1)
            batch_idx1 = np.argsort(cross_entropy)[:num_remember]

            # select samples based on  model 2
            y_pred = model2.predict_on_batch(x_batch)
            cross_entropy = np.sum(-y_batch * np.log(y_pred + 1e-8), axis=1)
            batch_idx2 = np.argsort(cross_entropy)[:num_remember]

            # training
            model1.train_on_batch(x_batch[batch_idx2, :],
                                  y_batch[batch_idx2, :])
            model2.train_on_batch(x_batch[batch_idx1, :],
                                  y_batch[batch_idx1, :])

            if i >= num_batch_iter:
                break

        epoch_stats(dataset, model1, e, log_dir, 'log1.csv')
        epoch_stats(dataset, model2, e, log_dir, 'log2.csv')

    # choose best model
    loss1, acc1 = model1.evaluate(x_test, y_test, verbose=0)
    loss2, acc2 = model2.evaluate(x_test, y_test, verbose=0)
    if acc1 > acc2:
        os.rename(log_dir + 'log1.csv', log_dir + 'log.csv')
        return model1
    else:
        os.rename(log_dir + 'log2.csv', log_dir + 'log.csv')
        return model2
def main(dataset_name,
         model_name,
         epochs,
         batch_size,
         noise_type,
         noise_ratio,
         verbose=1,
         alpha=util.ALPHA,
         temperature=16,
         is_dropout=False,
         percentage=1):
    K.clear_session()
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    seed_everything()
    # folders to be used
    noise_base_path = 'nr_{}'.format(
        noise_ratio) if not is_dropout else 'nr_{}_do'.format(noise_ratio)
    folders = {
        'logbase':
        '{}/logs_{}/'.format(dataset_name, percentage),
        'logbase_nr':
        '{}/logs_{}/{}/{}/'.format(dataset_name, percentage, noise_base_path,
                                   model_name),
        'logdir':
        '{}/logs_{}/{}/{}/{}/'.format(dataset_name, percentage,
                                      noise_base_path, model_name, noise_type),
        'modelbase':
        '{}/models/'.format(dataset_name),
        'noisebase':
        '{}/noisylabels/'.format(dataset_name),
        'noisedir':
        '{}/noisylabels/{}/'.format(dataset_name, noise_type),
        'dataset':
        '{}/dataset'.format(dataset_name)
    }

    # if log file already exis"ts dont run it again
    if isfile(folders['logdir'] +
              'model/model.h5') and isfile(folders['logdir'] +
                                           'model/model.json'):
        print('Logs exists, skipping run...')
        return

    # clean empty logs if there is any
    clean_empty_logs()
    # create necessary folders
    create_folders(folders['dataset'], folders['logdir'])
    # generate noisy labels
    y_train_noisy, y_test_noisy = prep_noisylabels(dataset_name, folders,
                                                   noise_type, noise_ratio,
                                                   verbose, alpha, temperature,
                                                   is_dropout)

    # load dataset with noisy labels
    dataset = get_data(dataset_name,
                       y_noisy=y_train_noisy,
                       y_noisy_test=y_test_noisy)
    dataset.get_percentage(percentage)

    # stats before training
    print(
        'Dataset: {}, model: {}, noise_type: {}, noise_ratio: {}, epochs: {}, batch: {} , dropout: {}'
        .format(dataset.name, model_name, noise_type, noise_ratio, epochs,
                batch_size, is_dropout))
    dataset.get_stats()
    dataset.save_cm_train(folders['logdir'] + 'corrupted_data.png')

    # train model
    if model_name == 'coteaching':
        model1 = get_model(dataset, model_name, is_dropout=is_dropout)
        model2 = get_model(dataset, model_name, is_dropout=is_dropout)
        model = train_coteaching(dataset, model1, model2, epochs, batch_size,
                                 folders['logdir'])
    else:
        #cm = np.load('{}/models/xy/npy/test_cm.npy'.format(dataset_name))
        cm = dataset.get_cm_train()
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        model = get_model(dataset, model_name, cm, is_dropout=is_dropout)
        model = train(dataset,
                      model,
                      epochs,
                      batch_size,
                      folders['logdir'],
                      verbose=verbose)

    # performance analysis
    postprocess(dataset, model, noise_type, noise_ratio, folders, y_test_noisy)
    K.clear_session()
def model_checkpoint(log_dir, **kwargs):
    logdir = log_dir + 'model/'
    create_folders(logdir)
    return ModelCheckpoint(logdir + "checkpoint.hdf5", **kwargs)