예제 #1
0
def main(cube_dir, hyperband_iterations, max_epochs):
    device_name = tf.test.gpu_device_name()
    if device_name != '/device:GPU:0':
        raise SystemError('GPU device not found')
    logging.info('Found GPU at: {}'.format(device_name))

    x, y, val_x, val_y = deep_emulator.data(cube_dir, rescale=True)

    tuner = Hyperband(
        build_model,
        objective='val_mean_absolute_error',
        hyperband_iterations=hyperband_iterations,
        max_epochs=max_epochs,
        directory='results/hyperband',
        project_name='agnfinder_5layer_dropout'
    )

    early_stopping = keras.callbacks.EarlyStopping(restore_best_weights=True)

    tuner.search(
        x,
        y,
        callbacks=[early_stopping],
        validation_data=(val_x, val_y),
        batch_size=1024
    )

    tuner.results_summary()

    models = tuner.get_best_models(num_models=5)

    for n, model in enumerate(models):
        logging.info(f'Model {n}')
        logging.info(model.summary())
예제 #2
0
def find_best_NN(x_train_main, y_train_main):
  tuner = Hyperband(build_model, objective="loss", max_epochs=10, hyperband_iterations=10)
  tuner.search(x_train, y_train, batch_size=1, epochs=10, validation_split=0.3)
  tuner.results_summary()
  print("\n\n\n")
  print("\n\n\nHERE IS THE BEST MODEL\n\n\n")
  best_params = tuner.get_best_hyperparameters()[0]
  best_model = tuner.hypermodel.build(best_params)
  best_model.summary()
  return best_model
예제 #3
0
def find_best(x_train, y_train):
  # создаю тюнер, который сможет подобрать оптимальную архитектуру модели
  tuner = Hyperband(build_model, objective="loss", max_epochs=10, hyperband_iterations=3)
  print("\n\n\n")
  # начинается автоматический подбор гиперпараметров
  print('[INFO] start searching')
  tuner.search(x_train, y_train, batch_size=128, epochs=10, validation_split=0.2)
  # выбираем лучшую модель
  print("\n\n\nRESULTS SUMMARY")
  tuner.results_summary()
  print("\n\n\n")
  # получаем лучшую модель
  print("\n\n\nHERE IS THE BEST MODEL\n\n\n")
  best_params = tuner.get_best_hyperparameters()[0]
  best_model = tuner.hypermodel.build(best_params)
  best_model.summary()
  return best_model
예제 #4
0
def hypermodel_exec(x_train, x_test, y_train, y_test):
    SEED = 17
    MAX_TRIALS = 40
    EXECUTION_PER_TRIAL = 3
    HYPERBAND_MAX_EPOCHS = 20

    NUM_CLASSES = 1  # One sigmoid neuron for binary classification
    INPUT_SHAPE = (32, 32, 3
                   )  # Depends on embedding type and bp lenght of dataset
    N_EPOCH_SEARCH = 40

    np.random.seed(SEED)

    hypermodel = HotCNNHyperModel(input_shape=INPUT_SHAPE,
                                  num_classes=NUM_CLASSES)

    # tuner = RandomSearch(
    #     hypermodel,
    #     objective='val_loss',
    #     seed=SEED,
    #     max_trials=MAX_TRIALS,
    #     executions_per_trial=EXECUTION_PER_TRIAL,
    #     directory='random_search',
    #     project_name='hot_cnn_promoter_01'
    # )

    tuner = Hyperband(hypermodel,
                      max_epochs=HYPERBAND_MAX_EPOCHS,
                      objective='val_accuracy',
                      seed=SEED,
                      executions_per_trial=EXECUTION_PER_TRIAL,
                      directory='hyperband',
                      project_name='hot_cnn_promoter_01')

    tuner.search_space_summary()

    tuner.search(x_train, y_train, epochs=N_EPOCH_SEARCH, validation_split=0.1)

    # Show a summary of the search
    tuner.results_summary()

    # Retrieve the best model.
    best_model = tuner.get_best_models(num_models=1)[0]

    # Evaluate the best model.
    loss, accuracy = best_model.evaluate(x_test, y_test)
                  hyperband_iterations=2,
                  directory='HyperBandTrials',
                  project_name='PressureOpti_hl')

# Defining the Early Stopping Function
early_stopping_callback = EarlyStopping(monitor='val_mape',
                                        patience=500,
                                        min_delta=1e-4,
                                        restore_best_weights=True,
                                        mode='auto',
                                        verbose=True)

tuner.search_space_summary()

print(model.summary())

tuner.search(X_train,
             y_train,
             epochs=2000,
             validation_data=(X_test, y_test),
             callbacks=[early_stopping_callback])

models = tuner.get_best_models(num_models=2)
models[0].save(
    '/mnt/IMP/Work/Thesis/NeuralNetwork/DeepLearning-RarefiedFlows/SavedModels/Pressure/HyperBand_hl/Best_Model_1'
)
models[1].save(
    '/mnt/IMP/Work/Thesis/NeuralNetwork/DeepLearning-RarefiedFlows/SavedModels/Pressure/HyperBand_hl/Best_Model_2'
)
tuner.results_summary()
예제 #6
0
                     hyperband_iterations=3)

#print total search space
tuner_hb.search_space_summary()

#search through the total search space
tuner_hb.search(train_generator,
                epochs=500,
                verbose=1,
                validation_data=(img_test, mask_test))

#save best model and hyperparameters
best_model = tuner_hb.get_best_models(1)[0]

best_hyperparameters = tuner_hb.get_best_hyperparameters(1)[0]
print(tuner_hb.get_best_hyperparameters(1))

#
model_json = best_model.to_json()
with open("hp_bce_all_basicUNet_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
best_model.save_weights("hp_bce_all_tune_basicUNet_tuner_model.h5")

with open('best_LGG_basicUNet_Param.txt', 'w') as f:
    print(best_hyperparameters, file=f)

print("Saved model to disk")
print(best_hyperparameters)
tuner_hb.results_summary()  #print best 10 models
예제 #7
0
train_generator = train_datagen.flow(
    img_overall_train, mask_overall_train,
    batch_size=16)

val_generator = train_datagen.flow(
    img_test, mask_test)


tuner_hb.search(train_generator,epochs = 500,verbose = 1,validation_data = (img_test,mask_test))

best_model = tuner_hb.get_best_models(1)[0]

best_hyperparameters = tuner_hb.get_best_hyperparameters(1)[0]
print(tuner_hb.get_best_hyperparameters(1))

#
model_json = best_model.to_json()
with open("hp_bce_all_basicUNet_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
best_model.save_weights("hp_bce_all_tune_basicUNet_tuner_model.h5")

with open('best_LGG_basicUNet_Param.txt', 'w') as f:
    print(best_hyperparameters, file=f)


print("Saved model to disk")
print(best_hyperparameters)
tuner_hb.results_summary()
예제 #8
0
    # print(f'train_labels: \n{train_labels}')
    # '''
    # model = build_model()
    # Build tuner
    tuner = Hyperband(build_model,
                      objective='mse',
                      max_epochs=100,
                      executions_per_trial=5,
                      directory='tuning',
                      project_name='tuning')
    print(tuner.search_space_summary())

    # Build tensorflow dataset
    dataset = tf.data.Dataset.from_tensor_slices(
        (train_data, train_labels)).batch(128)

    # Train Model
    tuner.search(
        dataset,
        shuffle=True,
        epochs=500,  # war 10000
        verbose=0)
    model = tuner.get_best_models(num_models=1)
    print(tuner.results_summary())
    # Save model
    # model.save('model.h5')

    validate_model(model, test_data, test_labels)
    # validate_model(model, train_data, train_labels)
    # '''
예제 #9
0
def main(variant):
    HYPERBAND_MAX_EPOCHS = 50
    EXECUTION_PER_TRIAL = 2
    SEED = 13377331
    # ,'../data/FunLines/task-1/preproc/2_concat_train.bin'
    train_path, dev_path, test_path = [
        '../data/task-1/preproc/2_concat_train.bin'
    ], ['../data/task-1/preproc/2_concat_dev.bin'
        ], ['../data/task-1/preproc/2_concat_test.bin']
    if variant == 'HUMOR':
        params = json.load(open("./lib/models/tuning/model.json", 'r'))

        train_data = load_data(train_path)
        dev_data = load_data(dev_path)
        test_data = load_data(test_path)

        features = [
            PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature,
            NellKbFeature
        ]

        train_data.AddFeatures(features)
        dev_data.AddFeatures(features)
        test_data.AddFeatures(features)

        features, train_y = train_data.GetFeatureVectors(
        ), train_data.GetGrades()
        ins = {"FeatureInput": features[:, :4]}
        i = 4
        ins["EntityInput"] = features[:, i:]

        # text = np.load('../data/task-1/train_replaced.npy', allow_pickle=True) #
        text = train_data.GetReplaced()
        ins["ReplacedInput"] = text

        # text = np.load('../data/task-1/train_edit.npy', allow_pickle=True) #
        text = train_data.GetEdits()
        ins["ReplacementInput"] = text

        # Dev data
        dev_features, dev_y = dev_data.GetFeatureVectors(), dev_data.GetGrades(
        )
        devIns = {"FeatureInput": dev_features[:, :4]}
        i = 4
        devIns["EntityInput"] = dev_features[:, i:]

        # text = np.load('../data/task-1/dev_replaced.npy', allow_pickle=True) #
        text = dev_data.GetReplaced()
        devIns["ReplacedInput"] = text

        # text = np.load('../data/task-1/dev_edit.npy', allow_pickle=True) #
        text = dev_data.GetEdits()
        devIns["ReplacementInput"] = text

        # Test data
        test_features, test_y = test_data.GetFeatureVectors(
        ), test_data.GetGrades()
        testIns = {"FeatureInput": test_features[:, :4]}
        i = 4
        testIns["EntityInput"] = test_features[:, i:]

        # text = np.load('../data/task-1/test_replaced.npy', allow_pickle=True) #
        text = test_data.GetReplaced()
        testIns["ReplacedInput"] = text

        # text = np.load('../data/task-1/test_edit.npy', allow_pickle=True) #
        text = test_data.GetEdits()
        testIns["ReplacementInput"] = text

        early = tf.keras.callbacks.EarlyStopping(
            monitor='val_root_mean_squared_error',
            min_delta=0.0001,
            patience=5,
            mode='min',
            restore_best_weights=True)

        score = []
        for i in range(10):
            model = create_HUMOR2_model(4, 25, 128, params["hyperparameters"])
            model.fit(x=ins,
                      y=train_y,
                      validation_data=(devIns, dev_y),
                      batch_size=16,
                      epochs=40,
                      shuffle=True,
                      callbacks=[early])

            preds = model.predict(testIns)
            score.append(
                mean_squared_error(test_y, round_numbers(preds),
                                   squared=False))
            print(score[i])
            del model
            # tf.reset_default_graph()
            tf.keras.backend.clear_session()
            gc.collect()
        score = np.array(score)
        print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}')

    elif variant == 'HUMOR2':
        params = json.load(open("./lib/models/tuning/model2.json", 'r'))

        train_data = load_data(train_path)
        dev_data = load_data(dev_path)
        test_data = load_data(test_path)

        features = [
            PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature,
            NellKbFeature, AlbertTokenizer
        ]

        train_data.AddFeatures(features)
        dev_data.AddFeatures(features)
        test_data.AddFeatures(features)

        features, train_y = train_data.GetFeatureVectors(
        ), train_data.GetGrades()
        ins = {"FeatureInput": features[:, :4]}
        i = 4
        ins["EntityInput"] = features[:, i:i + 25]
        i += 25
        ins["input_word_ids"] = features[:, i:i + 128]
        i += 128
        ins["segment_ids"] = features[:, i:i + 128]
        i += 128
        ins["input_mask"] = features[:, i:i + 128]

        text = train_data.GetReplaced()
        ins["ReplacedInput"] = text

        text = train_data.GetEdits()
        ins["ReplacementInput"] = text

        # Dev data
        dev_features, dev_y = dev_data.GetFeatureVectors(), dev_data.GetGrades(
        )
        devIns = {"FeatureInput": dev_features[:, :4]}
        i = 4
        devIns["EntityInput"] = dev_features[:, i:i + 25]
        i += 25
        devIns["input_word_ids"] = dev_features[:, i:i + 128]
        i += 128
        devIns["segment_ids"] = dev_features[:, i:i + 128]
        i += 128
        devIns["input_mask"] = dev_features[:, i:i + 128]

        text = dev_data.GetReplaced()
        devIns["ReplacedInput"] = text

        text = dev_data.GetEdits()
        devIns["ReplacementInput"] = text

        # Test data
        test_features, test_y = test_data.GetFeatureVectors(
        ), test_data.GetGrades()
        testIns = {"FeatureInput": test_features[:, :4]}
        i = 4
        testIns["EntityInput"] = test_features[:, i:i + 25]
        i += 25
        testIns["input_word_ids"] = test_features[:, i:i + 128]
        i += 128
        testIns["segment_ids"] = test_features[:, i:i + 128]
        i += 128
        testIns["input_mask"] = test_features[:, i:i + 128]

        text = test_data.GetReplaced()
        testIns["ReplacedInput"] = text

        text = test_data.GetEdits()
        testIns["ReplacementInput"] = text

        early = tf.keras.callbacks.EarlyStopping(
            monitor='val_root_mean_squared_error',
            min_delta=0.0001,
            patience=5,
            mode='min',
            restore_best_weights=True)

        score = []
        for i in range(10):
            model = create_HUMOR2_model(4, 25, 128, params["hyperparameters"])
            model.fit(x=ins,
                      y=train_y,
                      validation_data=(devIns, dev_y),
                      batch_size=16,
                      epochs=25,
                      shuffle=True,
                      callbacks=[early])

            preds = model.predict(testIns)
            score.append(mean_squared_error(test_y, preds, squared=False))
            del model
            # tf.reset_default_graph()
            tf.keras.backend.clear_session()
            gc.collect()
        score = np.array(score)
        print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}')
    elif variant == 'TASK2INFER':
        model = './headline_regression/20200308-194029-BEST/weights/final.hdf5'
        infer = Task2Inference(model,
                               '../data/task-2/preproc/2_concat_test.bin')
        infer.predict('../data/task-2/predictions/task-2-output.csv')
    elif variant == 'TESTINFER':
        preds = 'task-1-output.context.csv'
        test = load_data(test_path)
        y = test.GetGrades()
        with open(preds, 'r') as f:
            i = 0
            pred_list = []
            for line in f:
                if i == 0:
                    i = 1
                else:
                    pred_list.append(float(line.strip().split(',')[1]))
        rmse = mean_squared_error(y, np.array(pred_list), squared=False)
        print(rmse)

    elif variant == 'NAM':
        model = create_NAM_model(1, 181544, 832)
        data_path = '../data/NELL/NELLRDF.xml'
        ent_vocab = '../data/NELL/NELLWordNetVocab.txt'
        rel_vocab = '../data/NELL/NELLRelVocab.txt'
        trainer = NAMTraining(model, data_path, ent_vocab, rel_vocab)
        trainer.train(30, 2048)
        trainer.test()
    elif variant == 'TUNING':
        model = HumorTuner(4, 20)
        tuner = Hyperband(model,
                          max_epochs=HYPERBAND_MAX_EPOCHS,
                          objective=kerastuner.Objective(
                              "val_root_mean_squared_error", direction="min"),
                          seed=SEED,
                          executions_per_trial=EXECUTION_PER_TRIAL,
                          hyperband_iterations=2,
                          directory=f'tuning_hyperband',
                          project_name='ContextHumor')

        tuner.search_space_summary()

        ## Loading the data
        train_data = load_data(train_path)
        dev_data = load_data(dev_path)
        features = [
            PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature,
            NellKbFeature
        ]
        train_data.AddFeatures(features)
        dev_data.AddFeatures(features)

        features, train_y = train_data.GetFeatureVectors(
        ), train_data.GetGrades()
        ins = {"FeatureInput": features[:, :4]}
        i = 4
        ins["EntityInput"] = features[:, i:i + 20]

        ins["ReplacedInput"] = train_data.GetReplaced()
        ins["ReplacementInput"] = train_data.GetEdits()

        # Dev data
        dev_features, dev_y = dev_data.GetFeatureVectors(), dev_data.GetGrades(
        )
        devIns = {"FeatureInput": dev_features[:, :4]}
        i = 4
        devIns["EntityInput"] = dev_features[:, i:i + 20]

        devIns["ReplacedInput"] = dev_data.GetReplaced()
        devIns["ReplacementInput"] = dev_data.GetEdits()

        early = tf.keras.callbacks.EarlyStopping(
            monitor='val_root_mean_squared_error',
            min_delta=0.0005,
            patience=2,
            mode='min',
            restore_best_weights=True)

        tuner.oracle.hyperband_iterations = 2

        tuner.search(ins,
                     train_y,
                     epochs=HYPERBAND_MAX_EPOCHS,
                     batch_size=64,
                     validation_data=(devIns, dev_y),
                     callbacks=[early])

        tuner.results_summary()
    elif variant == 'TUNINGSERVER':
        params = json.load(open("./lib/models/tuning/model.json", 'r'))
        model = HumorTunerServer(4, 20, 128, params["hyperparameters"])
        tuner = Hyperband(model,
                          max_epochs=HYPERBAND_MAX_EPOCHS,
                          objective=kerastuner.Objective(
                              "val_root_mean_squared_error", direction="min"),
                          seed=SEED,
                          executions_per_trial=EXECUTION_PER_TRIAL,
                          hyperband_iterations=1,
                          directory=f'tuning_hyperband',
                          project_name='ContextHumor')

        tuner.search_space_summary()

        ## Loading the data
        train_data = load_data(train_path)
        dev_data = load_data(dev_path)
        features = [
            PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature,
            NellKbFeature, AlbertTokenizer
        ]
        train_data.AddFeatures(features)
        dev_data.AddFeatures(features)

        features, train_y = train_data.GetFeatureVectors(
        ), train_data.GetGrades()
        ins = {"FeatureInput": features[:, :4]}
        i = 4
        ins["EntityInput"] = features[:, i:i + 20]
        i += 20
        ins["input_word_ids"] = features[:, i:i + 128]
        i += 128
        ins["segment_ids"] = features[:, i:i + 128]
        i += 128
        ins["input_mask"] = features[:, i:i + 128]

        text = train_data.GetReplaced()
        ins["ReplacedInput"] = text

        text = train_data.GetEdits()
        ins["ReplacementInput"] = text

        # Dev data
        dev_features, dev_y = dev_data.GetFeatureVectors(), dev_data.GetGrades(
        )
        devIns = {"FeatureInput": dev_features[:, :4]}
        i = 4
        devIns["EntityInput"] = dev_features[:, i:i + 20]
        i += 20
        devIns["input_word_ids"] = dev_features[:, i:i + 128]
        i += 128
        devIns["segment_ids"] = dev_features[:, i:i + 128]
        i += 128
        devIns["input_mask"] = dev_features[:, i:i + 128]

        text = dev_data.GetReplaced()
        devIns["ReplacedInput"] = text

        text = dev_data.GetEdits()
        devIns["ReplacementInput"] = text

        early = tf.keras.callbacks.EarlyStopping(
            monitor='val_root_mean_squared_error',
            min_delta=0.0005,
            patience=2,
            mode='min',
            restore_best_weights=True)

        tuner.search(ins,
                     train_y,
                     epochs=HYPERBAND_MAX_EPOCHS,
                     batch_size=64,
                     validation_data=(devIns, dev_y),
                     callbacks=[early])

        tuner.results_summary()
    elif variant == 'PLOT':
        axes = [
            "feature_units1", "feature_units2", "entity_units1",
            "entity_units2", "sentence_units1", "sentence_units2",
            "sentence_units3"
        ]
        models = json.load(open("./lib/models/tuning/result_summary.json",
                                'r'))
        params = defaultdict(list)

        for model in models["top_10"]:
            t_id = model["TrialID"]
            model_param = json.load(
                open(f"./tuning_hyperband/HumorHumor/trial_{t_id}/trial.json",
                     "r"))
            for a in axes:
                params[a].append(model_param["hyperparameters"]["values"][a])
            params["score"].append(model["Score"])

        fig = go.Figure(
            data=go.Parcoords(line_color='green',
                              dimensions=list([
                                  dict(range=[8, 128],
                                       label='Feature Layer 1',
                                       values=params[axes[0]]),
                                  dict(range=[8, 128],
                                       label='Feature Layer 2',
                                       values=params[axes[1]]),
                                  dict(range=[8, 128],
                                       label='Knowledge Layer 1',
                                       values=params[axes[2]]),
                                  dict(range=[8, 128],
                                       label='Knowledge Layer 2',
                                       values=params[axes[3]]),
                                  dict(range=[32, 512],
                                       label='Word Layer 2',
                                       values=params[axes[4]]),
                                  dict(range=[32, 512],
                                       label='Word Layer 1',
                                       values=params[axes[5]]),
                                  dict(range=[8, 128],
                                       label='Word Layer 2',
                                       values=params[axes[6]]),
                                  dict(range=[0, 1],
                                       label='Root Mean Square Error',
                                       values=params["score"]),
                              ])))

        fig.show()

    elif variant == 'MultiCNN':
        train_data = load_data(train_path)
        dev_data = load_data(dev_path)
        test_data = load_data(test_path)
        with codecs.open('../data/vocab/train_vocab.json',
                         encoding='utf-8') as fp:
            vocab_dict = json.load(fp)

        max_length = longest(train_data.GetTokenizedWEdit())
        ins = {
            "TextIn":
            convert_to_index(vocab_dict, train_data.GetTokenizedWEdit(),
                             max_length)
        }
        train_y = train_data.GetGrades()
        devIns = {
            "TextIn":
            convert_to_index(vocab_dict, dev_data.GetTokenizedWEdit(),
                             max_length)
        }
        dev_y = dev_data.GetGrades()
        testIns = {
            "TextIn":
            convert_to_index(vocab_dict, test_data.GetTokenizedWEdit(),
                             max_length)
        }
        test_y = test_data.GetGrades()
        early = tf.keras.callbacks.EarlyStopping(
            monitor='val_root_mean_squared_error',
            min_delta=0.0001,
            patience=5,
            mode='min',
            restore_best_weights=True)
        lr_schedule = create_learning_rate_scheduler(max_learn_rate=1e-1,
                                                     end_learn_rate=1e-6,
                                                     warmup_epoch_count=15,
                                                     total_epoch_count=40)
        score = []
        for i in range(10):
            model = create_MultiCNN_model()
            model.fit(x=ins,
                      y=train_y,
                      validation_data=(devIns, dev_y),
                      batch_size=16,
                      epochs=40,
                      shuffle=True,
                      callbacks=[early])

            preds = model.predict(testIns)
            score.append(mean_squared_error(test_y, preds, squared=False))
            del model
            # tf.reset_default_graph()
            tf.keras.backend.clear_session()
            gc.collect()
        score = np.array(score)
        print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}')

        # preds = model.predict(devIns)
        # ids = dev_data.GetIDs()
        # out = np.stack((ids, preds.flatten()), axis=-1)
        # Save the predictions to file
        # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f")

        # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}')
        # bins = np.linspace(0, 3, 50)
        # plt.hist(preds, bins=bins, alpha=0.5, label="preds")
        # plt.hist(dev_y, bins=bins, alpha=0.5, label="true")
        # plt.legend(loc='upper right')
        # plt.show()
        # del model

    elif variant == 'CNN':
        # model = create_CNN_model()
        train_data = load_data(train_path)
        dev_data = load_data(dev_path)
        test_data = load_data(test_path)
        with codecs.open('../data/vocab/train_vocab.json',
                         encoding='utf-8') as fp:
            vocab_dict = json.load(fp)

        max_length = longest(train_data.GetTokenizedWEdit())
        ins = {
            "TextIn":
            convert_to_index(vocab_dict, train_data.GetTokenizedWEdit(),
                             max_length)
        }
        train_y = train_data.GetGrades()
        devIns = {
            "TextIn":
            convert_to_index(vocab_dict, dev_data.GetTokenizedWEdit(),
                             max_length)
        }
        dev_y = dev_data.GetGrades()
        testIns = {
            "TextIn":
            convert_to_index(vocab_dict, test_data.GetTokenizedWEdit(),
                             max_length)
        }
        test_y = test_data.GetGrades()
        early = tf.keras.callbacks.EarlyStopping(
            monitor='val_root_mean_squared_error',
            min_delta=0.0001,
            patience=5,
            mode='min',
            restore_best_weights=True)
        score = []
        for i in range(10):
            model = create_CNN_model()
            model.fit(x=ins,
                      y=train_y,
                      validation_data=(devIns, dev_y),
                      batch_size=16,
                      epochs=40,
                      shuffle=True,
                      callbacks=[early])

            preds = model.predict(testIns)
            score.append(mean_squared_error(test_y, preds, squared=False))
            del model
            # tf.reset_default_graph()
            tf.keras.backend.clear_session()
            gc.collect()
        score = np.array(score)
        print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}')

        # preds = model.predict(devIns)
        # ids = dev_data.GetIDs()
        # out = np.stack((ids, preds.flatten()), axis=-1)
        # Save the predictions to file
        # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f")

        # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}')
        # bins = np.linspace(0, 3, 50)
        # plt.hist(preds, bins=bins, alpha=0.5, label="preds")
        # plt.hist(dev_y, bins=bins, alpha=0.5, label="true")
        # plt.legend(loc='upper right')
        # plt.show()
        # del model

    elif variant == 'KBLSTM':
        train_data = load_data(train_path)
        train_data.AddFeatures([NellKbFeature])
        dev_data = load_data(dev_path)
        dev_data.AddFeatures([NellKbFeature])
        test_data = load_data(test_path)
        test_data.AddFeatures([NellKbFeature])
        with codecs.open('../data/vocab/train_vocab.json',
                         encoding='utf-8') as fp:
            vocab_dict = json.load(fp)

        max_length = longest(train_data.GetTokenizedWEdit())
        train = convert_to_index(vocab_dict, train_data.GetTokenizedWEdit(),
                                 max_length)
        ins = {"TextIn": train, "EntityInput": train_data.GetFeatureVectors()}
        train_y = train_data.GetGrades()
        dev = convert_to_index(vocab_dict, dev_data.GetTokenizedWEdit(),
                               max_length)
        devIns = {"TextIn": dev, "EntityInput": dev_data.GetFeatureVectors()}
        dev_y = dev_data.GetGrades()
        test = convert_to_index(vocab_dict, test_data.GetTokenizedWEdit(),
                                max_length)
        testIns = {
            "TextIn": test,
            "EntityInput": test_data.GetFeatureVectors()
        }
        test_y = test_data.GetGrades()
        early = tf.keras.callbacks.EarlyStopping(
            monitor='val_root_mean_squared_error',
            min_delta=0.0001,
            patience=5,
            mode='min',
            restore_best_weights=True)
        score = []
        for i in range(10):
            model = create_KBLSTM_model()
            model.fit(x=ins,
                      y=train_y,
                      validation_data=(devIns, dev_y),
                      batch_size=16,
                      epochs=40,
                      shuffle=True,
                      callbacks=[early])

            preds = model.predict(testIns)
            score.append(mean_squared_error(test_y, preds, squared=False))
            del model
            # tf.reset_default_graph()
            tf.keras.backend.clear_session()
            gc.collect()
        score = np.array(score)
        print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}')

        # preds = model.predict(devIns)
        # ids = dev_data.GetIDs()
        # out = np.stack((ids, preds.flatten()), axis=-1)
        # Save the predictions to file
        # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f")

        # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}')
        # bins = np.linspace(0, 3, 50)
        # plt.hist(preds, bins=bins, alpha=0.5, label="preds")
        # plt.hist(dev_y, bins=bins, alpha=0.5, label="true")
        # plt.legend(loc='upper right')
        # plt.show()
        del model

    elif variant == 'NNLM':
        train_data = load_data(train_path)
        dev_data = load_data(dev_path)
        test_data = load_data(test_path)

        ins = {"sentence_in": train_data.GetEditSentences()}
        devIns = {"sentence_in": dev_data.GetEditSentences()}
        testIns = {"sentence_in": test_data.GetEditSentences()}
        train_y = train_data.GetGrades()
        dev_y = dev_data.GetGrades()
        test_y = test_data.GetGrades()
        early = tf.keras.callbacks.EarlyStopping(
            monitor='val_root_mean_squared_error',
            min_delta=0.0001,
            patience=5,
            mode='min',
            restore_best_weights=True)
        score = []
        for i in range(10):
            model = create_BERT_model()
            model.fit(x=ins,
                      y=train_y,
                      validation_data=(devIns, dev_y),
                      batch_size=16,
                      epochs=40,
                      shuffle=True,
                      callbacks=[early])

            preds = model.predict(testIns)
            score.append(mean_squared_error(test_y, preds, squared=False))
            del model
            # tf.reset_default_graph()
            tf.keras.backend.clear_session()
            gc.collect()
        score = np.array(score)
        print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}')

        # preds = model.predict(devIns)
        # ids = dev_data.GetIDs()
        # out = np.stack((ids, preds.flatten()), axis=-1)
        # Save the predictions to file
        # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f")

        # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}')
        # bins = np.linspace(0, 3, 50)
        # plt.hist(preds, bins=bins, alpha=0.5, label="preds")
        # plt.hist(dev_y, bins=bins, alpha=0.5, label="true")
        # plt.legend(loc='upper right')
        # plt.show()
        # del model

    elif variant == 'LINEAR':
        train = load_data(train_path)
        dev = load_data(dev_path)

        features = [
            PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature
        ]

        train.AddFeatures(features)
        dev.AddFeatures(features)

        X, y = train.GetFeatureVectors(), train.GetGrades()
        X_dev, dev_y = dev.GetFeatureVectors(), dev.GetGrades()

        reg = LinearRegression(n_jobs=-1).fit(X, y)

        preds = reg.predict(X_dev)
        rmse = mean_squared_error(test_y, preds, squared=False)
        # ids = dev.GetIDs()
        # out = np.stack((ids, preds.flatten()), axis=-1)
        # Save the predictions to file
        # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f")

        # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}')
    elif variant == 'VOCAB':
        embed_path = '../data/embeddings/numpy/headline.npy'
        print("Loading embeddings and vocab...")
        model = Word2Vec.load('../data/embeddings/headlineEmbeds.bin')
        print("Loaded embeddings...")
        with codecs.open('../data/vocab/train_vocab.funlines.json',
                         encoding='utf-8') as fp:
            vocab_dict = json.load(fp)
        print("Loaded vocab...")

        embed_matrix = np.zeros((len(vocab_dict), 300))
        i = 0
        for k, v in vocab_dict.items():
            try:
                embed_matrix[v] = model.wv.get_vector(k)
            except KeyError:
                # print(f'{k} does not exist in FastText embeddings')
                i += 1
        print(len(vocab_dict), i)
        print("Created the embedding matrix...")
        np.save(embed_path, embed_matrix)
        print("Saved the new embeddings...")
    elif variant == 'WORD2VEC':
        print("Loading data...")
        headline_paths = [
            '../data/extra_data_sarcasm/Sarcasm_Headlines_Dataset_v2.json'
        ]
        headlines = []
        for headline_path in headline_paths:
            with open(headline_path, 'r') as fp:
                for line in fp:
                    d = json.loads(line)
                    headlines.append(text_to_word_sequence(d["headline"]))

        train_data = load_data(train_path)
        print("Train model...")
        print(len(headlines))
        headlines.extend(train_data.GetTokenizedWEdit())
        print(len(headlines))
        model = Word2Vec(headlines,
                         size=300,
                         window=14,
                         workers=4,
                         min_count=1)

        vocab = list(model.wv.vocab)
        print(len(vocab))

        print("Saving model...")
        model.save('../data/embeddings/headlineEmbeds.bin')
    elif variant == 'PCA':
        model = PCA(n_components=3)
        entities = np.load('../data/NELL/embeddings/entity.npy')
        labels = load_vocab('../data/NELL/NELLWordNetVocab_proc.txt')
        top_100 = {}
        with open('../data/NELL/top_100_nell.txt', 'r') as f:
            for line in f:
                label = line.strip()
                top_100[label] = entities[labels[label]]

        # print(entities[:4])
        # print(labels[:4])

        pca_ent = model.fit_transform(list(top_100.values()))

        # create_dendrogram(list(top_100.values()), list(top_100.keys()), 'ward')

        # print(pca_ent.shape)
        # print(pca_ent[:10])
        rand_color = RandomColor()
        fig = go.Figure(data=[
            go.Scatter3d(x=pca_ent[:, 0],
                         y=pca_ent[:, 1],
                         z=pca_ent[:, 2],
                         mode='markers',
                         text=list(top_100.keys()),
                         marker=dict(size=12,
                                     color=rand_color.generate(count=100),
                                     colorscale='Viridis',
                                     opacity=0.8))
        ])

        plotly.offline.plot(fig, filename="NELLPCA.html")
    elif variant == 'MEAN':
        files = [
            'CNN.csv', 'context.csv', 'KBLSTM.csv', 'LINEAR.csv',
            'MultiCNN.csv', 'NNLM.csv', 'simple.csv'
        ]
        for f in files:
            with open(f'../plots/{f}', 'r') as fp:
                i = 0
                vals = []
                for line in fp:
                    if i == 0:
                        i += 1
                        continue
                    vals.append(float(line.strip().split(',')[1]))
            vals = np.array(vals)
            mean, std = vals.mean(), vals.std()
            print(f'{f.split(".")[0]}: Mean: {mean}, STD: {std}')
    elif variant == 'COEF':
        train = load_data(train_path)

        features = [
            PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature
        ]

        train.AddFeatures(features)

        X, y = train.GetFeatureVectors(), train.GetGrades()
        y = np.reshape(y, (-1, 1))
        print(y.shape)
        z = np.concatenate((X, y), axis=-1).T

        coef = np.corrcoef(z).round(decimals=4)

        np.savetxt("coef.csv", coef, delimiter=',')
    elif variant == 'ALBERT':
        model = create_BERT_model()
        train = load_data(train_path)
        dev = load_data(dev_path)
        test = load_data(test_path)

        features = [AlbertTokenizer]

        train.AddFeatures(features)
        dev.AddFeatures(features)
        test.AddFeatures(features)

        features, indexes = dev.GetFeatureVectors(), dev.GetIndexes()

        ins = {}
        i = 0
        ins["input_word_ids"] = features[:, i:i + 128]
        i += 128
        ins["segment_ids"] = features[:, i:i + 128]
        i += 128
        ins["input_mask"] = features[:, i:i + 128]

        preds = model.predict(ins)
        words_train = []
        for i, pred in enumerate(preds):
            words_train.append(pred[indexes[i]])
        words_train = np.array(words_train)
        print(words_train.shape)

        np.save("./dev_edit.npy", words_train)
    elif variant == 'MEDIAN':
        train_data = load_data(train_path)
        test_data = load_data(test_path)

        train_y = train_data.GetGrades()
        test_y = test_data.GetGrades()

        pred = np.mean(train_y)
        print("Median", pred)

        pred_y = np.array([pred] * len(test_y))
        rmse = mean_squared_error(test_y, pred_y, squared=False)
        print("RMSE", rmse)
예제 #10
0
def tune():
    # train_generator = RobertaDataGenerator(config.train_path)
    # train_dataset = tf.data.Dataset.from_generator(train_generator.generate,
    #                                                output_types=({'ids': tf.int32, 'att': tf.int32, 'tti': tf.int32},
    #                                                              {'sts': tf.int32, 'ets': tf.int32}))
    # train_dataset = train_dataset.padded_batch(32,
    #                                            padded_shapes=({'ids': [None], 'att': [None], 'tti': [None]},
    #                                                           {'sts': [None], 'ets': [None]}),
    #                                            padding_values=({'ids': 1, 'att': 0, 'tti': 0},
    #                                                            {'sts': 0, 'ets': 0}))
    # train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    #
    # val_generator = RobertaDataGenerator(config.validation_path)
    # val_dataset = tf.data.Dataset.from_generator(val_generator.generate,
    #                                              output_types=({'ids': tf.int32, 'att': tf.int32, 'tti': tf.int32},
    #                                                            {'sts': tf.int32, 'ets': tf.int32}))
    # val_dataset = val_dataset.padded_batch(32,
    #                                        padded_shapes=({'ids': [None], 'att': [None], 'tti': [None]},
    #                                                       {'sts': [None], 'ets': [None]}),
    #                                        padding_values=({'ids': 1, 'att': 0, 'tti': 0},
    #                                                        {'sts': 0, 'ets': 0}))
    # val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    train_dataset = RobertaData(Config.train_path, 'train').get_data()
    val_dataset = RobertaData(Config.validation_path, 'val').get_data()

    tuner = Hyperband(get_tunable_roberta,
                      objective='val_loss',
                      max_epochs=10,
                      factor=3,
                      hyperband_iterations=3,
                      seed=Config.seed,
                      directory='tuner_logs',
                      project_name='feat_roberta')

    tuner.search_space_summary()

    callbacks = [
        tf.keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1),
        tf.keras.callbacks.EarlyStopping(patience=3, verbose=1)
    ]
    tuner.search(train_dataset[0],
                 train_dataset[1],
                 epochs=10,
                 verbose=1,
                 callbacks=callbacks,
                 batch_size=32,
                 validation_data=val_dataset)

    tuner.results_summary()
    best_hps: List[HyperParameters] = tuner.get_best_hyperparameters(
        num_trials=5)
    for hp in best_hps:
        print(f'{hp.values}\n')

    model = tuner.hypermodel.build(best_hps[0])
    tf.keras.utils.plot_model(model,
                              to_file='best_hp_tuned_model.png',
                              show_shapes=True,
                              show_layer_names=True,
                              expand_nested=True)
    model.summary()