Пример #1
0
def main(args):
    print("Version", tf.__version__)
    print("Device", tf.test.gpu_device_name())
    print("GPUS", tf.config.list_physical_devices('GPU'))

    # Data loading
    path = '../data/rnn_cv10/data/'

    # K-Fold Cross Validator model evaluation
    fold_no = 1
    FOLDS = 10

    acc_per_fold = []
    loss_per_fold = []
    f1_per_fold = []
    precision_per_fold = []
    recall_per_fold = []

    for partition in range(1, FOLDS + 1):
        partition_path = os.path.join(path, 'particion' + str(partition))
        training_set = pd.read_csv(partition_path + '/train.tsv',
                                   sep='\t',
                                   header=None)
        test_set = pd.read_csv(partition_path + '/dev.tsv',
                               sep='\t',
                               header=None)

        # Train
        x_train = training_set[3]
        y_train = training_set[1]

        # Test
        x_test = test_set[3]
        y_test = test_set[1]

        # Store each tweet as a list of tokens
        token_list = []
        for text in x_train:
            token_list.append(preprocessing.text.text_to_word_sequence(text))

        # For each list of tokens, store its length
        len_texts = []
        for index, tweet in enumerate(token_list):
            len_texts.append(len(tweet))

        # Tokenize
        max_words = 10000  # Top most frequent words
        max_seq = 75  # Size to be padded to (should be greater than the max value=70)

        # Create a tokenize that takes the 10000 most common words
        tokenizer = preprocessing.text.Tokenizer(num_words=max_words)

        # Fit the tokenizer to the dataset
        tokenizer.fit_on_texts(x_train)

        # Dictionary ordered by total frequency
        word_index = tokenizer.word_index
        vocab_size = len(word_index) + 1

        # Transform each tweet into a numerical sequence
        train_sequences = tokenizer.texts_to_sequences(x_train)
        test_sequences = tokenizer.texts_to_sequences(x_test)

        # Fill each sequence with zeros until max_seq
        x_train = preprocessing.sequence.pad_sequences(train_sequences,
                                                       maxlen=max_seq)
        x_test = preprocessing.sequence.pad_sequences(test_sequences,
                                                      maxlen=max_seq)

        # Load embeddings
        emb_path = '../embeddings/embeddings-l-model.vec'
        EMB_DIM = 300
        LIMIT = 100000
        embedding_matrix = loadembeddings.load_suc(emb_path, word_index,
                                                   EMB_DIM, LIMIT)

        # Metrics
        METRICS = [
            keras.metrics.BinaryAccuracy(name='accuracy'),
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall'),
            keras.metrics.AUC(name='auc')
        ]

        # Create a model instance for the tuner
        if args.model == 'lstm':
            model = buildmodel.LSTMModel(vocab_size, max_seq, embedding_matrix,
                                         EMB_DIM, METRICS)
        elif args.model == 'bilstm':
            model = buildmodel.BiLSTMModel(vocab_size, max_seq,
                                           embedding_matrix, EMB_DIM, METRICS)
        elif args.model == 'cnn':
            model = buildmodel.CNNModel(vocab_size, max_seq, embedding_matrix,
                                        EMB_DIM, METRICS)
        elif args.model == 'bilstm_cnn':
            model = buildmodel.BiLSTM_CNNModel(vocab_size, max_seq,
                                               embedding_matrix, EMB_DIM,
                                               METRICS)
        else:
            print("Wrong model. Please choose another one.")
            exit()

        print('----------------------------------------------------')
        print(f'Training for fold {fold_no} ...')

        # Create the tuner
        tuner = MyTuner(
            model,  # Model's function name
            objective=kerastuner.Objective(
                "accuracy", direction="max"),  # Objective metric
            max_trials=args.trials,  # Maximum number of trials
            executions_per_trial=1,  # Increase this to reduce results variance
            directory='../hp_trials/',  # Directory to store the models
            project_name=args.model,  # Project name
            overwrite=True)  # Overwrite the project
        '''
    # Balancing classes
    class_weights = class_weight.compute_class_weight('balanced',
                                                    np.unique(y[train]),
                                                    y[train])

    class_weights = dict(enumerate(class_weights))
    '''

        ## Early stopping and model checkpoint callbacks for fitting
        callbacks = [
            EarlyStopping(monitor='val_loss',
                          mode='min',
                          verbose=0,
                          patience=20),
        ]

        print("Searching...")
        tuner.search(x_train,
                     y_train,
                     validation_split=0.20,
                     verbose=0,
                     callbacks=callbacks)

        # Save the best model
        best_model = tuner.get_best_models(num_models=1)
        print(tuner.results_summary(num_trials=1))

        # CV statistics
        scores = best_model[0].evaluate(x_test, y_test, verbose=0)
        acc_per_fold.append(scores[1] * 100)
        loss_per_fold.append(scores[0])

        fold_no = fold_no + 1

        # Statistics
        y_prob = best_model[0].predict(np.array(x_test),
                                       batch_size=128,
                                       verbose=0)
        y_classes = np.around(y_prob, decimals=0)
        y_pred = y_classes.astype(int)

        # Calculate precision, recall and f1
        precision_per_fold.append(
            precision_score(y_test, y_pred, average="macro"))
        recall_per_fold.append(recall_score(y_test, y_pred, average="macro"))
        f1_per_fold.append(f1_score(y_test, y_pred, average="macro"))

        print('\nCLASSIFICATION REPORT\n')
        print(classification_report(y_test, y_pred))

        print('\nCONFUSION MATRIX\n')
        print(confusion_matrix(y_test, y_pred))

    print("----------------------------------------------")
    print("Average scores for all folds:")
    print(f"> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})")
    print(f"> Loss: {np.mean(loss_per_fold)}")
    print(f"> Precision macro: {np.mean(precision_per_fold)}")
    print(f"> Recall macro: {np.mean(recall_per_fold)}")
    print(f"> F1 macro: {np.mean(f1_per_fold)}")
    print("----------------------------------------------")
Пример #2
0
def main(args):
    print("Version", tf.__version__)
    print("Device", tf.test.gpu_device_name())
    print("GPUS", tf.config.list_physical_devices('GPU'))

    # Data loading
    '''path = '../data/HaterNet/'
  training_set = pd.read_csv(path + 'train_prep_uncased.tsv', sep='\t')
  test_set = pd.read_csv(path + 'test_prep_uncased.tsv', sep='\t')
  '''
    path = '../data/HatEval/'
    training_set = pd.read_csv(path + 'train.tsv', sep='\t')
    test_set = pd.read_csv(path + 'test.tsv', sep='\t')

    # Train
    '''x_train = training_set.text
  y_train = training_set.label
  '''
    x_train = training_set.iloc[:, 3]
    y_train = training_set.iloc[:, 1]

    # Test
    '''x_test = test_set.text
  y_test = test_set.label
  '''
    x_test = test_set.iloc[:, 3]
    y_test = test_set.iloc[:, 1]

    # Store each tweet as a list of tokens
    token_list = []
    for text in x_train:
        token_list.append(preprocessing.text.text_to_word_sequence(text))

    # For each list of tokens, store its length
    len_texts = []
    for index, tweet in enumerate(token_list):
        len_texts.append(len(tweet))

    # Tokenize
    max_words = 10000  # Top most frequent words
    max_seq = 75  # Size to be padded to (should be greater than the max value=70)

    # Create a tokenize that takes the 10000 most common words
    tokenizer = preprocessing.text.Tokenizer(num_words=max_words)

    # Fit the tokenizer to the dataset
    tokenizer.fit_on_texts(x_train)

    # Dictionary ordered by total frequency
    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1

    # Transform each tweet into a numerical sequence
    train_sequences = tokenizer.texts_to_sequences(x_train)
    test_sequences = tokenizer.texts_to_sequences(x_test)

    # Fill each sequence with zeros until max_seq
    x_train = preprocessing.sequence.pad_sequences(train_sequences,
                                                   maxlen=max_seq)
    x_test = preprocessing.sequence.pad_sequences(test_sequences,
                                                  maxlen=max_seq)

    # Load embeddings
    path = '../embeddings/embeddings-l-model.vec'
    EMB_DIM = 300
    LIMIT = 100000
    embedding_matrix = loadembeddings.load_suc(path, word_index, EMB_DIM,
                                               LIMIT)

    # Metrics
    METRICS = [
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc')
    ]

    # Create a model instance for the tuner
    if args.model == 'lstm':
        model = buildmodel.LSTMModel(vocab_size, max_seq, embedding_matrix,
                                     EMB_DIM, METRICS)
    elif args.model == 'bilstm':
        model = buildmodel.BiLSTMModel(vocab_size, max_seq, embedding_matrix,
                                       EMB_DIM, METRICS)
    elif args.model == 'cnn':
        model = buildmodel.CNNModel(vocab_size, max_seq, embedding_matrix,
                                    EMB_DIM, METRICS)
    else:
        print("Wrong model. Please choose another one.")
        exit()

    # Create the tuner
    tuner = MyTuner(
        model,  # Model's function name
        objective=kerastuner.Objective("val_accuracy",
                                       direction="max"),  # Objective metric
        max_trials=args.trials,  # Maximum number of trials
        executions_per_trial=1,  # Increase this to reduce results variance
        directory='../hp_trials/',  # Directory to store the models
        project_name=args.model,  # Project name
        overwrite=True)  # Overwrite the project

    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)

    class_weights = dict(enumerate(class_weights))

    # Early stopping and tensorboard callbacks for fitting
    callbacks = [
        EarlyStopping(monitor='val_loss', mode='max', verbose=0,
                      patience=20)  #,
        #keras.callbacks.TensorBoard(log_dir="./logs")
    ]

    epochs = 15
    print("Searching...")
    tuner.search(x_train,
                 y_train,
                 epochs=epochs,
                 validation_split=0.20,
                 verbose=0,
                 callbacks=callbacks,
                 class_weight=class_weights)

    # Save the best model
    best_model = tuner.get_best_models(num_models=1)
    print(tuner.results_summary(num_trials=1))

    # Statistics
    y_prob = best_model[0].predict(np.array(x_test), batch_size=128, verbose=1)
    y_classes = np.around(y_prob, decimals=0)
    y_pred = y_classes.astype(int)

    print('\nCLASSIFICATION REPORT\n')
    print(classification_report(y_test, y_pred, digits=4))

    print('\nCONFUSION MATRIX\n')
    print(confusion_matrix(y_test, y_pred))

    print("\nParameters used:")
    print(args.model + " model")
    print(str(args.trials) + " trials")
    print(str(epochs) + " epochs")
    print("Weight balance")
    print("No Cross-validation")
    print("Metric: val_accuracy")
Пример #3
0
def main(args):
    print("Version", tf.__version__)
    print("Device", tf.test.gpu_device_name())
    print("GPUS", tf.config.list_physical_devices('GPU'))

    # Data loading
    path = '../data/HaterNet/'

    training_set = pd.read_csv(path + 'train_prep_uncased.tsv', sep='\t')
    test_set = pd.read_csv(path + 'test_prep_uncased.tsv', sep='\t')

    # Remove accents from train
    x_train = training_set.text
    y_train = training_set.label

    # Remove accents from test
    x_test = test_set.text
    y_test = test_set.label

    # Normalize dataset for the lexicon matching
    norm_train = training_set.text.str.normalize('NFKD').str.encode(
        'ascii', errors='ignore').str.decode('utf-8')
    norm_test = test_set.text.str.normalize('NFKD').str.encode(
        'ascii', errors='ignore').str.decode('utf-8')

    # Lexicon loading
    if args.lexicon == 'sel':
        lexicon = loadfeatures.SEL(path='../lexicons/')
        lex_train = lexicon.process(dataset=norm_train)
        lex_test = lexicon.process(dataset=norm_test)
    elif args.lexicon == 'liwc':
        lexicon = loadfeatures.SpanishLIWC(path='../lexicons/')
        lex_train = lexicon.process(dataset=norm_train)
        lex_test = lexicon.process(dataset=norm_test)
    elif args.lexicon == 'all':
        lexicon = loadfeatures.All(path='../lexicons/')
        lex_train = lexicon.process(dataset=norm_train)
        lex_test = lexicon.process(dataset=norm_test)
    else:
        print("No se utilizará lexicon.")

    # Store each tweet as a list of tokens
    token_list = []
    for text in x_train:
        token_list.append(preprocessing.text.text_to_word_sequence(text))

    # For each list of tokens, store its length
    len_texts = []
    for index, tweet in enumerate(token_list):
        len_texts.append(len(tweet))

    # Tokenize
    max_words = 10000  # Top most frequent words
    max_seq = 75  # Size to be padded to (should be greater than the max value=70)

    # Create a tokenize that takes the 10000 most common words
    tokenizer = preprocessing.text.Tokenizer(num_words=max_words)

    # Fit the tokenizer to the dataset
    tokenizer.fit_on_texts(x_train)

    # Dictionary ordered by total frequency
    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1

    # Transform each tweet into a numerical sequence
    train_sequences = tokenizer.texts_to_sequences(x_train)
    test_sequences = tokenizer.texts_to_sequences(x_test)

    # Fill each sequence with zeros until max_seq
    x_train = preprocessing.sequence.pad_sequences(train_sequences,
                                                   maxlen=max_seq)
    x_test = preprocessing.sequence.pad_sequences(test_sequences,
                                                  maxlen=max_seq)

    # Load embeddings
    path = '../embeddings/embeddings-l-model.vec'
    EMB_DIM = 300
    LIMIT = 100000
    embedding_matrix = loadembeddings.load_suc(path, word_index, EMB_DIM,
                                               LIMIT)

    # Metrics
    METRICS = [
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc')
    ]

    if args.lexicon:
        model = buildmodel.LSTMFeaturesModel(vocab_size, max_seq,
                                             embedding_matrix, EMB_DIM,
                                             args.lexicon, METRICS)

    else:
        # Create a model instance for the tuner
        if args.model == 'lstm':
            model = buildmodel.LSTMModel(vocab_size, max_seq, embedding_matrix,
                                         EMB_DIM, METRICS)
        elif args.model == 'bilstm':
            model = buildmodel.BiLSTMModel(vocab_size, max_seq,
                                           embedding_matrix, EMB_DIM, METRICS)
        elif args.model == 'cnn':
            model = buildmodel.CNNModel(vocab_size, max_seq, embedding_matrix,
                                        EMB_DIM, METRICS)
        else:
            print("Wrong model. Please, choose another one.")
            exit()

    # Create the tuner
    tuner = CVTuner(
        hypermodel=model,  # Model's function name
        oracle=kt.oracles.BayesianOptimization(
            objective=kt.Objective("accuracy",
                                   direction="max"),  # Optimizing metric
            max_trials=args.trials  # Number of trials, default=10
        ),
        directory='../hp_trials/',  # Directory to store the models
        project_name=args.model + "_" + args.lexicon,  # Project name
        overwrite=True)  # Overwrite the project
    '''
  class_weights = class_weight.compute_class_weight('balanced',
                                                  np.unique(y_train),
                                                  y_train)

  class_weights = dict(enumerate(class_weights))
  '''

    # Early stopping and tensorboard callbacks for fitting
    callbacks = [EarlyStopping(monitor='loss', verbose=1, patience=5)]

    print("Searching...")
    if args.lexicon:
        tuner.search(x=[x_train, lex_train],
                     y=y_train,
                     verbose=0,
                     callbacks=callbacks,
                     epochs=10)
    else:
        tuner.search(x=x_train,
                     y=y_train,
                     verbose=0,
                     callbacks=callbacks,
                     epochs=10)

    # Save the best model
    best_model = tuner.get_best_models(num_models=1)
    print(tuner.results_summary(num_trials=1))

    # Statistics
    if args.lexicon:
        y_prob = best_model[0].predict([np.array(x_test), lex_test],
                                       batch_size=128,
                                       verbose=1)
    else:
        y_prob = best_model[0].predict(np.array(x_test),
                                       batch_size=128,
                                       verbose=1)

    y_classes = np.around(y_prob, decimals=0)
    y_pred = y_classes.astype(int)

    print('\nCLASSIFICATION REPORT\n')
    print(classification_report(y_test, y_pred, digits=4))

    print('\nCONFUSION MATRIX\n')
    print(confusion_matrix(y_test, y_pred))

    print("\nParameters used:")
    print(args.model + " model")
    print(str(args.trials) + " trials")
    print(args.lexicon + " lexicon")