def main(args): print("Version", tf.__version__) print("Device", tf.test.gpu_device_name()) print("GPUS", tf.config.list_physical_devices('GPU')) # Data loading path = '../data/rnn_cv10/data/' # K-Fold Cross Validator model evaluation fold_no = 1 FOLDS = 10 acc_per_fold = [] loss_per_fold = [] f1_per_fold = [] precision_per_fold = [] recall_per_fold = [] for partition in range(1, FOLDS + 1): partition_path = os.path.join(path, 'particion' + str(partition)) training_set = pd.read_csv(partition_path + '/train.tsv', sep='\t', header=None) test_set = pd.read_csv(partition_path + '/dev.tsv', sep='\t', header=None) # Train x_train = training_set[3] y_train = training_set[1] # Test x_test = test_set[3] y_test = test_set[1] # Store each tweet as a list of tokens token_list = [] for text in x_train: token_list.append(preprocessing.text.text_to_word_sequence(text)) # For each list of tokens, store its length len_texts = [] for index, tweet in enumerate(token_list): len_texts.append(len(tweet)) # Tokenize max_words = 10000 # Top most frequent words max_seq = 75 # Size to be padded to (should be greater than the max value=70) # Create a tokenize that takes the 10000 most common words tokenizer = preprocessing.text.Tokenizer(num_words=max_words) # Fit the tokenizer to the dataset tokenizer.fit_on_texts(x_train) # Dictionary ordered by total frequency word_index = tokenizer.word_index vocab_size = len(word_index) + 1 # Transform each tweet into a numerical sequence train_sequences = tokenizer.texts_to_sequences(x_train) test_sequences = tokenizer.texts_to_sequences(x_test) # Fill each sequence with zeros until max_seq x_train = preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_seq) x_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_seq) # Load embeddings emb_path = '../embeddings/embeddings-l-model.vec' EMB_DIM = 300 LIMIT = 100000 embedding_matrix = loadembeddings.load_suc(emb_path, word_index, EMB_DIM, LIMIT) # Metrics METRICS = [ keras.metrics.BinaryAccuracy(name='accuracy'), keras.metrics.Precision(name='precision'), keras.metrics.Recall(name='recall'), keras.metrics.AUC(name='auc') ] # Create a model instance for the tuner if args.model == 'lstm': model = buildmodel.LSTMModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) elif args.model == 'bilstm': model = buildmodel.BiLSTMModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) elif args.model == 'cnn': model = buildmodel.CNNModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) elif args.model == 'bilstm_cnn': model = buildmodel.BiLSTM_CNNModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) else: print("Wrong model. Please choose another one.") exit() print('----------------------------------------------------') print(f'Training for fold {fold_no} ...') # Create the tuner tuner = MyTuner( model, # Model's function name objective=kerastuner.Objective( "accuracy", direction="max"), # Objective metric max_trials=args.trials, # Maximum number of trials executions_per_trial=1, # Increase this to reduce results variance directory='../hp_trials/', # Directory to store the models project_name=args.model, # Project name overwrite=True) # Overwrite the project ''' # Balancing classes class_weights = class_weight.compute_class_weight('balanced', np.unique(y[train]), y[train]) class_weights = dict(enumerate(class_weights)) ''' ## Early stopping and model checkpoint callbacks for fitting callbacks = [ EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=20), ] print("Searching...") tuner.search(x_train, y_train, validation_split=0.20, verbose=0, callbacks=callbacks) # Save the best model best_model = tuner.get_best_models(num_models=1) print(tuner.results_summary(num_trials=1)) # CV statistics scores = best_model[0].evaluate(x_test, y_test, verbose=0) acc_per_fold.append(scores[1] * 100) loss_per_fold.append(scores[0]) fold_no = fold_no + 1 # Statistics y_prob = best_model[0].predict(np.array(x_test), batch_size=128, verbose=0) y_classes = np.around(y_prob, decimals=0) y_pred = y_classes.astype(int) # Calculate precision, recall and f1 precision_per_fold.append( precision_score(y_test, y_pred, average="macro")) recall_per_fold.append(recall_score(y_test, y_pred, average="macro")) f1_per_fold.append(f1_score(y_test, y_pred, average="macro")) print('\nCLASSIFICATION REPORT\n') print(classification_report(y_test, y_pred)) print('\nCONFUSION MATRIX\n') print(confusion_matrix(y_test, y_pred)) print("----------------------------------------------") print("Average scores for all folds:") print(f"> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})") print(f"> Loss: {np.mean(loss_per_fold)}") print(f"> Precision macro: {np.mean(precision_per_fold)}") print(f"> Recall macro: {np.mean(recall_per_fold)}") print(f"> F1 macro: {np.mean(f1_per_fold)}") print("----------------------------------------------")
def main(args): print("Version", tf.__version__) print("Device", tf.test.gpu_device_name()) print("GPUS", tf.config.list_physical_devices('GPU')) # Data loading '''path = '../data/HaterNet/' training_set = pd.read_csv(path + 'train_prep_uncased.tsv', sep='\t') test_set = pd.read_csv(path + 'test_prep_uncased.tsv', sep='\t') ''' path = '../data/HatEval/' training_set = pd.read_csv(path + 'train.tsv', sep='\t') test_set = pd.read_csv(path + 'test.tsv', sep='\t') # Train '''x_train = training_set.text y_train = training_set.label ''' x_train = training_set.iloc[:, 3] y_train = training_set.iloc[:, 1] # Test '''x_test = test_set.text y_test = test_set.label ''' x_test = test_set.iloc[:, 3] y_test = test_set.iloc[:, 1] # Store each tweet as a list of tokens token_list = [] for text in x_train: token_list.append(preprocessing.text.text_to_word_sequence(text)) # For each list of tokens, store its length len_texts = [] for index, tweet in enumerate(token_list): len_texts.append(len(tweet)) # Tokenize max_words = 10000 # Top most frequent words max_seq = 75 # Size to be padded to (should be greater than the max value=70) # Create a tokenize that takes the 10000 most common words tokenizer = preprocessing.text.Tokenizer(num_words=max_words) # Fit the tokenizer to the dataset tokenizer.fit_on_texts(x_train) # Dictionary ordered by total frequency word_index = tokenizer.word_index vocab_size = len(word_index) + 1 # Transform each tweet into a numerical sequence train_sequences = tokenizer.texts_to_sequences(x_train) test_sequences = tokenizer.texts_to_sequences(x_test) # Fill each sequence with zeros until max_seq x_train = preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_seq) x_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_seq) # Load embeddings path = '../embeddings/embeddings-l-model.vec' EMB_DIM = 300 LIMIT = 100000 embedding_matrix = loadembeddings.load_suc(path, word_index, EMB_DIM, LIMIT) # Metrics METRICS = [ keras.metrics.BinaryAccuracy(name='accuracy'), keras.metrics.Precision(name='precision'), keras.metrics.Recall(name='recall'), keras.metrics.AUC(name='auc') ] # Create a model instance for the tuner if args.model == 'lstm': model = buildmodel.LSTMModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) elif args.model == 'bilstm': model = buildmodel.BiLSTMModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) elif args.model == 'cnn': model = buildmodel.CNNModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) else: print("Wrong model. Please choose another one.") exit() # Create the tuner tuner = MyTuner( model, # Model's function name objective=kerastuner.Objective("val_accuracy", direction="max"), # Objective metric max_trials=args.trials, # Maximum number of trials executions_per_trial=1, # Increase this to reduce results variance directory='../hp_trials/', # Directory to store the models project_name=args.model, # Project name overwrite=True) # Overwrite the project class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) class_weights = dict(enumerate(class_weights)) # Early stopping and tensorboard callbacks for fitting callbacks = [ EarlyStopping(monitor='val_loss', mode='max', verbose=0, patience=20) #, #keras.callbacks.TensorBoard(log_dir="./logs") ] epochs = 15 print("Searching...") tuner.search(x_train, y_train, epochs=epochs, validation_split=0.20, verbose=0, callbacks=callbacks, class_weight=class_weights) # Save the best model best_model = tuner.get_best_models(num_models=1) print(tuner.results_summary(num_trials=1)) # Statistics y_prob = best_model[0].predict(np.array(x_test), batch_size=128, verbose=1) y_classes = np.around(y_prob, decimals=0) y_pred = y_classes.astype(int) print('\nCLASSIFICATION REPORT\n') print(classification_report(y_test, y_pred, digits=4)) print('\nCONFUSION MATRIX\n') print(confusion_matrix(y_test, y_pred)) print("\nParameters used:") print(args.model + " model") print(str(args.trials) + " trials") print(str(epochs) + " epochs") print("Weight balance") print("No Cross-validation") print("Metric: val_accuracy")
def main(args): print("Version", tf.__version__) print("Device", tf.test.gpu_device_name()) print("GPUS", tf.config.list_physical_devices('GPU')) # Data loading path = '../data/HaterNet/' training_set = pd.read_csv(path + 'train_prep_uncased.tsv', sep='\t') test_set = pd.read_csv(path + 'test_prep_uncased.tsv', sep='\t') # Remove accents from train x_train = training_set.text y_train = training_set.label # Remove accents from test x_test = test_set.text y_test = test_set.label # Normalize dataset for the lexicon matching norm_train = training_set.text.str.normalize('NFKD').str.encode( 'ascii', errors='ignore').str.decode('utf-8') norm_test = test_set.text.str.normalize('NFKD').str.encode( 'ascii', errors='ignore').str.decode('utf-8') # Lexicon loading if args.lexicon == 'sel': lexicon = loadfeatures.SEL(path='../lexicons/') lex_train = lexicon.process(dataset=norm_train) lex_test = lexicon.process(dataset=norm_test) elif args.lexicon == 'liwc': lexicon = loadfeatures.SpanishLIWC(path='../lexicons/') lex_train = lexicon.process(dataset=norm_train) lex_test = lexicon.process(dataset=norm_test) elif args.lexicon == 'all': lexicon = loadfeatures.All(path='../lexicons/') lex_train = lexicon.process(dataset=norm_train) lex_test = lexicon.process(dataset=norm_test) else: print("No se utilizará lexicon.") # Store each tweet as a list of tokens token_list = [] for text in x_train: token_list.append(preprocessing.text.text_to_word_sequence(text)) # For each list of tokens, store its length len_texts = [] for index, tweet in enumerate(token_list): len_texts.append(len(tweet)) # Tokenize max_words = 10000 # Top most frequent words max_seq = 75 # Size to be padded to (should be greater than the max value=70) # Create a tokenize that takes the 10000 most common words tokenizer = preprocessing.text.Tokenizer(num_words=max_words) # Fit the tokenizer to the dataset tokenizer.fit_on_texts(x_train) # Dictionary ordered by total frequency word_index = tokenizer.word_index vocab_size = len(word_index) + 1 # Transform each tweet into a numerical sequence train_sequences = tokenizer.texts_to_sequences(x_train) test_sequences = tokenizer.texts_to_sequences(x_test) # Fill each sequence with zeros until max_seq x_train = preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_seq) x_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_seq) # Load embeddings path = '../embeddings/embeddings-l-model.vec' EMB_DIM = 300 LIMIT = 100000 embedding_matrix = loadembeddings.load_suc(path, word_index, EMB_DIM, LIMIT) # Metrics METRICS = [ keras.metrics.BinaryAccuracy(name='accuracy'), keras.metrics.Precision(name='precision'), keras.metrics.Recall(name='recall'), keras.metrics.AUC(name='auc') ] if args.lexicon: model = buildmodel.LSTMFeaturesModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, args.lexicon, METRICS) else: # Create a model instance for the tuner if args.model == 'lstm': model = buildmodel.LSTMModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) elif args.model == 'bilstm': model = buildmodel.BiLSTMModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) elif args.model == 'cnn': model = buildmodel.CNNModel(vocab_size, max_seq, embedding_matrix, EMB_DIM, METRICS) else: print("Wrong model. Please, choose another one.") exit() # Create the tuner tuner = CVTuner( hypermodel=model, # Model's function name oracle=kt.oracles.BayesianOptimization( objective=kt.Objective("accuracy", direction="max"), # Optimizing metric max_trials=args.trials # Number of trials, default=10 ), directory='../hp_trials/', # Directory to store the models project_name=args.model + "_" + args.lexicon, # Project name overwrite=True) # Overwrite the project ''' class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) class_weights = dict(enumerate(class_weights)) ''' # Early stopping and tensorboard callbacks for fitting callbacks = [EarlyStopping(monitor='loss', verbose=1, patience=5)] print("Searching...") if args.lexicon: tuner.search(x=[x_train, lex_train], y=y_train, verbose=0, callbacks=callbacks, epochs=10) else: tuner.search(x=x_train, y=y_train, verbose=0, callbacks=callbacks, epochs=10) # Save the best model best_model = tuner.get_best_models(num_models=1) print(tuner.results_summary(num_trials=1)) # Statistics if args.lexicon: y_prob = best_model[0].predict([np.array(x_test), lex_test], batch_size=128, verbose=1) else: y_prob = best_model[0].predict(np.array(x_test), batch_size=128, verbose=1) y_classes = np.around(y_prob, decimals=0) y_pred = y_classes.astype(int) print('\nCLASSIFICATION REPORT\n') print(classification_report(y_test, y_pred, digits=4)) print('\nCONFUSION MATRIX\n') print(confusion_matrix(y_test, y_pred)) print("\nParameters used:") print(args.model + " model") print(str(args.trials) + " trials") print(args.lexicon + " lexicon")