def create_base_params(network_type, dataset_helper: Dataset_Helper, results_saver): if network_type == 'embedding': batch_size = hp.choice('batch_size', [64, 128]) num_of_layers = hp.choice('num_of_layers', [1, 2]) num_of_neurons = hp.choice('num_of_neurons', [32, 64, 128]) else: batch_size = hp.choice('batch_size', [64, 128, 256]) num_of_layers = hp.choice('num_of_layers', [1, 2, 3, 4]) num_of_neurons = hp.choice('num_of_neurons', [32, 64, 128, 256]) space = { 'dataset_num': datasets_helper.dataset_position, 'network_type': network_type, 'topic_nums': dataset_helper.get_num_of_topics(), 'num_of_words': hp.choice('num_of_words', [10000]), #'preprocess': False, 'max_len': hp.choice('max_len', [100, 200, 300]), 'num_of_layers': num_of_layers, 'num_of_neurons': num_of_neurons, 'activation_function': hp.choice('activation_function', ['relu', 'tanh']), 'dropouts': hp.randint('dropouts', 3), 'dropout_values': hp.uniform('dropout_values', 0.01, 0.2), 'epochs': 20, #hp.randint('epochs',20), 'batch_size': batch_size, 'learning_rate': hp.choice('learning_rate', [0.001, 0.01, 0.0005]), 'optimizer': hp.choice('optimizer', ['adam', 'rmsprop']), 'results_saver': results_saver } return space
def __init__(self, filename, batch_size, num_of_texts, num_of_words, tokenizer: Tokenizer, delimeter, dataset_helper: Dataset_Helper, max_len=None, start_point=0, preprocess=False, preload_dataset=True, is_predicting=False, tokenizer_mode='binary'): self.filename = filename self.batch_size = batch_size self.num_of_texts = num_of_texts self.tokenizer = tokenizer self.delimeter = delimeter self.num_of_words = num_of_words self.num_of_classes = dataset_helper.get_num_of_topics( ) #num_of_classes self.start_point = start_point self.max_len = max_len self.preprocess = preprocess self.preload_dataset = preload_dataset self.is_predicting = is_predicting self.dataset_helper = dataset_helper self.tokenizer_mode = tokenizer_mode self.labels = [] self.tmp_articles = None self.articles = [] if preload_dataset: self.load_dataset()
def optimize_model(args): print(args) datasets_helper = Dataset_Helper(False) datasets_helper.set_wanted_datasets([args['dataset_num']]) datasets_helper.next_dataset() tokenizer = Tokenizer(num_words=args['num_of_words']) generator = datasets_helper.text_generator() tokenizer.fit_on_texts(generator) optimizer = create_optimizer(args['optimizer'], args['learning_rate']) model = resolve_network_type(args['network_type']) model.set_params(args) model.optimizer = optimizer if args['network_type'] == 'embedding': model.tokenizer = tokenizer model.compile_model() model.fit(datasets_helper=datasets_helper, tokenizer=tokenizer, validation_count=500) results = model.evaluate(datasets_helper=datasets_helper, tokenizer=tokenizer) print(results) args['results_saver'].write_any( 'logs', [get_important_params_from_args(results[1], args)], 'a') del model del tokenizer del generator del datasets_helper tf.compat.v2.keras.backend.clear_session() return -np.amax(results[1])
from training_text_generator_RNN import Training_Text_Generator_RNN from dataset_loader.dataset_helper import Dataset_Helper from results_saver import LogWriter import os import sys from neural_networks.aliaser import * import tkinter as tk from tkinter import simpledialog file_dir = os.path.dirname(__file__) sys.path.append(file_dir) root = tk.Tk() root.withdraw() preprocess = True datasets_helper = Dataset_Helper(preprocess) results_saver = LogWriter(log_file_desc=simpledialog.askstring( title="Test Name", prompt="Insert test name:", initialvalue='CONV_GRU_')) results = [] num_of_words = 10000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) validation_count = datasets_helper.get_num_of_train_texts() // 10 tokenizer = Tokenizer(num_words=num_of_words, filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', lower=False, split=' ') generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.")
for param in params: seed(42) tf.random.set_seed(42) test_name = param[0] i += 1 #config = tf.compat.v1.ConfigProto( device_count = {'GPU': 1 , 'CPU': 4} ) #sess = tf.compat.v1.Session(config=config) #tf.keras.backend.set_session(sess) #results_saver = LogWriter(log_file_desc="Autoencoder") results = [] #mycolors = np.array([color for name, color in mcolors.XKCD_COLORS.items()]) from sys import getsizeof num_of_words = 10000 dataset_helper = Dataset_Helper(True) dataset_helper.set_wanted_datasets([param[1]]) dataset_helper.next_dataset() num_of_topics = dataset_helper.get_num_of_topics() documents = dataset_helper.get_texts_as_list() labels = dataset_helper.get_labels(dataset_helper.get_train_file_path()) tokenizer = Tokenizer(num_words=num_of_words) tokenizer.fit_on_texts(documents) #items= tokenizer.word_index reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) matrix = tokenizer.texts_to_matrix(documents, mode='binary') print(getsizeof(documents)) print(getsizeof(tokenizer)) print(getsizeof(matrix)) #mydict = corpora.Dictionary([line.split() for line in documents],prune_at=num_of_words) #corpus = [mydict.doc2bow(line.split()) for line in documents]
import matplotlib.pyplot as plt from training_text_generator_RNN import Training_Text_Generator_RNN from dataset_loader.dataset_helper import Dataset_Helper from results_saver import LogWriter import os import sys from neural_networks.aliaser import * file_dir = os.path.dirname(__file__) sys.path.append(file_dir) """config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 4} ) sess = tf.Session(config=config) keras.backend.set_session(sess)""" datasets_helper = Dataset_Helper(preprocess=True) datasets_helper.set_wanted_datasets([0]) results_saver = LogWriter(log_file_desc="Bidirectional-no-relu") results = [] num_of_words = 15000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) validation_count = 200 #datasets_helper.get_num_of_train_texts() // 10 tokenizer = Tokenizer(num_words=num_of_words) #, #filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', #lower=False, split=' ') generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.") tokenizer.fit_on_texts(generator) results_saver.add_log("Done. Building model now.")
import matplotlib.pyplot as plt from text_generators.training_text_generator_RNN import TrainingTextGeneratorRNN from dataset_loader.dataset_helper import Dataset_Helper from results_saver import LogWriter import os import sys import tkinter as tk from tkinter import simpledialog file_dir = os.path.dirname(__file__) sys.path.append(file_dir) root = tk.Tk() root.withdraw() preprocess = True datasets_helper = Dataset_Helper(preprocess) results_saver = LogWriter(log_file_desc=simpledialog.askstring( title="Test Name", prompt="Insert test name:", initialvalue='GRU_')) results = [] num_of_words = 10000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) validation_count = datasets_helper.get_num_of_train_texts() // 10 tokenizer = Tokenizer(num_words=num_of_words, filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', lower=False, split=' ') generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.")
from text_generators.training_text_generator import TrainingTextGenerator from dataset_loader.dataset_helper import Dataset_Helper from results_saver import LogWriter, finish_dataset import os import sys from neural_networks.aliaser import * import tkinter as tk from tkinter import simpledialog file_dir = os.path.dirname(__file__) sys.path.append(file_dir) root = tk.Tk() root.withdraw() preprocess = True datasets_helper = Dataset_Helper(preprocess) datasets_helper.set_wanted_datasets([3]) results_saver = LogWriter(log_file_desc=simpledialog.askstring( title="Test Name", prompt="Insert test name:", initialvalue='Dense_')) results = [] num_of_words = 10000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) tokenizer = Tokenizer(num_words=num_of_words) generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.") tokenizer.fit_on_texts(generator) results_saver.add_log("Done. Building model now.")
}, ModelType.DT: { 'max_features': max_feauters } } start_time = get_time_in_millis() preprocess = True models_for_test = test_model.keys() for model in models_for_test: if not test_model[model]: continue log_writer = LogWriter(log_file_desc='_{}_{}'.format( 'prep' if preprocess else 'no-prep', model.name), result_desc='Classic') tester = GeneralTester(log_writer, start_time) datasets_helper = Dataset_Helper(preprocess=preprocess) datasets_helper.set_wanted_datasets([0, 2, 3]) while datasets_helper.next_dataset(): if 'topic_count' in models_params[model]: models_params[model][ 'topic_count'] = datasets_helper.get_num_of_topics() topic_names = [(index, item) for index, item in enumerate( datasets_helper.get_dataset_topic_names())] tester.set_new_dataset(datasets_helper.get_num_of_topics(), topic_names) output_csv = [] """for key,value in test_model.items(): if not value: models_params.pop(key)""" log_writer.write_any("model-settings", json.dumps(models_params[model]), 'w+', True)
file_dir = os.path.dirname(__file__) sys.path.append(file_dir) root = tk.Tk() root.withdraw() test_name = simpledialog.askstring(title="Test Name", prompt="Insert test name:", initialvalue='LDATests') #config = tf.compat.v1.ConfigProto( device_count = {'GPU': 1 , 'CPU': 4} ) #sess = tf.compat.v1.Session(config=config) #tf.keras.backend.set_session(sess) #results_saver = LogWriter(log_file_desc="Autoencoder") results = [] #mycolors = np.array([color for name, color in mcolors.XKCD_COLORS.items()]) num_of_words = 10000 dataset_helper = Dataset_Helper(True) dataset_helper.set_wanted_datasets([2]) dataset_helper.next_dataset() num_of_topics = dataset_helper.get_num_of_topics() documents = dataset_helper.get_texts_as_list() labels = dataset_helper.get_labels(dataset_helper.get_train_file_path()) tokenizer = Tokenizer(num_words=num_of_words) tokenizer.fit_on_texts(documents) #items= tokenizer.word_index reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) matrix = tokenizer.texts_to_matrix(documents, mode='binary') #mydict = corpora.Dictionary([line.split() for line in documents],prune_at=num_of_words) #corpus = [mydict.doc2bow(line.split()) for line in documents] #tfidf = TfidfModel(corpus)
['relu', 'tanh']), 'dropouts': hp.randint('dropouts', 3), 'dropout_values': hp.uniform('dropout_values', 0.01, 0.2), 'epochs': 20, #hp.randint('epochs',20), 'batch_size': batch_size, 'learning_rate': hp.choice('learning_rate', [0.001, 0.01, 0.0005]), 'optimizer': hp.choice('optimizer', ['adam', 'rmsprop']), 'results_saver': results_saver } return space file_dir = os.path.dirname(__file__) sys.path.append(file_dir) datasets_helper = Dataset_Helper(False) results_saver = LogWriter(log_file_desc="hyperopt-best-param-search") results = [] datasets_helper.set_wanted_datasets([1]) models_to_test = ['lstm', 'dense', 'embedding', 'bidi'] """datasets_helper.next_dataset() space = create_base_params('lstm',datasets_helper) smpl = sample(space) print(sample(space))""" for model in models_to_test: while datasets_helper.next_dataset(): space = create_base_params(model, datasets_helper, results_saver) best = fmin(optimize_model, space=space, algo=tpe.suggest, max_evals=30,
def finish_dataset(model, gnr, dataset_helper: Dataset_Helper, log_writer: LogWriter, history): log_writer.write_any('model', model.to_json(), 'w+', True) plot_model(model, log_writer.get_plot_path("", "model-graph"), show_shapes=True) #model.save_weights(log_writer.convert_name_to_file_path(dataset_helper.get_dataset_name(),'weights','.h5')) loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(loss) + 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss {}'.format( dataset_helper.get_dataset_name())) plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.savefig( log_writer.get_plot_path(dataset_helper.get_dataset_name(), "loss")) plt.clf() if not dataset_helper.vectorized_labels: x = [] for i in range(len(gnr)): x.extend(gnr.__getitem__(i)) x = np.array(x) labels = gnr.labels predicts = model.predict(x) predicts = predicts.argmax(axis=-1) #labels = np.array(gnr.labels[:len(predicts)]) # datasets_helper.get_labels(datasets_helper.get_test_file_path()) # print(confusion_matrix(labels[:len(predicts)],predicts)) cm = confusion_matrix(labels, predicts) #print(cm) plot_confusion_matrix(cm, dataset_helper.get_num_of_topics(), dataset_helper.get_dataset_name(), log_writer) """fig = plt.figure(figsize=(dataset_helper.get_num_of_topics(), dataset_helper.get_num_of_topics())) ax = fig.add_subplot(111) cax = ax.matshow(cm) for (i, j), z in np.ndenumerate(cm): ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center') # bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3')) plt.title('Confusion matrix of the classifier') fig.colorbar(cax) plt.xlabel('Predicted') plt.ylabel('True') plt.savefig(log_writer.get_plot_path(dataset_helper.get_dataset_name(), 'confusion_matrix')) plt.clf()""" acc = history.history['accuracy'] val_acc = history.history['val_accuracy'] plt.plot(epochs, acc, 'bo', label='Training accuracy') plt.plot(epochs, val_acc, 'b', label='Validation accuracy') plt.title('Training and validation accuracy {}'.format( dataset_helper.get_dataset_name())) plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.savefig( log_writer.get_plot_path(dataset_helper.get_dataset_name(), "acc")) plt.clf()
import numpy as np import matplotlib.pyplot as plt from training_text_generator_RNN_embedding import Training_Text_Generator_RNN_Embedding from dataset_loader.dataset_helper import Dataset_Helper from results_saver import LogWriter import os import sys from neural_networks.aliaser import * file_dir = os.path.dirname(__file__) sys.path.append(file_dir) datasets_helper = Dataset_Helper() results_saver = LogWriter( log_file_desc="Bidirectional-recurrent-dropout-Embed-preprocessing") results = [] num_of_words = 10000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) validation_count = datasets_helper.get_num_of_train_texts() // 10 tokenizer = Tokenizer(num_words=num_of_words, filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', lower=False, split=' ') generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.") tokenizer.fit_on_texts(generator) results_saver.add_log("Done. Building model now.")