Пример #1
0
    weight_in = weight_in.transpose()
    #combined_weight = np.dot(weight_in.transpose(), weight_out)
    num_of_important_words = 20

    log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, ""),
                           result_desc="NeuralTopicModel")

    log_writer.write_any('model', autoencoder.to_json(), 'w+', True)
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(loss) + 1)
    plt.plot(epochs, loss, 'g', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss {}'.format(
        dataset_helper.get_dataset_name()))
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(
        log_writer.get_plot_path(dataset_helper.get_dataset_name(), "loss"))
    plt.clf()
    """topic_words_in = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_in]
    topic_words_out = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_out]
    log_writer = LogWriter(log_file_desc='LDATestsRegularize{}'.format(regularization))
    log_writer.write_2D_list('topic_words_in', topic_words_in)
    log_writer.write_2D_list('topic_words_out', topic_words_out)"""

    topic_words_in_max = get_extremes(weight_in, num_of_topics,
                                      num_of_important_words, reverse_word_map,
                                      True, 'topic_words_in_max', log_writer,
file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)
root = tk.Tk()
root.withdraw()

preprocess = True
datasets_helper = Dataset_Helper(preprocess)
results_saver = LogWriter(log_file_desc=simpledialog.askstring(
    title="Test Name", prompt="Insert test name:", initialvalue='CONV_GRU_'))
results = []
num_of_words = 10000

while datasets_helper.next_dataset():
    results_saver.add_log("Starting testing dataset {}".format(
        datasets_helper.get_dataset_name()))
    validation_count = datasets_helper.get_num_of_train_texts() // 10
    tokenizer = Tokenizer(num_words=num_of_words,
                          filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                          lower=False,
                          split=' ')
    generator = datasets_helper.text_generator()
    results_saver.add_log("Starting preprocessing and tokenization.")
    tokenizer.fit_on_texts(generator)
    results_saver.add_log("Done. Building model now.")

    batch_size = 256
    gauss_noise = 0.5
    epochs = 1
    val_split = 0.2
    val_data_count = int(datasets_helper.get_num_of_train_texts() * val_split)
Пример #3
0
        topic_names = [(index, item) for index, item in enumerate(
            datasets_helper.get_dataset_topic_names())]
        tester.set_new_dataset(datasets_helper.get_num_of_topics(),
                               topic_names)
        output_csv = []
        """for key,value in test_model.items():
            if not value:
                models_params.pop(key)"""
        log_writer.write_any("model-settings",
                             json.dumps(models_params[model]), 'w+', True)
        seed = 5
        random.seed(5)

        log_writer.add_log(
            "Starting preprocessing texts of {} for training".format(
                datasets_helper.get_dataset_name()))
        texts_for_train = datasets_helper.get_dataset(DatasetType.TRAIN)
        log_writer.add_log("Preprocessing finished")

        log_writer.add_log(
            "Starting preprocessing texts of {} for testing".format(
                datasets_helper.get_dataset_name()))
        texts_for_testing = datasets_helper.get_dataset(DatasetType.TEST)
        log_writer.add_log("Preprocessing finished")

        statistics = []
        tester.set_new_preprocess_docs(texts_for_train, texts_for_testing)
        test_params = {
            "dataset_name": datasets_helper.get_dataset_name(),
            'dataset_helper': datasets_helper
        }
Пример #4
0
file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)
root = tk.Tk()
root.withdraw()

preprocess = True
datasets_helper = Dataset_Helper(preprocess)
datasets_helper.set_wanted_datasets([3])
results_saver = LogWriter(log_file_desc=simpledialog.askstring(
    title="Test Name", prompt="Insert test name:", initialvalue='Dense_'))
results = []
num_of_words = 10000

while datasets_helper.next_dataset():
    results_saver.add_log("Starting testing dataset {}".format(
        datasets_helper.get_dataset_name()))
    tokenizer = Tokenizer(num_words=num_of_words)
    generator = datasets_helper.text_generator()
    results_saver.add_log("Starting preprocessing and tokenization.")
    tokenizer.fit_on_texts(generator)
    results_saver.add_log("Done. Building model now.")

    epochs = 1
    batch_size = 256
    val_split = 0.2
    val_data_count = int(datasets_helper.get_num_of_train_texts() * val_split)
    enhanced_num_of_topics = 128
    gauss_noise = 0.5
    tokenizer_mode = 'tfidf'

    model = Sequential()
Пример #5
0
    #items= tokenizer.word_index
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    matrix = tokenizer.texts_to_matrix(documents, mode='binary')

    num_of_important_words = 20
    log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, ""),
                           result_desc="NeuralTopicModel")

    model = Lda(num_of_topics,
                num_of_important_words,
                passes=25,
                iterations=25)
    """gensim.models.LdaModel(
    doc_term_matrix,
    num_topics=num_of_topics,
    id2word=dictionary,
    passes=2,
    iterations=2)"""

    #LDA section
    model.train(documents)
    topic_words_lda = extract_important_words(model.get_topics(), True)
    print(topic_words_lda)
    log_writer.write_2D_list('topic_words_lda', topic_words_lda, 'w+')
    test_model(documents, labels, model, log_writer, 'standard_lda')
    #plot_clustering_chart(model,True,documents,log_writer,'lda',dataset_helper.get_dataset_name(),dataset_helper.get_num_of_topics())
    measureCoherence(topic_words_lda, log_writer, model.dictionary, documents,
                     'lda', dataset_helper.get_dataset_name())

    log_writer.end_logging()
Пример #6
0
weight_in = autoencoder.get_weights()[2]
weight_out = autoencoder.get_weights()[4]
blob = np.array([])
weight_in = weight_in.transpose()
#tst = autoencoder.get_weights()
num_of_important_words = 20
from results_saver import LogWriter

log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, regularization))
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'g', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss {}'.format(
    dataset_helper.get_dataset_name()))
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig(log_writer.get_plot_path(dataset_helper.get_dataset_name(),
                                     "loss"))
plt.clf()
"""topic_words_in = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_in]
topic_words_out = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_out]
log_writer = LogWriter(log_file_desc='LDATestsRegularize{}'.format(regularization))
log_writer.write_2D_list('topic_words_in', topic_words_in)
log_writer.write_2D_list('topic_words_out', topic_words_out)"""

topic_words_in_max = get_extremes(weight_in, num_of_topics,
                                  num_of_important_words, reverse_word_map,
                                  True, 'topic_words_in_max', log_writer,
Пример #7
0
#combined_weight = np.dot(weight_in.transpose(), weight_out)
num_of_important_words = 20

log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, regularization))

log_writer.write_any('model', autoencoder.to_json(), 'w+', True)
"""topic_words_in = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_in]
topic_words_out = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_out]
log_writer = LogWriter(log_file_desc='LDATestsRegularize{}'.format(regularization))
log_writer.write_2D_list('topic_words_in', topic_words_in)
log_writer.write_2D_list('topic_words_out', topic_words_out)"""

topic_words_in_max = get_extremes(weight_in, num_of_topics,
                                  num_of_important_words, reverse_word_map,
                                  True, 'topic_words_in_max', log_writer,
                                  dataset_helper.get_dataset_name())
topic_words_in_min = get_extremes(weight_in, num_of_topics,
                                  num_of_important_words, reverse_word_map,
                                  False, 'topic_words_in_min', log_writer,
                                  dataset_helper.get_dataset_name())
topic_words_out_max = get_extremes(weight_out, num_of_topics,
                                   num_of_important_words, reverse_word_map,
                                   True, 'topic_words_out_max', log_writer,
                                   dataset_helper.get_dataset_name())
topic_words_out_min = get_extremes(weight_out, num_of_topics,
                                   num_of_important_words, reverse_word_map,
                                   False, 'topic_words_out_min', log_writer,
                                   dataset_helper.get_dataset_name())
#topic_words_combined = get_extremes(combined_weight, num_of_topics, num_of_important_words, reverse_word_map, False, 'topic_words_combined', log_writer, dataset_helper.get_dataset_name())
"""texts = [text.split() for text in documents]
dictionary = corpora.Dictionary(texts)
Пример #8
0
models_to_test = ['lstm', 'dense', 'embedding', 'bidi']
"""datasets_helper.next_dataset()
space = create_base_params('lstm',datasets_helper)
smpl = sample(space)
print(sample(space))"""
for model in models_to_test:
    while datasets_helper.next_dataset():
        space = create_base_params(model, datasets_helper, results_saver)
        best = fmin(optimize_model,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=30,
                    max_queue_len=1,
                    verbose=False)
        results_saver.add_log(
            'Best params for network type {} and dataset {} are: {}\n{}'.
            format(model, datasets_helper.get_dataset_name(), best,
                   space_eval(space, best)))
        results_saver.write_any('best_params', [
            model,
            datasets_helper.get_dataset_name(),
            space_eval(space, best)
        ], 'a')
        #results_saver.write_2D_list([[model,datasets_helper.get_dataset_name(),best]],'best_params','a')
    datasets_helper.reset_dataset_counter()
"""best_run, best_model = optim.minimize(model=test,
                                          data=[],
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials())"""
Пример #9
0
def finish_dataset(model, gnr, dataset_helper: Dataset_Helper,
                   log_writer: LogWriter, history):
    log_writer.write_any('model', model.to_json(), 'w+', True)
    plot_model(model,
               log_writer.get_plot_path("", "model-graph"),
               show_shapes=True)
    #model.save_weights(log_writer.convert_name_to_file_path(dataset_helper.get_dataset_name(),'weights','.h5'))
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(loss) + 1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss {}'.format(
        dataset_helper.get_dataset_name()))
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(
        log_writer.get_plot_path(dataset_helper.get_dataset_name(), "loss"))
    plt.clf()

    if not dataset_helper.vectorized_labels:
        x = []
        for i in range(len(gnr)):
            x.extend(gnr.__getitem__(i))
        x = np.array(x)
        labels = gnr.labels
        predicts = model.predict(x)
        predicts = predicts.argmax(axis=-1)
        #labels = np.array(gnr.labels[:len(predicts)])  # datasets_helper.get_labels(datasets_helper.get_test_file_path())
        # print(confusion_matrix(labels[:len(predicts)],predicts))

        cm = confusion_matrix(labels, predicts)
        #print(cm)
        plot_confusion_matrix(cm, dataset_helper.get_num_of_topics(),
                              dataset_helper.get_dataset_name(), log_writer)
        """fig = plt.figure(figsize=(dataset_helper.get_num_of_topics(), dataset_helper.get_num_of_topics()))
        ax = fig.add_subplot(111)
        cax = ax.matshow(cm)
        for (i, j), z in np.ndenumerate(cm):
            ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')
            # bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
        plt.title('Confusion matrix of the classifier')
        fig.colorbar(cax)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.savefig(log_writer.get_plot_path(dataset_helper.get_dataset_name(), 'confusion_matrix'))
        plt.clf()"""

    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    plt.plot(epochs, acc, 'bo', label='Training accuracy')
    plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
    plt.title('Training and validation accuracy {}'.format(
        dataset_helper.get_dataset_name()))
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(
        log_writer.get_plot_path(dataset_helper.get_dataset_name(), "acc"))
    plt.clf()