def run_image_cnn_model(model, train_data, train_labels, test_data,
                        test_labels):
    print('###  CNN  ###')
    # compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy', 'mse'])

    train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(
        train_data, train_labels)

    # TODO:
    # test if it is multi label or not
    # train the model
    model.fit(train_data, train_labels, epochs=5)
    #   ,
    # batch_size=512,
    # validation_data=(val_data, val_labels),
    # verbose=1)

    # check on the test dataset
    test_loss, test_acc, test_mse = model.evaluate(test_data, test_labels)
    # make predictions
    predictions = model.predict(test_data)
    return [test_loss, test_acc, test_mse], predictions
def run_text_cnn_model(model, train_data, train_labels, test_data,
                       test_labels):
    print('###  CNN  ###')
    # compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'mse'])

    train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(
        train_data, train_labels)

    # TODO: convert to multi label classification
    # train_labels = train_labels[:, 0]
    # val_labels = val_labels[:, 0]

    # train the model
    history = model.fit(train_data,
                        train_labels,
                        epochs=40,
                        batch_size=512,
                        validation_data=(val_data, val_labels),
                        verbose=1)

    # check on the test dataset
    test_loss, test_acc, test_mse = model.evaluate(test_data, test_labels)
    predictions = model.predict(test_data)
    return [test_loss, test_acc, test_mse], predictions
Пример #3
0
def apply_word2vec_extratrees(data_frame, classif_level, classif_type,
                              source_path):
    data_frame['text'] = data_frame.apply(
        lambda row: th.tokenize_complex_text(row['text']), axis=1)
    data_frame['classification'] = data_frame.apply(
        lambda row: th.tokenize_complex_text(row['classification']), axis=1)

    df_single_classification = ch.get_list_each_text_a_different_classification(
        data_frame)

    x = df_single_classification['text']
    y = df_single_classification['classification']

    X_train, X_test, y_train, y_test = ch.get_train_test_from_data(x, y)

    model_w2v = wmh.get_word2vec_model(X_train)

    etree_w2v = Pipeline([("word2vec vectorizer",
                           wmh.MeanEmbeddingVectorizer(model_w2v)),
                          ("extra trees", pmh.get_extra_tree())])
    etree_w2v_tfidf = Pipeline([("word2vec vectorizer",
                                 wmh.TfidfEmbeddingVectorizer(model_w2v)),
                                ("extra trees", pmh.get_extra_tree())])

    # NB!!!: the model does not support multi targets, so i duplicate the sources and give them different targets
    y_pred = pmh.fit_predict_functions(etree_w2v_tfidf, X_train, y_train,
                                       X_test)

    classifier_name_0 = 'Word2Vec/MeanEmbeddingVectorizer'
    classifier_name_1, parameters_1 = ch.get_extratree_classifier_information(
        str(etree_w2v))
    model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_1

    # this should be changed by comparing all the possibilities for specified text (i can use the original dataframe!)
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name,
                    classif_level, classif_type, source_path)

    # NB!!!: the model does not support multi targets, so i duplicate the sources and give them different targets
    y_pred = pmh.fit_predict_functions(etree_w2v, X_train, y_train, X_test)

    classifier_name_2 = 'Word2Vec/TfidfEmbeddingVectorizer'
    model_name = '[all classes predictions]' + classifier_name_2 + '/' + classifier_name_1

    # this should be changed by comparing all the possibilities for specified text (i can use the original dataframe!)
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name,
                    classif_level, classif_type, source_path)
Пример #4
0
def second_attempt_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location):

    root_location = fh.get_root_location('data/convolutional_outcome/')

    # imdb = keras.datasets.imdb
    # (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

    # explore data
    # print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
    # print("how data looks like: ", train_data[0]) # [1, 14, 34...]
    # print("how labels looks like: ", train_labels[0]) # 0

    # preprocess data
    # A dictionary mapping words to an integer index
    # word_index = imdb.get_word_index()

    # The first indices are reserved, thus it increases all the indices by 3
    # word_index = {k : (v + 3) for k, v in word_index.items()}
    # word_index["<PAD>"] = 0
    # word_index["<START>"] = 1
    # word_index["<UNK>"] = 2  # unknown
    # word_index["<UNUSED>"] = 3

    # make both the train and the test dataset the same length
    # train_data = keras.preprocessing.sequence.pad_sequences(train_data,
    #                                                         value=word_index["<PAD>"],
    #                                                         padding='post',
    #                                                         maxlen=256)

    # test_data = keras.preprocessing.sequence.pad_sequences(test_data,
    #                                                        value=word_index["<PAD>"],
    #                                                        padding='post',
    #                                                        maxlen=256)
    # print(train_data[0]) # [1 14 34 0 0 0] - with little difference: PAD,START,UNK,UNUSED

    model_name = text_vectorizer+'/'+class_vectorizer+'/NN'
    standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results

    train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(train_data, train_labels)

    # print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
    # print("how data looks like: ", train_data[0]) # [1 14 34 0 0 0]
    # print("how labels looks like: ", train_labels[0:5]) # list of lists [[0 1 0 0 0 1 0 1], ...]

    model = pmh.get_text_convolutional_from_web(len_vocabulary, n_classes)
    metrics, predictions = pmh.run_text_cnn_model(model, train_data, train_labels, test_data, test_labels)

    classifier_name, layers = ch.get_sequential_classifier_information(model)
    mh.display_convolutional_metrics(classifier_name, metrics[0], metrics[1], metrics[2], test_labels, predictions)

    ch.save_results(classifier_name, metrics, layers, model_name, classif_level, classif_type, dataset_location)
Пример #5
0
def apply_multilabel_label_encoder_tfidf_classification(
        data_frame, classif_level, classif_type, source_path):
    data_frame = ch.get_list_each_text_a_different_classification(data_frame)

    temp_text, patent_ids, vectorizer = ch.apply_tfidf_vectorizer_fit_transform(
        data_frame)

    X_train_tfidf, X_test_tfidf, y_train, y_test = ch.get_train_test_from_data(
        temp_text, data_frame['classification'])

    y_train = ch.apply_label_encoder(y_train)
    y_test = ch.apply_label_encoder(y_test)

    apply_naive_bayes(X_train_tfidf, y_train, X_test_tfidf, y_test,
                      classif_level, classif_type, source_path)
    apply_svm(X_train_tfidf, y_train, X_test_tfidf, y_test, classif_level,
              classif_type, source_path)
Пример #6
0
def train_testing_convolution(data_frame, text_vectorizer, class_vectorizer):
    save_standard_sets = True
    root_location = fh.get_root_location('data/convolutional_outcome/')

    sets_location = fh.join_paths(root_location, "model_sets")
    checkpoint_path = fh.join_paths(root_location, "model_checkpoints")
    model_path = fh.link_paths(checkpoint_path, 'convolution_model')
    weights_path = fh.link_paths(checkpoint_path, 'convolution_weights')

    # get sets
    model_name = text_vectorizer+'/'+class_vectorizer+'/NN'
    standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results
    train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(train_data, train_labels)

    # save sets
    # ch.save_sets(sets_location, train_data, test_data, val_data, train_labels, test_labels, val_labels,
    #           [classes, n_classes, vocab_processor, len_vocabulary])

    # this is for test
    train_data, test_data, val_data, train_labels, test_labels, val_labels, _ = ch.load_sets(sets_location)

    # it could be that a label is only in the test/data data, might be a problem
    sequence_length = train_data.shape[1]
    # define the model
    model = pmh.get_cnn_test(len_vocabulary, n_classes, sequence_length)

    # calculates metrics with validating data
    model, val_predictions = pmh.run_cnn_test(model,
                                   train_data, train_labels, val_data, val_labels, val_data, val_labels,
                                   model_path, weights_path, True)
    binary_val_predictions = mh.get_binary_0_5(val_predictions)
    print(val_labels.shape)
    print(val_predictions.shape)
    # display validation metrics
    metrics = mh.get_sequential_metrics(val_labels, val_predictions, binary_predictions)
    mh.display_sequential_metrics('validation convolution sequence', metrics)
Пример #7
0
def test_LSTM_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location):
    print('### LSTM Doing Testing ###')

    root_location = fh.get_root_location('data/lstm_outcome/')

    nn_parameter_search_location = fh.join_paths(root_location, "nn_fhv_parameter_search")
    doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model")

    save_results = True

    sequence_size = 1
    EMBEDDING_SIZE = 150

    model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM'
    results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = results

    X_train, X_val, y_train, y_val = ch.get_train_test_from_data(X_train, y_train)

    training_docs_list = X_train['patent_id']
    test_docs_list = X_test['patent_id']
    val_docs_list = X_val['patent_id']

    X_data, Xv_data, Xt_data = ch.get_df_data(3, training_docs_list, val_docs_list, test_docs_list, sequence_size, EMBEDDING_SIZE, doc2vec_model_location)
    GLOBAL_VARS.DOC2VEC_MODEL_NAME, GLOBAL_VARS.MODEL_NAME = wmh.set_parameters_lstm_doc2vec(nn_parameter_search_location, classif_level, classif_type)

    NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS = pmh.get_lstm_shapes(X_data, n_classes)
    NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = pmh.get_lstm_basic_parameters()
    params = pmh.get_lstm_testing_parameters()
    lstm_output_size,w_dropout_do,u_dropout_do, stack_layers, conv_size, conv_filter_length, conv_max_pooling_length = params
    EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = pmh.get_early_stopping_parameters()

    TEST_METRICS_FILENAME = '{}_level_{}_standard_nn_test_metrics_dict.pkl'

    test_metrics_dict = dict()
    test_metrics_path = fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), TEST_METRICS_FILENAME.format(classif_type, PARTS_LEVEL))

    param_results_path = fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE))

    param_results_dict = pickle.load(open(param_results_path, 'rb'))
    GLOBAL_VARS.NN_MODEL_NAME = 'lstm_size_{}_w-drop_{}_u-drop_{}_stack_{}_conv_{}'.format(lstm_output_size,
                                                                                            w_dropout_do,
                                                                                            u_dropout_do,
                                                                                            stack_layers,
                                                                                            str(conv_size)
                                                                                            )
    if conv_size:
        GLOBAL_VARS.NN_MODEL_NAME += '_conv-filter-length_{}_max-pooling-size_{}'.format(conv_filter_length,
                                                                                         conv_max_pooling_length)
    if GLOBAL_VARS.NN_MODEL_NAME not in param_results_dict.keys():
        print("Can't find model: {}".format(GLOBAL_VARS.NN_MODEL_NAME))
        raise Exception()

    if fh.ensure_exists_path_location(test_metrics_path):
        test_metrics_dict = pickle.load(open(fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), TEST_METRICS_FILENAME.format(classif_type,PARTS_LEVEL)), 'rb'))
        if GLOBAL_VARS.NN_MODEL_NAME in test_metrics_dict.keys():
            print("Test metrics already exist for: {}".format(GLOBAL_VARS.NN_MODEL_NAME))
            test_metrics = test_metrics_dict[GLOBAL_VARS.NN_MODEL_NAME]

            print("** Test Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}".format(
                test_metrics['coverage_error'], test_metrics['average_num_of_labels'],
                test_metrics['top_1'], test_metrics['top_3'], test_metrics['top_5'],
                test_metrics['f1_micro'], test_metrics['f1_macro']))
            raise Exception()

    print('***************************************************************************************')
    print(GLOBAL_VARS.NN_MODEL_NAME)

    model = pmh.get_keras_rnn_model(NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS,
                                   lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, conv_size,
                                   conv_filter_length, conv_max_pooling_length)

    # get model best weights
    weights = param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_weights']
    model.set_weights(weights)

    print('Evaluating on Test Data using best weights')
    _, ytp, ytp_binary = pmh.predict_generator(None, model, Xt_data, y_test, NN_BATCH_SIZE, QUEUE_SIZE, test_docs_list)

    print('Generating Test Metrics')
    test_metrics = mh.get_sequential_metrics(y_test, ytp, ytp_binary)
    mh.display_sequential_metrics(test_metrics)

    if save_results:
        classifier_name, parameters = ch.get_sequential_classifier_information(model)
        ch.save_results(classifier_name+'_LSTM', test_metrics, parameters, model_name, classif_level, classif_type, dataset_location)

        test_metrics_dict[GLOBAL_VARS.NN_MODEL_NAME] = test_metrics
        pickle.dump(test_metrics_dict, open(test_metrics_path, 'wb'))
def preprocessing_data_for_fasttext(data_frame, text_vectorizer,
                                    class_vectorizer):
    root_location = fh.get_root_location('data/fasttext_outcome/')

    data_frame['text'] = data_frame['text'].replace(
        '\n', ' ', regex=True).replace('\t', ' ', regex=True)

    model_name = text_vectorizer + '/' + class_vectorizer + '/FastText'
    try:
        X_train, X_test, Y_train, Y_test, _, _, _, _ = ch.apply_df_vectorizer(
            data_frame, text_vectorizer, class_vectorizer, model_name)

        X_train, X_val, Y_train, Y_val = ch.get_train_test_from_data(
            X_train, Y_train)

        # self.data_vectors = pd.DataFrame(columns=range(vectors_size), index=range(corpus_size))
        if not isinstance(X_train, pd.DataFrame):
            train = pd.DataFrame(data=X_train)
            test = pd.DataFrame(data=X_test)
            val = pd.DataFrame(data=X_val)
            # test_labels = pd.DataFrame(columns=[''])
        else:
            train = X_train
            test = X_test
            val = X_val

        train.loc[:, 1] = Y_train
        test.loc[:, 1] = Y_test
        val.loc[:, 1] = Y_val

        train.drop(columns=['patent_id'], inplace=True)
        test.drop(columns=['patent_id'], inplace=True)
        val.drop(columns=['patent_id'], inplace=True)

        data_frame.to_csv(fh.link_paths(root_location, 'dataframe.csv'),
                          index=False,
                          sep=' ',
                          header=False,
                          quoting=csv.QUOTE_NONE,
                          quotechar="",
                          escapechar=" ")

        train.to_csv(fh.link_paths(root_location, 'training set.csv'),
                     index=False,
                     sep=' ',
                     header=False,
                     quoting=csv.QUOTE_NONE,
                     quotechar="",
                     escapechar=" ")
        test.to_csv(fh.link_paths(root_location, 'testing set.csv'),
                    index=False,
                    sep=',',
                    header=False,
                    quoting=csv.QUOTE_NONE,
                    quotechar="",
                    escapechar=" ")
    except:
        print('a problem occurred while trying to store the dataframes')

        X_train, X_test, Y_train, Y_test, _, _, _, _ = ch.apply_df_vectorizer(
            data_frame, text_vectorizer, class_vectorizer, model_name)

        X_train, X_val, Y_train, Y_val = ch.get_train_test_from_data(
            X_train, Y_train)

        val = pd.DataFrame({'text': X_val, 'classification': Y_val})
        train = pd.DataFrame({'text': X_train, 'classification': Y_train})
        test = pd.DataFrame({'text': X_test, 'classification': Y_test})

        data_frame.to_csv(fh.link_paths(root_location, 'dataframe.csv'),
                          index=False,
                          sep=' ',
                          header=False,
                          quoting=csv.QUOTE_NONE,
                          quotechar="",
                          escapechar=" ")

        val.to_csv(fh.link_paths(root_location, 'validating set.csv'),
                   index=False,
                   sep=' ',
                   header=False,
                   quoting=csv.QUOTE_NONE,
                   quotechar="",
                   escapechar=" ")
        train.to_csv(fh.link_paths(root_location, 'training set.csv'),
                     index=False,
                     sep=' ',
                     header=False,
                     quoting=csv.QUOTE_NONE,
                     quotechar="",
                     escapechar=" ")
        test.to_csv(fh.link_paths(root_location, 'testing set.csv'),
                    index=False,
                    sep=',',
                    header=False,
                    quoting=csv.QUOTE_NONE,
                    quotechar="",
                    escapechar=" ")