Пример #1
0
def train_cnn(texts,
              languages,
              labels,
              embeddings,
              parameters,
              model_serialization_path,
              emb_lang='default'):
    # preparing texts
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing texts...',
          flush=True)
    texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts]
    # encoding languages (full name to abbreviation)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Encoding languages (full name to abbreviation)...',
          flush=True)
    langs = [map_lang(x) for x in languages]
    # preparing training examples
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing training examples...',
          flush=True)
    x_train, y_train, dist_labels = data_shaper.prep_classification(
        texts_clean,
        labels,
        embeddings,
        embeddings_language=emb_lang,
        multilingual_langs=langs,
        numbers_token='<NUM/>',
        punct_token='<PUNC/>',
        add_out_of_vocabulary_terms=False)

    # defining the CNN model
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Defining the CNN model...',
          flush=True)
    cnn_classifier = cnn.CNN(embeddings=(embeddings.emb_sizes[emb_lang],
                                         embeddings.lang_embeddings[emb_lang]),
                             num_conv_layers=parameters["num_convolutions"],
                             filters=parameters["filters"],
                             k_max_pools=parameters["k_max_pools"],
                             manual_features_size=0)
    cnn_classifier.define_model(
        len(x_train[0]),
        len(dist_labels),
        loss_functions.softmax_cross_entropy,
        len(embeddings.lang_vocabularies[emb_lang]),
        l2_reg_factor=parameters["reg_factor"],
        update_embeddings=parameters["update_embeddings"])
    cnn_classifier.define_optimization(
        learning_rate=parameters["learning_rate"])
    cnn_classifier.set_distinct_labels(dist_labels)

    # initializing a Tensorflow session
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Initializing a Tensorflow session...',
          flush=True)
    session = tf.InteractiveSession()
    session.run(tf.global_variables_initializer())

    # training the model
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Training the model...',
          flush=True)
    simp_trainer = trainer.SimpleTrainer(cnn_classifier,
                                         session,
                                         build_feed_dict_func,
                                         eval_func,
                                         configuration_func=None)
    simp_trainer.train(
        list(zip(x_train, y_train)),
        parameters["batch_size"],
        parameters["num_epochs"],
        num_epochs_not_better_end=5,
        epoch_diff_smaller_end=parameters["epoch_diff_smaller_end"],
        print_batch_losses=True,
        eval_params={"dist_labels": dist_labels})

    # storing the model
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Storing the model...',
          flush=True)
    cnn_classifier.serialize(session, model_serialization_path)
    session.close()
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Training model is done!',
          flush=True)
Пример #2
0
def topically_scale(filenames,
                    texts,
                    languages,
                    embeddings,
                    model_serialization_path,
                    predictions_file_path,
                    parameters,
                    emb_lang='default',
                    stopwords=[]):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Loading classifier...",
          flush=True)
    cnn_classifier, session = cnn.load_model(
        model_serialization_path,
        embeddings.lang_embeddings[emb_lang],
        loss_functions.softmax_cross_entropy,
        just_predict=True)
    simp_trainer = trainer.SimpleTrainer(cnn_classifier,
                                         session,
                                         build_feed_dict_func,
                                         None,
                                         configuration_func=None)

    classified_texts = {}
    items = list(zip(filenames, texts, [map_lang(x) for x in languages]))
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Topically classifying texts...",
          flush=True)
    for item in items:
        fn, text, lang = item
        print(fn, flush=True)
        # split text in sentences
        sentences = nltk.sent_tokenize(text)
        sents_clean = [
            data_helper.clean_str(s.strip()).split() for s in sentences
        ]
        langs = [lang] * len(sentences)

        # preparing training examples
        x_test = data_shaper.prep_classification(
            sents_clean,
            None,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)

        results = simp_trainer.test(list(zip(x_test, [None] * len(x_test))),
                                    parameters["batch_size"],
                                    batch_size_irrelevant=True,
                                    print_batches=True)

        pred_labs = get_prediction_labels(results, cnn_classifier.dist_labels)
        print("Predictions: ", flush=True)
        print(pred_labs, flush=True)

        classified_texts[fn] = list(zip(sentences, pred_labs, langs))

        print("Languages: " + str(langs), flush=True)
        print("Done with classifying: " + fn, flush=True)

    lines_to_write = []
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Topical scaling...",
          flush=True)
    for l in cnn_classifier.dist_labels:
        label_filtered = [(fn, classified_texts[fn][0][2], ' '.join([
            sent_label[0] for sent_label in classified_texts[fn]
            if sent_label[1] == l
        ])) for fn in classified_texts]
        label_filtered = [x for x in label_filtered if len(x[2].strip()) > 50]
        if len(label_filtered) > 3:
            print("Topic: " + l, flush=True)
            fns = [x[0] for x in label_filtered]
            langs = [x[1] for x in label_filtered]
            filt_texts = [x[2] for x in label_filtered]

            for i in range(len(fns)):
                io_helper.write_list(
                    os.path.dirname(predictions_file_path) + "/" +
                    fns[i].split(".")[0] + "_" + l.replace(" ", "-") + ".txt",
                    [filt_texts[i]])

            label_scale = scale_efficient(fns,
                                          filt_texts,
                                          [inverse_map_lang(x) for x in langs],
                                          embeddings,
                                          None,
                                          parameters,
                                          emb_lang=emb_lang,
                                          stopwords=stopwords)
            lines_to_write.append("Scaling for class: " + l)
            lines_to_write.extend(
                [k + " " + str(label_scale[k]) for k in label_scale])
            lines_to_write.append("\n")
        else:
            lines_to_write.append(
                "Topic: " + l +
                ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic."
            )
            print(
                "Topic: " + l +
                ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic.",
                flush=True)

    io_helper.write_list(predictions_file_path, lines_to_write)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Topical Scaling is done!',
          flush=True)
Пример #3
0
    drp = config[-1]
    fd = model.get_feed_dict(x1s_flat, x2s_flat, ys_flat,
                             1.0 if predict else drp)
    return fd, ys


# training parameters
max_num_epochs = 1000
num_evals_not_better_end = 30
eval_each_num_batches = 100
shuffle_data = False

simp_trainer = trainer.SimpleTrainer(None,
                                     None,
                                     build_feed_dict_func,
                                     None,
                                     configuration_func=prep_model_config,
                                     additional_results_func=None,
                                     model_serialization_path=model_path)
results = simp_trainer.grid_search(
    configs,
    train_set,
    dev_set,
    batch_size,
    max_num_epochs,
    num_devs_not_better_end=num_evals_not_better_end,
    batch_dev_perf=eval_each_num_batches,
    print_batch_losses=False,
    dev_score_maximize=False,
    print_training=True,
    shuffle_data=shuffle_data)
Пример #4
0
def test_cnn(texts,
             languages,
             labels,
             embeddings,
             model_serialization_path,
             predictions_file_path,
             parameters,
             emb_lang='default'):
    # loading the serialized
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Loading the serialized model...',
          flush=True)
    cnn_classifier, session = cnn.load_model(
        model_serialization_path,
        embeddings.lang_embeddings[emb_lang],
        loss_functions.softmax_cross_entropy,
        just_predict=(labels is None))

    # preparing/cleaning the texts
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing/cleaning the texts...',
          flush=True)
    texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts]
    # encoding languages (full name to abbreviation)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Encoding languages (full name to abbreviation)...',
          flush=True)
    langs = [map_lang(x) for x in languages]
    # preparing testing examples
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing training examples...',
          flush=True)
    if labels:
        x_test, y_test, dist_labels = data_shaper.prep_classification(
            texts_clean,
            labels,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)
    else:
        x_test = data_shaper.prep_classification(
            texts_clean,
            labels,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)

    simp_trainer = trainer.SimpleTrainer(cnn_classifier,
                                         session,
                                         build_feed_dict_func,
                                         None if not labels else eval_func,
                                         configuration_func=None)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '  Starting test...',
          flush=True)
    results = simp_trainer.test(
        list(zip(x_test, y_test if labels else [None] * len(x_test))),
        parameters["batch_size"],
        eval_params={"dist_labels": cnn_classifier.dist_labels},
        batch_size_irrelevant=True)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Getting prediction labels...',
          flush=True)
    pred_labs = get_prediction_labels(results[0] if labels else results,
                                      cnn_classifier.dist_labels)

    if labels is None:
        io_helper.write_list(predictions_file_path, pred_labs)
    else:
        list_pairs = list(zip(pred_labs, labels))
        list_pairs.insert(0, ("Prediction", "Real label"))
        list_pairs.append(("Performance: ", str(results[1])))
        io_helper.write_list_tuples_separated(predictions_file_path,
                                              list_pairs)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Prediction is done!',
          flush=True)
Пример #5
0
# train params
num_maps = args.slice if args.slice else 5
lay_size = args.specsize if args.specsize else 100
lr = args.learningrate if args.learningrate else 0.0001
same_encoder = True
num_lays = 1
drp = 0.5
l2_reg_fac = 0.001
act = tf.nn.tanh
noise = 0
batch_size = 50

print("Defining the model...")
mapper_layers = [lay_size] * num_lays
model = wordpair_classifier.WordPairClassifier(embeddings, embedding_size, mapper_layers, same_mlp = same_encoder, bilinear_softmax = True, num_mappings = num_maps, activation = act, num_classes = len(dist_labels), noise_std = noise, dist_labels = dist_labels)
model.define_optimization(loss_functions.softmax_cross_entropy, l2_reg_fac, lr, loss_function_params = None)

print("Initializing a TensorFlow session...")
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())

print("Training the model...")
coach = trainer.SimpleTrainer(model, session, build_feed_dict_func, None, None, None, None)
coach.train(train_data, batch_size, 10000, num_epochs_not_better_end = 20, epoch_diff_smaller_end = 0.001, print_batch_losses = False)

print("Serializing the model...")
ser_path = args.output
pickle.dump(model.get_model(session), open(args.output, "wb"))

print("My work here is done, ciao bella!")