示例#1
0
文件: nlp.py 项目: umanlp/SemScale
def test_cnn(texts,
             languages,
             labels,
             embeddings,
             model_serialization_path,
             predictions_file_path,
             parameters,
             emb_lang='default'):
    # loading the serialized
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Loading the serialized model...',
          flush=True)
    cnn_classifier, session = cnn.load_model(
        model_serialization_path,
        embeddings.lang_embeddings[emb_lang],
        loss_functions.softmax_cross_entropy,
        just_predict=(labels is None))

    # preparing/cleaning the texts
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing/cleaning the texts...',
          flush=True)
    texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts]
    # encoding languages (full name to abbreviation)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Encoding languages (full name to abbreviation)...',
          flush=True)
    langs = [map_lang(x) for x in languages]
    # preparing testing examples
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing training examples...',
          flush=True)
    if labels:
        x_test, y_test, dist_labels = data_shaper.prep_classification(
            texts_clean,
            labels,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)
    else:
        x_test = data_shaper.prep_classification(
            texts_clean,
            labels,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)

    simp_trainer = trainer.SimpleTrainer(cnn_classifier,
                                         session,
                                         build_feed_dict_func,
                                         None if not labels else eval_func,
                                         configuration_func=None)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '  Starting test...',
          flush=True)
    results = simp_trainer.test(
        list(zip(x_test, y_test if labels else [None] * len(x_test))),
        parameters["batch_size"],
        eval_params={"dist_labels": cnn_classifier.dist_labels},
        batch_size_irrelevant=True)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Getting prediction labels...',
          flush=True)
    pred_labs = get_prediction_labels(results[0] if labels else results,
                                      cnn_classifier.dist_labels)

    if labels is None:
        io_helper.write_list(predictions_file_path, pred_labs)
    else:
        list_pairs = list(zip(pred_labs, labels))
        list_pairs.insert(0, ("Prediction", "Real label"))
        list_pairs.append(("Performance: ", str(results[1])))
        io_helper.write_list_tuples_separated(predictions_file_path,
                                              list_pairs)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Prediction is done!',
          flush=True)
示例#2
0
文件: nlp.py 项目: umanlp/SemScale
def topically_scale(filenames,
                    texts,
                    languages,
                    embeddings,
                    model_serialization_path,
                    predictions_file_path,
                    parameters,
                    emb_lang='default',
                    stopwords=[]):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Loading classifier...",
          flush=True)
    cnn_classifier, session = cnn.load_model(
        model_serialization_path,
        embeddings.lang_embeddings[emb_lang],
        loss_functions.softmax_cross_entropy,
        just_predict=True)
    simp_trainer = trainer.SimpleTrainer(cnn_classifier,
                                         session,
                                         build_feed_dict_func,
                                         None,
                                         configuration_func=None)

    classified_texts = {}
    items = list(zip(filenames, texts, [map_lang(x) for x in languages]))
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Topically classifying texts...",
          flush=True)
    for item in items:
        fn, text, lang = item
        print(fn, flush=True)
        # split text in sentences
        sentences = nltk.sent_tokenize(text)
        sents_clean = [
            data_helper.clean_str(s.strip()).split() for s in sentences
        ]
        langs = [lang] * len(sentences)

        # preparing training examples
        x_test = data_shaper.prep_classification(
            sents_clean,
            None,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)

        results = simp_trainer.test(list(zip(x_test, [None] * len(x_test))),
                                    parameters["batch_size"],
                                    batch_size_irrelevant=True,
                                    print_batches=True)

        pred_labs = get_prediction_labels(results, cnn_classifier.dist_labels)
        print("Predictions: ", flush=True)
        print(pred_labs, flush=True)

        classified_texts[fn] = list(zip(sentences, pred_labs, langs))

        print("Languages: " + str(langs), flush=True)
        print("Done with classifying: " + fn, flush=True)

    lines_to_write = []
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Topical scaling...",
          flush=True)
    for l in cnn_classifier.dist_labels:
        label_filtered = [(fn, classified_texts[fn][0][2], ' '.join([
            sent_label[0] for sent_label in classified_texts[fn]
            if sent_label[1] == l
        ])) for fn in classified_texts]
        label_filtered = [x for x in label_filtered if len(x[2].strip()) > 50]
        if len(label_filtered) > 3:
            print("Topic: " + l, flush=True)
            fns = [x[0] for x in label_filtered]
            langs = [x[1] for x in label_filtered]
            filt_texts = [x[2] for x in label_filtered]

            for i in range(len(fns)):
                io_helper.write_list(
                    os.path.dirname(predictions_file_path) + "/" +
                    fns[i].split(".")[0] + "_" + l.replace(" ", "-") + ".txt",
                    [filt_texts[i]])

            label_scale = scale_efficient(fns,
                                          filt_texts,
                                          [inverse_map_lang(x) for x in langs],
                                          embeddings,
                                          None,
                                          parameters,
                                          emb_lang=emb_lang,
                                          stopwords=stopwords)
            lines_to_write.append("Scaling for class: " + l)
            lines_to_write.extend(
                [k + " " + str(label_scale[k]) for k in label_scale])
            lines_to_write.append("\n")
        else:
            lines_to_write.append(
                "Topic: " + l +
                ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic."
            )
            print(
                "Topic: " + l +
                ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic.",
                flush=True)

    io_helper.write_list(predictions_file_path, lines_to_write)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Topical Scaling is done!',
          flush=True)
示例#3
0
        args.output) == "":
    print("Error: Directory of the output file does not exist.")
    exit(code=1)

if args.stopwords and not os.path.isfile(args.stopwords):
    print("Error: File containing stopwords not found.")
    exit(code=1)

if args.stopwords:
    stopwords = io_helper.load_file_lines(args.stopwords)
else:
    stopwords = None

files = io_helper.load_all_files(args.datadir)
corp = corpus.Corpus(files)
corp.tokenize(stopwords=stopwords, freq_treshold=ft)
corp.build_occurrences()

wf_scaler = scaler.WordfishScaler(corp)
wf_scaler.initialize()
wf_scaler.train(learning_rate=lr, num_iters=niter)

print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
      " WordFish scaling completed.",
      flush=True)

scale = []
for x in corp.results:
    scale.append(str(x) + "\t" + str(corp.results[x]))
io_helper.write_list(args.output, scale)