def test_cnn(texts, languages, labels, embeddings, model_serialization_path, predictions_file_path, parameters, emb_lang='default'): # loading the serialized print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Loading the serialized model...', flush=True) cnn_classifier, session = cnn.load_model( model_serialization_path, embeddings.lang_embeddings[emb_lang], loss_functions.softmax_cross_entropy, just_predict=(labels is None)) # preparing/cleaning the texts print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing/cleaning the texts...', flush=True) texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts] # encoding languages (full name to abbreviation) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Encoding languages (full name to abbreviation)...', flush=True) langs = [map_lang(x) for x in languages] # preparing testing examples print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing training examples...', flush=True) if labels: x_test, y_test, dist_labels = data_shaper.prep_classification( texts_clean, labels, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) else: x_test = data_shaper.prep_classification( texts_clean, labels, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, None if not labels else eval_func, configuration_func=None) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Starting test...', flush=True) results = simp_trainer.test( list(zip(x_test, y_test if labels else [None] * len(x_test))), parameters["batch_size"], eval_params={"dist_labels": cnn_classifier.dist_labels}, batch_size_irrelevant=True) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Getting prediction labels...', flush=True) pred_labs = get_prediction_labels(results[0] if labels else results, cnn_classifier.dist_labels) if labels is None: io_helper.write_list(predictions_file_path, pred_labs) else: list_pairs = list(zip(pred_labs, labels)) list_pairs.insert(0, ("Prediction", "Real label")) list_pairs.append(("Performance: ", str(results[1]))) io_helper.write_list_tuples_separated(predictions_file_path, list_pairs) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Prediction is done!', flush=True)
def topically_scale(filenames, texts, languages, embeddings, model_serialization_path, predictions_file_path, parameters, emb_lang='default', stopwords=[]): print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Loading classifier...", flush=True) cnn_classifier, session = cnn.load_model( model_serialization_path, embeddings.lang_embeddings[emb_lang], loss_functions.softmax_cross_entropy, just_predict=True) simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, None, configuration_func=None) classified_texts = {} items = list(zip(filenames, texts, [map_lang(x) for x in languages])) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Topically classifying texts...", flush=True) for item in items: fn, text, lang = item print(fn, flush=True) # split text in sentences sentences = nltk.sent_tokenize(text) sents_clean = [ data_helper.clean_str(s.strip()).split() for s in sentences ] langs = [lang] * len(sentences) # preparing training examples x_test = data_shaper.prep_classification( sents_clean, None, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) results = simp_trainer.test(list(zip(x_test, [None] * len(x_test))), parameters["batch_size"], batch_size_irrelevant=True, print_batches=True) pred_labs = get_prediction_labels(results, cnn_classifier.dist_labels) print("Predictions: ", flush=True) print(pred_labs, flush=True) classified_texts[fn] = list(zip(sentences, pred_labs, langs)) print("Languages: " + str(langs), flush=True) print("Done with classifying: " + fn, flush=True) lines_to_write = [] print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Topical scaling...", flush=True) for l in cnn_classifier.dist_labels: label_filtered = [(fn, classified_texts[fn][0][2], ' '.join([ sent_label[0] for sent_label in classified_texts[fn] if sent_label[1] == l ])) for fn in classified_texts] label_filtered = [x for x in label_filtered if len(x[2].strip()) > 50] if len(label_filtered) > 3: print("Topic: " + l, flush=True) fns = [x[0] for x in label_filtered] langs = [x[1] for x in label_filtered] filt_texts = [x[2] for x in label_filtered] for i in range(len(fns)): io_helper.write_list( os.path.dirname(predictions_file_path) + "/" + fns[i].split(".")[0] + "_" + l.replace(" ", "-") + ".txt", [filt_texts[i]]) label_scale = scale_efficient(fns, filt_texts, [inverse_map_lang(x) for x in langs], embeddings, None, parameters, emb_lang=emb_lang, stopwords=stopwords) lines_to_write.append("Scaling for class: " + l) lines_to_write.extend( [k + " " + str(label_scale[k]) for k in label_scale]) lines_to_write.append("\n") else: lines_to_write.append( "Topic: " + l + ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic." ) print( "Topic: " + l + ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic.", flush=True) io_helper.write_list(predictions_file_path, lines_to_write) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Topical Scaling is done!', flush=True)
args.output) == "": print("Error: Directory of the output file does not exist.") exit(code=1) if args.stopwords and not os.path.isfile(args.stopwords): print("Error: File containing stopwords not found.") exit(code=1) if args.stopwords: stopwords = io_helper.load_file_lines(args.stopwords) else: stopwords = None files = io_helper.load_all_files(args.datadir) corp = corpus.Corpus(files) corp.tokenize(stopwords=stopwords, freq_treshold=ft) corp.build_occurrences() wf_scaler = scaler.WordfishScaler(corp) wf_scaler.initialize() wf_scaler.train(learning_rate=lr, num_iters=niter) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " WordFish scaling completed.", flush=True) scale = [] for x in corp.results: scale.append(str(x) + "\t" + str(corp.results[x])) io_helper.write_list(args.output, scale)