def train_cnn(texts, languages, labels, embeddings, parameters, model_serialization_path, emb_lang='default'): # preparing texts print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing texts...', flush=True) texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts] # encoding languages (full name to abbreviation) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Encoding languages (full name to abbreviation)...', flush=True) langs = [map_lang(x) for x in languages] # preparing training examples print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing training examples...', flush=True) x_train, y_train, dist_labels = data_shaper.prep_classification( texts_clean, labels, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False) # defining the CNN model print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Defining the CNN model...', flush=True) cnn_classifier = cnn.CNN(embeddings=(embeddings.emb_sizes[emb_lang], embeddings.lang_embeddings[emb_lang]), num_conv_layers=parameters["num_convolutions"], filters=parameters["filters"], k_max_pools=parameters["k_max_pools"], manual_features_size=0) cnn_classifier.define_model( len(x_train[0]), len(dist_labels), loss_functions.softmax_cross_entropy, len(embeddings.lang_vocabularies[emb_lang]), l2_reg_factor=parameters["reg_factor"], update_embeddings=parameters["update_embeddings"]) cnn_classifier.define_optimization( learning_rate=parameters["learning_rate"]) cnn_classifier.set_distinct_labels(dist_labels) # initializing a Tensorflow session print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Initializing a Tensorflow session...', flush=True) session = tf.InteractiveSession() session.run(tf.global_variables_initializer()) # training the model print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Training the model...', flush=True) simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, eval_func, configuration_func=None) simp_trainer.train( list(zip(x_train, y_train)), parameters["batch_size"], parameters["num_epochs"], num_epochs_not_better_end=5, epoch_diff_smaller_end=parameters["epoch_diff_smaller_end"], print_batch_losses=True, eval_params={"dist_labels": dist_labels}) # storing the model print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Storing the model...', flush=True) cnn_classifier.serialize(session, model_serialization_path) session.close() print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Training model is done!', flush=True)
def topically_scale(filenames, texts, languages, embeddings, model_serialization_path, predictions_file_path, parameters, emb_lang='default', stopwords=[]): print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Loading classifier...", flush=True) cnn_classifier, session = cnn.load_model( model_serialization_path, embeddings.lang_embeddings[emb_lang], loss_functions.softmax_cross_entropy, just_predict=True) simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, None, configuration_func=None) classified_texts = {} items = list(zip(filenames, texts, [map_lang(x) for x in languages])) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Topically classifying texts...", flush=True) for item in items: fn, text, lang = item print(fn, flush=True) # split text in sentences sentences = nltk.sent_tokenize(text) sents_clean = [ data_helper.clean_str(s.strip()).split() for s in sentences ] langs = [lang] * len(sentences) # preparing training examples x_test = data_shaper.prep_classification( sents_clean, None, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) results = simp_trainer.test(list(zip(x_test, [None] * len(x_test))), parameters["batch_size"], batch_size_irrelevant=True, print_batches=True) pred_labs = get_prediction_labels(results, cnn_classifier.dist_labels) print("Predictions: ", flush=True) print(pred_labs, flush=True) classified_texts[fn] = list(zip(sentences, pred_labs, langs)) print("Languages: " + str(langs), flush=True) print("Done with classifying: " + fn, flush=True) lines_to_write = [] print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Topical scaling...", flush=True) for l in cnn_classifier.dist_labels: label_filtered = [(fn, classified_texts[fn][0][2], ' '.join([ sent_label[0] for sent_label in classified_texts[fn] if sent_label[1] == l ])) for fn in classified_texts] label_filtered = [x for x in label_filtered if len(x[2].strip()) > 50] if len(label_filtered) > 3: print("Topic: " + l, flush=True) fns = [x[0] for x in label_filtered] langs = [x[1] for x in label_filtered] filt_texts = [x[2] for x in label_filtered] for i in range(len(fns)): io_helper.write_list( os.path.dirname(predictions_file_path) + "/" + fns[i].split(".")[0] + "_" + l.replace(" ", "-") + ".txt", [filt_texts[i]]) label_scale = scale_efficient(fns, filt_texts, [inverse_map_lang(x) for x in langs], embeddings, None, parameters, emb_lang=emb_lang, stopwords=stopwords) lines_to_write.append("Scaling for class: " + l) lines_to_write.extend( [k + " " + str(label_scale[k]) for k in label_scale]) lines_to_write.append("\n") else: lines_to_write.append( "Topic: " + l + ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic." ) print( "Topic: " + l + ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic.", flush=True) io_helper.write_list(predictions_file_path, lines_to_write) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Topical Scaling is done!', flush=True)
drp = config[-1] fd = model.get_feed_dict(x1s_flat, x2s_flat, ys_flat, 1.0 if predict else drp) return fd, ys # training parameters max_num_epochs = 1000 num_evals_not_better_end = 30 eval_each_num_batches = 100 shuffle_data = False simp_trainer = trainer.SimpleTrainer(None, None, build_feed_dict_func, None, configuration_func=prep_model_config, additional_results_func=None, model_serialization_path=model_path) results = simp_trainer.grid_search( configs, train_set, dev_set, batch_size, max_num_epochs, num_devs_not_better_end=num_evals_not_better_end, batch_dev_perf=eval_each_num_batches, print_batch_losses=False, dev_score_maximize=False, print_training=True, shuffle_data=shuffle_data)
def test_cnn(texts, languages, labels, embeddings, model_serialization_path, predictions_file_path, parameters, emb_lang='default'): # loading the serialized print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Loading the serialized model...', flush=True) cnn_classifier, session = cnn.load_model( model_serialization_path, embeddings.lang_embeddings[emb_lang], loss_functions.softmax_cross_entropy, just_predict=(labels is None)) # preparing/cleaning the texts print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing/cleaning the texts...', flush=True) texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts] # encoding languages (full name to abbreviation) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Encoding languages (full name to abbreviation)...', flush=True) langs = [map_lang(x) for x in languages] # preparing testing examples print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing training examples...', flush=True) if labels: x_test, y_test, dist_labels = data_shaper.prep_classification( texts_clean, labels, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) else: x_test = data_shaper.prep_classification( texts_clean, labels, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, None if not labels else eval_func, configuration_func=None) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Starting test...', flush=True) results = simp_trainer.test( list(zip(x_test, y_test if labels else [None] * len(x_test))), parameters["batch_size"], eval_params={"dist_labels": cnn_classifier.dist_labels}, batch_size_irrelevant=True) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Getting prediction labels...', flush=True) pred_labs = get_prediction_labels(results[0] if labels else results, cnn_classifier.dist_labels) if labels is None: io_helper.write_list(predictions_file_path, pred_labs) else: list_pairs = list(zip(pred_labs, labels)) list_pairs.insert(0, ("Prediction", "Real label")) list_pairs.append(("Performance: ", str(results[1]))) io_helper.write_list_tuples_separated(predictions_file_path, list_pairs) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Prediction is done!', flush=True)
# train params num_maps = args.slice if args.slice else 5 lay_size = args.specsize if args.specsize else 100 lr = args.learningrate if args.learningrate else 0.0001 same_encoder = True num_lays = 1 drp = 0.5 l2_reg_fac = 0.001 act = tf.nn.tanh noise = 0 batch_size = 50 print("Defining the model...") mapper_layers = [lay_size] * num_lays model = wordpair_classifier.WordPairClassifier(embeddings, embedding_size, mapper_layers, same_mlp = same_encoder, bilinear_softmax = True, num_mappings = num_maps, activation = act, num_classes = len(dist_labels), noise_std = noise, dist_labels = dist_labels) model.define_optimization(loss_functions.softmax_cross_entropy, l2_reg_fac, lr, loss_function_params = None) print("Initializing a TensorFlow session...") session = tf.InteractiveSession() session.run(tf.global_variables_initializer()) print("Training the model...") coach = trainer.SimpleTrainer(model, session, build_feed_dict_func, None, None, None, None) coach.train(train_data, batch_size, 10000, num_epochs_not_better_end = 20, epoch_diff_smaller_end = 0.001, print_batch_losses = False) print("Serializing the model...") ser_path = args.output pickle.dump(model.get_model(session), open(args.output, "wb")) print("My work here is done, ciao bella!")