NN_PARAMETER_SEARCH_PREFIX.format(classifications_type, NN_BATCH_SIZE))) if load_existing_results: param_results_path = os.path.join( os.path.join( nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME, NN_PARAMETER_SEARCH_PREFIX.format(classifications_type, NN_BATCH_SIZE))) if os.path.exists(param_results_path): info('Loading Previous results in {}'.format(param_results_path)) param_results_dict = pickle.load(open(param_results_path)) else: info('No Previous results exist in {}'.format(param_results_path)) ensure_disk_location_exists( os.path.join(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME)) for parameters in param_sampler: start_time = time.time() first_hidden_layer_size = parameters['first_hidden_layer_size'] first_hidden_layer_activation = parameters[ 'first_hidden_layer_activation'] second_hidden_layer_size = parameters['second_hidden_layer_size'] second_hidden_layer_activation = parameters[ 'second_hidden_layer_activation'] input_dropout_do = parameters['input_dropout'] hidden_dropout_do = parameters['hidden_dropout'] second_hidden_dropout_do = parameters['second_hidden_dropout'] GLOBAL_VARS.NN_MODEL_NAME = 'nn_1st-size_{}_1st-act_{}_2nd-size_{}_2nd-act_{}_in-drop_{}_hid-drop_{}'.format( first_hidden_layer_size, first_hidden_layer_activation,
# Validation Metrics info('Evaluating on Validation Data') yvp = clf.predict(Xv) yvp_score = clf.decision_function(Xv) validation_metrics = get_metrics(yv, yvp_score, yvp) print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format( validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive']) # Dump the classifier and metrics data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type) ensure_disk_location_exists(data_folder) pickle.dump( clf, open( os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "w")) pickle.dump( training_metrics, open( os.path.join(data_folder, TRAINING_METRICS_FILENAME.format(classifications_type)), "w")) pickle.dump( validation_metrics, open( os.path.join(data_folder,
sequence_insert_location, use_get=False) info("Filling validation matrix") fill_matrix(Xv_data, validation_dict, validation_docs_list, sequence_insert_location, use_get=True) info("Filling test matrix") fill_matrix(Xt_data, test_dict, test_docs_list, sequence_insert_location, use_get=True) ensure_disk_location_exists( os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME)) info("Saving training matrix") np.save( open( os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME, TRAINING_DATA_MATRIX.format(LEVEL_TO_GENERATE)), "w"), X_data) info("Saving validation matrix") np.save( open( os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME, VALIDATION_DATA_MATRIX.format(LEVEL_TO_GENERATE)), "w"), Xv_data) info("Saving test matrix")
subclasses_file = os.path.join(exports_location, "subclasses.pkl") valid_classes_file = os.path.join(exports_location, "valid_classes.pkl") valid_subclasses_file = os.path.join(exports_location, "valid_subclasses.pkl") classifications_file = os.path.join(exports_location, "classifications.pkl") doc_lengths_map_file = os.path.join(exports_location, "doc_lengths_map.pkl") training_docs_list_file = os.path.join(exports_location, "training_docs_list.pkl") validation_docs_list_file = os.path.join(exports_location, "validation_docs_list.pkl") test_docs_list_file = os.path.join(exports_location, "test_docs_list.pkl") docs_only_preprocessed_file = os.path.join(root_location, "preprocessed_data", "docs_only_file.txt") bow_data_location = os.path.join(root_location, "bow_data") ensure_disk_location_exists(bow_data_location) ## Load utility data doc_classification_map = pickle.load(open(doc_classification_map_file)) sections = pickle.load(open(sections_file)) classes = pickle.load(open(classes_file)) subclasses = pickle.load(open(subclasses_file)) valid_classes = pickle.load(open(valid_classes_file)) valid_subclasses = pickle.load(open(valid_subclasses_file)) training_docs_list = pickle.load(open(training_docs_list_file)) validation_docs_list = pickle.load(open(validation_docs_list_file)) test_docs_list = pickle.load(open(test_docs_list_file)) classifications_index = pickle.load(open(classifications_index_file)) doc_count = len(doc_classification_map)
exports_location = root_location + "exported_data/" classifications_index_file = os.path.join(exports_location, "classifications_index.pkl") doc_classification_map_file = os.path.join(exports_location, "doc_classification_map.pkl") sections_file = os.path.join(exports_location, "sections.pkl") valid_classes_file = os.path.join(exports_location, "valid_classes.pkl") valid_subclasses_file = os.path.join(exports_location, "valid_subclasses.pkl") classifications_file = os.path.join(exports_location, "classifications.pkl") doc_lengths_map_file = os.path.join(exports_location, "doc_lengths_map.pkl") training_docs_list_file = os.path.join(exports_location, "training_docs_list.pkl") validation_docs_list_file = os.path.join(exports_location, "validation_docs_list.pkl") test_docs_list_file = os.path.join(exports_location, "test_docs_list.pkl") preprocessed_location = os.path.join(root_location, "preprocessed_data", "separated_datasets") ensure_disk_location_exists(preprocessed_location) training_preprocessed_files_prefix = os.path.join(preprocessed_location, "training_docs_data_preprocessed-") validation_preprocessed_files_prefix = os.path.join(preprocessed_location, "validation_docs_data_preprocessed-") test_preprocessed_files_prefix = os.path.join(preprocessed_location, "test_docs_data_preprocessed-") docs_only_preprocessed_file = os.path.join(root_location,"preprocessed_data", "docs_only_file.txt") doc_classification_map = pickle.load(open(doc_classification_map_file)) sections = pickle.load(open(sections_file)) valid_classes = pickle.load(open(valid_classes_file)) valid_subclasses = pickle.load(open(valid_subclasses_file)) training_docs_list = pickle.load(open(training_docs_list_file)) validation_docs_list = pickle.load(open(validation_docs_list_file)) test_docs_list = pickle.load(open(test_docs_list_file))
root_location = "../../data/" preprocessed_location = result = root_location + "preprocessed_data/separated_datasets/" training_preprocessed_files_prefix = os.path.join( preprocessed_location, "training_docs_data_preprocessed-") validation_preprocessed_files_prefix = os.path.join( preprocessed_location, "validation_docs_data_preprocessed-") test_preprocessed_files_prefix = os.path.join(preprocessed_location, "test_docs_data_preprocessed-") # loading vocabulary if exists doc2vec_model_save_location = os.path.join( root_location, "parameter_search_doc2vec_models_" + str(level) + '_' + model_name, "full") ensure_disk_location_exists(doc2vec_model_save_location) ensure_disk_location_exists( os.path.join(doc2vec_model_save_location, VOCAB_MODEL)) placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format( DOC2VEC_SIZE, DOC2VEC_WINDOW, 'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow', DOC2VEC_CONCAT, DOC2VEC_MEAN, DOC2VEC_TRAIN_WORDS, DOC2VEC_HIERARCHICAL_SAMPLE, DOC2VEC_NEGATIVE_SAMPLE_SIZE, str(DOC2VEC_MAX_VOCAB_SIZE), str(level) + '_' + model_name) GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}") info("FILE " + os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX)) doc2vec_model = Doc2Vec( size=DOC2VEC_SIZE,