예제 #1
0
            NN_PARAMETER_SEARCH_PREFIX.format(classifications_type,
                                              NN_BATCH_SIZE)))

    if load_existing_results:
        param_results_path = os.path.join(
            os.path.join(
                nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME,
                NN_PARAMETER_SEARCH_PREFIX.format(classifications_type,
                                                  NN_BATCH_SIZE)))
        if os.path.exists(param_results_path):
            info('Loading Previous results in {}'.format(param_results_path))
            param_results_dict = pickle.load(open(param_results_path))
        else:
            info('No Previous results exist in {}'.format(param_results_path))

    ensure_disk_location_exists(
        os.path.join(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME))

    for parameters in param_sampler:
        start_time = time.time()
        first_hidden_layer_size = parameters['first_hidden_layer_size']
        first_hidden_layer_activation = parameters[
            'first_hidden_layer_activation']
        second_hidden_layer_size = parameters['second_hidden_layer_size']
        second_hidden_layer_activation = parameters[
            'second_hidden_layer_activation']
        input_dropout_do = parameters['input_dropout']
        hidden_dropout_do = parameters['hidden_dropout']
        second_hidden_dropout_do = parameters['second_hidden_dropout']

        GLOBAL_VARS.NN_MODEL_NAME = 'nn_1st-size_{}_1st-act_{}_2nd-size_{}_2nd-act_{}_in-drop_{}_hid-drop_{}'.format(
            first_hidden_layer_size, first_hidden_layer_activation,
예제 #2
0
# Validation Metrics
info('Evaluating on Validation Data')
yvp = clf.predict(Xv)
yvp_score = clf.decision_function(Xv)
validation_metrics = get_metrics(yv, yvp_score, yvp)
print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    validation_metrics['coverage_error'],
    validation_metrics['average_num_of_labels'], validation_metrics['top_1'],
    validation_metrics['top_3'], validation_metrics['top_5'],
    validation_metrics['f1_micro'], validation_metrics['f1_macro'],
    validation_metrics['total_positive'])

# Dump the classifier and metrics
data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
ensure_disk_location_exists(data_folder)
pickle.dump(
    clf,
    open(
        os.path.join(data_folder,
                     CLASSIFIER_FILE.format(classifications_type)), "w"))
pickle.dump(
    training_metrics,
    open(
        os.path.join(data_folder,
                     TRAINING_METRICS_FILENAME.format(classifications_type)),
        "w"))
pickle.dump(
    validation_metrics,
    open(
        os.path.join(data_folder,
예제 #3
0
                    sequence_insert_location,
                    use_get=False)
        info("Filling validation matrix")
        fill_matrix(Xv_data,
                    validation_dict,
                    validation_docs_list,
                    sequence_insert_location,
                    use_get=True)
        info("Filling test matrix")
        fill_matrix(Xt_data,
                    test_dict,
                    test_docs_list,
                    sequence_insert_location,
                    use_get=True)

ensure_disk_location_exists(
    os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME))
info("Saving training matrix")
np.save(
    open(
        os.path.join(matrices_save_location,
                     GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME,
                     TRAINING_DATA_MATRIX.format(LEVEL_TO_GENERATE)), "w"),
    X_data)
info("Saving validation matrix")
np.save(
    open(
        os.path.join(matrices_save_location,
                     GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME,
                     VALIDATION_DATA_MATRIX.format(LEVEL_TO_GENERATE)), "w"),
    Xv_data)
info("Saving test matrix")
예제 #4
0
subclasses_file = os.path.join(exports_location, "subclasses.pkl")
valid_classes_file = os.path.join(exports_location, "valid_classes.pkl")
valid_subclasses_file = os.path.join(exports_location, "valid_subclasses.pkl")
classifications_file = os.path.join(exports_location, "classifications.pkl")
doc_lengths_map_file = os.path.join(exports_location, "doc_lengths_map.pkl")
training_docs_list_file = os.path.join(exports_location,
                                       "training_docs_list.pkl")
validation_docs_list_file = os.path.join(exports_location,
                                         "validation_docs_list.pkl")
test_docs_list_file = os.path.join(exports_location, "test_docs_list.pkl")

docs_only_preprocessed_file = os.path.join(root_location, "preprocessed_data",
                                           "docs_only_file.txt")
bow_data_location = os.path.join(root_location, "bow_data")

ensure_disk_location_exists(bow_data_location)

## Load utility data

doc_classification_map = pickle.load(open(doc_classification_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))
classifications_index = pickle.load(open(classifications_index_file))

doc_count = len(doc_classification_map)
예제 #5
0
exports_location = root_location + "exported_data/"

classifications_index_file = os.path.join(exports_location, "classifications_index.pkl")
doc_classification_map_file = os.path.join(exports_location, "doc_classification_map.pkl")
sections_file = os.path.join(exports_location, "sections.pkl")
valid_classes_file = os.path.join(exports_location, "valid_classes.pkl")
valid_subclasses_file = os.path.join(exports_location, "valid_subclasses.pkl")
classifications_file = os.path.join(exports_location, "classifications.pkl")
doc_lengths_map_file = os.path.join(exports_location, "doc_lengths_map.pkl")
training_docs_list_file = os.path.join(exports_location, "training_docs_list.pkl")
validation_docs_list_file = os.path.join(exports_location, "validation_docs_list.pkl")
test_docs_list_file = os.path.join(exports_location, "test_docs_list.pkl")

preprocessed_location = os.path.join(root_location, "preprocessed_data", "separated_datasets")

ensure_disk_location_exists(preprocessed_location)

training_preprocessed_files_prefix = os.path.join(preprocessed_location, "training_docs_data_preprocessed-")
validation_preprocessed_files_prefix = os.path.join(preprocessed_location, "validation_docs_data_preprocessed-")
test_preprocessed_files_prefix = os.path.join(preprocessed_location, "test_docs_data_preprocessed-")

docs_only_preprocessed_file = os.path.join(root_location,"preprocessed_data", "docs_only_file.txt")


doc_classification_map = pickle.load(open(doc_classification_map_file))
sections = pickle.load(open(sections_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))
예제 #6
0
root_location = "../../data/"

preprocessed_location = result = root_location + "preprocessed_data/separated_datasets/"

training_preprocessed_files_prefix = os.path.join(
    preprocessed_location, "training_docs_data_preprocessed-")
validation_preprocessed_files_prefix = os.path.join(
    preprocessed_location, "validation_docs_data_preprocessed-")
test_preprocessed_files_prefix = os.path.join(preprocessed_location,
                                              "test_docs_data_preprocessed-")

# loading vocabulary if exists
doc2vec_model_save_location = os.path.join(
    root_location,
    "parameter_search_doc2vec_models_" + str(level) + '_' + model_name, "full")
ensure_disk_location_exists(doc2vec_model_save_location)
ensure_disk_location_exists(
    os.path.join(doc2vec_model_save_location, VOCAB_MODEL))
placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format(
    DOC2VEC_SIZE, DOC2VEC_WINDOW, 'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
    DOC2VEC_CONCAT, DOC2VEC_MEAN, DOC2VEC_TRAIN_WORDS,
    DOC2VEC_HIERARCHICAL_SAMPLE, DOC2VEC_NEGATIVE_SAMPLE_SIZE,
    str(DOC2VEC_MAX_VOCAB_SIZE),
    str(level) + '_' + model_name)

GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")
info("FILE " +
     os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
doc2vec_model = Doc2Vec(
    size=DOC2VEC_SIZE,