示例#1
0
def main():
    """ main """
    DATASET_LOCATION = '../datasets/'

    POS_DATASET_NAME = 'brown_pos_dataset.hdf5'
    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

    WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy'
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)

    CUTOFF = 100000

    MODELS_PATH = "../trained_models/"
    MODEL_NAME = "logistic_regression_model.pkl"
    save_path = os.path.join(MODELS_PATH, MODEL_NAME)

    train_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['train'], cutoff=CUTOFF)
    dev_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['dev'], cutoff=CUTOFF)

    # get the functions and params that we need for our models
    initialization_data = initialize_logistic_regression(train_dataset, dev_dataset,
                                                         learning_rate=0.1, batch_size=100)

    classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches = initialization_data

    train_model(train_model_func,  n_train_batches, validate_model=validate_model_func,
                n_valid_batches=n_valid_batches, training_epochs=200)


    save_model(classifier, save_path)
def main():
    """ main """
    DATASET_LOCATION = '../../datasets/'

    # the pos dataset consists of windows around words
    POS_DATASET_NAME = 'brown_pos_dataset.hdf5'
    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

    CORPUS_INDICES = 'brown_pos_dataset.indices'
    WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy'

    # load the training data
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)
    CUTOFF = 10000

    train_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH,
                                 which_sets=['train'], cutoff=CUTOFF)

    initialization_data = initialize_dA(train_dataset, learning_rate=0.1,
                                        corruption_level=0.3, batch_size=50,
                                        n_hidden=2)

    classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches = initialization_data

    train_model(train_model_func,  n_train_batches,
                validate_model=validate_model_func,
                n_valid_batches=n_valid_batches, training_epochs=10)

    # make a theano function to get predictions from a trained model
    training_data = theano.tensor.matrix('training_X')
    predictions = classifier.predict(training_data)
    get_predictions = theano.function([training_data], predictions)
    # get predictions and evaluate
    p = get_predictions(train_dataset[0].get_value())


    # get train_y without the cast
    train_y = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['train'],
                        cast_y=False, cutoff=CUTOFF)[1].get_value().astype('int32')

    CUTOFF_BEGIN=0
    CUTOFF_END=1000

    y_vals = train_y[CUTOFF_BEGIN:CUTOFF_END]
    norm_y_vals = y_vals / float(np.amax(y_vals))

    jitter1 = np.random.normal(loc=0.0, scale=0.05, size=CUTOFF_END-CUTOFF_BEGIN)
    jitter2 = np.random.normal(loc=0.0, scale=0.05, size=CUTOFF_END-CUTOFF_BEGIN)
    x1 = p[CUTOFF_BEGIN:CUTOFF_END,0] + jitter1
    x2 = p[CUTOFF_BEGIN:CUTOFF_END,1] + jitter2


    plt.scatter(x1, x2, c=norm_y_vals, s=20)
    plt.show()
示例#3
0
def main():
    DATASET_LOCATION = '../datasets/'

    POS_DATASET_NAME = 'brown_pos_dataset.hdf5'

    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)
    WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy'
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)

    CUTOFF = 10000

    MODELS_PATH = "../trained_models/"
    MODEL_NAME = "mlp_model.pkl"
    save_path = os.path.join(MODELS_PATH, MODEL_NAME)

    train_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH,
                                 which_sets=['train'], cutoff=CUTOFF)
    dev_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH,
                               which_sets=['dev'], cutoff=CUTOFF)

    # initialize the MLP
    initialization_data = initialize_mlp(train_dataset, dev_dataset,
                                         learning_rate=0.01, batch_size=50)

    classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches = initialization_data

    # train the MLP model
    train_model(train_model_func, n_train_batches, validate_model=validate_model_func,
                n_valid_batches=n_valid_batches, training_epochs=10)

    test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH,
                                which_sets=['test'], cutoff=CUTOFF, cast_y=False)

    test_X, test_y = test_dataset
    test_y = test_y.get_value().astype('int32')
    predictions = predict(classifier, test_X.get_value())

    CORPUS_INDICES = 'brown_pos_dataset.indices'


    with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file:
        corpus_indices = cPickle.load(indices_file)

    # map tag ids back to strings
    y_test_actual = [corpus_indices['idx2tag'][tag_idx] for tag_idx in test_y]
    y_test_hat = [corpus_indices['idx2tag'][tag_idx] for tag_idx in predictions]

    # Quick Evaluation
    acc = sum([y==p for y,p in zip(predictions, test_y)]) / float(len(predictions))
    print "ACC: {}".format((acc))
    # get class names
    class_names = list(set(y_test_actual))

    # Compute confusion matrix
    cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names)

    # Normalize the confusion matrix by row (i.e by the number of samples in each class)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure()
    plot_confusion_matrix(cm_normalized, class_names, title='Normalized confusion matrix')

    plt.show()