Exemplo n.º 1
0
def train_LSTM(X, Y, model, train_split=0.8, epochs=10, batch_size=32):

    # Clinical
    SAMPLE_TYPE_cli, X_cli, Y_cli = get_input(sample_type=4,
                                              shuffle_documents=False,
                                              pad=False)

    which_model = 2
    if which_model == 2:
        custom_fit(X, Y, train_split=train_split, model=model, epochs=epochs)
        print "Clinical Data"
        custom_fit(X_cli, Y_cli, train_split=1, model=model)  # Test clinical

    elif which_modle == 1:
        # Works for TYPE2 but check for others
        # Both these lines work for which_model == 1
        X_train, Y_train, X_test, Y_test = split_data(X,
                                                      Y,
                                                      train_split=train_split)
        model.fit(X_train,
                  Y_train,
                  shuffle=False,
                  nb_epoch=epochs,
                  batch_size=batch_size,
                  validation_data=(X_test, Y_test))

        # WIkipedia
        #model.evaluate(X_test, Y_test, batch_size=batch_size)
        #pred = model.predict(X_test)
        #rounded = np.round(pred)
        #result = helper.windiff_metric_NUMPY(Y_test, rounded)
        #print result

        # Clinical
        # Temporary TRUNCATION
        TRUNCATE_LEN = X_train.shape[1]
        print "NOTE: Truncating the Test dataset(clinical) from %d sentences to %d sentences." % (
            X_cli.shape[1], TRUNCATE_LEN)
        X_cli, Y_cli = X_cli[:, :TRUNCATE_LEN, :], Y_cli[:, :TRUNCATE_LEN, :]
        model.evaluate(X_cli, Y_cli, batch_size=batch_size)
        pred = model.predict(X_cli)
        rounded = np.round(pred)
        result = helper.windiff_metric_NUMPY(Y_cli,
                                             rounded,
                                             win_size=10,
                                             rounded=True)
        print result

    pdb.set_trace()
#        _, result = helper.windiff_metric_NUMPY(Y_cli, rounded, win_size=10, rounded=True)
#        print result

#pdb.set_trace()

#rounded = [round(x) for x in pred]

if __name__ == "__main__":

    # Print parameters
    print "=== SCALE_LOSS_FUN: %d, ONE_SIDE_CONTEXT_SIZE: %d ===" % (
        int(SCALE_LOSS_FUN), ONE_SIDE_CONTEXT_SIZE)
    print "NOTE: Make sure you have MIN_SENTENCES_IN_DOCUMENT >= 2*context_size + 1"

    # For which_model == 2
    SAMPLE_TYPE_wiki, X_wiki, Y_wiki, trained_sample_handler = get_input(
        sample_type=2, shuffle_documents=True, pad=False)
    NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_wiki.shape[
        0], -1, X_wiki[0].shape[1]  #MAX_SEQUENCE_LENGTH is is already padded

    # For which_model == 2
    # Biography data for training
    #SAMPLE_TYPE_bio, X_bio, Y_bio, trained_sample_handler = get_input(sample_type=5, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler)
    #NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_bio.shape[0], -1, X_bio[0].shape[1]          #MAX_SEQUENCE_LENGTH is is already padded

    # Clinical, Fiction, Wikipedia - Only for testing
    SAMPLE_TYPE_cli, X_cli, Y_cli, trained_sample_handler = get_input(
        sample_type=4,
        shuffle_documents=False,
        pad=False,
        trained_sent2vec_model=trained_sample_handler)
    SAMPLE_TYPE_fic, X_fic, Y_fic, trained_sample_handler = get_input(
#        _, result = helper.windiff_metric_NUMPY(Y_cli, rounded, win_size=10, rounded=True)
#        print result

    pdb.set_trace()

    #rounded = [round(x) for x in pred]

if __name__ == "__main__":

    # Print parameters
    print "=== SCALE_LOSS_FUN: %d, ONE_SIDE_CONTEXT_SIZE: %d ===" % (
        int(SCALE_LOSS_FUN), ONE_SIDE_CONTEXT_SIZE)
    print "NOTE: Make sure you have MIN_SENTENCES_IN_DOCUMENT >= 2*context_size + 1"

    # For which_model == 2
    SAMPLE_TYPE_wiki, X_wiki, Y_wiki, trained_sample_handler = get_input(
        sample_type=2, shuffle_documents=True, pad=False)
    NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_wiki.shape[
        0], -1, X_wiki[0].shape[1]  #MAX_SEQUENCE_LENGTH is is already padded
    print "X_wiki[0].shape: ", X_wiki[0].shape
    # For which_model == 2
    # Biography data for training
    #SAMPLE_TYPE_bio, X_bio, Y_bio, trained_sample_handler = get_input(sample_type=5, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler)
    #NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_bio.shape[0], -1, X_bio[0].shape[1]          #MAX_SEQUENCE_LENGTH is is already padded

    # Clinical - Only for testing
    #SAMPLE_TYPE_cli, X_cli, Y_cli, trained_sample_handler = get_input(sample_type=4, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler)

    # Fiction - Only for testing
    #SAMPLE_TYPE_fic, X_fic, Y_fic, trained_sample_handler = get_input(sample_type=6, shuffle_documents=False, pad=False, trained_sent2vec_model=trained_sample_handler)

    dictionary_object = trained_sample_handler.dictionary
Exemplo n.º 4
0
        pred = model.predict(X_cli)
        rounded = np.round(pred)
        result = helper.windiff_metric_NUMPY(Y_cli,
                                             rounded,
                                             win_size=10,
                                             rounded=True)
        print result

    pdb.set_trace()

    #rounded = [round(x) for x in pred]


if __name__ == "__main__":
    # For which_model == 2
    SAMPLE_TYPE_wiki, X_wiki, Y_wiki, trained_sample_handler = get_input(
        sample_type=2, shuffle_documents=True, pad=True)
    NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_wiki.shape[
        0], -1, X_wiki[0].shape[1]  #MAX_SEQUENCE_LENGTH is is already padded

    # For which_model == 2
    # Biography data for training
    SAMPLE_TYPE_bio, X_bio, Y_bio, trained_sample_handler = get_input(
        sample_type=5,
        shuffle_documents=False,
        pad=True,
        trained_sent2vec_model=trained_sample_handler)
    #NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X_bio.shape[0], -1, X_bio[0].shape[1]          #MAX_SEQUENCE_LENGTH is is already padded

    # Clinical - Only for testing
    SAMPLE_TYPE_cli, X_cli, Y_cli, trained_sample_handler = get_input(
        sample_type=4,
Exemplo n.º 5
0
#
#
#    print "macro results are"
#    print "average precision is %f" %(p/10)
#    print "average recall is %f" %(r/10)
#    print "average f1 is %f" %(f1/10)
#
#    print "micro results are"
#    print "average precision is %f" %(p1/10)
#    print "average recall is %f" %(r1/10)
#    print "average f1 is %f" %(f11/10)

if __name__ == "__main__":
    # For which_model == 2
    SAMPLE_TYPE, X, Y = get_input(sample_type=2,
                                  shuffle_documents=True,
                                  pad=False)
    NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X.shape[0], -1, X[
        0].shape[1]  #MAX_SEQUENCE_LENGTH is is already padded

    # For which_model == 1
    #SAMPLE_TYPE, X, Y = get_input(sample_type=2, shuffle_documents=True, pad=True)
    #NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = X.shape[0], X.shape[1], X.shape[2]          #MAX_SEQUENCE_LENGTH is is already padded
    #if SAMPLE_TYPE == 1:
    #    Y = Y[:,-1].reshape((NO_OF_SAMPLES, 1))    # For LSTM
    #elif SAMPLE_TYPE == 2:
    #    # because of TimeDistributed layer :/
    #    Y = Y.reshape((NO_OF_SAMPLES, MAX_SEQUENCE_LENGTH, 1))
    #else:
    #    print "INVALID SAMPLE TYPE!"
Exemplo n.º 6
0
    print helper.windiff_metric_NUMPY(Y_test, rounded)
    pdb.set_trace()
    #rounded = [round(x) for x in predictions]  # round predictions
    #print(predictions)
    #pdb.set_trace()


def sample_data():
    # load pima indians dataset
    dataset = np.loadtxt(
        "/home/pinkesh/DATASETS/PIMA_DATASET/pima-indians-diabetes.data",
        delimiter=",")
    X = dataset[:, 0:8]
    Y = dataset[:, 8]
    return X, Y


if __name__ == "__main__":
    #X, Y = sample_data()
    SAMPLE_TYPE, X, Y = get_input(shuffle=False)

    # Split test-train data
    train_ratio = 0.8
    print 'X(train)=', X.shape[0] * train_ratio
    print 'X(test)=', X.shape[0] * (1 - train_ratio)
    train_samples = int(train_ratio * X.shape[0])

    #pdb.set_trace()
    run_neural_net(X[:train_samples + 1, :], Y[:train_samples + 1],
                   X[train_samples + 1:, :], Y[train_samples + 1:])