示例#1
0
def lstm_with_embedding_layer():
    x_train, x_test, y_train, y_test = load_data.get_depression_data()
    y_train_one_hot = preprocessing.class_one_hot(y_train)
    embedding_matrix, word_index, tokenizer = preprocessing.get_embedding_matrix(
        x_train)
    x_train = preprocessing.vectorize_with_tokenizer(x_train, tokenizer)
    x_test = preprocessing.vectorize_with_tokenizer(x_test, tokenizer)
    model = get_lstm_model(True, word_index, embedding_matrix)
    run(x_train, x_test, y_train_one_hot, y_test, model)
示例#2
0
def lstm():
    # x_train, x_test, y_train, y_test = load_data.get_depression_data()
    # x_train, x_test, y_train, y_test = load_data.get_bipolar_disorder_data()
    # x_train, y_train = load_data.get_rsdd_data(set_="train")
    # x_test, y_test = load_data.get_rsdd_data(end_index=5, set_="validation")
    x_train, y_train = load_data.get_smhd_data(set_="train")
    x_test, y_test = load_data.get_smhd_data(end_index=5, set_="validation")
    y_train_one_hot = preprocessing.class_one_hot(y_train)
    vectorize_function = preprocessing.vectorize_data_glove
    embedding_index = preprocessing.get_embeddings_index()
    print(x_train[0])
    x_train = preprocessing.add_features_and_vectorize(x_train,
                                                       vectorize_function,
                                                       embedding_index)
    x_test = preprocessing.add_features_and_vectorize(x_test,
                                                      vectorize_function,
                                                      embedding_index)
    model = get_lstm_model(use_embedding_layer=False,
                           input_shape=(x_train.shape[1], x_train.shape[2]))
    run(x_train, x_test, y_train_one_hot, y_test, model)
示例#3
0
def save_data(vectorize_function,
              embedding_index,
              data_per_iteration=2,
              num_of_load_iterations=2):
    x_train_filenames = []
    y_train_filenames = []

    num_of_train_batches = 0
    for i in range(257, num_of_load_iterations):
        start = time()
        # x_train, y_train = load_data.get_rsdd_data(start_index=i * data_per_iteration,
        #                                            end_index=(i + 1) * data_per_iteration, set_="train")
        x0, x1, _, _, _ = load_smhd_datasets.get_smhd_data_user_level(
            start_index=i * data_per_iteration,
            end_index=(i + 1) * data_per_iteration)
        t1 = time()
        print(t1 - start)
        x_train, y_train = load_smhd_datasets.prepare_binary_data(x0, x1)
        t2 = time()
        print(t2 - t1)
        x_train = preprocessing.add_features_and_vectorize(
            x_train, vectorize_function, embedding_index)
        t3 = time()
        print(t3 - t2)
        y_train_one_hot = preprocessing.class_one_hot(y_train, 2)
        # print(x_train.shape)
        # print(y_train_one_hot.shape)
        np.save("x_train" + str(i) + ".npy", x_train)
        np.save("y_train" + str(i) + ".npy", y_train_one_hot)
        x_train_filenames.append("x_train" + str(i) + ".npy")
        y_train_filenames.append("y_train" + str(i) + ".npy")
        num_of_train_batches += len(x_train) // BATCH_SIZE
        end = time()
        print(end - t3)
    f = open("num_of_train_batches.txt", "w")
    f.write(str(num_of_train_batches))
    f.close()

    return x_train_filenames, y_train_filenames, num_of_train_batches
def multitask_memory_efficient():
    vectorize_function = preprocessing.vectorize_data_glove
    embedding_index = preprocessing.get_embeddings_index()

    data_per_iteration = BATCH_SIZE
    num_of_batches = TRAIN_SET_SIZE // data_per_iteration
    num_of_train_batches = 0
    x_train1_filenames = []
    y_train1_filenames = []
    x_train2_filenames = []
    y_train2_filenames = []
    for i in range(num_of_batches):
        x_train1, y_train1 = load_data.get_depression_data(
            start_index=i * data_per_iteration,
            end_index=(i + 1) * data_per_iteration,
            test_size=0)
        x_train2, y_train2 = load_data.get_bipolar_disorder_data(
            start_index=i * data_per_iteration // 2,
            skiprows_start=(i + 1) * data_per_iteration // 2,
            skiprows_end=(i + 1) * data_per_iteration // 2 + 10**7,
            nrows=data_per_iteration,
            test_size=0)

        x_train1 = preprocessing.add_features_and_vectorize(
            x_train1, vectorize_function, embedding_index)
        x_train2 = preprocessing.add_features_and_vectorize(
            x_train2, vectorize_function, embedding_index)

        x_train1 = x_train1[:len(x_train2)]
        y_train1 = y_train1[:len(y_train2)]

        np.save("x_train1_" + str(i) + ".npy", x_train1)
        y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2)
        np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1)

        np.save("x_train2_" + str(i) + ".npy", x_train2)
        y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2)
        np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2)

        x_train1_filenames.append("x_train1_" + str(i) + ".npy")
        y_train1_filenames.append("y_train1_" + str(i) + ".npy")
        x_train2_filenames.append("x_train2_" + str(i) + ".npy")
        y_train2_filenames.append("y_train2_" + str(i) + ".npy")
        num_of_train_batches += len(x_train1) // BATCH_SIZE

    x_test1, y_test1 = load_data.get_depression_data(start_index=0,
                                                     end_index=0,
                                                     test_size=500)

    x_test2, y_test2 = load_data.get_bipolar_disorder_data(
        start_index=num_of_batches * data_per_iteration // 2,
        skiprows_start=(num_of_batches + 1) * data_per_iteration // 2 + 250,
        skiprows_end=(num_of_batches + 1) * data_per_iteration // 2 + 10**7 +
        250,
        nrows=data_per_iteration,
        test_size=1)

    x_test1 = preprocessing.add_features_and_vectorize(x_test1,
                                                       vectorize_function,
                                                       embedding_index)
    x_test2 = preprocessing.add_features_and_vectorize(x_test2,
                                                       vectorize_function,
                                                       embedding_index)
    x_test1 = x_test1[:len(x_test2)]
    y_test1 = y_test1[:len(y_test2)]

    model = multitask1.get_multitask_model(
        (x_test1.shape[1], x_test1.shape[2]))
    multitask1.run_multitask(x_train1_filenames,
                             x_test1,
                             y_train1_filenames,
                             y_test1,
                             x_train2_filenames,
                             x_test2,
                             y_train2_filenames,
                             y_test2,
                             model,
                             fit_generator=True,
                             steps_per_epoch=num_of_train_batches)
def multitask_smhd_memory_efficient(reload_data=True,
                                    data_per_iteration=2,
                                    num_of_load_iterations=2,
                                    num_of_train_batches=None,
                                    user_level=True):
    x_train1_filenames = []
    y_train1_filenames = []
    x_train2_filenames = []
    y_train2_filenames = []
    if reload_data:
        vectorize_function = preprocessing.vectorize_data_glove
        embedding_index = preprocessing.get_embeddings_index()

        num_of_train_batches = 0
        for i in range(num_of_load_iterations):
            if user_level:
                x0, x1, x2, x3, _ = load_smhd_datasets.get_smhd_data_user_level(
                    start_index=i * data_per_iteration,
                    end_index=(i + 1) * data_per_iteration)
            else:
                x0, x1, x2, x3, _ = load_smhd_datasets.get_smhd_data(
                    start_index=i * data_per_iteration,
                    end_index=(i + 1) * data_per_iteration)

            x_train1, y_train1 = load_smhd_datasets.prepare_binary_data(
                x0[:len(x0) // 3], x1)
            x_train2, y_train2 = load_smhd_datasets.prepare_binary_data(
                x0[len(x0) // 3:2 * len(x0) // 3], x2)
            x_train3, y_train3 = load_smhd_datasets.prepare_binary_data(
                x0[2 * len(x0) // 3:], x3)

            x_train1 = preprocessing.add_features_and_vectorize(
                x_train1, vectorize_function, embedding_index)
            x_train2 = preprocessing.add_features_and_vectorize(
                x_train2, vectorize_function, embedding_index)
            x_train3 = preprocessing.add_features_and_vectorize(
                x_train2, vectorize_function, embedding_index)

            np.save("x_train1_" + str(i) + ".npy", x_train1)
            y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2)
            np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1)

            np.save("x_train2_" + str(i) + ".npy", x_train2)
            y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2)
            np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2)

            np.save("x_train3_" + str(i) + ".npy", x_train3)
            y_train_one_hot3 = preprocessing.class_one_hot(y_train3, 2)
            np.save("y_train3_" + str(i) + ".npy", y_train_one_hot3)

            x_train1_filenames.append("x_train1_" + str(i) + ".npy")
            y_train1_filenames.append("y_train1_" + str(i) + ".npy")
            x_train2_filenames.append("x_train2_" + str(i) + ".npy")
            y_train2_filenames.append("y_train2_" + str(i) + ".npy")
            num_of_train_batches += len(x_train1) // BATCH_SIZE

        f = open("num_of_train_batches.txt", "w")
        f.write(num_of_train_batches)
        f.close()
        if user_level:
            x0, x1, x2, _, _ = load_smhd_datasets.get_smhd_data_user_level(
                set_='validation')
        else:
            x0, x1, x2, _, _ = load_smhd_datasets.get_smhd_data(
                set_='validation')

        x_test1, y_test1 = load_smhd_datasets.prepare_binary_data(
            x0[:len(x0) // 2], x1)
        x_test2, y_test2 = load_smhd_datasets.prepare_binary_data(
            x0[len(x0) // 2:], x2)

        x_test1 = preprocessing.add_features_and_vectorize(
            x_test1, vectorize_function, embedding_index)
        x_test2 = preprocessing.add_features_and_vectorize(
            x_test2, vectorize_function, embedding_index)
        x_test1 = x_test1[:len(x_test2)]
        y_test1 = y_test1[:len(y_test2)]
        x_test2 = x_test2[:len(x_test1)]
        y_test2 = y_test2[:len(y_test1)]
        print(len(y_test1))
        print(len(y_test2))
        np.save("x_test1.npy", x_test1)
        np.save("y_test1.npy", y_test1)
        np.save("x_test2.npy", x_test2)
        np.save("y_test2.npy", y_test2)

    else:
        for i in range(num_of_load_iterations):
            x_train1_filenames.append("x_train1_" + str(i) + ".npy")
            y_train1_filenames.append("y_train1_" + str(i) + ".npy")
            x_train2_filenames.append("x_train2_" + str(i) + ".npy")
            y_train2_filenames.append("y_train2_" + str(i) + ".npy")
        if isinstance(num_of_train_batches, str):
            num_of_train_batches = eval(
                open(num_of_train_batches, "r").readlines()[0])

    x_test1 = np.load("x_test1.npy")
    y_test1 = np.load("y_test1.npy")
    x_test2 = np.load("x_test2.npy")
    y_test2 = np.load("y_test2.npy")

    model = multitask1.get_multitask_model(
        (x_test1.shape[1], x_test1.shape[2]))
    acc1, f11, acc2, f12 = multitask1.run_multitask(
        x_train1_filenames,
        x_test1,
        y_train1_filenames,
        y_test1,
        x_train2_filenames,
        x_test2,
        y_train2_filenames,
        y_test2,
        model,
        fit_generator=True,
        steps_per_epoch=num_of_train_batches)

    return acc1, f11, acc2, f12
示例#6
0
def lstm_memory_efficient():

    vectorize_function = preprocessing.vectorize_data_glove
    embedding_index = preprocessing.get_embeddings_index()

    data_per_iteration = 5  # BATCH_SIZE * 10

    count = 0
    i = 0
    x_train_filenames = []
    y_train_filenames = []
    while count < TRAIN_SET_SIZE // BATCH_SIZE:
        i += 1
        # x_train, y_train = load_data.get_depression_data(start_index=i*data_per_iteration,
        #                                                  end_index=(i+1)*data_per_iteration, test_size=0)
        # x_train, y_train = load_data.get_bipolar_disorder_data(start_index=i * data_per_iteration // 2,
        #                                                        skiprows_start=(i+1) * data_per_iteration // 2,
        #                                                       skiprows_end=(i+1) * data_per_iteration // 2 + 10**7,
        #                                                        nrows=data_per_iteration, test_size=0)
        x_train, y_train = load_data.get_rsdd_data(
            start_index=i * data_per_iteration,
            end_index=(i + 1) * data_per_iteration,
            set_="train")

        x_train = preprocessing.add_features_and_vectorize(
            x_train, vectorize_function, embedding_index)
        y_train_one_hot = preprocessing.class_one_hot(y_train, 2)
        print(x_train.shape)
        print(y_train_one_hot.shape)
        for j in range(len(x_train) // BATCH_SIZE):
            np.save("x_train" + str(count) + ".npy",
                    x_train[j * BATCH_SIZE:(j + 1) * BATCH_SIZE])
            np.save("y_train" + str(count) + ".npy",
                    y_train_one_hot[j * BATCH_SIZE:(j + 1) * BATCH_SIZE])
            x_train_filenames.append("x_train" + str(count) + ".npy")
            y_train_filenames.append("y_train" + str(count) + ".npy")
            count += 1

    # x_test, y_test = load_data.get_bipolar_disorder_data(start_index=num_of_batches * data_per_iteration // 2,
    #                                                     skiprows_start=(num_of_batches+1) * data_per_iteration // 2,
    #                                                     skiprows_end=(num_of_batches+1) * data_per_iteration // 2 + 10**7,
    #                                                     nrows=data_per_iteration, test_size=1)
    # x_test, y_test = load_data.get_depression_data(start_index=0, end_index=0, test_size=500)
    x_test, y_test = load_data.get_rsdd_data(end_index=5, set_="validation")

    x_test = preprocessing.add_features_and_vectorize(x_test,
                                                      vectorize_function,
                                                      embedding_index)
    np.save("x_test.npy", x_test)
    np.save("y_test.npy", y_test)

    x_test = np.load("x_test.npy")
    y_test = np.load("y_test.npy")

    model = get_lstm_model(use_embedding_layer=False,
                           input_shape=(x_test.shape[1], x_test.shape[2]))
    return run(x_train_filenames,
               x_test,
               y_train_filenames,
               y_test,
               model,
               True,
               steps_per_epoch=20)
示例#7
0
def lstm_model_hyperparameters_random_search(x, y, n_iterations=100):
    x_train, x_validation, y_train, y_validation = train_test_split(
        x, y, test_size=0.2)
    y_train = preprocessing.class_one_hot(y_train)
    best_acc = 0
    best_f1 = 0
    best_avg = 0
    best_model = None

    for _ in range(n_iterations):

        if random() < 0.5:
            nb_epoch = randint(5, 50)
        else:
            nb_epoch = randint(
                5,
                30)  # model will not lose to much time for large epoch number

        if random() < 0.5:
            lr = random() * 10**(randint(0, 3) - 4)
        else:
            lr = random() * 10**-3

        r = random()
        if r < 0.2:
            broj_neurona_u_sloju = randint(100, 500)
        elif r < 0.4:
            broj_neurona_u_sloju = 2**randint(2, 10)
        else:
            broj_neurona_u_sloju = EMBEDDING_DIM

        r = random()
        if r < 0.3:
            dropout_rate = r
        elif r < 0.5:
            dropout_rate = random() / 10
        elif r < 0.7:
            dropout_rate = 0.2
        else:
            dropout_rate = 0

        r = random()
        if r < 0.2:
            recurrent_dropout = r
        elif r < 0.4:
            recurrent_dropout = random() / 10
        elif r < 0.6:
            recurrent_dropout = 0.2
        elif r < 0.8:
            recurrent_dropout = 0
        else:
            recurrent_dropout = dropout_rate

        if random() < 0.5:
            bidirectional = True
        else:
            bidirectional = False

        r = random()
        if r < 0.3:
            activation = "sigmoid"
        elif r < 0.6:
            activation = "relu"
        else:
            activation = "None"

        model = Sequential()

        if bidirectional:
            model.add(
                Bidirectional(LSTM(units=broj_neurona_u_sloju,
                                   dropout=dropout_rate,
                                   recurrent_dropout=recurrent_dropout),
                              input_shape=(x_train.shape[1],
                                           x_train.shape[2])))
        else:
            model.add(
                LSTM(units=broj_neurona_u_sloju,
                     dropout=dropout_rate,
                     recurrent_dropout=recurrent_dropout,
                     input_shape=(x_train.shape[1], x_train.shape[2])))

        if activation == "sigmoid":
            model.add(Activation('sigmoid'))
        elif activation == "relu":
            model.add(Activation('relu'))

        model.add(Dense(output_dim=2))
        model.add(Activation('softmax'))

        adam = Adam(lr=lr)

        model.compile(loss='categorical_crossentropy',
                      optimizer=adam,
                      metrics=['accuracy'])

        model.fit(x_train,
                  y_train,
                  nb_epoch=nb_epoch,
                  batch_size=100,
                  verbose=0)

        pred = model.predict_classes(x_validation, 100)

        acc_score = metrics.accuracy_score(pred, y_validation)
        f1_score = metrics.f1_score(pred, y_validation, average='macro')
        avg_score = (acc_score + f1_score) / 2

        if avg_score > best_avg:
            print('acc_score: ', acc_score)
            print('f1_score: ', f1_score)
            print('avg_score: ', avg_score)
            print('broj_neurona: ', broj_neurona_u_sloju)
            print('lr: ', lr)
            print('nb_epoch: ', nb_epoch)
            print('recurrent_dropout: ', recurrent_dropout)
            print('bidirectional: ', bidirectional)
            print('dropout_rate: ', dropout_rate)
            print('activation: ', activation)
            print()
            best_avg = avg_score
            best_model = model
            if acc_score > best_acc:
                best_acc = acc_score

            if f1_score > best_f1:
                best_f1 = f1_score

        elif acc_score > best_acc:
            print('acc_score: ', acc_score)
            print('f1_score: ', f1_score)
            print('avg_score: ', avg_score)
            print('broj_neurona: ', broj_neurona_u_sloju)
            print('lr: ', lr)
            print('nb_epoch: ', nb_epoch)
            print('recurrent_dropout: ', recurrent_dropout)
            print('bidirectional: ', bidirectional)
            print('dropout_rate: ', dropout_rate)
            print('activation: ', activation)
            print()
            best_acc = acc_score

            if f1_score > best_f1:
                best_f1 = f1_score

        if f1_score > best_f1:
            print('acc_score: ', acc_score)
            print('f1_score: ', f1_score)
            print('avg_score: ', avg_score)
            print('broj_neurona: ', broj_neurona_u_sloju)
            print('lr: ', lr)
            print('nb_epoch: ', nb_epoch)
            print('recurrent_dropout: ', recurrent_dropout)
            print('bidirectional: ', bidirectional)
            print('dropout_rate: ', dropout_rate)
            print('activation: ', activation)
            print()
            best_f1 = f1_score

    return best_model