def multitask(): x_train1, x_test1, y_train1, y_test1 = load_data.get_depression_data() x_train2, x_test2, y_train2, y_test2 = load_data.get_bipolar_disorder_data( ) x_train1 = x_train1[:len(x_train2)] x_test1 = x_test1[:len(x_test2)] y_train1 = y_train1[:len(y_train2)] y_test1 = y_test1[:len(y_test2)] vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() x_train1 = preprocessing.add_features_and_vectorize( x_train1, vectorize_function, embedding_index) x_test1 = preprocessing.add_features_and_vectorize(x_test1, vectorize_function, embedding_index) x_train2 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) x_test2 = preprocessing.add_features_and_vectorize(x_test2, vectorize_function, embedding_index) model = multitask1.get_multitask_model( (x_train1.shape[1], x_train1.shape[2])) multitask1.run_multitask(x_train1, x_test1, y_train1, y_test1, x_train2, x_test2, y_train2, y_test2, model)
def rf_with_glove(): x_train, x_test, y_train, y_test = load_data.get_depression_data() vectorize_function = preprocessing.vectorize_data_1d_glove embedding_index = preprocessing.get_embeddings_index() x_train = preprocessing.add_features_and_vectorize(x_train, vectorize_function, embedding_index) x_test = preprocessing.add_features_and_vectorize(x_test, vectorize_function, embedding_index) pred = baseline(x_train, x_test, y_train, y_test) print(pred)
def multitask_smhd(): x0, _, x2, x1, _ = load_smhd_datasets.get_smhd_data() x_train1, y_train1 = load_smhd_datasets.prepare_binary_data( x0[:len(x0) // 2], x1) x_train2, y_train2 = load_smhd_datasets.prepare_binary_data( x0[len(x0) // 2:], x2) print(y_train1) x0, _, x2, x1, _ = load_smhd_datasets.get_smhd_data(set_='validation') x_test1, y_test1 = load_smhd_datasets.prepare_binary_data( x0[:len(x0) // 2], x1) x_test2, y_test2 = load_smhd_datasets.prepare_binary_data( x0[len(x0) // 2:], x2) print(y_test2) x_train1 = x_train1[:len(x_train2)] x_test1 = x_test1[:len(x_test2)] y_train1 = y_train1[:len(y_train2)] y_test1 = y_test1[:len(y_test2)] x_train2 = x_train2[:len(x_train1)] x_test2 = x_test2[:len(x_test1)] y_train2 = y_train2[:len(y_train1)] y_test2 = y_test2[:len(y_test1)] vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() x_train1 = preprocessing.add_features_and_vectorize( x_train1, vectorize_function, embedding_index) x_test1 = preprocessing.add_features_and_vectorize(x_test1, vectorize_function, embedding_index) x_train2 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) x_test2 = preprocessing.add_features_and_vectorize(x_test2, vectorize_function, embedding_index) model = multitask1.get_multitask_model( (x_test1.shape[1], x_test1.shape[2])) print(y_train1) return multitask1.run_multitask(x_train1, x_test1, y_train1, y_test1, x_train2, x_test2, y_train2, y_test2, model)
def lstm(): # x_train, x_test, y_train, y_test = load_data.get_depression_data() # x_train, x_test, y_train, y_test = load_data.get_bipolar_disorder_data() # x_train, y_train = load_data.get_rsdd_data(set_="train") # x_test, y_test = load_data.get_rsdd_data(end_index=5, set_="validation") x_train, y_train = load_data.get_smhd_data(set_="train") x_test, y_test = load_data.get_smhd_data(end_index=5, set_="validation") y_train_one_hot = preprocessing.class_one_hot(y_train) vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() print(x_train[0]) x_train = preprocessing.add_features_and_vectorize(x_train, vectorize_function, embedding_index) x_test = preprocessing.add_features_and_vectorize(x_test, vectorize_function, embedding_index) model = get_lstm_model(use_embedding_layer=False, input_shape=(x_train.shape[1], x_train.shape[2])) run(x_train, x_test, y_train_one_hot, y_test, model)
def lstm_memory_efficient_simple(reload_data=True, data_per_iteration=2, num_of_load_iterations=2, num_of_train_batches=None): if reload_data: vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() x_train_filenames, y_train_filenames, num_of_train_batches = save_data( vectorize_function, embedding_index, data_per_iteration, num_of_load_iterations) x0, x1, _, _, _ = load_smhd_datasets.get_smhd_data_user_level( end_index=100, set_="validation") x_test, y_test = load_smhd_datasets.prepare_binary_data(x0, x1) x_test = preprocessing.add_features_and_vectorize( x_test, vectorize_function, embedding_index) np.save("x_test.npy", x_test) np.save("y_test.npy", y_test) else: x_train_filenames = [] y_train_filenames = [] for i in range(num_of_load_iterations): x_train_filenames.append("x_train" + str(i) + ".npy") y_train_filenames.append("y_train" + str(i) + ".npy") if isinstance(num_of_train_batches, str): num_of_train_batches = eval( open(num_of_train_batches, "r").readlines()[0]) x_test = np.load("x_test.npy") y_test = np.load("y_test.npy") model = get_lstm_model(use_embedding_layer=False, input_shape=(x_test.shape[1], x_test.shape[2])) return run(x_train_filenames, x_test, y_train_filenames, y_test, model, fit_generator=True, epochs=5, steps_per_epoch=num_of_train_batches)
def multitask_memory_efficient(): vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() data_per_iteration = BATCH_SIZE num_of_batches = TRAIN_SET_SIZE // data_per_iteration num_of_train_batches = 0 x_train1_filenames = [] y_train1_filenames = [] x_train2_filenames = [] y_train2_filenames = [] for i in range(num_of_batches): x_train1, y_train1 = load_data.get_depression_data( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration, test_size=0) x_train2, y_train2 = load_data.get_bipolar_disorder_data( start_index=i * data_per_iteration // 2, skiprows_start=(i + 1) * data_per_iteration // 2, skiprows_end=(i + 1) * data_per_iteration // 2 + 10**7, nrows=data_per_iteration, test_size=0) x_train1 = preprocessing.add_features_and_vectorize( x_train1, vectorize_function, embedding_index) x_train2 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) x_train1 = x_train1[:len(x_train2)] y_train1 = y_train1[:len(y_train2)] np.save("x_train1_" + str(i) + ".npy", x_train1) y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2) np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1) np.save("x_train2_" + str(i) + ".npy", x_train2) y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2) np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2) x_train1_filenames.append("x_train1_" + str(i) + ".npy") y_train1_filenames.append("y_train1_" + str(i) + ".npy") x_train2_filenames.append("x_train2_" + str(i) + ".npy") y_train2_filenames.append("y_train2_" + str(i) + ".npy") num_of_train_batches += len(x_train1) // BATCH_SIZE x_test1, y_test1 = load_data.get_depression_data(start_index=0, end_index=0, test_size=500) x_test2, y_test2 = load_data.get_bipolar_disorder_data( start_index=num_of_batches * data_per_iteration // 2, skiprows_start=(num_of_batches + 1) * data_per_iteration // 2 + 250, skiprows_end=(num_of_batches + 1) * data_per_iteration // 2 + 10**7 + 250, nrows=data_per_iteration, test_size=1) x_test1 = preprocessing.add_features_and_vectorize(x_test1, vectorize_function, embedding_index) x_test2 = preprocessing.add_features_and_vectorize(x_test2, vectorize_function, embedding_index) x_test1 = x_test1[:len(x_test2)] y_test1 = y_test1[:len(y_test2)] model = multitask1.get_multitask_model( (x_test1.shape[1], x_test1.shape[2])) multitask1.run_multitask(x_train1_filenames, x_test1, y_train1_filenames, y_test1, x_train2_filenames, x_test2, y_train2_filenames, y_test2, model, fit_generator=True, steps_per_epoch=num_of_train_batches)
def multitask_smhd_memory_efficient(reload_data=True, data_per_iteration=2, num_of_load_iterations=2, num_of_train_batches=None, user_level=True): x_train1_filenames = [] y_train1_filenames = [] x_train2_filenames = [] y_train2_filenames = [] if reload_data: vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() num_of_train_batches = 0 for i in range(num_of_load_iterations): if user_level: x0, x1, x2, x3, _ = load_smhd_datasets.get_smhd_data_user_level( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration) else: x0, x1, x2, x3, _ = load_smhd_datasets.get_smhd_data( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration) x_train1, y_train1 = load_smhd_datasets.prepare_binary_data( x0[:len(x0) // 3], x1) x_train2, y_train2 = load_smhd_datasets.prepare_binary_data( x0[len(x0) // 3:2 * len(x0) // 3], x2) x_train3, y_train3 = load_smhd_datasets.prepare_binary_data( x0[2 * len(x0) // 3:], x3) x_train1 = preprocessing.add_features_and_vectorize( x_train1, vectorize_function, embedding_index) x_train2 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) x_train3 = preprocessing.add_features_and_vectorize( x_train2, vectorize_function, embedding_index) np.save("x_train1_" + str(i) + ".npy", x_train1) y_train_one_hot1 = preprocessing.class_one_hot(y_train1, 2) np.save("y_train1_" + str(i) + ".npy", y_train_one_hot1) np.save("x_train2_" + str(i) + ".npy", x_train2) y_train_one_hot2 = preprocessing.class_one_hot(y_train2, 2) np.save("y_train2_" + str(i) + ".npy", y_train_one_hot2) np.save("x_train3_" + str(i) + ".npy", x_train3) y_train_one_hot3 = preprocessing.class_one_hot(y_train3, 2) np.save("y_train3_" + str(i) + ".npy", y_train_one_hot3) x_train1_filenames.append("x_train1_" + str(i) + ".npy") y_train1_filenames.append("y_train1_" + str(i) + ".npy") x_train2_filenames.append("x_train2_" + str(i) + ".npy") y_train2_filenames.append("y_train2_" + str(i) + ".npy") num_of_train_batches += len(x_train1) // BATCH_SIZE f = open("num_of_train_batches.txt", "w") f.write(num_of_train_batches) f.close() if user_level: x0, x1, x2, _, _ = load_smhd_datasets.get_smhd_data_user_level( set_='validation') else: x0, x1, x2, _, _ = load_smhd_datasets.get_smhd_data( set_='validation') x_test1, y_test1 = load_smhd_datasets.prepare_binary_data( x0[:len(x0) // 2], x1) x_test2, y_test2 = load_smhd_datasets.prepare_binary_data( x0[len(x0) // 2:], x2) x_test1 = preprocessing.add_features_and_vectorize( x_test1, vectorize_function, embedding_index) x_test2 = preprocessing.add_features_and_vectorize( x_test2, vectorize_function, embedding_index) x_test1 = x_test1[:len(x_test2)] y_test1 = y_test1[:len(y_test2)] x_test2 = x_test2[:len(x_test1)] y_test2 = y_test2[:len(y_test1)] print(len(y_test1)) print(len(y_test2)) np.save("x_test1.npy", x_test1) np.save("y_test1.npy", y_test1) np.save("x_test2.npy", x_test2) np.save("y_test2.npy", y_test2) else: for i in range(num_of_load_iterations): x_train1_filenames.append("x_train1_" + str(i) + ".npy") y_train1_filenames.append("y_train1_" + str(i) + ".npy") x_train2_filenames.append("x_train2_" + str(i) + ".npy") y_train2_filenames.append("y_train2_" + str(i) + ".npy") if isinstance(num_of_train_batches, str): num_of_train_batches = eval( open(num_of_train_batches, "r").readlines()[0]) x_test1 = np.load("x_test1.npy") y_test1 = np.load("y_test1.npy") x_test2 = np.load("x_test2.npy") y_test2 = np.load("y_test2.npy") model = multitask1.get_multitask_model( (x_test1.shape[1], x_test1.shape[2])) acc1, f11, acc2, f12 = multitask1.run_multitask( x_train1_filenames, x_test1, y_train1_filenames, y_test1, x_train2_filenames, x_test2, y_train2_filenames, y_test2, model, fit_generator=True, steps_per_epoch=num_of_train_batches) return acc1, f11, acc2, f12
def lstm_memory_efficient(): vectorize_function = preprocessing.vectorize_data_glove embedding_index = preprocessing.get_embeddings_index() data_per_iteration = 5 # BATCH_SIZE * 10 count = 0 i = 0 x_train_filenames = [] y_train_filenames = [] while count < TRAIN_SET_SIZE // BATCH_SIZE: i += 1 # x_train, y_train = load_data.get_depression_data(start_index=i*data_per_iteration, # end_index=(i+1)*data_per_iteration, test_size=0) # x_train, y_train = load_data.get_bipolar_disorder_data(start_index=i * data_per_iteration // 2, # skiprows_start=(i+1) * data_per_iteration // 2, # skiprows_end=(i+1) * data_per_iteration // 2 + 10**7, # nrows=data_per_iteration, test_size=0) x_train, y_train = load_data.get_rsdd_data( start_index=i * data_per_iteration, end_index=(i + 1) * data_per_iteration, set_="train") x_train = preprocessing.add_features_and_vectorize( x_train, vectorize_function, embedding_index) y_train_one_hot = preprocessing.class_one_hot(y_train, 2) print(x_train.shape) print(y_train_one_hot.shape) for j in range(len(x_train) // BATCH_SIZE): np.save("x_train" + str(count) + ".npy", x_train[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]) np.save("y_train" + str(count) + ".npy", y_train_one_hot[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]) x_train_filenames.append("x_train" + str(count) + ".npy") y_train_filenames.append("y_train" + str(count) + ".npy") count += 1 # x_test, y_test = load_data.get_bipolar_disorder_data(start_index=num_of_batches * data_per_iteration // 2, # skiprows_start=(num_of_batches+1) * data_per_iteration // 2, # skiprows_end=(num_of_batches+1) * data_per_iteration // 2 + 10**7, # nrows=data_per_iteration, test_size=1) # x_test, y_test = load_data.get_depression_data(start_index=0, end_index=0, test_size=500) x_test, y_test = load_data.get_rsdd_data(end_index=5, set_="validation") x_test = preprocessing.add_features_and_vectorize(x_test, vectorize_function, embedding_index) np.save("x_test.npy", x_test) np.save("y_test.npy", y_test) x_test = np.load("x_test.npy") y_test = np.load("y_test.npy") model = get_lstm_model(use_embedding_layer=False, input_shape=(x_test.shape[1], x_test.shape[2])) return run(x_train_filenames, x_test, y_train_filenames, y_test, model, True, steps_per_epoch=20)