def generate_train_data(data_dir, output_dir, window=5): docs_gen = docs_generator(data_dir) docs_gen_ = docs_generator(data_dir) tokenizer = text.Tokenizer(num_words=5000) tokenizer.fit_on_texts(text_generator(docs_gen)) word2id = tokenizer.word_index id2word = {v: k for k, v in word2id.items()} vocab_size = len(word2id) + 1 print('Vocabulary Size:', vocab_size) print('Vocabulary Sample:', list(word2id.items())[:10]) wids = [[word2id[w] for w in text.text_to_word_sequence(line)] for line in text_generator(docs_gen_)] train_data = [] for wid in tqdm(wids, 'Generating skip gram samples:'): pairs, labels = skipgrams(wid, vocabulary_size=vocab_size, window_size=window) for pair, label in zip(pairs, labels): train_data.append([pair[0], pair[1], label]) train_data = np.array(train_data) print(train_data[:5]) utils.save_numpy(train_data, os.path.join(output_dir, 'train_data.npy')) utils.save_pickle(wids, os.path.join(output_dir, 'word_ids_sent.pkl')) utils.save_pickle(word2id, os.path.join(output_dir, 'word2id.pkl')) utils.save_pickle(id2word, os.path.join(output_dir, 'id2word.pkl')) utils.save_pickle(tokenizer, os.path.join(output_dir, 'tokenizer.pkl'))
def prepare_data(input_train, input_test, num_neighbors, num_points): utils.pretty_print("PREPARING THE DATA...") scaled_laplacian_train = prepare_graph(input_train, num_neighbors, num_points) scaled_laplacian_test = prepare_graph(input_test, num_neighbors, num_points) utils.save_numpy([scaled_laplacian_train, scaled_laplacian_test], ['scaled_laplacian_train', 'scaled_laplacian_test']) return scaled_laplacian_train, scaled_laplacian_test
def load_data(num_points): # load the train and test protein names with open('../data/train.txt', 'r') as train_file: train_proteins = train_file.readlines() # note: includes newlines with open('../data/test.txt', 'r') as test_file: test_proteins = test_file.readlines() utils.pretty_print("LOADING THE TRAINING SET...") input_train, train_labels = get_atoms(train_proteins, num_points) utils.pretty_print("LOADING THE TESTING SET...") input_test, test_labels = get_atoms(test_proteins, num_points) utils.save_numpy( [input_train, train_labels, input_test, test_labels], ['input_train', 'train_labels', 'input_test', 'test_labels']) return input_train, train_labels, input_test, test_labels
def save_images(policy_return, old_observation, id_tmp_dir, j, step, _run, log): """ Save images predicted by model to database. Args: policy_return: NamedTuple returned by actor critic. Contains: - predicted_obs_img: Averaged prediction - particle_obs_img: Predictions for each particle old_observation: Ground truth observation at current time id_tmp_dir: Working directory j: Current gradient update step step: Current step of the n_s steps of n-step A2C _run, log: Provided by sacred """ for dt, img, p_img in zip(log['predicted_times'], policy_return.predicted_obs_img, policy_return.particle_obs_img,): utils.save_numpy( dir=id_tmp_dir, name="update{}_step{}_dt{}.npy".format(j, step, dt), array=img.detach().cpu().numpy(), _run=_run) if log['save_particle_reconstruction']: utils.save_numpy( dir=id_tmp_dir, name="update{}_step{}_dt{}_particles.npy".format(j, step, dt), array=p_img.detach().cpu().numpy(), _run=_run) utils.save_numpy( dir=id_tmp_dir, name="update{}_step{}_obs.npy".format(j, step, dt), array=old_observation.cpu().numpy(), _run=_run)
# Generators training_generator = DataGenerator(training_ids, labels, **params) params['batch_size'] = 128 validation_generator = DataGenerator(validation_ids, labels, **params) # Design model model = get_model(params, True) print(model.summary()) # Train checkpointer = ModelCheckpoint(filepath='model.h5', verbose=2, save_best_only=True, save_weights_only=False) model.fit_generator(generator=training_generator, validation_data=validation_generator, callbacks=[checkpointer], epochs=1) # Predict params['shuffle'] = False params['augment'] = False params['batch_size'] = 1 params['dir_path'] = '/tmp/human_atlas/testing_data/' testing_generator = DataGenerator(testing_ids, labels=None, **params) preds = model.predict_generator(testing_generator, steps=len(testing_ids), verbose=1) save_numpy(preds) save_preds(preds, testing_ids) print(preds, preds.shape, len(testing_ids))