예제 #1
0
def generate_train_data(data_dir, output_dir, window=5):
    docs_gen = docs_generator(data_dir)
    docs_gen_ = docs_generator(data_dir)
    tokenizer = text.Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(text_generator(docs_gen))

    word2id = tokenizer.word_index
    id2word = {v: k for k, v in word2id.items()}

    vocab_size = len(word2id) + 1
    print('Vocabulary Size:', vocab_size)
    print('Vocabulary Sample:', list(word2id.items())[:10])

    wids = [[word2id[w] for w in text.text_to_word_sequence(line)]
            for line in text_generator(docs_gen_)]

    train_data = []

    for wid in tqdm(wids, 'Generating skip gram samples:'):
        pairs, labels = skipgrams(wid,
                                  vocabulary_size=vocab_size,
                                  window_size=window)

        for pair, label in zip(pairs, labels):
            train_data.append([pair[0], pair[1], label])

    train_data = np.array(train_data)
    print(train_data[:5])

    utils.save_numpy(train_data, os.path.join(output_dir, 'train_data.npy'))
    utils.save_pickle(wids, os.path.join(output_dir, 'word_ids_sent.pkl'))
    utils.save_pickle(word2id, os.path.join(output_dir, 'word2id.pkl'))
    utils.save_pickle(id2word, os.path.join(output_dir, 'id2word.pkl'))
    utils.save_pickle(tokenizer, os.path.join(output_dir, 'tokenizer.pkl'))
예제 #2
0
def prepare_data(input_train, input_test, num_neighbors, num_points):
    utils.pretty_print("PREPARING THE DATA...")
    scaled_laplacian_train = prepare_graph(input_train, num_neighbors,
                                           num_points)
    scaled_laplacian_test = prepare_graph(input_test, num_neighbors,
                                          num_points)
    utils.save_numpy([scaled_laplacian_train, scaled_laplacian_test],
                     ['scaled_laplacian_train', 'scaled_laplacian_test'])
    return scaled_laplacian_train, scaled_laplacian_test
예제 #3
0
def load_data(num_points):

    # load the train and test protein names
    with open('../data/train.txt', 'r') as train_file:
        train_proteins = train_file.readlines()  # note: includes newlines
    with open('../data/test.txt', 'r') as test_file:
        test_proteins = test_file.readlines()

    utils.pretty_print("LOADING THE TRAINING SET...")
    input_train, train_labels = get_atoms(train_proteins, num_points)
    utils.pretty_print("LOADING THE TESTING SET...")
    input_test, test_labels = get_atoms(test_proteins, num_points)

    utils.save_numpy(
        [input_train, train_labels, input_test, test_labels],
        ['input_train', 'train_labels', 'input_test', 'test_labels'])
    return input_train, train_labels, input_test, test_labels
예제 #4
0
파일: main.py 프로젝트: pitcany/hitachi
def save_images(policy_return, old_observation, id_tmp_dir, j, step, _run, log):
    """
    Save images predicted by model to database.
    Args:
        policy_return: NamedTuple returned by actor critic. Contains:
            - predicted_obs_img: Averaged prediction
            - particle_obs_img: Predictions for each particle
        old_observation: Ground truth observation at current time
        id_tmp_dir: Working directory
        j: Current gradient update step
        step: Current step of the n_s steps of n-step A2C
        _run, log: Provided by sacred
    """

    for dt, img, p_img in zip(log['predicted_times'],
                              policy_return.predicted_obs_img,
                              policy_return.particle_obs_img,):
        utils.save_numpy(
            dir=id_tmp_dir,
            name="update{}_step{}_dt{}.npy".format(j, step, dt),
            array=img.detach().cpu().numpy(),
            _run=_run)

        if log['save_particle_reconstruction']:
            utils.save_numpy(
                dir=id_tmp_dir,
                name="update{}_step{}_dt{}_particles.npy".format(j, step, dt),
                array=p_img.detach().cpu().numpy(),
                _run=_run)

    utils.save_numpy(
        dir=id_tmp_dir,
        name="update{}_step{}_obs.npy".format(j, step, dt),
        array=old_observation.cpu().numpy(),
        _run=_run)
예제 #5
0
# Generators
training_generator = DataGenerator(training_ids, labels, **params)
params['batch_size'] = 128
validation_generator = DataGenerator(validation_ids, labels, **params)

# Design model
model = get_model(params, True)
print(model.summary())

# Train
checkpointer = ModelCheckpoint(filepath='model.h5', verbose=2,
                               save_best_only=True, save_weights_only=False)
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    callbacks=[checkpointer],
                    epochs=1)

# Predict
params['shuffle'] = False
params['augment'] = False
params['batch_size'] = 1
params['dir_path'] = '/tmp/human_atlas/testing_data/'
testing_generator = DataGenerator(testing_ids, labels=None, **params)

preds = model.predict_generator(testing_generator,
                                steps=len(testing_ids),
                                verbose=1)
save_numpy(preds)
save_preds(preds, testing_ids)
print(preds, preds.shape, len(testing_ids))