def test_shuffle_it(self): v = list(range(10)) padding = -1 b_it = batch_it(v, size=4, padding=True, padding_elem=padding) s_it = shuffle_it(b_it, 3) for elem in s_it: print(elem)
def data_pipeline(data, epochs=1, batch_size=args.batch_size, shuffle=False): def chunk_fn(x): return chunk_it(x, chunk_size=batch_size * 1000) if epochs > 1: data = repeat_apply(chunk_fn, data, epochs) else: data = chunk_fn(data) if shuffle: data = shuffle_it(data, args.shuffle_buffer_size) data = batch_it(data, size=batch_size, padding=False) return data
def data_pipeline(hdf5_dataset, epochs=1, batch_size=args.batch_size, shuffle=args.shuffle): def chunk_fn(x): return chunk_it(x, chunk_size=batch_size * 1000) if epochs > 1: dataset = repeat_apply(chunk_fn, hdf5_dataset, epochs) else: dataset = chunk_fn(hdf5_dataset) if shuffle: dataset = shuffle_it(dataset, args.shuffle_buffer_size) # cannot pad because 0 might be a valid index and that screws our evaluation # padding = np.zeros([args.ngram_size], dtype=np.int64) # dataset = batch_it(dataset, size=batch_size, padding=True, padding_elem=padding) dataset = batch_it(dataset, size=batch_size, padding=False) return dataset
def test_chain_shuffle(self): n_samples = 4 repeat = 2 v = np.arange(0, n_samples, 1) data_it = chunk_it(v, chunk_size=2) def chunk_fn(x): return chunk_it(x, chunk_size=2) # first chain is normal, second is shuffled from the two repetitions data_it = repeat_apply(chunk_fn, v, repeat) data_it = chain_it(data_it, shuffle_it(repeat_apply(chunk_fn, v, repeat), buffer_size=8)) data = list(data_it) unique_data = np.unique(data) counts = np.unique(np.bincount(data)) self.assertEqual(len(unique_data), 4) self.assertEqual(len(counts), 1) self.assertEqual(counts[0], 4)
def corpus_pipeline(corpus_stream, n_gram_size=args.ngram_size, epochs=1, batch_size=args.batch_size, shuffle=args.shuffle, flatten=False): """ Corpus Processing Pipeline. Transforms the corpus reader -a stream of sentences or words- into a stream of n-gram batches. Args: n_gram_size: the size of the n-gram window corpus_stream: the stream of sentences of words epochs: number of epochs we want to iterate over this corpus batch_size: batch size for the n-gram batch shuffle: if true, shuffles the n-grams according to a buffer size flatten: if true sliding windows are applied over a stream of words rather than within each sentence (n-grams can cross sentence boundaries) """ if flatten: word_it = flatten_it(corpus_stream) n_grams = window_it(word_it, n_gram_size) else: sentence_n_grams = (window_it(sentence, n_gram_size) for sentence in corpus_stream) n_grams = flatten_it(sentence_n_grams) # at this point this is an n_gram iterator # n_grams = ([vocab[w] for w in ngram] for ngram in n_grams) n_grams = ([index.get_id(w) for w in ngram] for ngram in n_grams) if epochs > 1: n_grams = repeat_it(n_grams, epochs) if shuffle: n_grams = shuffle_it(n_grams, args.shuffle_buffer_size) n_grams = batch_it(n_grams, size=batch_size, padding=False) return n_grams
v = np.random.randint(2, size=[N, M]) v[..., C] = 0 v[R, C] = 1 # print("data:\n", v) # data pipeline batch_size = 1 epochs = 4 data = np.concatenate([v, labels], -1) data = repeat_it(data, 2) data = shuffle_it(iter(data), buffer_size=batch_size * 4) data = batch_it(data, batch_size) label_layer = tx.Input(1) in_layer = tx.Input(M) f1 = tx.FC(in_layer, 512, activation=tf.nn.tanh) f2 = tx.FC(f1, 512, activation=tf.nn.relu) fm = tx.Highway(f1, f2, carry_gate=True) out = tx.Linear(f2, 1) out_prob = tx.Activation(out, fn=tx.sigmoid) loss = tx.binary_cross_entropy(labels=label_layer.tensor, logits=out.tensor) model = tx.Model(run_inputs=in_layer,
def test_nce_nrp(self): vocab_size = 1000 k = 500 s = 8 embed_size = 128 nce_samples = 10 noise_ratio = 0.1 use_nce = True vocab = [str(i) for i in range(vocab_size)] generator = Generator(k, s) sign_index = TrieSignIndex(generator, vocabulary=vocab, pregen_indexes=True) ris = [ sign_index.get_ri(sign_index.get_sign(i)) for i in range(len(sign_index)) ] # ris = [generator.generate() for _ in range(vocab_size)] ri_tensor = ris_to_sp_tensor_value(ri_seq=ris, dim=k, all_positive=False) ri_tensor_input = tx.SparseInput(n_units=k, value=ri_tensor) if use_nce: label_inputs = tx.SparseInput(k, name="target_random_indices") else: label_inputs = [ tx.Input(1, dtype=tf.int64, name="ids"), tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size") ] eval_label_inputs = [ tx.Input(1, dtype=tf.int64, name="ids_eval"), tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size") ] model = NRP( run_inputs=tx.SparseInput(n_units=k, name="random_index_inputs"), label_inputs=label_inputs, eval_label_input=eval_label_inputs, ctx_size=2, # vocab_size=vocab_size, k_dim=k, ri_tensor_input=ri_tensor_input, # current dictionary state embed_dim=embed_size, h_dim=128, num_h=1, h_activation=tx.relu, use_dropout=True, embed_dropout=True, keep_prob=0.70, use_nce=use_nce, nce_samples=nce_samples, nce_noise_amount=noise_ratio, noise_input=tx.SparseInput(k, name="noise")) tf.summary.histogram("embeddings", model.embeddings.weights) for h in model.h_layers: tf.summary.histogram("h", h.linear.weights) # model.eval_tensors.append(model.train_loss_tensors[0]) runner = tx.ModelRunner(model) runner.set_log_dir("/tmp") runner.log_graph() options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # options = None runner.set_session(runtime_stats=True, run_options=options) # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # runner.config_optimizer(tf.train.GradientDescentOptimizer(learning_rate=0.005))#, # SGD with 0.025 # lr = tx.InputParam(init_value=0.0002) lr = tx.InputParam(value=0.025) # runner.config_optimizer(tf.train.AdamOptimizer(learning_rate=lr.tensor, beta1=0.9), params=lr, runner.config_optimizer( tf.train.GradientDescentOptimizer(learning_rate=lr.tensor), optimizer_params=lr, global_gradient_op=False, # gradient_op=lambda grad: tf.clip_by_global_norm(grad, 10.0)[0]) gradient_op=lambda grad: tf.clip_by_norm(grad, 1.0)) data = np.array([[0, 2], [5, 7], [9, 8], [3, 4], [1, 9], [12, 8]]) labels = np.array([[32], [56], [12], [2], [5], [23]]) ppl_curve = [] n = 256 batch_size = 128 dataset = np.column_stack((data, labels)) # print(dataset) dataset = views.repeat_it([dataset], n) dataset = views.flatten_it(dataset) # shuffle 5 at a time dataset = views.shuffle_it(dataset, 6) dataset = views.batch_it(dataset, batch_size) # print(np.array(list(dataset))) # d = list(views.take_it(1, views.shuffle_it(d, 4)))[0] data_stream = dataset for data_stream in tqdm(data_stream, total=n * 5 / batch_size): sample = np.array(data_stream) ctx = sample[:, :-1] ctx.flatten() ctx = ctx.flatten() ctx_ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in ctx] ctx_ris = ris_to_sp_tensor_value( ctx_ris, dim=sign_index.feature_dim(), all_positive=not sign_index.generator.symmetric) lbl_ids = sample[:, -1:] lbl = lbl_ids.flatten() if use_nce: lbl_ris = [ sign_index.get_ri(sign_index.get_sign(i)) for i in lbl ] lbl_ris = ris_to_sp_tensor_value( lbl_ris, dim=sign_index.feature_dim(), all_positive=not sign_index.generator.symmetric) noise = generate_noise(k_dim=k, batch_size=lbl_ris.dense_shape[0] * nce_samples, ratio=noise_ratio) runner.train(ctx_ris, [lbl_ris, noise], output_loss=True, write_summaries=True) else: runner.train(model_input_data=ctx_ris, loss_input_data=lbl_ids, output_loss=True, write_summaries=True) runner.close_session()