def store_ngrams(corpus_stream, name): # if flatten, ngrams go across sentences if args.flatten: word_it = flatten_it(corpus_stream) ngrams = window_it(word_it, args.n) else: sentence_ngrams = (window_it(sentence, args.n) for sentence in corpus_stream) ngrams = flatten_it(sentence_ngrams) ngram_ids = [list(map(lambda w: vocab[w], ngram)) for ngram in ngrams] ngrams = np.array(ngram_ids) dataset = hdf5_file.create_dataset(name, data=ngrams, compression="gzip") dataset.attrs['n'] = args.n
def corpus_pipeline(corpus_stream, n_gram_size=args.ngram_size, epochs=1, batch_size=args.batch_size, shuffle=args.shuffle, flatten=False): """ Corpus Processing Pipeline. Transforms the corpus reader -a stream of sentences or words- into a stream of n-gram batches. Args: n_gram_size: the size of the n-gram window corpus_stream: the stream of sentences of words epochs: number of epochs we want to iterate over this corpus batch_size: batch size for the n-gram batch shuffle: if true, shuffles the n-grams according to a buffer size flatten: if true sliding windows are applied over a stream of words rather than within each sentence (n-grams can cross sentence boundaries) """ if flatten: word_it = flatten_it(corpus_stream) n_grams = window_it(word_it, n_gram_size) else: sentence_n_grams = (window_it(sentence, n_gram_size) for sentence in corpus_stream) n_grams = flatten_it(sentence_n_grams) # at this point this is an n_gram iterator # n_grams = ([vocab[w] for w in ngram] for ngram in n_grams) n_grams = ([index.get_id(w) for w in ngram] for ngram in n_grams) if epochs > 1: n_grams = repeat_it(n_grams, epochs) if shuffle: n_grams = shuffle_it(n_grams, args.shuffle_buffer_size) n_grams = batch_it(n_grams, size=batch_size, padding=False) return n_grams
if __name__ == '__main__': data_path = os.path.join(os.getenv("HOME"), "data/datasets/ptb/") corpus = PTBReader(path=data_path, mark_eos=True) corpus_stats = h5py.File(os.path.join(data_path, "ptb_stats.hdf5"), mode='r') vocab = marisa_trie.Trie(corpus_stats["vocabulary"]) batch_size = 4 num_steps = 3 # data = [vocab[word] for word in it.take_it(1000, it.flatten_it(corpus.training_set(1000)))] data = [ word for word in it.take_it(it.flatten_it(corpus.training_set(1000)), 1000) ] data = iter((c for c in it.flatten_it(data))) print(next(data)) # data = np.array(data) # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
type=str, default="text8") parser.add_argument('-vocab_limit', dest="vocab_limit", type=int, default=70000) parser.add_argument('-unk_token', dest="unk_token", type=str, default="<UNK>") args = parser.parse_args() dataset = os.path.join(args.data_dir, "text8.txt") corpus = Text8Corpus(dataset, sentence_length=1000) word_counter = Counter() for word in tqdm(iterators.flatten_it(corpus)): word_counter[word] += 1 print("total words ", sum(word_counter.values())) sorted_counts = word_counter.most_common(args.vocab_limit) word_list, _ = zip(*sorted_counts) vocab = marisa_trie.Trie(word_list) print("vocab size: ", len(vocab)) new_counter = Counter() for word in word_counter.keys(): if word in vocab: new_counter[word] += word_counter[word] if word not in vocab: new_counter[args.unk_token] += word_counter[word]
def test_nce_nrp(self): vocab_size = 1000 k = 500 s = 8 embed_size = 128 nce_samples = 10 noise_ratio = 0.1 use_nce = True vocab = [str(i) for i in range(vocab_size)] generator = Generator(k, s) sign_index = TrieSignIndex(generator, vocabulary=vocab, pregen_indexes=True) ris = [ sign_index.get_ri(sign_index.get_sign(i)) for i in range(len(sign_index)) ] # ris = [generator.generate() for _ in range(vocab_size)] ri_tensor = ris_to_sp_tensor_value(ri_seq=ris, dim=k, all_positive=False) ri_tensor_input = tx.SparseInput(n_units=k, value=ri_tensor) if use_nce: label_inputs = tx.SparseInput(k, name="target_random_indices") else: label_inputs = [ tx.Input(1, dtype=tf.int64, name="ids"), tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size") ] eval_label_inputs = [ tx.Input(1, dtype=tf.int64, name="ids_eval"), tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size") ] model = NRP( run_inputs=tx.SparseInput(n_units=k, name="random_index_inputs"), label_inputs=label_inputs, eval_label_input=eval_label_inputs, ctx_size=2, # vocab_size=vocab_size, k_dim=k, ri_tensor_input=ri_tensor_input, # current dictionary state embed_dim=embed_size, h_dim=128, num_h=1, h_activation=tx.relu, use_dropout=True, embed_dropout=True, keep_prob=0.70, use_nce=use_nce, nce_samples=nce_samples, nce_noise_amount=noise_ratio, noise_input=tx.SparseInput(k, name="noise")) tf.summary.histogram("embeddings", model.embeddings.weights) for h in model.h_layers: tf.summary.histogram("h", h.linear.weights) # model.eval_tensors.append(model.train_loss_tensors[0]) runner = tx.ModelRunner(model) runner.set_log_dir("/tmp") runner.log_graph() options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # options = None runner.set_session(runtime_stats=True, run_options=options) # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # runner.config_optimizer(tf.train.GradientDescentOptimizer(learning_rate=0.005))#, # SGD with 0.025 # lr = tx.InputParam(init_value=0.0002) lr = tx.InputParam(value=0.025) # runner.config_optimizer(tf.train.AdamOptimizer(learning_rate=lr.tensor, beta1=0.9), params=lr, runner.config_optimizer( tf.train.GradientDescentOptimizer(learning_rate=lr.tensor), optimizer_params=lr, global_gradient_op=False, # gradient_op=lambda grad: tf.clip_by_global_norm(grad, 10.0)[0]) gradient_op=lambda grad: tf.clip_by_norm(grad, 1.0)) data = np.array([[0, 2], [5, 7], [9, 8], [3, 4], [1, 9], [12, 8]]) labels = np.array([[32], [56], [12], [2], [5], [23]]) ppl_curve = [] n = 256 batch_size = 128 dataset = np.column_stack((data, labels)) # print(dataset) dataset = views.repeat_it([dataset], n) dataset = views.flatten_it(dataset) # shuffle 5 at a time dataset = views.shuffle_it(dataset, 6) dataset = views.batch_it(dataset, batch_size) # print(np.array(list(dataset))) # d = list(views.take_it(1, views.shuffle_it(d, 4)))[0] data_stream = dataset for data_stream in tqdm(data_stream, total=n * 5 / batch_size): sample = np.array(data_stream) ctx = sample[:, :-1] ctx.flatten() ctx = ctx.flatten() ctx_ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in ctx] ctx_ris = ris_to_sp_tensor_value( ctx_ris, dim=sign_index.feature_dim(), all_positive=not sign_index.generator.symmetric) lbl_ids = sample[:, -1:] lbl = lbl_ids.flatten() if use_nce: lbl_ris = [ sign_index.get_ri(sign_index.get_sign(i)) for i in lbl ] lbl_ris = ris_to_sp_tensor_value( lbl_ris, dim=sign_index.feature_dim(), all_positive=not sign_index.generator.symmetric) noise = generate_noise(k_dim=k, batch_size=lbl_ris.dense_shape[0] * nce_samples, ratio=noise_ratio) runner.train(ctx_ris, [lbl_ris, noise], output_loss=True, write_summaries=True) else: runner.train(model_input_data=ctx_ris, loss_input_data=lbl_ids, output_loss=True, write_summaries=True) runner.close_session()