예제 #1
0
def store_ngrams(corpus_stream, name):
    # if flatten, ngrams go across sentences
    if args.flatten:
        word_it = flatten_it(corpus_stream)
        ngrams = window_it(word_it, args.n)
    else:
        sentence_ngrams = (window_it(sentence, args.n)
                           for sentence in corpus_stream)
        ngrams = flatten_it(sentence_ngrams)

    ngram_ids = [list(map(lambda w: vocab[w], ngram)) for ngram in ngrams]
    ngrams = np.array(ngram_ids)
    dataset = hdf5_file.create_dataset(name, data=ngrams, compression="gzip")
    dataset.attrs['n'] = args.n
예제 #2
0
    def corpus_pipeline(corpus_stream,
                        n_gram_size=args.ngram_size,
                        epochs=1,
                        batch_size=args.batch_size,
                        shuffle=args.shuffle,
                        flatten=False):
        """ Corpus Processing Pipeline.

        Transforms the corpus reader -a stream of sentences or words- into a stream of n-gram batches.

        Args:
            n_gram_size: the size of the n-gram window
            corpus_stream: the stream of sentences of words
            epochs: number of epochs we want to iterate over this corpus
            batch_size: batch size for the n-gram batch
            shuffle: if true, shuffles the n-grams according to a buffer size
            flatten: if true sliding windows are applied over a stream of words rather than within each sentence
            (n-grams can cross sentence boundaries)
        """

        if flatten:
            word_it = flatten_it(corpus_stream)
            n_grams = window_it(word_it, n_gram_size)
        else:
            sentence_n_grams = (window_it(sentence, n_gram_size)
                                for sentence in corpus_stream)
            n_grams = flatten_it(sentence_n_grams)

        # at this point this is an n_gram iterator
        # n_grams = ([vocab[w] for w in ngram] for ngram in n_grams)
        n_grams = ([index.get_id(w) for w in ngram] for ngram in n_grams)

        if epochs > 1:
            n_grams = repeat_it(n_grams, epochs)

        if shuffle:
            n_grams = shuffle_it(n_grams, args.shuffle_buffer_size)

        n_grams = batch_it(n_grams, size=batch_size, padding=False)
        return n_grams
예제 #3
0
if __name__ == '__main__':
    data_path = os.path.join(os.getenv("HOME"), "data/datasets/ptb/")

    corpus = PTBReader(path=data_path, mark_eos=True)
    corpus_stats = h5py.File(os.path.join(data_path, "ptb_stats.hdf5"),
                             mode='r')
    vocab = marisa_trie.Trie(corpus_stats["vocabulary"])

    batch_size = 4
    num_steps = 3

    # data = [vocab[word] for word in it.take_it(1000, it.flatten_it(corpus.training_set(1000)))]
    data = [
        word
        for word in it.take_it(it.flatten_it(corpus.training_set(1000)), 1000)
    ]
    data = iter((c for c in it.flatten_it(data)))
    print(next(data))
    # data = np.array(data)

    # Starting from sequential data, batchify arranges the dataset into columns.
    # For instance, with the alphabet as the sequence and batch size 4, we'd get
    # ┌ a g m s ┐
    # │ b h n t │
    # │ c i o u │
    # │ d j p v │
    # │ e k q w │
    # └ f l r x ┘.
    # These columns are treated as independent by the model, which means that the
    # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
                    type=str,
                    default="text8")
parser.add_argument('-vocab_limit',
                    dest="vocab_limit",
                    type=int,
                    default=70000)
parser.add_argument('-unk_token', dest="unk_token", type=str, default="<UNK>")

args = parser.parse_args()

dataset = os.path.join(args.data_dir, "text8.txt")

corpus = Text8Corpus(dataset, sentence_length=1000)

word_counter = Counter()
for word in tqdm(iterators.flatten_it(corpus)):
    word_counter[word] += 1

print("total words ", sum(word_counter.values()))
sorted_counts = word_counter.most_common(args.vocab_limit)
word_list, _ = zip(*sorted_counts)

vocab = marisa_trie.Trie(word_list)
print("vocab size: ", len(vocab))

new_counter = Counter()
for word in word_counter.keys():
    if word in vocab:
        new_counter[word] += word_counter[word]
    if word not in vocab:
        new_counter[args.unk_token] += word_counter[word]
예제 #5
0
    def test_nce_nrp(self):
        vocab_size = 1000
        k = 500
        s = 8
        embed_size = 128
        nce_samples = 10
        noise_ratio = 0.1
        use_nce = True

        vocab = [str(i) for i in range(vocab_size)]

        generator = Generator(k, s)
        sign_index = TrieSignIndex(generator,
                                   vocabulary=vocab,
                                   pregen_indexes=True)
        ris = [
            sign_index.get_ri(sign_index.get_sign(i))
            for i in range(len(sign_index))
        ]
        # ris = [generator.generate() for _ in range(vocab_size)]

        ri_tensor = ris_to_sp_tensor_value(ri_seq=ris,
                                           dim=k,
                                           all_positive=False)

        ri_tensor_input = tx.SparseInput(n_units=k, value=ri_tensor)

        if use_nce:
            label_inputs = tx.SparseInput(k, name="target_random_indices")
        else:
            label_inputs = [
                tx.Input(1, dtype=tf.int64, name="ids"),
                tx.InputParam(dtype=tf.int32,
                              value=vocab_size,
                              name="vocab_size")
            ]

        eval_label_inputs = [
            tx.Input(1, dtype=tf.int64, name="ids_eval"),
            tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size")
        ]

        model = NRP(
            run_inputs=tx.SparseInput(n_units=k, name="random_index_inputs"),
            label_inputs=label_inputs,
            eval_label_input=eval_label_inputs,
            ctx_size=2,
            # vocab_size=vocab_size,
            k_dim=k,
            ri_tensor_input=ri_tensor_input,  # current dictionary state
            embed_dim=embed_size,
            h_dim=128,
            num_h=1,
            h_activation=tx.relu,
            use_dropout=True,
            embed_dropout=True,
            keep_prob=0.70,
            use_nce=use_nce,
            nce_samples=nce_samples,
            nce_noise_amount=noise_ratio,
            noise_input=tx.SparseInput(k, name="noise"))

        tf.summary.histogram("embeddings", model.embeddings.weights)
        for h in model.h_layers:
            tf.summary.histogram("h", h.linear.weights)

        # model.eval_tensors.append(model.train_loss_tensors[0])
        runner = tx.ModelRunner(model)
        runner.set_log_dir("/tmp")
        runner.log_graph()

        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        # options = None
        runner.set_session(runtime_stats=True, run_options=options)

        # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

        # runner.config_optimizer(tf.train.GradientDescentOptimizer(learning_rate=0.005))#,
        # SGD with 0.025

        # lr = tx.InputParam(init_value=0.0002)
        lr = tx.InputParam(value=0.025)
        # runner.config_optimizer(tf.train.AdamOptimizer(learning_rate=lr.tensor, beta1=0.9), params=lr,
        runner.config_optimizer(
            tf.train.GradientDescentOptimizer(learning_rate=lr.tensor),
            optimizer_params=lr,
            global_gradient_op=False,
            # gradient_op=lambda grad: tf.clip_by_global_norm(grad, 10.0)[0])
            gradient_op=lambda grad: tf.clip_by_norm(grad, 1.0))

        data = np.array([[0, 2], [5, 7], [9, 8], [3, 4], [1, 9], [12, 8]])
        labels = np.array([[32], [56], [12], [2], [5], [23]])

        ppl_curve = []
        n = 256
        batch_size = 128

        dataset = np.column_stack((data, labels))
        # print(dataset)
        dataset = views.repeat_it([dataset], n)
        dataset = views.flatten_it(dataset)
        # shuffle 5 at a time
        dataset = views.shuffle_it(dataset, 6)
        dataset = views.batch_it(dataset, batch_size)

        # print(np.array(list(dataset)))
        # d = list(views.take_it(1, views.shuffle_it(d, 4)))[0]

        data_stream = dataset

        for data_stream in tqdm(data_stream, total=n * 5 / batch_size):
            sample = np.array(data_stream)

            ctx = sample[:, :-1]
            ctx.flatten()
            ctx = ctx.flatten()
            ctx_ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in ctx]
            ctx_ris = ris_to_sp_tensor_value(
                ctx_ris,
                dim=sign_index.feature_dim(),
                all_positive=not sign_index.generator.symmetric)
            lbl_ids = sample[:, -1:]
            lbl = lbl_ids.flatten()

            if use_nce:
                lbl_ris = [
                    sign_index.get_ri(sign_index.get_sign(i)) for i in lbl
                ]
                lbl_ris = ris_to_sp_tensor_value(
                    lbl_ris,
                    dim=sign_index.feature_dim(),
                    all_positive=not sign_index.generator.symmetric)

                noise = generate_noise(k_dim=k,
                                       batch_size=lbl_ris.dense_shape[0] *
                                       nce_samples,
                                       ratio=noise_ratio)
                runner.train(ctx_ris, [lbl_ris, noise],
                             output_loss=True,
                             write_summaries=True)
            else:
                runner.train(model_input_data=ctx_ris,
                             loss_input_data=lbl_ids,
                             output_loss=True,
                             write_summaries=True)

        runner.close_session()