def store_ngrams(corpus_stream, name):
    # if flatten, ngrams go across sentences
    if args.flatten:
        word_it = flatten_it(corpus_stream)
        ngrams = window_it(word_it, args.n)
    else:
        sentence_ngrams = (window_it(sentence, args.n)
                           for sentence in corpus_stream)
        ngrams = flatten_it(sentence_ngrams)

    ngram_ids = [list(map(lambda w: vocab[w], ngram)) for ngram in ngrams]
    ngrams = np.array(ngram_ids)
    dataset = hdf5_file.create_dataset(name, data=ngrams, compression="gzip")
    dataset.attrs['n'] = args.n
示例#2
0
    def test_ngrams(self):
        sentence = "hello there mr smith welcome back to the world"
        tokens = sentence.split()
        windows = window_it(tokens, 3)
        for window in windows:
            print(window)

        print("fewer than ngram_size sequences")
        sentence = "hello there"

        tokens = sentence.split()
        windows = list(window_it(tokens, 3))

        print(windows)

        self.assertEqual(len(windows), 0)

        for window in windows:
            print(window)
示例#3
0
    def corpus_pipeline(corpus_stream,
                        n_gram_size=args.ngram_size,
                        epochs=1,
                        batch_size=args.batch_size,
                        shuffle=args.shuffle,
                        flatten=False):
        """ Corpus Processing Pipeline.

        Transforms the corpus reader -a stream of sentences or words- into a stream of n-gram batches.

        Args:
            n_gram_size: the size of the n-gram window
            corpus_stream: the stream of sentences of words
            epochs: number of epochs we want to iterate over this corpus
            batch_size: batch size for the n-gram batch
            shuffle: if true, shuffles the n-grams according to a buffer size
            flatten: if true sliding windows are applied over a stream of words rather than within each sentence
            (n-grams can cross sentence boundaries)
        """

        if flatten:
            word_it = flatten_it(corpus_stream)
            n_grams = window_it(word_it, n_gram_size)
        else:
            sentence_n_grams = (window_it(sentence, n_gram_size)
                                for sentence in corpus_stream)
            n_grams = flatten_it(sentence_n_grams)

        # at this point this is an n_gram iterator
        # n_grams = ([vocab[w] for w in ngram] for ngram in n_grams)
        n_grams = ([index.get_id(w) for w in ngram] for ngram in n_grams)

        if epochs > 1:
            n_grams = repeat_it(n_grams, epochs)

        if shuffle:
            n_grams = shuffle_it(n_grams, args.shuffle_buffer_size)

        n_grams = batch_it(n_grams, size=batch_size, padding=False)
        return n_grams
tokens = tokenizer.tokenize(sentence)
tokens = [t for t in tokens if is_token.is_word(t)]
vocab = marisa_trie.Trie(tokens)

k = 10
s = 4
seq_size = 2
embed_dim = 4
batch_size = 2
generator = Generator(k, s)

print([vocab[w] for w in vocab.keys()])
ri_dict = {vocab[word]: generator.generate() for word in vocab.keys()}

tokens = [vocab[w] for w in tokens]
data_it = window_it(tokens, seq_size)
data_it = batch_it(data_it, batch_size)

vocab_tensor = [ri_dict[i] for i in range(len(vocab))]
sp_ri = deepsign.data.transform.ris_to_sp_tensor_value(vocab_tensor, dim=k)

inputs = tx.Input(n_units=2)
ri_inputs = tx.gather_sparse(sp_ri, inputs.tensor)
ri_inputs = tx.TensorLayer(ri_inputs, k)

embed = tx.Lookup(ri_inputs, seq_size, [k, embed_dim])

# logits: take the embeddings and get the features for all random indexes

ri_layer = tx.TensorLayer(sp_ri, n_units=k)
logits = tx.Linear(input_layer=ri_layer,
示例#5
0
import os
from deepsign.data.corpora.ptb import PTBReader
from collections import Counter
from deepsign.data.iterators import window_it

home = os.getenv("HOME")
ptb_dir = home + "/data/gold_standards/ptb"

reader = PTBReader(ptb_dir)

training_set = reader.training_set()

vocab = Counter()

for sentence in training_set:
    words = sentence
    ngrams = window_it(words, 4)
    for ngram in ngrams:
        print(ngram)
    vocab.update(words)

print("total words:", sum(vocab.values()))
print("total unique words:", len(vocab.keys()))
print("100 most common:")
for w in vocab.most_common(100):
    print(w)
print("processing n-grams...")

filtered_corpus = map(lambda w: w if w in vocab else args.unk_token,
                      iterators.flatten_it(corpus))

total_words = sum(word_freq)
n_training = int(total_words * 0.8)

# leave out
n_leave = total_words - n_training

n_eval = n_leave // 2
n_test = n_leave - n_eval

# 80 / 10 / 10 split
ngrams = iterators.window_it(filtered_corpus, args.n)

# TODO iterative: 1 consume n n grams 2 extend hdf5 dataset 3 write to dataset
# https://stackoverflow.com/questions/34531479/writing-a-large-hdf5-dataset-using-h5py
# I also have some old examples with wacky corpus

#ngram_ids = [list(map(lambda w: vocab[w], ngram)) for ngram in ngrams]
#ngrams = np.array(ngram_ids)
#dataset = hdf5_file.create_dataset("full", data=ngrams, compression="gzip")
#dataset.attrs['n'] = args.n

# for ngram in views.consume_it(ngrams):
#    print(ngram)

hdf5_file.close()