def store_ngrams(corpus_stream, name): # if flatten, ngrams go across sentences if args.flatten: word_it = flatten_it(corpus_stream) ngrams = window_it(word_it, args.n) else: sentence_ngrams = (window_it(sentence, args.n) for sentence in corpus_stream) ngrams = flatten_it(sentence_ngrams) ngram_ids = [list(map(lambda w: vocab[w], ngram)) for ngram in ngrams] ngrams = np.array(ngram_ids) dataset = hdf5_file.create_dataset(name, data=ngrams, compression="gzip") dataset.attrs['n'] = args.n
def test_ngrams(self): sentence = "hello there mr smith welcome back to the world" tokens = sentence.split() windows = window_it(tokens, 3) for window in windows: print(window) print("fewer than ngram_size sequences") sentence = "hello there" tokens = sentence.split() windows = list(window_it(tokens, 3)) print(windows) self.assertEqual(len(windows), 0) for window in windows: print(window)
def corpus_pipeline(corpus_stream, n_gram_size=args.ngram_size, epochs=1, batch_size=args.batch_size, shuffle=args.shuffle, flatten=False): """ Corpus Processing Pipeline. Transforms the corpus reader -a stream of sentences or words- into a stream of n-gram batches. Args: n_gram_size: the size of the n-gram window corpus_stream: the stream of sentences of words epochs: number of epochs we want to iterate over this corpus batch_size: batch size for the n-gram batch shuffle: if true, shuffles the n-grams according to a buffer size flatten: if true sliding windows are applied over a stream of words rather than within each sentence (n-grams can cross sentence boundaries) """ if flatten: word_it = flatten_it(corpus_stream) n_grams = window_it(word_it, n_gram_size) else: sentence_n_grams = (window_it(sentence, n_gram_size) for sentence in corpus_stream) n_grams = flatten_it(sentence_n_grams) # at this point this is an n_gram iterator # n_grams = ([vocab[w] for w in ngram] for ngram in n_grams) n_grams = ([index.get_id(w) for w in ngram] for ngram in n_grams) if epochs > 1: n_grams = repeat_it(n_grams, epochs) if shuffle: n_grams = shuffle_it(n_grams, args.shuffle_buffer_size) n_grams = batch_it(n_grams, size=batch_size, padding=False) return n_grams
tokens = tokenizer.tokenize(sentence) tokens = [t for t in tokens if is_token.is_word(t)] vocab = marisa_trie.Trie(tokens) k = 10 s = 4 seq_size = 2 embed_dim = 4 batch_size = 2 generator = Generator(k, s) print([vocab[w] for w in vocab.keys()]) ri_dict = {vocab[word]: generator.generate() for word in vocab.keys()} tokens = [vocab[w] for w in tokens] data_it = window_it(tokens, seq_size) data_it = batch_it(data_it, batch_size) vocab_tensor = [ri_dict[i] for i in range(len(vocab))] sp_ri = deepsign.data.transform.ris_to_sp_tensor_value(vocab_tensor, dim=k) inputs = tx.Input(n_units=2) ri_inputs = tx.gather_sparse(sp_ri, inputs.tensor) ri_inputs = tx.TensorLayer(ri_inputs, k) embed = tx.Lookup(ri_inputs, seq_size, [k, embed_dim]) # logits: take the embeddings and get the features for all random indexes ri_layer = tx.TensorLayer(sp_ri, n_units=k) logits = tx.Linear(input_layer=ri_layer,
import os from deepsign.data.corpora.ptb import PTBReader from collections import Counter from deepsign.data.iterators import window_it home = os.getenv("HOME") ptb_dir = home + "/data/gold_standards/ptb" reader = PTBReader(ptb_dir) training_set = reader.training_set() vocab = Counter() for sentence in training_set: words = sentence ngrams = window_it(words, 4) for ngram in ngrams: print(ngram) vocab.update(words) print("total words:", sum(vocab.values())) print("total unique words:", len(vocab.keys())) print("100 most common:") for w in vocab.most_common(100): print(w)
print("processing n-grams...") filtered_corpus = map(lambda w: w if w in vocab else args.unk_token, iterators.flatten_it(corpus)) total_words = sum(word_freq) n_training = int(total_words * 0.8) # leave out n_leave = total_words - n_training n_eval = n_leave // 2 n_test = n_leave - n_eval # 80 / 10 / 10 split ngrams = iterators.window_it(filtered_corpus, args.n) # TODO iterative: 1 consume n n grams 2 extend hdf5 dataset 3 write to dataset # https://stackoverflow.com/questions/34531479/writing-a-large-hdf5-dataset-using-h5py # I also have some old examples with wacky corpus #ngram_ids = [list(map(lambda w: vocab[w], ngram)) for ngram in ngrams] #ngrams = np.array(ngram_ids) #dataset = hdf5_file.create_dataset("full", data=ngrams, compression="gzip") #dataset.attrs['n'] = args.n # for ngram in views.consume_it(ngrams): # print(ngram) hdf5_file.close()