def test_chunk_it(self): n_rows = 100 data = np.arange(n_rows) it = chunk_it(data, n_rows, 3) for i in range(len(data)): data_j = next(it) self.assertEqual(data[i], data_j)
def build_vocabulary(corpus_file, output_file=None, max_sentences=0): input_hdf5 = h5py.File(corpus_file, 'r') #dataset_name = "sentences_lemmatised" dataset_name = "sentences" dataset = input_hdf5[dataset_name] if max_sentences > 0: num_sentences = min(max_sentences, len(dataset)) else: num_sentences = len(dataset) gen = chunk_it(dataset, num_sentences, chunk_size=250) tokenizer = Tokenizer() pipe = WaCKyPipe(gen, tokenizer, filter_stop=False) freq = Counter() for tokens in tqdm(pipe, total=num_sentences): #tqdm.write(str(tokens)) for token in tokens: normal_token = token.lower() freq[normal_token] += 1 input_hdf5.close() tqdm.write("{0} unique words".format(len(freq))) # order by frequency freq = freq.most_common() for i in range(10): (w, f) = freq[i] print("{0}:{1}".format(w, f)) if output_file is not None: output_hdf5 = h5py.File(output_file, 'w') word_ids = range(len(freq)) # encode explicitly so that hdf5 can take an array of variable length strings and store it # the hdf5 needs to store variable-length strings with a specific encoding (UTF-8 in this case) vocabulary = np.array( [freq[i][0].encode("utf8") for i in range(len(freq))]) dt = h5py.special_dtype(vlen=str) output_hdf5.create_dataset("vocabulary", data=vocabulary, dtype=dt, compression="gzip") print("vocabulary written") freq = np.array([freq[i][1] for i in range(len(freq))]) output_hdf5.create_dataset("frequencies", data=freq, compression="gzip") print("frequencies written") output_hdf5.close() print("done")
def test_repeat_fn_exhaust(self): n_samples = 4 repeat = 2 v = np.random.uniform(0, 1, [n_samples, 1]) data_it = chunk_it(v, chunk_size=2) def it_fn(x): return iter(x) # data it will get exhausted so it will not repeat data_it = repeat_apply(it_fn, data_it, repeat) # only return 4 items self.assertEqual(len(list(data_it)), n_samples)
def test_reat_chunk_it(self): n_samples = 4 repeat = 2 v = np.random.uniform(0, 1, [n_samples, 1]) data_it = chunk_it(v, chunk_size=2) def chunk_fn(x): return chunk_it(x, chunk_size=2) # for chunk in data_it: # print(chunk) # print(data_it) data_it = repeat_apply(chunk_fn, v, repeat) self.assertEqual(len(list(data_it)), n_samples * repeat)
def test_chain_shuffle(self): n_samples = 4 repeat = 2 v = np.arange(0, n_samples, 1) data_it = chunk_it(v, chunk_size=2) def chunk_fn(x): return chunk_it(x, chunk_size=2) # first chain is normal, second is shuffled from the two repetitions data_it = repeat_apply(chunk_fn, v, repeat) data_it = chain_it(data_it, shuffle_it(repeat_apply(chunk_fn, v, repeat), buffer_size=8)) data = list(data_it) unique_data = np.unique(data) counts = np.unique(np.bincount(data)) self.assertEqual(len(unique_data), 4) self.assertEqual(len(counts), 1) self.assertEqual(counts[0], 4)
def test_batch_it(self): num_samples = 6 v = np.random.uniform(-1, 1, [num_samples, 2]) padding = np.zeros([2]) c_it = chunk_it(v, 6, chunk_size=3) print(v) batch_size = 4 b_it = batch_it(c_it, batch_size, padding=True, padding_elem=padding) for b in b_it: self.assertEqual(len(b), batch_size) print(np.array(b)) b_it = batch_it(v, batch_size) last_batch = None try: for b in b_it: last_batch = b self.assertEqual(len(b), batch_size) except AssertionError: self.assertEqual(len(last_batch), num_samples % batch_size)
def chunk_fn(x): return chunk_it(x, chunk_size=batch_size * 1000)
model_file = result_dir + "model_bnc" # ====================================================================================== # Load Corpus # ====================================================================================== data_dir = home + "/data/gold_standards/" corpus_file = data_dir + "wacky_6M.hdf5" corpus_hdf5 = h5py.File(corpus_file, 'r') corpus_dataset = corpus_hdf5["sentences"] # iterates over lines but loads them as chunks #n_rows = 100000 #sentences = chunk_it(corpus_dataset,n_rows=n_rows, chunk_size=20000) n_rows = len(corpus_dataset) sentences = chunk_it(corpus_dataset, chunk_size=100000) pipeline = WaCKyPipe(datagen=sentences) # ====================================================================================== # Load Vocabulary # ====================================================================================== vocab_file = data_dir + "wacky_vocab_6M_spacy.hdf5" vocab_hdf5 = h5py.File(vocab_file, 'r') ri_gen = Generator(dim=k, num_active=s) print("Loading Vocabulary...") sign_index = TrieSignIndex(ri_gen, list(vocab_hdf5["vocabulary"][:]), pregen_indexes=False) if subsampling: freq = TrieSignIndex.map_frequencies(list(vocab_hdf5["vocabulary"][:]), list(vocab_hdf5["frequencies"][:]),
return True def get_window_stream(pipeline): if subsampling: windows_stream = (windows(list(filter(keep_token, tokens)), window_size) for tokens in pipeline) else: windows_stream = (windows(tokens, window_size) for tokens in pipeline) return windows_stream try: sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=100000) pipeline = BNCPipe(datagen=sentences, lemmas=args.lemmas) for epoch in range(args.epochs): print("epoch ", epoch + 1) i = 0 x_samples = [] y_samples = [] # restart sentence iterator sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=10000) pipeline.reaload(sentences) window_stream = get_window_stream(pipeline) for windows in tqdm(window_stream, total=n_rows): if len(windows) > 0:
def get_ngrams(): for ngram in chunk_it(data, chunk_size=batch_size * 100): yield ngram
def chunk_fn(x): return chunk_it(x, chunk_size=2) # first chain is normal, second is shuffled from the two repetitions data_it = repeat_apply(chunk_fn, v, repeat)
def chunk_fn(x): return chunk_it(x, chunk_size=2) # for chunk in data_it: # print(chunk) # print(data_it) data_it = repeat_apply(chunk_fn, v, repeat)