def test_bow_normalise(self): data = ["A", "A"] for s in data: self.sign_index.add(s) unique_str = set(data) self.assertEqual(len(self.sign_index), len(unique_str)) windows = windows(data, window_size=1) norm_bow = enc.to_bow(windows[0], self.sign_index,normalise=True,include_target=True) self.assertEqual(np.max(norm_bow),1) unorm_bow = enc.to_bow(windows[0], self.sign_index, normalise=False,include_target=True) self.assertEqual(np.max(unorm_bow),2)
def test_bow_create(self): data = ["A", "B", "A", "C", "A", "B"] for s in data: self.sign_index.add(s) unique_str = set(data) self.assertEqual(len(self.sign_index), len(unique_str)) windows = windows(data, window_size=1) vectors = [enc.to_bow(w, self.sign_index) for w in windows] self.assertEqual(len(vectors), len(windows))
def test_bow_ignore_order(self): data1 = ["A", "B"] data2 = ["B", "A"] for s1, s2 in data1, data2: self.sign_index.add(s1) self.sign_index.add(s2) windows1 = windows(data1, window_size=1) windows2 = windows(data2, window_size=1) v1 = enc.to_bow(windows1[0], self.sign_index) v2 = enc.to_bow(windows2[0], self.sign_index) np_test.assert_array_equal(v1, v2) np_test.assert_array_equal(v1, v2) a_ri = self.sign_index.get_ri("A") b_ri = self.sign_index.get_ri("B") np_test.assert_array_equal(v1 - a_ri.to_vector(), b_ri.to_vector())
def text_to_ri(args): (fname, data_slice, window_size) = args input_hdf5 = h5py.File(fname, 'r') dataset_name = "sentences" dataset = input_hdf5[dataset_name] gen = subset_chunk_it(dataset, data_slice, chunk_size=250) pbar = tqdm(total=len(data_slice)) tokenizer = Tokenizer() pipe = WaCKyPipe(gen, tokenizer, filter_stop=False) global sign_index ri_vectors = dict() for tokens in pipe: # get sliding windows of given size s_windows = windows(tokens, window_size) # encode each window as a bag-of-words and add to occurrencies for window in s_windows: # pbar.write(str(window)) # lock.acquire() bow_vector = to_bow(window, sign_index) # lock.release() bow_vector = np_to_sparse(bow_vector) sign_id = sign_index.get_id(window.target) if sign_id not in ri_vectors: ri_vectors[sign_id] = bow_vector else: current_vector = ri_vectors[sign_id] ri_vectors[sign_id] = bow_vector + current_vector pbar.update(1) return ri_vectors
if subsampling: windows_stream = (windows(list(filter(keep_token, tokens)), window_size) for tokens in pipeline) else: windows_stream = (windows(tokens, window_size) for tokens in pipeline) i = 0 x_samples = [] c_samples = [] for windows in tqdm(windows_stream, total=n_rows): if len(windows) > 0: # list of (target,ctx) for window in windows: target = sign_index.get_ri(window.target).to_vector() ctx = to_bow(window, sign_index, include_target=False, normalise=True) x_samples.append(target) c_samples.append(ctx) i += 1 # batch size in number of sentences if i % batch_size == 0 and len(c_samples) > 0: # feed data to the model x = np.asmatrix(x_samples) y = np.asmatrix(c_samples) yp = y.copy() yp[yp < 0] = 0 yn = y.copy() yn[yn > 0] = 0
i = 0 x_samples = [] y_samples = [] # restart sentence iterator sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=10000) pipeline.reaload(sentences) window_stream = get_window_stream(pipeline) for windows in tqdm(window_stream, total=n_rows): if len(windows) > 0: # list of (target,ctx) for window in windows: word_t = window.target ctx_ri = to_bow(window, index, include_target=False, normalise=True) target_vector = index.get_ri(word_t).to_dist_vector() x_samples.append(ctx_ri) y_samples.append(target_vector) i += 1 # batch size in number of sentences if i % batch_size == 0 and len(y_samples) > 0: # print current loss if i % 1000 == 0: current_loss = sess.run(loss, { model.input(): x_samples, labels(): y_samples,