def test_assemble_sparse_matrix(self): tree = { "shape": (3, 10), "format": "csr", "data": [numpy.arange(1, 8), numpy.array([0, 4, 1, 5, 2, 3, 8]), numpy.array([0, 2, 4, 7])] } mat = assemble_sparse_matrix(tree) self.assertIsInstance(mat, csr_matrix) self.assertTrue((mat.data == tree["data"][0]).all()) self.assertTrue((mat.indices == tree["data"][1]).all()) self.assertTrue((mat.indptr == tree["data"][2]).all()) self.assertEqual(mat.shape, (3, 10)) self.assertEqual(mat.dtype, numpy.int) tree = { "shape": (3, 10), "format": "csr", "data": [numpy.arange(1, 8), numpy.array([0, 4, 1, 5, 2, 3, 8]), numpy.array([0, 2, 2, 3])] } mat = assemble_sparse_matrix(tree) self.assertIsInstance(mat, csr_matrix) self.assertTrue((mat.data == tree["data"][0]).all()) self.assertTrue((mat.indices == tree["data"][1]).all()) self.assertTrue((mat.indptr == [0, 2, 4, 7]).all()) self.assertEqual(mat.shape, (3, 10)) self.assertEqual(mat.dtype, numpy.int)
def test_assemble_sparse_matrix_empty(self): tree = { "shape": (10, 10), "format": "csr", "data": [numpy.array([], dtype=numpy.float32), numpy.array([], dtype=numpy.int32), numpy.zeros(11, dtype=numpy.uint8)] } mat = assemble_sparse_matrix(tree) self.assertEqual(mat.nonzero()[0].size, 0)
def _load_tree(self, tree): self.construct(tokens=split_strings(tree["tokens"]), matrix=assemble_sparse_matrix(tree["matrix"]))
def test_preprocess(self): import tensorflow as tf with tempfile.TemporaryDirectory() as tmpdir: args = default_preprocess_params(tmpdir, VOCAB) with captured_output() as (out, err, log): id2vec_preprocess(args) self.assertFalse(out.getvalue()) self.assertFalse(err.getvalue()) self.assertEqual(sorted(os.listdir(tmpdir)), [ "col_sums.txt", "col_vocab.txt", "row_sums.txt", "row_vocab.txt", "shard-000-000.pb" ]) df = OrderedDocumentFrequencies().load(source=args.docfreq_in) self.assertEqual(len(df), VOCAB) with open(os.path.join(tmpdir, "col_sums.txt")) as fin: col_sums = fin.read() with open(os.path.join(tmpdir, "row_sums.txt")) as fin: row_sums = fin.read() self.assertEqual(col_sums, row_sums) with open(os.path.join(tmpdir, "col_vocab.txt")) as fin: col_vocab = fin.read() with open(os.path.join(tmpdir, "row_vocab.txt")) as fin: row_vocab = fin.read() self.assertEqual(col_vocab, row_vocab) self.assertEqual(row_vocab.split("\n"), df.tokens()) for word in row_vocab.split("\n"): self.assertGreater(df[word], 0) with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin: features = tf.parse_single_example( fin.read(), features={ "global_row": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "global_col": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "sparse_local_row": tf.VarLenFeature(dtype=tf.int64), "sparse_local_col": tf.VarLenFeature(dtype=tf.int64), "sparse_value": tf.VarLenFeature(dtype=tf.float32) }) with tf.Session() as session: global_row, global_col, local_row, local_col, value = session.run( [ features[n] for n in ("global_row", "global_col", "sparse_local_row", "sparse_local_col", "sparse_value") ]) self.assertEqual(set(range(VOCAB)), set(global_row)) self.assertEqual(set(range(VOCAB)), set(global_col)) nnz = 16001 self.assertEqual(value.values.shape, (nnz, )) self.assertEqual(local_row.values.shape, (nnz, )) self.assertEqual(local_col.values.shape, (nnz, )) numpy.random.seed(0) all_tokens = row_vocab.split("\n") chosen_indices = numpy.random.choice(list(range(VOCAB)), 128, replace=False) chosen = [all_tokens[i] for i in chosen_indices] freqs = numpy.zeros((len(chosen), ) * 2, dtype=int) index = {w: i for i, w in enumerate(chosen)} chosen = set(chosen) with asdf.open(args.input) as model: matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr() tokens = split_strings(model.tree["tokens"]) interesting = {i for i, t in enumerate(tokens) if t in chosen} for y in interesting: row = matrix[y] yi = index[tokens[y]] for x, v in zip(row.indices, row.data): if x in interesting: freqs[yi, index[tokens[x]]] += v matrix = coo_matrix( (value.values, ([global_row[row] for row in local_row.values ], [global_col[col] for col in local_col.values])), shape=(VOCAB, VOCAB)) matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense( ).astype(int) self.assertTrue((matrix == freqs).all())
def _load_tree(self, tree: dict) -> None: self.construct( split_strings(tree["tokens"]), split_strings(tree["topics"]) if tree["topics"] else None, assemble_sparse_matrix(tree["matrix"]))
def _load_tree(self, tree): self.construct(matrix=assemble_sparse_matrix(tree["matrix"]))
def _load_tree_kwargs(self, tree): return dict(repos=split_strings(tree["repos"]), matrix=assemble_sparse_matrix(tree["matrix"]))