def _load_tree_kwargs(self, tree): return dict(repository=tree["repository"], filenames=split_strings(tree["filenames"]), uasts=[ type(self).parse_bblfsh_response(uast) for uast in split_strings(tree["uasts"]) ])
def _load_tree(self, tree: dict) -> None: """ Attaches the needed data from the tree. :param tree: asdf file tree. :return: None """ self.construct(X=split_strings(tree["X"]), y_text=split_strings(tree["y_text"]), y_pos=tree["y_pos"], y_uast=[ self.parse_bblfsh_response(response) for response in split_strings(tree["y_uast"]) ])
def test_split_bytes(self): strings = split_strings({ "strings": numpy.array([b"abcdef"]), "lengths": numpy.array([1, 2, 3]), "str": False }) self.assertEqual(strings, [b"a", b"bc", b"def"])
def test_empty_split_save_load_merge(self): strings = [] merged = merge_strings(strings) assert_array_equal(merged["strings"], numpy.array([], dtype="S1")) assert_array_equal(merged["lengths"], numpy.array([], dtype=int)) self.assertIsNone(merged["str"]) af = asdf.AsdfFile(merged) buffer = BytesIO() af.write_to(buffer) buffer.seek(0) af_loaded = asdf.open(buffer) strings_restored = split_strings(af_loaded.tree) self.assertEqual(strings, strings_restored)
def _load_tree(self, tree): self.construct(embeddings=tree["embeddings"].copy(), tokens=split_strings(tree["tokens"]))
def _load_tree(self, tree): self.construct(docs=tree["docs"], tokens=split_strings(tree["tokens"]), freqs=tree["freqs"])
def _load_tree(self, tree): self.construct(tokens=split_strings(tree["tokens"]), matrix=assemble_sparse_matrix(tree["matrix"]))
def test_preprocess(self): import tensorflow as tf with tempfile.TemporaryDirectory() as tmpdir: args = default_preprocess_params(tmpdir, VOCAB) with captured_output() as (out, err, log): id2vec_preprocess(args) self.assertFalse(out.getvalue()) self.assertFalse(err.getvalue()) self.assertEqual(sorted(os.listdir(tmpdir)), [ "col_sums.txt", "col_vocab.txt", "row_sums.txt", "row_vocab.txt", "shard-000-000.pb" ]) df = OrderedDocumentFrequencies().load(source=args.docfreq_in) self.assertEqual(len(df), VOCAB) with open(os.path.join(tmpdir, "col_sums.txt")) as fin: col_sums = fin.read() with open(os.path.join(tmpdir, "row_sums.txt")) as fin: row_sums = fin.read() self.assertEqual(col_sums, row_sums) with open(os.path.join(tmpdir, "col_vocab.txt")) as fin: col_vocab = fin.read() with open(os.path.join(tmpdir, "row_vocab.txt")) as fin: row_vocab = fin.read() self.assertEqual(col_vocab, row_vocab) self.assertEqual(row_vocab.split("\n"), df.tokens()) for word in row_vocab.split("\n"): self.assertGreater(df[word], 0) with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin: features = tf.parse_single_example( fin.read(), features={ "global_row": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "global_col": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "sparse_local_row": tf.VarLenFeature(dtype=tf.int64), "sparse_local_col": tf.VarLenFeature(dtype=tf.int64), "sparse_value": tf.VarLenFeature(dtype=tf.float32) }) with tf.Session() as session: global_row, global_col, local_row, local_col, value = session.run( [ features[n] for n in ("global_row", "global_col", "sparse_local_row", "sparse_local_col", "sparse_value") ]) self.assertEqual(set(range(VOCAB)), set(global_row)) self.assertEqual(set(range(VOCAB)), set(global_col)) nnz = 16001 self.assertEqual(value.values.shape, (nnz, )) self.assertEqual(local_row.values.shape, (nnz, )) self.assertEqual(local_col.values.shape, (nnz, )) numpy.random.seed(0) all_tokens = row_vocab.split("\n") chosen_indices = numpy.random.choice(list(range(VOCAB)), 128, replace=False) chosen = [all_tokens[i] for i in chosen_indices] freqs = numpy.zeros((len(chosen), ) * 2, dtype=int) index = {w: i for i, w in enumerate(chosen)} chosen = set(chosen) with asdf.open(args.input) as model: matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr() tokens = split_strings(model.tree["tokens"]) interesting = {i for i, t in enumerate(tokens) if t in chosen} for y in interesting: row = matrix[y] yi = index[tokens[y]] for x, v in zip(row.indices, row.data): if x in interesting: freqs[yi, index[tokens[x]]] += v matrix = coo_matrix( (value.values, ([global_row[row] for row in local_row.values ], [global_col[col] for col in local_col.values])), shape=(VOCAB, VOCAB)) matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense( ).astype(int) self.assertTrue((matrix == freqs).all())
def test_split_strings(self): strings = split_strings({ "strings": numpy.array([b"abcdef"]), "lengths": numpy.array([1, 2, 3]) }) self.assertEqual(strings, ["a", "bc", "def"])
def _load_tree(self, tree: dict) -> None: self.construct( split_strings(tree["tokens"]), split_strings(tree["topics"]) if tree["topics"] else None, assemble_sparse_matrix(tree["matrix"]))
def _load_tree_kwargs(self, tree): return dict(repos=split_strings(tree["repos"]), matrix=assemble_sparse_matrix(tree["matrix"]))
def _load_tree_kwargs(self, tree): tree_kwargs = super()._load_tree_kwargs(tree) tree_kwargs["tokens"] = split_strings(tree["tokens"]) return tree_kwargs
def _load_tree_kwargs(self, tree): tree_kwargs = super(Source, self)._load_tree_kwargs(tree) tree_kwargs["sources"] = split_strings(tree["sources"]) return tree_kwargs