p = pth.Path(ps.dir_data) / ps.dset pv = p / ps.vocab_path p = p / kind if not p.exists(): tokenizer = encoder.tokenizer_for(ps) tp = F.Topic(ps.dset, tokenizer(reader(ps, kind))) R.dump(p / ps.dset, lambda: recorder(tp)) if kind == 'train' and not pv.exists(): R.dump(pv, lambda: [tokenizer.vocab.record()]) ds = R.dataset(p / ps.dset) return ds, feats feats = { 'context': tf.VarLenFeature(tf.int64), 'uid': tf.FixedLenFeature([], tf.string), } def recorder(topic): for _, c in topic.contexts(): yield R.example({ 'context': R.ints_feat([*c.toks]), 'uid': R.bytes_feat(c.uid), }) def reader(ps, kind): assert not ps.dset or ps.dset == 'enwik8' p = pth.Path(ps.dir_data) / ps.dset with zipfile.ZipFile(p / 'enwik8.zip') as z:
from qnarre.neura import tf from qnarre.feeds.prep import records as R def dset(ps, kind): assert ps.dset.startswith('mnist') p = pth.Path(ps.dir_data) / ps.dset / kind if not p.exists(): vs = tuple(reader(ps, kind)) R.dump(p / ps.dset, lambda: recorder(vs)) ds = R.dataset(p / ps.dset) return ds, feats feats = { 'int_img': tf.FixedLenFeature([28 * 28], tf.int64), 'flt_img': tf.VarLenFeature(tf.float32), 'int_lbl': tf.FixedLenFeature([], tf.int64), 'str_lbl': tf.FixedLenFeature([], tf.string), } def recorder(vals): for iis, fis, il, sl in vals: yield R.example({ 'int_img': R.ints_feat(iis), 'flt_img': R.floats_feat(fis), 'int_lbl': R.one_int_feat(il), 'str_lbl': R.bytes_feat(sl), })
tokenizer = encoder.tokenizer_for(ps) ts = F.Topics(tokenizer(reader(ps, kind))) for n in registry['all']: R.dump(p / n, lambda: registry[n](ts)) if kind == 'train' and not pv.exists(): R.dump(pv, lambda: [tokenizer.vocab.record()]) ds = R.dataset(p / ps.dset_subset) return ds, feats[ps.dset_subset] feats = { 'query_valid': { 'title': tf.VarLenFeature(tf.int64), 'context': tf.VarLenFeature(tf.int64), 'query': tf.VarLenFeature(tf.int64), 'valid': tf.FixedLenFeature([], tf.int64), 'uid': tf.FixedLenFeature([], tf.string), }, 'reply_spans': { 'title': tf.VarLenFeature(tf.int64), 'context': tf.VarLenFeature(tf.int64), 'query': tf.VarLenFeature(tf.int64), 'reply': tf.VarLenFeature(tf.int64), 'begin': tf.FixedLenFeature([], tf.int64), 'end': tf.FixedLenFeature([], tf.int64), 'uid': tf.FixedLenFeature([], tf.string), }, 'possibles': { 'title': tf.VarLenFeature(tf.int64), 'context': tf.VarLenFeature(tf.int64), 'query': tf.VarLenFeature(tf.int64),