예제 #1
0
파일: enwik8.py 프로젝트: quantapix/qnarre2
    p = pth.Path(ps.dir_data) / ps.dset
    pv = p / ps.vocab_path
    p = p / kind
    if not p.exists():
        tokenizer = encoder.tokenizer_for(ps)
        tp = F.Topic(ps.dset, tokenizer(reader(ps, kind)))
        R.dump(p / ps.dset, lambda: recorder(tp))
        if kind == 'train' and not pv.exists():
            R.dump(pv, lambda: [tokenizer.vocab.record()])
    ds = R.dataset(p / ps.dset)
    return ds, feats


feats = {
    'context': tf.VarLenFeature(tf.int64),
    'uid': tf.FixedLenFeature([], tf.string),
}


def recorder(topic):
    for _, c in topic.contexts():
        yield R.example({
            'context': R.ints_feat([*c.toks]),
            'uid': R.bytes_feat(c.uid),
        })


def reader(ps, kind):
    assert not ps.dset or ps.dset == 'enwik8'
    p = pth.Path(ps.dir_data) / ps.dset
    with zipfile.ZipFile(p / 'enwik8.zip') as z:
예제 #2
0
from qnarre.neura import tf
from qnarre.feeds.prep import records as R


def dset(ps, kind):
    assert ps.dset.startswith('mnist')
    p = pth.Path(ps.dir_data) / ps.dset / kind
    if not p.exists():
        vs = tuple(reader(ps, kind))
        R.dump(p / ps.dset, lambda: recorder(vs))
    ds = R.dataset(p / ps.dset)
    return ds, feats


feats = {
    'int_img': tf.FixedLenFeature([28 * 28], tf.int64),
    'flt_img': tf.VarLenFeature(tf.float32),
    'int_lbl': tf.FixedLenFeature([], tf.int64),
    'str_lbl': tf.FixedLenFeature([], tf.string),
}


def recorder(vals):
    for iis, fis, il, sl in vals:
        yield R.example({
            'int_img': R.ints_feat(iis),
            'flt_img': R.floats_feat(fis),
            'int_lbl': R.one_int_feat(il),
            'str_lbl': R.bytes_feat(sl),
        })
예제 #3
0
        tokenizer = encoder.tokenizer_for(ps)
        ts = F.Topics(tokenizer(reader(ps, kind)))
        for n in registry['all']:
            R.dump(p / n, lambda: registry[n](ts))
        if kind == 'train' and not pv.exists():
            R.dump(pv, lambda: [tokenizer.vocab.record()])
    ds = R.dataset(p / ps.dset_subset)
    return ds, feats[ps.dset_subset]


feats = {
    'query_valid': {
        'title': tf.VarLenFeature(tf.int64),
        'context': tf.VarLenFeature(tf.int64),
        'query': tf.VarLenFeature(tf.int64),
        'valid': tf.FixedLenFeature([], tf.int64),
        'uid': tf.FixedLenFeature([], tf.string),
    },
    'reply_spans': {
        'title': tf.VarLenFeature(tf.int64),
        'context': tf.VarLenFeature(tf.int64),
        'query': tf.VarLenFeature(tf.int64),
        'reply': tf.VarLenFeature(tf.int64),
        'begin': tf.FixedLenFeature([], tf.int64),
        'end': tf.FixedLenFeature([], tf.int64),
        'uid': tf.FixedLenFeature([], tf.string),
    },
    'possibles': {
        'title': tf.VarLenFeature(tf.int64),
        'context': tf.VarLenFeature(tf.int64),
        'query': tf.VarLenFeature(tf.int64),