def reader(ps, kind): p = pth.Path(ps.dir_data) / ps.dset for n in registry[kind]: with lzma.open(p / (n + '.json.xz'), mode='rt') as f: for data in json.load(f)['data']: cs = [] for p in data['paragraphs']: ct = utils.normalize(p['context']) qs = [] for q in p['qas']: qu = q['id'] rs = [] for i, r in enumerate(q.get('answers', ())): rt = utils.normalize(r['text']) s = r['answer_start'] if ct.find(rt, s) == s: s = F.Span(s, s + len(rt)) rs.append(F.Reply(rt, s, qu + f'-r{i}')) else: print('Mismatched', ct[:20], rt[:20]) ps = [] for i, p in enumerate(q.get('plausible_answers', ())): pt = utils.normalize(p['text']) s = p['answer_start'] if ct.find(pt, s) == s: s = F.Span(s, s + len(pt)) ps.append(F.Reply(pt, s, qu + f'-p{i}')) else: print('Mismatched', ct[:20], pt[:20]) qt = utils.normalize(q['question']) qv = q.get('is_impossible', False) qs.append(F.Query(qt, qv, qu, rs, ps)) cs.append(F.Context(ct, qs)) tt = utils.normalize(data['title']) yield F.Topic(tt, cs)
def test_encoders(): txt = "sf!fg dfg'sdf?dfg xcxb'sdfg!sdg 324sdf.sdfa" ce = encoder.CharE(ps) ts, os, _ = zip(*ce(txt)) d = ce.decode(ts, os) assert d == txt we = encoder.WordE(ps) ts, os, _ = zip(*we(txt)) d = we.decode(ts, os) assert d == txt be = encoder.BertE(ps) ge = encoder.Gpt2E(ps) with zipfile.ZipFile('.data/text8/text8.zip') as z: with z.open('text8') as f: ws = utils.normalize(f.read().decode().strip()) ws = utils.normalize(ws).split() for i in range(200): txt = ' '.join(ws[i * 100:i * 100 + 100]) ts, os, _ = zip(*ce(txt)) d = ce.decode(ts, os) assert d == txt ts, os, _ = zip(*we(txt)) d = we.decode(ts, os) assert d == txt ts, os, _ = zip(*be(txt)) d = be.decode(ts, os) assert d == txt ts, os, _ = zip(*ge(txt)) d = ge.decode(ts, os) assert d == txt print(len(ce.vocab), len(we.vocab), len(be.vocab), len(ge.vocab))
def reader(ps, kind): assert not ps.dset or ps.dset == 'enwik8' p = pth.Path(ps.dir_data) / ps.dset with zipfile.ZipFile(p / 'enwik8.zip') as z: with z.open('enwik8') as f: ws = utils.normalize(f.read().decode().strip()).split() split = ps.test_train_split or 10 n = len(ws) * split // 100 if kind == 'train': ws = ws[:-2 * n] elif kind == 'valid': ws = ws[-2 * n:-n] elif kind == 'test': ws = ws[-n:] wl = ps.len_words for i in range(len(ws) // wl): cu = '{:0>9d}0'.format(i) yield F.Context(ws[i * wl:(i + 1) * wl], uid=cu)
def reader(ps, kind): p = pth.Path(ps.dir_data) / ps.dset for n in registry[kind]: with lzma.open(p / (n + '.csv.xz'), mode='rt') as f: for i, ln in enumerate(csv.reader(f)): if i: ln = utils.normalize(ln) if kind == 'train': tt = ln[1].strip() ct = ' '.join(t.strip() for t in ln[2:6]) qs = [F.Query(ln[6].strip(), True, ln[0].strip())] else: tt = '' ct = ' '.join(t.strip() for t in ln[1:5]) qu = ln[0].strip() v = int(ln[-1]) qs = [ F.Query(ln[5].strip(), v == 1, qu + f'-r0'), F.Query(ln[6].strip(), v == 2, qu + f'-r1'), ] yield F.Topic(tt, [F.Context(ct, qs)])