def main(args): set_seed(args.seed) tk = Tokenizer(args.tokenizer) model = TransformerModel( d_model=768, d_ff=1024, dropout=.1, layers=6, heads=8, d_emb=-1, pad_token_id=tk.pad_id, vocab_size=tk.vocab_size ) ds = dataset(0) device = torch.device(args.device) model.load_state_dict(torch.load(args.ckpt, map_location=device)['model']) ds.set_mono_ratio(args.m_ratio) if not os.path.exists(args.results): start = timeit.default_timer() ds.generate(lambda x: [beam_search( model=model.to(device), input_sequence=torch.LongTensor(tk.tokenize(x)).to(device), bos_id=tk.bos_id, eos_id=tk.eos_id, beam_width=args.beam, device=device, max_seq_len=64)], max_input_len=64) end = timeit.default_timer() print(f'{end-start:.2f} sec') open(args.results,'w').writelines('\n'.join(tk.detokenize(ds.synthetic[1:]))) else: start = timeit.default_timer() ds.generate(lambda x: beam_search_v2( model=model.to(device), input_sequence=tk.tokenize(x), tokenizer=tk, is_full=lambda b, nx, ny: (nx + ny * 1.5) * b > 256 * 64, beam_width=args.beam, device=device, max_seq_len=64), max_input_len=64, batch_size=64) end = timeit.default_timer() s = tk.detokenize(ds.synthetic[1:]) open(args.results+'_2', 'w').writelines('\n'.join(s)) r = open(args.results).readlines() if len(s) != len(r): raise Exception(f'result should be length of {len(r)} but got {len(s)}') for i, j in zip(s, r): if i != j.strip(): print(f'---------------\n"{i}"\n!=\n"{j.strip()}"') print(f'{end-start:.2f} sec')
f = np.array(f) print(np.exp(f[39:]).sum()) l = list(map(len, w[39:])) freq = sorted(dict(Counter(l)).items(), key=lambda x: x[0]) print('\n'.join(f'|{i}|{j}|' for i, j in freq)) # In[] ds = NewsDataset('data/news_dataset_tag10_v2.1.db') # ds = NewsDataset('data/wiki.db') tk = Tokenizer('data/t2.1_c1') # In[] from src.utils import peek d = peek(ds.data, 1) print(d[0][2]) print(tk.detokenize(tk.tokenize(d[0][2]))) # In[] ll = list(map(lambda x: len(x[2]), ds.data)) sl = sorted(ll) print(sl[0]) print(sl[int(len(sl) * 0.25)]) print(sl[int(len(sl) * 0.5)]) print(sl[int(len(sl) * 0.75)]) print(sl[-1]) # In[] tl = [] al = [] for idx, t, a in tqdm(ds.data): tl.append(len(tk.tokenize(t, bos=False, eos=False))) al.append(len(tk.tokenize(a, bos=False, eos=False)))
def main(args): set_seed(args.seed) if args.seed is not None else None tk = Tokenizer(args.tokenizer) model = TransformerModel(d_model=768, d_ff=1024, dropout=0, layers=args.layer, heads=args.heads, d_emb=-1, pad_token_id=tk.pad_id, vocab_size=tk.vocab_size) if args.inseq is not None: r = beam_search_v2(model, tk.tokenize(args.inseq), tk, lambda b, nx, ny: (nx + ny) * b > 128 * 64, 4, args.device, args.maxlen) else: if args.peek == 0: return ds = NewsDataset(args.data, args.a, args.b, inplace=args.inplace, sample=args.sample, seed=args.seed) device = torch.device(args.device) if args.ckpt == 'latest': args.ckpt = find_latest_ckpt(args.model_dir, args.ckpt_pattern).path model.load_state_dict( torch.load(args.ckpt, map_location=device)['model']) model.to(device) if len(args.aids) == 0: ids, inseq, outseq = ds.get_collate_fn(tk, getid=True)(peek( ds, args.peek, args.seed)) else: data = list(filter(lambda x: x[0] in args.aids, ds.data)) sdata = [] if len(data) != len(args.aids): raise Exception(f'only got {list(zip(*data))[0]}') for i in args.aids: for x in data: if x[0] == i: sdata.append(x) ids, inseq, outseq = ds.get_collate_fn(tk, getid=True)(sdata) start = timeit.default_timer() preds = [] for beam_n in args.beam: p = beam_search_v2(model, inseq, tk, lambda b, nx, ny: (nx + ny) * b > 128 * 64, beam_n, args.device, args.maxlen) preds.append((f'beam{beam_n}', p)) for topk_k in args.topk: p = topk(model, inseq, tk, topk_k, args.device, args.maxlen) preds.append((f'topk{topk_k}', p)) results = [] for idx in range(len(inseq)): results.append((tk.detokenize(inseq[idx]), *[tk.detokenize(p[idx]) for _, p in preds], tk.detokenize(outseq[idx][:args.maxlen + 1]))) df = pd.DataFrame(results, columns=['input', *[n for n, _ in preds], 'target']) if args.output is None: print(df) else: df.to_csv(args.output) print((timeit.default_timer() - start))