def handle(srcf, rsf, h5args=h5zipargs): if srcf == rsf: h5save(h5load(srcf, restore_list=False), rsf, h5args=h5args) else: with h5File(srcf, "r") as sfg, h5File(rsf, 'w') as rfg: handle_group(sfg, rfg, h5args=h5args)
def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False): vcbi, nwordi = ldvocab(fvocab_i, minf=minfreq, omit_vsize=vsize, vanilla=False) vcbt, nwordt = ldvocab(fvocab_t, minf=minfreq, omit_vsize=vsize, vanilla=False) if expand_for_mulgpu: _bsize = bsize * minbsize _maxtoken = maxtoken * minbsize else: _bsize = bsize _maxtoken = maxtoken with h5File(frs,'w') as rsf: src_grp = rsf.create_group("src") tgt_grp = rsf.create_group("tgt") curd = 0 for i_d, td in batch_padder(finput, ftarget, vcbi, vcbt, _bsize, maxpad, maxpart, _maxtoken, minbsize): rid = numpy.array(i_d, dtype=numpy.int32) rtd = numpy.array(td, dtype=numpy.int32) #rld = numpy.array(ld, dtype=numpy.int32) wid = str(curd) src_grp.create_dataset(wid, data=rid, **h5datawargs) tgt_grp.create_dataset(wid, data=rtd, **h5datawargs) #rsf["l" + wid] = rld curd += 1 rsf["ndata"] = numpy.array([curd], dtype=numpy.int32) rsf["nword"] = numpy.array([nwordi, nwordt], dtype=numpy.int32) print("Number of batches: %d\nSource Vocabulary Size: %d\nTarget Vocabulary Size: %d" % (curd, nwordi, nwordt,))
def handle(cnfg, srcmtf, decf, rsf): with h5File(cnfg.dev_data, "r") as tdf: nwordi, nwordt = tdf["nword"][:].tolist() mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_layer_fwd) init_model_params(mymodel) _tmpm = NMTBase(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) _tmpm = init_model_params(_tmpm) _tmpm = load_model_cpu(srcmtf, _tmpm) mymodel.load_base(_tmpm) mymodel.dec = load_model_cpu(decf, mymodel.dec) if cnfg.share_emb: mymodel.dec.wemb.weight = _tmpm.enc.wemb.weight if cnfg.bindDecoderEmb: mymodel.dec.classifier.weight = mymodel.dec.wemb.weight _tmpm = None save_model(mymodel, rsf, sub_module=False, h5args=h5zipargs)
def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False): vcbi, nwordi = ldvocab(fvocab_i, minf=minfreq, omit_vsize=vsize, vanilla=False) vcbt, nwordt = ldvocab(fvocab_t, minf=minfreq, omit_vsize=vsize, vanilla=False) if expand_for_mulgpu: _bsize = bsize * minbsize _maxtoken = maxtoken * minbsize else: _bsize = bsize _maxtoken = maxtoken with h5File(frs, 'w') as rsf: src_grp = rsf.create_group("src") tgt_grp = rsf.create_group("tgt") curd = {} for i_d, td, nsent in batch_padder(finput, ftarget, vcbi, vcbt, _bsize, maxpad, maxpart, _maxtoken, minbsize): rid = numpy.array(i_d, dtype=numpy.int32) rtd = numpy.array(td, dtype=numpy.int32) _nsentgid = str(nsent) _curd = curd.get(nsent, 0) if _curd == 0: src_grp.create_group(_nsentgid) tgt_grp.create_group(_nsentgid) _curid = str(_curd) src_grp[_nsentgid].create_dataset(_curid, data=rid, **h5datawargs) tgt_grp[_nsentgid].create_dataset(_curid, data=rtd, **h5datawargs) curd[nsent] = _curd + 1 sents, ndl = dict2pairs(curd) rsf["nsent"] = numpy.array(sents, dtype=numpy.int32) rsf["ndata"] = numpy.array(ndl, dtype=numpy.int32) rsf["nword"] = numpy.array([nwordi, nwordt], dtype=numpy.int32) print("Number of batches: %d\nSource Vocabulary Size: %d\nTarget Vocabulary Size: %d" % (sum(ndl), nwordi, nwordt,))
def handle(finput, fvocab_i, fvocab_task, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False): vcbi, nwordi = ldvocab(fvocab_i, minf=minfreq, omit_vsize=vsize, vanilla=False) vcbtask, nwordtask = ldvocab(fvocab_task, minf=False, omit_vsize=False, vanilla=True) if expand_for_mulgpu: _bsize = bsize * minbsize _maxtoken = maxtoken * minbsize else: _bsize = bsize _maxtoken = maxtoken with h5File(frs, 'w') as rsf: curd = {} torder = [] for i_d, taskd in batch_padder(finput, vcbi, vcbtask, _bsize, maxpad, maxpart, _maxtoken, minbsize): _str_taskd = str(taskd) if _str_taskd in rsf: src_grp = rsf[_str_taskd]["src"] else: src_grp = rsf.create_group(_str_taskd).create_group("src") torder.append(taskd) rid = numpy.array(i_d, dtype=numpy.int32) _id = curd.get(taskd, 0) wid = str(_id) src_grp.create_dataset(wid, data=rid, **h5datawargs) curd[taskd] = _id + 1 rsf["taskorder"] = numpy.array(torder, dtype=numpy.int32) curd = [curd[tmp] for tmp in torder] rsf["ndata"] = numpy.array(curd, dtype=numpy.int32) rsf["nword"] = numpy.array([nwordi, nwordtask], dtype=numpy.int32) print( "Number of batches: %d\nSource Vocabulary Size: %d\nNumber of Tasks: %d" % ( sum(curd), nwordi, nwordtask, ))
def handle(h5f, bsize, shuf=True): ntoken = 0 rsl = [] with h5File(h5f, "r") as td: ntest = td["ndata"][:].item() tl = list(range(ntest)) if shuf: shuffle(tl) tgt_grp = td["tgt"] for tid in tqdm(tl, mininterval=tqdm_mininterval): seq_batch = torch.from_numpy(tgt_grp[str(tid)][:]) ot = seq_batch.narrow(-1, 1, seq_batch.size(-1) - 1) ntoken += ot.ne(0).int().sum().item() if ntoken >= bsize: rsl.append(ntoken) ntoken = 0 return sum(rsl) / float(len(rsl))
def handle(h5f, bsize, shuf=True): with h5File(h5f, "r") as td: tl = [( str(nsent), str(_curd), ) for nsent, ndata in zip(td["nsent"][:].tolist(), td["ndata"] [:].tolist()) for _curd in range(ndata)] if shuf: shuffle(tl) tgt_grp = td["tgt"] ntoken = 0 nstep = 0 for nsent, i_d in tqdm(tl, mininterval=tqdm_mininterval): seq_batch = torch.from_numpy(tgt_grp[nsent][i_d][:]) ot = seq_batch.narrow(-1, 1, seq_batch.size(-1) - 1) ntoken += ot.ne(0).int().sum().item() if ntoken >= bsize: nstep += 1 ntoken = 0 return nstep
def handle(h5f, bsize, shuf=True): with h5File(h5f, "r") as td: ntest = td["ndata"][:].tolist() tl = [( i, str(_task), ) for _nd, _task in zip(ntest, td["taskorder"][:].tolist()) for i in range(_nd)] if shuf: shuffle(tl) ntoken = 0 nstep = 0 for tid, taskid in tqdm(tl, mininterval=tqdm_mininterval): seq_batch = torch.from_numpy(td[taskid]["tgt"][str(tid)][:]) ot = seq_batch.narrow(-1, 1, seq_batch.size(-1) - 1) ntoken += ot.ne(0).int().sum().item() if ntoken >= bsize: nstep += 1 ntoken = 0 return nstep
def handle(srcf): with h5File(srcf, "r") as sfg: rs = handle_group(sfg) print(rs)