Exemplo n.º 1
0
def handle(srcf, rsf, h5args=h5zipargs):

    if srcf == rsf:
        h5save(h5load(srcf, restore_list=False), rsf, h5args=h5args)
    else:
        with h5File(srcf, "r") as sfg, h5File(rsf, 'w') as rfg:
            handle_group(sfg, rfg, h5args=h5args)
Exemplo n.º 2
0
def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False):
	vcbi, nwordi = ldvocab(fvocab_i, minf=minfreq, omit_vsize=vsize, vanilla=False)
	vcbt, nwordt = ldvocab(fvocab_t, minf=minfreq, omit_vsize=vsize, vanilla=False)
	if expand_for_mulgpu:
		_bsize = bsize * minbsize
		_maxtoken = maxtoken * minbsize
	else:
		_bsize = bsize
		_maxtoken = maxtoken
	with h5File(frs,'w') as rsf:
		src_grp = rsf.create_group("src")
		tgt_grp = rsf.create_group("tgt")
		curd = 0
		for i_d, td in batch_padder(finput, ftarget, vcbi, vcbt, _bsize, maxpad, maxpart, _maxtoken, minbsize):
			rid = numpy.array(i_d, dtype=numpy.int32)
			rtd = numpy.array(td, dtype=numpy.int32)
			#rld = numpy.array(ld, dtype=numpy.int32)
			wid = str(curd)
			src_grp.create_dataset(wid, data=rid, **h5datawargs)
			tgt_grp.create_dataset(wid, data=rtd, **h5datawargs)
			#rsf["l" + wid] = rld
			curd += 1
		rsf["ndata"] = numpy.array([curd], dtype=numpy.int32)
		rsf["nword"] = numpy.array([nwordi, nwordt], dtype=numpy.int32)
	print("Number of batches: %d\nSource Vocabulary Size: %d\nTarget Vocabulary Size: %d" % (curd, nwordi, nwordt,))
Exemplo n.º 3
0
def handle(cnfg, srcmtf, decf, rsf):

    with h5File(cnfg.dev_data, "r") as tdf:
        nwordi, nwordt = tdf["nword"][:].tolist()

    mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize,
                  cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead,
                  cache_len_default, cnfg.attn_hsize, cnfg.norm_output,
                  cnfg.bindDecoderEmb, cnfg.forbidden_indexes,
                  cnfg.num_layer_fwd)
    init_model_params(mymodel)
    _tmpm = NMTBase(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize,
                    cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead,
                    cache_len_default, cnfg.attn_hsize, cnfg.norm_output,
                    cnfg.bindDecoderEmb, cnfg.forbidden_indexes)
    _tmpm = init_model_params(_tmpm)
    _tmpm = load_model_cpu(srcmtf, _tmpm)
    mymodel.load_base(_tmpm)
    mymodel.dec = load_model_cpu(decf, mymodel.dec)
    if cnfg.share_emb:
        mymodel.dec.wemb.weight = _tmpm.enc.wemb.weight
    if cnfg.bindDecoderEmb:
        mymodel.dec.classifier.weight = mymodel.dec.wemb.weight
    _tmpm = None

    save_model(mymodel, rsf, sub_module=False, h5args=h5zipargs)
Exemplo n.º 4
0
def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False):
	vcbi, nwordi = ldvocab(fvocab_i, minf=minfreq, omit_vsize=vsize, vanilla=False)
	vcbt, nwordt = ldvocab(fvocab_t, minf=minfreq, omit_vsize=vsize, vanilla=False)
	if expand_for_mulgpu:
		_bsize = bsize * minbsize
		_maxtoken = maxtoken * minbsize
	else:
		_bsize = bsize
		_maxtoken = maxtoken
	with h5File(frs, 'w') as rsf:
		src_grp = rsf.create_group("src")
		tgt_grp = rsf.create_group("tgt")
		curd = {}
		for i_d, td, nsent in batch_padder(finput, ftarget, vcbi, vcbt, _bsize, maxpad, maxpart, _maxtoken, minbsize):
			rid = numpy.array(i_d, dtype=numpy.int32)
			rtd = numpy.array(td, dtype=numpy.int32)
			_nsentgid = str(nsent)
			_curd = curd.get(nsent, 0)
			if _curd == 0:
				src_grp.create_group(_nsentgid)
				tgt_grp.create_group(_nsentgid)
			_curid = str(_curd)
			src_grp[_nsentgid].create_dataset(_curid, data=rid, **h5datawargs)
			tgt_grp[_nsentgid].create_dataset(_curid, data=rtd, **h5datawargs)
			curd[nsent] = _curd + 1
		sents, ndl = dict2pairs(curd)
		rsf["nsent"] = numpy.array(sents, dtype=numpy.int32)
		rsf["ndata"] = numpy.array(ndl, dtype=numpy.int32)
		rsf["nword"] = numpy.array([nwordi, nwordt], dtype=numpy.int32)
	print("Number of batches: %d\nSource Vocabulary Size: %d\nTarget Vocabulary Size: %d" % (sum(ndl), nwordi, nwordt,))
Exemplo n.º 5
0
def handle(finput,
           fvocab_i,
           fvocab_task,
           frs,
           minbsize=1,
           expand_for_mulgpu=True,
           bsize=max_sentences_gpu,
           maxpad=max_pad_tokens_sentence,
           maxpart=normal_tokens_vs_pad_tokens,
           maxtoken=max_tokens_gpu,
           minfreq=False,
           vsize=False):
    vcbi, nwordi = ldvocab(fvocab_i,
                           minf=minfreq,
                           omit_vsize=vsize,
                           vanilla=False)
    vcbtask, nwordtask = ldvocab(fvocab_task,
                                 minf=False,
                                 omit_vsize=False,
                                 vanilla=True)
    if expand_for_mulgpu:
        _bsize = bsize * minbsize
        _maxtoken = maxtoken * minbsize
    else:
        _bsize = bsize
        _maxtoken = maxtoken
    with h5File(frs, 'w') as rsf:
        curd = {}
        torder = []
        for i_d, taskd in batch_padder(finput, vcbi, vcbtask, _bsize, maxpad,
                                       maxpart, _maxtoken, minbsize):
            _str_taskd = str(taskd)
            if _str_taskd in rsf:
                src_grp = rsf[_str_taskd]["src"]
            else:
                src_grp = rsf.create_group(_str_taskd).create_group("src")
                torder.append(taskd)
            rid = numpy.array(i_d, dtype=numpy.int32)
            _id = curd.get(taskd, 0)
            wid = str(_id)
            src_grp.create_dataset(wid, data=rid, **h5datawargs)
            curd[taskd] = _id + 1
        rsf["taskorder"] = numpy.array(torder, dtype=numpy.int32)
        curd = [curd[tmp] for tmp in torder]
        rsf["ndata"] = numpy.array(curd, dtype=numpy.int32)
        rsf["nword"] = numpy.array([nwordi, nwordtask], dtype=numpy.int32)
    print(
        "Number of batches: %d\nSource Vocabulary Size: %d\nNumber of Tasks: %d"
        % (
            sum(curd),
            nwordi,
            nwordtask,
        ))
Exemplo n.º 6
0
def handle(h5f, bsize, shuf=True):

    ntoken = 0
    rsl = []
    with h5File(h5f, "r") as td:
        ntest = td["ndata"][:].item()
        tl = list(range(ntest))
        if shuf:
            shuffle(tl)

        tgt_grp = td["tgt"]
        for tid in tqdm(tl, mininterval=tqdm_mininterval):
            seq_batch = torch.from_numpy(tgt_grp[str(tid)][:])
            ot = seq_batch.narrow(-1, 1, seq_batch.size(-1) - 1)
            ntoken += ot.ne(0).int().sum().item()
            if ntoken >= bsize:
                rsl.append(ntoken)
                ntoken = 0

    return sum(rsl) / float(len(rsl))
Exemplo n.º 7
0
def handle(h5f, bsize, shuf=True):

    with h5File(h5f, "r") as td:
        tl = [(
            str(nsent),
            str(_curd),
        ) for nsent, ndata in zip(td["nsent"][:].tolist(), td["ndata"]
                                  [:].tolist()) for _curd in range(ndata)]
        if shuf:
            shuffle(tl)

        tgt_grp = td["tgt"]
        ntoken = 0
        nstep = 0
        for nsent, i_d in tqdm(tl, mininterval=tqdm_mininterval):
            seq_batch = torch.from_numpy(tgt_grp[nsent][i_d][:])
            ot = seq_batch.narrow(-1, 1, seq_batch.size(-1) - 1)
            ntoken += ot.ne(0).int().sum().item()
            if ntoken >= bsize:
                nstep += 1
                ntoken = 0

    return nstep
Exemplo n.º 8
0
def handle(h5f, bsize, shuf=True):

    with h5File(h5f, "r") as td:
        ntest = td["ndata"][:].tolist()
        tl = [(
            i,
            str(_task),
        ) for _nd, _task in zip(ntest, td["taskorder"][:].tolist())
              for i in range(_nd)]
        if shuf:
            shuffle(tl)

        ntoken = 0
        nstep = 0
        for tid, taskid in tqdm(tl, mininterval=tqdm_mininterval):
            seq_batch = torch.from_numpy(td[taskid]["tgt"][str(tid)][:])
            ot = seq_batch.narrow(-1, 1, seq_batch.size(-1) - 1)
            ntoken += ot.ne(0).int().sum().item()
            if ntoken >= bsize:
                nstep += 1
                ntoken = 0

    return nstep
Exemplo n.º 9
0
def handle(srcf):

    with h5File(srcf, "r") as sfg:
        rs = handle_group(sfg)
    print(rs)