예제 #1
0
    def __init__(self,
                 generator,
                 criterion,
                 input_size,
                 output_size,
                 trg_word_emb=None,
                 bowMapper=None,
                 loss_norm='tokens',
                 bow_crit=None,
                 emb_crit=None,
                 rl_crit=None,
                 chunk_size=5,
                 device_ids=None):

        super(MultiGPULossCompute, self).__init__()
        if bow_crit is not None or emb_crit is not None:
            wlog('using the bow loss')
            self.init_lambda, self.max_lambda, self.warmup_steps = 0., 1., 20000
            self.lambda_step = (self.max_lambda -
                                self.init_lambda) / self.warmup_steps
            self.decay_factor = self.max_lambda * (self.warmup_steps**0.5)
        self.loss_norm = loss_norm
        self.chunk_size = chunk_size  # ??
        self.device_ids = device_ids
        self.rl_crit_single = rl_crit
        # self.trg_word_emb = trg_word_emb.we
        self.output_size = output_size
        self.generator = generator
        self.criterion = criterion
        self.bowMapper = nn.parallel.replicate(
            bowMapper,
            devices=self.device_ids) if bowMapper is not None else None
예제 #2
0
 def __init__(self, output_size):
     super(BOWLossCriterion, self).__init__()
     weight = tc.ones(output_size, requires_grad=False)
     weight[PAD] = 0  # do not predict padding, same with ingore_index
     self.crit = nn.NLLLoss(weight, ignore_index=PAD, reduction='sum')
     wlog('using the bag of words loss')
     self.sigmoid = nn.Sigmoid()
예제 #3
0
    def __init__(self, dropout_prob, n_embed, max_len=MAX_SEQ_SIZE):

        super(PositionalEncoding, self).__init__()

        # Compute the positional encodings once in log space.
        pe = tc.zeros(max_len, n_embed)
        position = tc.arange(0., max_len).unsqueeze(1)
        div_term = tc.exp(
            tc.arange(0., n_embed, 2) * -(math.log(10000.0) / n_embed))
        # keep dim 0 for padding token position encoding zero vector
        #inter_term = position.float() * div_term
        #pe[1:, 0::2] = tc.sin(inter_term)[1:]
        #pe[1:, 1::2] = tc.cos(inter_term)[1:]
        # [5000, 1] * [256] = [5000, 256]
        pe[:, 0::2] = tc.sin(position * div_term)
        pe[:, 1::2] = tc.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [5000, 512] -> [1, 5000, 512]
        self.register_buffer('pe', pe)
        self.n_embed = n_embed

        wlog('\t pe: {}'.format(pe.size()))

        self.dropout = None
        if dropout_prob is not None and 0. < dropout_prob <= 1.0:
            wlog('\t with emb dropout prob = {} ...'.format(dropout_prob))
            self.dropout = nn.Dropout(p=dropout_prob)
예제 #4
0
def build_decoder(trg_emb):

    if wargs.encoder_type == 'gru':
        from models.gru_decoder import StackedGRUDecoder
        return StackedGRUDecoder(trg_emb=trg_emb,
                                 enc_hid_size=wargs.d_enc_hid,
                                 dec_hid_size=wargs.d_dec_hid,
                                 n_layers=wargs.n_dec_layers,
                                 attention_type=wargs.attention_type,
                                 rnn_dropout_prob=wargs.rnn_dropout,
                                 out_dropout_prob=wargs.output_dropout)
    if wargs.decoder_type == 'att':
        from models.self_att_model import SelfAttDecoder, SelfAttDecoderLayer, \
                PositionwiseFeedForward, clones
        from models.attention import MultiHeadedAttention
        c = copy.deepcopy
        attn = MultiHeadedAttention(h=wargs.n_head,
                                    d_model=wargs.d_model,
                                    dropout=wargs.att_dropout)
        wlog('clones -> {}'.format(2))
        ff = PositionwiseFeedForward(d_model=wargs.d_model,
                                     d_ff=wargs.d_ff_filter,
                                     dropout=wargs.relu_dropout)
        return SelfAttDecoder(trg_emb=trg_emb,
                              layer=SelfAttDecoderLayer(
                                  wargs.d_model,
                                  c(attn),
                                  c(attn),
                                  c(ff),
                                  dropout=wargs.residual_dropout),
                              N=wargs.n_enc_layers)
예제 #5
0
 def __init__(self, features, eps=1e-5):
     super(LayerNorm, self).__init__()
     self.a_2 = nn.Parameter(tc.ones(features))
     wlog('*Ones init a in layernorm {}'.format(self.a_2.size()))
     self.b_2 = nn.Parameter(tc.zeros(features))
     wlog('*Zeros init b in layernorm {}'.format(self.b_2.size()))
     self.eps = eps
예제 #6
0
def Linear(in_features, out_features, bias=True):
    m = nn.Linear(in_features, out_features, bias)
    nn.init.xavier_uniform_(m.weight)
    wlog('*Xavier init linear weight {}'.format(m.weight.size()))
    if bias is True:
        nn.init.constant_(m.bias, 0.)
        wlog('*Zeros init linear bias {}'.format(m.bias.size()))
    return m
예제 #7
0
파일: losser.py 프로젝트: zhang-wen/wenmt
    def __init__(self, output_size, label_smoothing=0.1):

        super(LabelSmoothingCriterion, self).__init__()
        assert 0. < label_smoothing <= 1., 'label smoothing value should be in [0, 1]'
        wlog('NLL loss with label_smoothing: {}'.format(label_smoothing))
        # all non-true labels are uniformly set to low-confidence
        self.smoothing_value = label_smoothing / (output_size - 2)
        one_hot = tc.full((output_size, ), self.smoothing_value)
        one_hot[PAD] = 0.
        self.register_buffer('one_hot', one_hot.unsqueeze(0))
        self.confidence = 1.0 - label_smoothing
        self.label_smoothing = label_smoothing
예제 #8
0
파일: losser.py 프로젝트: zjpbinary/OR-NMT
    def __init__(self,
                 input_size,
                 output_size,
                 trg_word_emb=None,
                 loss_norm='tokens',
                 label_smoothing=0.,
                 emb_loss=False,
                 bow_loss=False):

        super(Classifier, self).__init__()
        if emb_loss is True:
            assert trg_word_emb is not None, 'embedding loss needs target embedding'
            self.trg_word_emb = trg_word_emb.we
            self.euclidean_dist = nn.PairwiseDistance(p=2,
                                                      eps=1e-06,
                                                      keepdim=True)
        self.emb_loss = emb_loss
        if bow_loss is True:
            wlog('using the bag of words loss')
            self.sigmoid = nn.Sigmoid()
            #self.ctx_map_vocab = Linear(2 * input_size, output_size, bias=True)
            #self.softmax = MaskSoftmax()
        self.bow_loss = bow_loss

        self.map_vocab = Linear(input_size, output_size, bias=True)
        nn.init.normal_(self.map_vocab.weight, mean=0, std=input_size**-0.5)
        if wargs.proj_share_weight is True:
            assert input_size == wargs.d_trg_emb
            wlog('copying weights of target word embedding into classifier')
            self.map_vocab.weight = trg_word_emb.we.weight
        self.log_prob = MyLogSoftmax(wargs.self_norm_alpha)

        assert 0. <= label_smoothing <= 1., 'label smoothing value should be in [0, 1]'
        wlog('NLL loss with label_smoothing: {}'.format(label_smoothing))
        if label_smoothing == 0. or self.bow_loss is True:
            weight = tc.ones(output_size, requires_grad=False)
            weight[PAD] = 0  # do not predict padding, same with ingore_index
            self.criterion = nn.NLLLoss(weight,
                                        ignore_index=PAD,
                                        reduction='sum')
            #self.criterion = nn.NLLLoss(weight, ignore_index=PAD, size_average=False)
        if 0. < label_smoothing <= 1.:
            # all non-true labels are uniformly set to low-confidence
            self.smoothing_value = label_smoothing / (output_size - 2)
            one_hot = tc.full((output_size, ), self.smoothing_value)
            one_hot[PAD] = 0.
            self.register_buffer('one_hot', one_hot.unsqueeze(0))
            self.confidence = 1.0 - label_smoothing

        self.output_size = output_size
        self.softmax = MaskSoftmax()
        self.loss_norm = loss_norm
        self.label_smoothing = label_smoothing
예제 #9
0
    def __init__(self,
                 n_vocab,
                 n_embed=512,
                 emb_dropout=0.,
                 position_encoding=False,
                 prefix='WordEmbedding'):

        super(WordEmbedding, self).__init__()
        wlog('WordEmbedding_{}'.format(prefix))
        self.position_encoding = position_encoding
        self.we = nn.Embedding(n_vocab, n_embed, padding_idx=PAD)
        nn.init.normal_(self.we.weight, mean=0, std=n_embed**-0.5)
        nn.init.constant_(self.we.weight[PAD], 0)
        self.n_embed = n_embed
        if position_encoding is True:
            wlog('with position emb ...')
            #self.pe = PositionalEncoding(emb_dropout, n_embed)
            #self.spe = PositionalEmbedding(MAX_SEQ_SIZE, n_embed, PAD, left_pad=False, learned=False)
        wlog('with emb dropout prob = {} ...'.format(emb_dropout))
        self.emb_dropout = emb_dropout
예제 #10
0
    def __init__(self,
                 n_vocab,
                 n_embed=512,
                 emb_dropout=0.,
                 position_encoding=False,
                 prefix='WordEmbedding'):

        super(WordEmbedding, self).__init__()
        wlog('WordEmbedding_{}'.format(prefix))
        self.position_encoding = position_encoding
        self.we = nn.Embedding(n_vocab, n_embed, padding_idx=PAD)
        self.n_vocab = n_vocab
        nn.init.normal_(self.we.weight, mean=0, std=n_embed**-0.5)
        wlog('*Normal init word embedding weight {}'.format(
            self.we.weight.size()))
        nn.init.constant_(self.we.weight[PAD], 0)
        self.n_embed = n_embed
        if position_encoding is True:
            wlog('with position emb ...')
            self.spe = PositionalEncoding(emb_dropout, n_embed)
            self.emb_dropout = emb_dropout
예제 #11
0
def domain_out(model, src_input):
    batch_count = len(src_input)
    point_every, number_every = int(math.ceil(batch_count / 100)), int(
        math.ceil(batch_count / 10))
    total_domain = []
    sent_no, words_cnt = 0, 0

    fd_attent_matrixs, trgs = None, None

    trans_start = time.time()

    for bid in range(batch_count):
        src, srcm = src_input[bid][1], src_input[bid][4]
        domain = model(src, None, srcm, None, 'IN')
        total_domain.append(domain)
        if numpy.mod(sent_no + 1, point_every) == 0: wlog('.', False)
        if numpy.mod(sent_no + 1, number_every) == 0:
            wlog('{}'.format(sent_no + 1), False)

        sent_no += 1

    wlog('Done ...')
    return total_domain
예제 #12
0
from_pretrain_model = True
train_en_de_model = False
rl_loss = False
rl_loss_lambda = 0.1
task = 'wenmt'

# task = 'zh_en_nmt'
# task = 'back_trans_nmt'
# task = 'en_de_nmt'


def getdir():
    return os.path.abspath('.') + '/'


wlog('work dir change:{}'.format(os.chdir(sys.path[0] + '/../')))
wlog('chdir then dir{}'.format(os.getcwd()))
wlog('working on {}'.format(getdir()))
''' directory to save model, validation output and test output '''
work_dir = getdir()
word2vec_weight = work_dir + 'w2v_embedding.pt'
''' reinforce CE '''
use_reinfore_ce = True  # standard ce exp0
''' 1: reinforce ce. 2: reinforce auto ml ce 2|v| vector . 3: small matrix auto ml ce. 4. use word2vec embedding for distance
5 cat the embedding then map to a scalar'''
reinfore_type = 4 if use_reinfore_ce else 0

from datetime import datetime

TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
illustration = 'training_ende_model_exp6_withsmooth'
예제 #13
0
    def __init__(self, trg_emb,
                 n_layers=6,
                 d_model=512,
                 n_head=8,
                 d_ff_filter=1024,
                 att_dropout=0.3,
                 residual_dropout=0.,
                 relu_dropout=0.,
                 self_attn_type='scaled-dot',
                 proj_share_weight=False,
                 decoder_normalize_before=False):

        wlog('Transformer decoder ========================= ')
        wlog('\ttrg_word_emb:       {}'.format(trg_emb.we.weight.size()))
        wlog('\tn_layers:           {}'.format(n_layers))
        wlog('\tn_head:             {}'.format(n_head))
        wlog('\td_model:            {}'.format(d_model))
        wlog('\td_ffn_filter:       {}'.format(d_ff_filter))
        wlog('\tatt_dropout:        {}'.format(att_dropout))
        wlog('\tresidual_dropout:   {}'.format(residual_dropout))
        wlog('\trelu_dropout:       {}'.format(relu_dropout))
        wlog('\tproj_share_weight:  {}'.format(proj_share_weight))

        super(SelfAttDecoder, self).__init__()

        self.layer_stack = nn.ModuleList([
            SelfAttDecoderLayer(d_model,
                                n_head,
                                d_ff_filter,
                                att_dropout=att_dropout,
                                residual_dropout=residual_dropout,
                                relu_dropout=relu_dropout,
                                self_attn_type=self_attn_type,
                                decoder_normalize_before=decoder_normalize_before)
            for _ in range(n_layers)])

        self.trg_word_emb = trg_emb
        if decoder_normalize_before is True:
            self.layer_norm = nn.LayerNorm(d_model, elementwise_affine=True)
        self.decoder_normalize_before = decoder_normalize_before
예제 #14
0
파일: main.py 프로젝트: zjpbinary/OR-NMT
def main():

    #if wargs.ss_type is not None: assert wargs.model == 1, 'Only rnnsearch support schedule sample'
    init_dir(wargs.dir_model)
    init_dir(wargs.dir_valid)

    src = os.path.join(wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_src_suffix))
    trg = os.path.join(wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_trg_suffix))
    vocabs = {}
    wlog('\nPreparing source vocabulary from {} ... '.format(src))
    src_vocab = extract_vocab(src, wargs.src_vcb, wargs.n_src_vcb_plan,
                              wargs.max_seq_len, char=wargs.src_char)
    wlog('\nPreparing target vocabulary from {} ... '.format(trg))
    trg_vocab = extract_vocab(trg, wargs.trg_vcb, wargs.n_trg_vcb_plan, wargs.max_seq_len)
    n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(n_src_vcb, n_trg_vcb))
    vocabs['src'], vocabs['trg'] = src_vocab, trg_vocab

    wlog('\nPreparing training set from {} and {} ... '.format(src, trg))
    trains = {}
    train_src_tlst, train_trg_tlst = wrap_data(wargs.dir_data, wargs.train_prefix,
                                               wargs.train_src_suffix, wargs.train_trg_suffix,
                                               src_vocab, trg_vocab, shuffle=True,
                                               sort_k_batches=wargs.sort_k_batches,
                                               max_seq_len=wargs.max_seq_len,
                                               char=wargs.src_char)
    '''
    list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...]
    no padding
    '''
    batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size,
                        batch_type=wargs.batch_type, bow=wargs.trg_bow, batch_sort=False)
    wlog('Sentence-pairs count in training data: {}'.format(len(train_src_tlst)))

    batch_valid = None
    if wargs.val_prefix is not None:
        val_src_file = os.path.join(wargs.val_tst_dir, '{}.{}'.format(wargs.val_prefix, wargs.val_src_suffix))
        val_trg_file = os.path.join(wargs.val_tst_dir, '{}.{}'.format(wargs.val_prefix, wargs.val_ref_suffix))
        wlog('\nPreparing validation set from {} and {} ... '.format(val_src_file, val_trg_file))
        valid_src_tlst, valid_trg_tlst = wrap_data(wargs.val_tst_dir, wargs.val_prefix,
                                                   wargs.val_src_suffix, wargs.val_ref_suffix,
                                                   src_vocab, trg_vocab, shuffle=False,
                                                   max_seq_len=wargs.dev_max_seq_len,
                                                   char=wargs.src_char)
        batch_valid = Input(valid_src_tlst, valid_trg_tlst, 1, batch_sort=False)

    batch_tests = None
    if wargs.tests_prefix is not None:
        assert isinstance(wargs.tests_prefix, list), 'Test files should be list.'
        init_dir(wargs.dir_tests)
        batch_tests = {}
        for prefix in wargs.tests_prefix:
            init_dir(wargs.dir_tests + '/' + prefix)
            test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix, wargs.val_src_suffix)
            wlog('\nPreparing test set from {} ... '.format(test_file))
            test_src_tlst, _ = wrap_tst_data(test_file, src_vocab, char=wargs.src_char)
            batch_tests[prefix] = Input(test_src_tlst, None, 1, batch_sort=False)
    wlog('\n## Finish to Prepare Dataset ! ##\n')

    src_emb = WordEmbedding(n_src_vcb, wargs.d_src_emb, wargs.input_dropout,
                            wargs.position_encoding, prefix='Src')
    trg_emb = WordEmbedding(n_trg_vcb, wargs.d_trg_emb, wargs.input_dropout,
                            wargs.position_encoding, prefix='Trg')
    # share the embedding matrix - preprocess with share_vocab required.
    if wargs.embs_share_weight:
        if n_src_vcb != n_trg_vcb:
            raise AssertionError('The `-share_vocab` should be set during '
                                 'preprocess if you use share_embeddings!')
        src_emb.we.weight = trg_emb.we.weight

    nmtModel = build_NMT(src_emb, trg_emb)

    if not wargs.copy_attn:
        classifier = Classifier(wargs.d_model if wargs.decoder_type == 'att' else 2 * wargs.d_enc_hid,
                                n_trg_vcb, trg_emb, loss_norm=wargs.loss_norm,
                                label_smoothing=wargs.label_smoothing,
                                emb_loss=wargs.emb_loss, bow_loss=wargs.bow_loss)
    nmtModel.decoder.classifier = classifier

    if wargs.gpu_id is not None:
        wlog('push model onto GPU {} ... '.format(wargs.gpu_id), 0)
        #nmtModel = nn.DataParallel(nmtModel, device_ids=wargs.gpu_id)
        nmtModel.to(tc.device('cuda'))
    else:
        wlog('push model onto CPU ... ', 0)
        nmtModel.to(tc.device('cpu'))
    wlog('done.')

    if wargs.pre_train is not None:
        assert os.path.exists(wargs.pre_train)
        from tools.utils import load_model
        _dict = load_model(wargs.pre_train)
        # initializing parameters of interactive attention model
        class_dict = None
        if len(_dict) == 5:
            model_dict, class_dict, eid, bid, optim = _dict
        elif len(_dict) == 4:
            model_dict, eid, bid, optim = _dict
        for name, param in nmtModel.named_parameters():
            if name in model_dict:
                param.requires_grad = not wargs.fix_pre_params
                param.data.copy_(model_dict[name])
                wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name))
            elif name.endswith('map_vocab.weight'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.weight'])
                    wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name))
            elif name.endswith('map_vocab.bias'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.bias'])
                    wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name))
            else: init_params(param, name, init_D=wargs.param_init_D, a=float(wargs.u_gain))

        wargs.start_epoch = eid + 1

    else:
        optim = Optim(wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm)
        #for n, p in nmtModel.named_parameters():
            # bias can not be initialized uniformly
            #if wargs.encoder_type != 'att' and wargs.decoder_type != 'att':
            #    init_params(p, n, init_D=wargs.param_init_D, a=float(wargs.u_gain))

    wlog(nmtModel)
    wlog(optim)
    pcnt1 = len([p for p in nmtModel.parameters()])
    pcnt2 = sum([p.nelement() for p in nmtModel.parameters()])
    wlog('parameters number: {}/{}'.format(pcnt1, pcnt2))

    wlog('\n' + '*' * 30 + ' trainable parameters ' + '*' * 30)
    for n, p in nmtModel.named_parameters():
        if p.requires_grad: wlog('{:60} : {}'.format(n, p.size()))

    optim.init_optimizer(nmtModel.parameters())

    trainer = Trainer(nmtModel, batch_train, vocabs, optim, batch_valid, batch_tests)

    trainer.train()
예제 #15
0
파일: car.py 프로젝트: karlgeorge/wpynmt
def main():

    # Check if CUDA is available
    if cuda.is_available():
        wlog(
            'CUDA is available, specify device by gpu_id argument (i.e. gpu_id=[3])'
        )
    else:
        wlog('Warning: CUDA is not available, try CPU')

    if wargs.gpu_id:
        cuda.set_device(wargs.gpu_id[0])
        wlog('Using GPU {}'.format(wargs.gpu_id[0]))

    init_dir(wargs.dir_model)
    init_dir(wargs.dir_valid)
    '''
    train_srcD_file = wargs.dir_data + 'train.10k.zh5'
    wlog('\nPreparing source vocabulary from {} ... '.format(train_srcD_file))
    src_vocab = extract_vocab(train_srcD_file, wargs.src_dict, wargs.src_dict_size)

    train_trgD_file = wargs.dir_data + 'train.10k.en5'
    wlog('\nPreparing target vocabulary from {} ... '.format(train_trgD_file))
    trg_vocab = extract_vocab(train_trgD_file, wargs.trg_dict, wargs.trg_dict_size)

    train_src_file = wargs.dir_data + 'train.10k.zh0'
    train_trg_file = wargs.dir_data + 'train.10k.en0'
    wlog('\nPreparing training set from {} and {} ... '.format(train_src_file, train_trg_file))
    train_src_tlst, train_trg_tlst = wrap_data(train_src_file, train_trg_file, src_vocab, trg_vocab)
    #list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...], no padding
    wlog('Sentence-pairs count in training data: {}'.format(len(train_src_tlst)))
    src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(src_vocab_size, trg_vocab_size))
    batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size)
    '''

    src = os.path.join(
        wargs.dir_data, '{}.{}'.format(wargs.train_prefix,
                                       wargs.train_src_suffix))
    trg = os.path.join(
        wargs.dir_data, '{}.{}'.format(wargs.train_prefix,
                                       wargs.train_trg_suffix))
    vocabs = {}
    wlog('\nPreparing source vocabulary from {} ... '.format(src))
    src_vocab = extract_vocab(src, wargs.src_dict, wargs.src_dict_size)
    wlog('\nPreparing target vocabulary from {} ... '.format(trg))
    trg_vocab = extract_vocab(trg, wargs.trg_dict, wargs.trg_dict_size)
    src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(
        src_vocab_size, trg_vocab_size))
    vocabs['src'], vocabs['trg'] = src_vocab, trg_vocab

    wlog('\nPreparing training set from {} and {} ... '.format(src, trg))
    trains = {}
    train_src_tlst, train_trg_tlst = wrap_data(wargs.dir_data,
                                               wargs.train_prefix,
                                               wargs.train_src_suffix,
                                               wargs.train_trg_suffix,
                                               src_vocab,
                                               trg_vocab,
                                               max_seq_len=wargs.max_seq_len)
    '''
    list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...]
    no padding
    '''
    batch_train = Input(train_src_tlst,
                        train_trg_tlst,
                        wargs.batch_size,
                        batch_sort=True)
    wlog('Sentence-pairs count in training data: {}'.format(
        len(train_src_tlst)))

    batch_valid = None
    if wargs.val_prefix is not None:
        val_src_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix,
                                        wargs.val_src_suffix)
        val_trg_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix,
                                        wargs.val_ref_suffix)
        wlog('\nPreparing validation set from {} and {} ... '.format(
            val_src_file, val_trg_file))
        valid_src_tlst, valid_trg_tlst = wrap_data(
            wargs.val_tst_dir,
            wargs.val_prefix,
            wargs.val_src_suffix,
            wargs.val_ref_suffix,
            src_vocab,
            trg_vocab,
            shuffle=False,
            sort_data=False,
            max_seq_len=wargs.dev_max_seq_len)
        batch_valid = Input(valid_src_tlst,
                            valid_trg_tlst,
                            1,
                            volatile=True,
                            batch_sort=False)

    batch_tests = None
    if wargs.tests_prefix is not None:
        assert isinstance(wargs.tests_prefix,
                          list), 'Test files should be list.'
        init_dir(wargs.dir_tests)
        batch_tests = {}
        for prefix in wargs.tests_prefix:
            init_dir(wargs.dir_tests + '/' + prefix)
            test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix,
                                         wargs.val_src_suffix)
            wlog('\nPreparing test set from {} ... '.format(test_file))
            test_src_tlst, _ = wrap_tst_data(test_file, src_vocab)
            batch_tests[prefix] = Input(test_src_tlst,
                                        None,
                                        1,
                                        volatile=True,
                                        batch_sort=False)
    wlog('\n## Finish to Prepare Dataset ! ##\n')

    nmtModel = NMT(src_vocab_size, trg_vocab_size)
    if wargs.pre_train is not None:

        assert os.path.exists(wargs.pre_train)

        _dict = _load_model(wargs.pre_train)
        # initializing parameters of interactive attention model
        class_dict = None
        if len(_dict) == 4: model_dict, eid, bid, optim = _dict
        elif len(_dict) == 5:
            model_dict, class_dict, eid, bid, optim = _dict
        for name, param in nmtModel.named_parameters():
            if name in model_dict:
                param.requires_grad = not wargs.fix_pre_params
                param.data.copy_(model_dict[name])
                wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad,
                                                  name))
            elif name.endswith('map_vocab.weight'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.weight'])
                    wlog('{:7} -> grad {}\t{}'.format('Model',
                                                      param.requires_grad,
                                                      name))
            elif name.endswith('map_vocab.bias'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.bias'])
                    wlog('{:7} -> grad {}\t{}'.format('Model',
                                                      param.requires_grad,
                                                      name))
            else:
                init_params(param, name, True)

        wargs.start_epoch = eid + 1

    else:
        for n, p in nmtModel.named_parameters():
            init_params(p, n, True)
        optim = Optim(wargs.opt_mode,
                      wargs.learning_rate,
                      wargs.max_grad_norm,
                      learning_rate_decay=wargs.learning_rate_decay,
                      start_decay_from=wargs.start_decay_from,
                      last_valid_bleu=wargs.last_valid_bleu)
        optim.init_optimizer(nmtModel.parameters())

    if wargs.gpu_id:
        wlog('Push model onto GPU {} ... '.format(wargs.gpu_id), 0)
        nmtModel.cuda()
    else:
        wlog('Push model onto CPU ... ', 0)
        nmtModel.cpu()

    wlog('done.')
    wlog(nmtModel)
    wlog(optim)
    pcnt1 = len([p for p in nmtModel.parameters()])
    pcnt2 = sum([p.nelement() for p in nmtModel.parameters()])
    wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2))

    trainer = Trainer(nmtModel,
                      src_vocab.idx2key,
                      trg_vocab.idx2key,
                      optim,
                      trg_vocab_size,
                      valid_data=batch_valid,
                      tests_data=batch_tests)

    # add 1000 to train
    train_all_chunks = (train_src_tlst, train_trg_tlst)
    dh = DataHisto(train_all_chunks)
    '''
    dev_src0 = wargs.dir_data + 'dev.1k.zh0'
    dev_trg0 = wargs.dir_data + 'dev.1k.en0'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src0, dev_trg0))
    dev_src0, dev_trg0 = wrap_data(dev_src0, dev_trg0, src_vocab, trg_vocab)
    wlog(len(train_src_tlst))

    dev_src1 = wargs.dir_data + 'dev.1k.zh1'
    dev_trg1 = wargs.dir_data + 'dev.1k.en1'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src1, dev_trg1))
    dev_src1, dev_trg1 = wrap_data(dev_src1, dev_trg1, src_vocab, trg_vocab)

    dev_src2 = wargs.dir_data + 'dev.1k.zh2'
    dev_trg2 = wargs.dir_data + 'dev.1k.en2'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src2, dev_trg2))
    dev_src2, dev_trg2 = wrap_data(dev_src2, dev_trg2, src_vocab, trg_vocab)

    dev_src3 = wargs.dir_data + 'dev.1k.zh3'
    dev_trg3 = wargs.dir_data + 'dev.1k.en3'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src3, dev_trg3))
    dev_src3, dev_trg3 = wrap_data(dev_src3, dev_trg3, src_vocab, trg_vocab)

    dev_src4 = wargs.dir_data + 'dev.1k.zh4'
    dev_trg4 = wargs.dir_data + 'dev.1k.en4'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src4, dev_trg4))
    dev_src4, dev_trg4 = wrap_data(dev_src4, dev_trg4, src_vocab, trg_vocab)
    wlog(len(dev_src4+dev_src3+dev_src2+dev_src1+dev_src0))
    batch_dev = Input(dev_src4+dev_src3+dev_src2+dev_src1+dev_src0, dev_trg4+dev_trg3+dev_trg2+dev_trg1+dev_trg0, wargs.batch_size)
    '''

    batch_dev = None
    assert wargs.dev_prefix is not None, 'Requires development to tuning.'
    dev_src_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.dev_prefix,
                                    wargs.val_src_suffix)
    dev_trg_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.dev_prefix,
                                    wargs.val_ref_suffix)
    wlog('\nPreparing dev set from {} and {} ... '.format(
        dev_src_file, dev_trg_file))
    valid_src_tlst, valid_trg_tlst = wrap_data(
        wargs.val_tst_dir,
        wargs.dev_prefix,
        wargs.val_src_suffix,
        wargs.val_ref_suffix,
        src_vocab,
        trg_vocab,
        shuffle=True,
        sort_data=True,
        max_seq_len=wargs.dev_max_seq_len)
    batch_dev = Input(valid_src_tlst,
                      valid_trg_tlst,
                      wargs.batch_size,
                      batch_sort=True)

    trainer.train(dh, batch_dev, 0, merge=True, name='DH_{}'.format('dev'))
    '''
예제 #16
0
def main():

    init_dir(wargs.dir_model)
    init_dir(wargs.dir_valid)

    vocab_data = {}
    train_srcD_file = wargs.src_vocab_from
    wlog('\nPreparing source vocabulary from {} ... '.format(train_srcD_file))
    src_vocab = extract_vocab(train_srcD_file, wargs.src_dict,
                              wargs.src_dict_size)
    vocab_data['src'] = src_vocab

    train_trgD_file = wargs.trg_vocab_from
    wlog('\nPreparing target vocabulary from {} ... '.format(train_trgD_file))
    trg_vocab = extract_vocab(train_trgD_file, wargs.trg_dict,
                              wargs.trg_dict_size)
    vocab_data['trg'] = trg_vocab

    train_src_file = wargs.train_src
    train_trg_file = wargs.train_trg
    wlog('\nPreparing training set from {} and {} ... '.format(
        train_src_file, train_trg_file))
    train_src_tlst, train_trg_tlst = wrap_data(train_src_file,
                                               train_trg_file,
                                               src_vocab,
                                               trg_vocab,
                                               max_seq_len=wargs.max_seq_len)
    '''
    list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...]
    no padding
    '''
    '''
    devs = {}
    dev_src = wargs.val_tst_dir + wargs.val_prefix + '.src'
    dev_trg = wargs.val_tst_dir + wargs.val_prefix + '.ref0'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src, dev_trg))
    dev_src, dev_trg = wrap_data(dev_src, dev_trg, src_vocab, trg_vocab)
    devs['src'], devs['trg'] = dev_src, dev_trg
    '''

    valid_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix,
                                  wargs.val_src_suffix)
    wlog('\nPreparing validation set from {} ... '.format(valid_file))
    valid_src_tlst, valid_src_lens = val_wrap_data(valid_file, src_vocab)

    wlog('Sentence-pairs count in training data: {}'.format(
        len(train_src_tlst)))
    src_vocab_size, trg_vocab_size = vocab_data['src'].size(
    ), vocab_data['trg'].size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(
        src_vocab_size, trg_vocab_size))

    batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size)
    batch_valid = Input(valid_src_tlst, None, 1, volatile=True)

    tests_data = None
    if wargs.tests_prefix is not None:
        init_dir(wargs.dir_tests)
        tests_data = {}
        for prefix in wargs.tests_prefix:
            init_dir(wargs.dir_tests + '/' + prefix)
            test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix,
                                         wargs.val_src_suffix)
            wlog('Preparing test set from {} ... '.format(test_file))
            test_src_tlst, _ = val_wrap_data(test_file, src_vocab)
            tests_data[prefix] = Input(test_src_tlst, None, 1, volatile=True)
    '''
    # lookup_table on cpu to save memory
    src_lookup_table = nn.Embedding(wargs.src_dict_size + 4,
                                    wargs.src_wemb_size, padding_idx=utils.PAD).cpu()
    trg_lookup_table = nn.Embedding(wargs.trg_dict_size + 4,
                                    wargs.trg_wemb_size, padding_idx=utils.PAD).cpu()

    wlog('Lookup table on CPU ... ')
    wlog(src_lookup_table)
    wlog(trg_lookup_table)
    '''

    sv = vocab_data['src'].idx2key
    tv = vocab_data['trg'].idx2key

    nmtModel = NMT(src_vocab_size, trg_vocab_size)
    #classifier = Classifier(wargs.out_size, trg_vocab_size,
    #                        nmtModel.decoder.trg_lookup_table if wargs.copy_trg_emb is True else None)

    if wargs.pre_train:

        assert os.path.exists(wargs.pre_train)
        _dict = _load_model(wargs.pre_train)
        # initializing parameters of interactive attention model
        class_dict = None
        if len(_dict) == 4: model_dict, eid, bid, optim = _dict
        elif len(_dict) == 5:
            model_dict, class_dict, eid, bid, optim = _dict
        for name, param in nmtModel.named_parameters():
            if name in model_dict:
                param.requires_grad = not wargs.fix_pre_params
                param.data.copy_(model_dict[name])
                wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad,
                                                  name))
            elif name.endswith('map_vocab.weight'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.weight'])
                    wlog('{:7} -> grad {}\t{}'.format('Model',
                                                      param.requires_grad,
                                                      name))
            elif name.endswith('map_vocab.bias'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.bias'])
                    wlog('{:7} -> grad {}\t{}'.format('Model',
                                                      param.requires_grad,
                                                      name))
            else:
                init_params(param, name, True)

        wargs.start_epoch = eid + 1

        #tor = Translator(nmtModel, sv, tv)
        #tor.trans_tests(tests_data, eid, bid)

    else:
        for n, p in nmtModel.named_parameters():
            init_params(p, n, True)
        #for n, p in classifier.named_parameters(): init_params(p, n, True)
        optim = Optim(wargs.opt_mode,
                      wargs.learning_rate,
                      wargs.max_grad_norm,
                      learning_rate_decay=wargs.learning_rate_decay,
                      start_decay_from=wargs.start_decay_from,
                      last_valid_bleu=wargs.last_valid_bleu)

    if wargs.gpu_id:
        nmtModel.cuda()
        #classifier.cuda()
        wlog('Push model onto GPU[{}] ... '.format(wargs.gpu_id[0]))
    else:
        nmtModel.cpu()
        #classifier.cpu()
        wlog('Push model onto CPU ... ')

    #nmtModel.classifier = classifier
    #nmtModel.decoder.map_vocab = classifier.map_vocab
    '''
    nmtModel.src_lookup_table = src_lookup_table
    nmtModel.trg_lookup_table = trg_lookup_table
    print nmtModel.src_lookup_table.weight.data.is_cuda

    nmtModel.classifier.init_weights(nmtModel.trg_lookup_table)
    '''

    wlog(nmtModel)
    wlog(optim)
    pcnt1 = len([p for p in nmtModel.parameters()])
    pcnt2 = sum([p.nelement() for p in nmtModel.parameters()])
    wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2))

    optim.init_optimizer(nmtModel.parameters())

    #tor = Translator(nmtModel, sv, tv, wargs.search_mode)
    #tor.trans_tests(tests_data, pre_dict['epoch'], pre_dict['batch'])

    trainer = Trainer(nmtModel, batch_train, vocab_data, optim, batch_valid,
                      tests_data)

    trainer.train()
예제 #17
0
파일: main.py 프로젝트: zjpbinary/OR-NMT
import torch as tc
from torch import cuda

import wargs
from tools.inputs_handler import *
from tools.inputs import Input
from tools.optimizer import Optim
from models.losser import Classifier
from models.embedding import WordEmbedding
from models.model_builder import build_NMT
from tools.utils import init_dir, wlog

# Check if CUDA is available
if cuda.is_available():
    wlog('CUDA is available, specify device by gpu_id argument (i.e. gpu_id=[0, 1, 2])')
else:
    wlog('Warning: CUDA is not available, train on CPU')

if wargs.gpu_id is not None:
    #cuda.set_device(wargs.gpu_id[0])
    device = tc.device('cuda:{}'.format(wargs.gpu_id[0]) if cuda.is_available() else 'cpu')
    wlog('Set device {}, will use {} GPUs {}'.format(
        wargs.gpu_id[0], len(wargs.gpu_id), wargs.gpu_id))

from trainer import *

import torch.backends.cudnn as cudnn
cudnn.benchmark = True
cudnn.enabled = True
예제 #18
0
파일: _main.py 프로젝트: gushu333/DA4NMT
import torch as tc
from torch import cuda
import math

import wargs
from tools.inputs import Input
from tools.utils import init_dir, wlog, _load_model
from tools.optimizer import Optim
from inputs_handler import *

# Check if CUDA is available
if cuda.is_available():
    wlog(
        'CUDA is available, specify device by gpu_id argument (i.e. gpu_id=[3])'
    )
else:
    wlog('Warning: CUDA is not available, train on CPU')

if wargs.gpu_id:
    cuda.set_device(wargs.gpu_id[0])
    wlog('Using GPU {}'.format(wargs.gpu_id[0]))

from models.rnnsearch import *
from models.losser import *

from trainer import *
from translate import Translator

import torch.backends.cudnn as cudnn
cudnn.benchmark = True
cudnn.enabled = True
예제 #19
0
from models.losser import *
#from tools.tsne import *

def encoder_state(model, input):
	batch_count = len(src_input)
    point_every, number_every = int(math.ceil(batch_count/100)), int(math.ceil(batch_count/10))
    total_state = []
    sent_no, words_cnt = 0, 0

    fd_attent_matrixs, trgs = None, None

    for bid in range(batch_count):
    	src = input[bid][1]
    	state, _, _ = model.init(src, "common")
    	total_state.append(state.data)
    	if numpy.mod(sent_no + 1, point_every) == 0: wlog('.', False)
        if numpy.mod(sent_no + 1, number_every) == 0: wlog('{}'.format(sent_no + 1), False)

        sent_no += 1

    wlog('Done ...')
    
    return total_state



if __name__ == "__main__":

    A = argparse.ArgumentParser(prog='NMT translator ... ')
    A.add_argument('--model-file', dest='model_file', help='model file')
예제 #20
0
    Word2VecDistanceCriterion, HighDimDistance, CosineDistance
from models.embedding import WordEmbedding
from models.model_builder import build_NMT
from tools.utils import init_dir, wlog

device_ids = wargs.gpu_ids

writer = None
if wargs.use_tensorboard:
    writer = SummaryWriter(wargs.tensorboard_dir)

if device_ids is not None:
    device = tc.device(
        'cuda:{}'.format(device_ids[0]) if cuda.is_available() else 'cpu')
    wlog('Set device {}, will use {} GPUs {}'.format(device_ids[0],
                                                     len(device_ids),
                                                     device_ids))

# Check if CUDA is available
if cuda.is_available():
    wlog(
        'CUDA is available, specify device by gpu_ids argument (i.e. gpu_ids=[0, 1, 2])'
    )
else:
    if len(device_ids) > 1:
        wlog('Can not train on multi-gpus, device count: {}'.format(
            cuda.device_count()))
        sys.exit(0)
    else:
        wlog('Warning: CUDA is not available, train on CPU')
예제 #21
0
def main():
    # if wargs.ss_type is not None: assert wargs.model == 1, 'Only rnnsearch support schedule sample'
    init_dir(wargs.dir_model)
    init_dir(wargs.dir_valid)

    src = os.path.join(
        wargs.dir_data, '{}.{}'.format(wargs.train_prefix,
                                       wargs.train_src_suffix))
    trg = os.path.join(
        wargs.dir_data, '{}.{}'.format(wargs.train_prefix,
                                       wargs.train_trg_suffix))
    src, trg = os.path.abspath(src), os.path.abspath(trg)
    vocabs = {}
    if wargs.share_vocab is False:
        wlog('\nPreparing source vocabulary from {} ... '.format(src))
        src_vocab = extract_vocab(src,
                                  wargs.src_vcb,
                                  wargs.n_src_vcb_plan,
                                  wargs.max_seq_len,
                                  char=wargs.src_char)
        wlog('\nPreparing target vocabulary from {} ... '.format(trg))
        trg_vocab = extract_vocab(trg, wargs.trg_vcb, wargs.n_trg_vcb_plan,
                                  wargs.max_seq_len)
        n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size()
        wlog('Vocabulary size: |source|={}, |target|={}'.format(
            n_src_vcb, n_trg_vcb))
    else:
        wlog('\nPreparing the shared vocabulary from \n\t{}\n\t{}'.format(
            src, trg))
        trg_vocab = src_vocab = extract_vocab(src,
                                              wargs.src_vcb,
                                              wargs.n_src_vcb_plan,
                                              wargs.max_seq_len,
                                              share_vocab=True,
                                              trg_file=trg)
        n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size()
        wlog('Shared vocabulary size: |vocab|={}'.format(src_vocab.size()))

    vocabs['src'], vocabs['trg'] = src_vocab, trg_vocab

    wlog('\nPreparing training set from {} and {} ... '.format(src, trg))
    trains = {}
    train_src_tlst, train_trg_tlst = wrap_data(
        wargs.dir_data,
        wargs.train_prefix,
        wargs.train_src_suffix,
        wargs.train_trg_suffix,
        src_vocab,
        trg_vocab,
        shuffle=True,
        sort_k_batches=wargs.sort_k_batches,
        max_seq_len=wargs.max_seq_len,
        char=wargs.src_char)
    '''
    list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...]
    no padding
    '''
    batch_train = Input(train_src_tlst,
                        train_trg_tlst,
                        wargs.batch_size,
                        batch_type=wargs.batch_type,
                        bow=wargs.trg_bow,
                        batch_sort=False,
                        gpu_ids=device_ids)
    wlog('Sentence-pairs count in training data: {}'.format(
        len(train_src_tlst)))

    batch_valid = None
    if wargs.val_prefix is not None:
        val_src_file = os.path.join(
            wargs.val_tst_dir, '{}.{}'.format(wargs.val_prefix,
                                              wargs.val_src_suffix))
        val_trg_file = os.path.join(
            wargs.val_tst_dir, '{}.{}'.format(wargs.val_prefix,
                                              wargs.val_ref_suffix))
        val_src_file, val_trg_file = os.path.abspath(
            val_src_file), os.path.abspath(val_trg_file)
        wlog('\nPreparing validation set from {} and {} ... '.format(
            val_src_file, val_trg_file))
        valid_src_tlst, valid_trg_tlst = wrap_data(
            wargs.val_tst_dir,
            wargs.val_prefix,
            wargs.val_src_suffix,
            wargs.val_ref_suffix,
            src_vocab,
            trg_vocab,
            shuffle=False,
            max_seq_len=wargs.dev_max_seq_len,
            char=wargs.src_char)
        batch_valid = Input(valid_src_tlst,
                            valid_trg_tlst,
                            batch_size=wargs.valid_batch_size,
                            batch_sort=False,
                            gpu_ids=device_ids)

    batch_tests = None
    if wargs.tests_prefix is not None:
        assert isinstance(wargs.tests_prefix,
                          list), 'Test files should be list.'
        init_dir(wargs.dir_tests)
        batch_tests = {}
        for prefix in wargs.tests_prefix:
            init_dir(wargs.dir_tests + '/' + prefix)
            test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix,
                                         wargs.val_src_suffix)
            test_file = os.path.abspath(test_file)
            wlog('\nPreparing test set from {} ... '.format(test_file))
            test_src_tlst, _ = wrap_tst_data(test_file,
                                             src_vocab,
                                             char=wargs.src_char)
            batch_tests[prefix] = Input(test_src_tlst,
                                        None,
                                        batch_size=wargs.test_batch_size,
                                        batch_sort=False,
                                        gpu_ids=device_ids)
    wlog('\n## Finish to Prepare Dataset ! ##\n')

    src_emb = WordEmbedding(n_src_vcb,
                            wargs.d_src_emb,
                            wargs.input_dropout,
                            wargs.position_encoding,
                            prefix='Src')
    trg_emb = WordEmbedding(n_trg_vcb,
                            wargs.d_trg_emb,
                            wargs.input_dropout,
                            wargs.position_encoding,
                            prefix='Trg')
    # share the embedding matrix between the source and target
    if wargs.share_vocab is True: src_emb.we.weight = trg_emb.we.weight

    nmtModel = build_NMT(src_emb, trg_emb)

    if device_ids is not None:
        wlog('push model onto GPU {} ... '.format(device_ids[0]), 0)
        nmtModel_par = nn.DataParallel(nmtModel, device_ids=device_ids)
        nmtModel_par.to(device)
    else:
        wlog('push model onto CPU ... ', 0)
        nmtModel.to(tc.device('cpu'))
    wlog('done.')

    if wargs.pre_train is not None:
        wlog(wargs.pre_train)
        assert os.path.exists(wargs.pre_train)
        from tools.utils import load_model
        _dict = load_model(wargs.pre_train)
        # initializing parameters of interactive attention model
        class_dict = None
        if len(_dict) == 5:
            # model_dict, e_idx, e_bidx, n_steps, optim = _dict['model'], _dict['epoch'], _dict['batch'], _dict['steps'], _dict['optim']
            model_dict, e_idx, e_bidx, n_steps, optim = _dict
        elif len(_dict) == 4:
            model_dict, e_idx, e_bidx, optim = _dict
        for name, param in nmtModel.named_parameters():
            if name in model_dict:
                param.requires_grad = not wargs.fix_pre_params
                param.data.copy_(model_dict[name])
                # wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name))
            elif name.endswith('map_vocab.weight'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.weight'])
                    # wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name))
            elif name.endswith('map_vocab.bias'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.bias'])
                    # wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name))
            else:
                init_params(param,
                            name,
                            init_D=wargs.param_init_D,
                            a=float(wargs.u_gain))

        # wargs.start_epoch = e_idx + 1
        # # 不重新开始
        # optim.n_current_steps = 0

    else:
        optim = Optim(wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm)
        for n, p in nmtModel.named_parameters():
            # bias can not be initialized uniformly
            if 'norm' in n:
                wlog('ignore layer norm init ...')
                continue
            if 'emb' in n:
                wlog('ignore word embedding weight init ...')
                continue
            if 'vcb_proj' in n:
                wlog('ignore vcb_proj weight init ...')
                continue
            init_params(p, n, init_D=wargs.param_init_D, a=float(wargs.u_gain))
            # if wargs.encoder_type != 'att' and wargs.decoder_type != 'att':
            #    init_params(p, n, init_D=wargs.param_init_D, a=float(wargs.u_gain))

    # wlog(nmtModel)
    wlog(optim)
    pcnt1 = len([p for p in nmtModel.parameters()])
    pcnt2 = sum([p.nelement() for p in nmtModel.parameters()])
    wlog('parameters number: {}/{}'.format(pcnt1, pcnt2))

    # wlog('\n' + '*' * 30 + ' trainable parameters ' + '*' * 30)
    # for n, p in nmtModel.named_parameters():
    #     if p.requires_grad: wlog('{:60} : {}'.format(n, p.size()))
    opt_state = None
    if wargs.pre_train:
        opt_state = optim.optimizer.state_dict()

    if wargs.use_reinfore_ce is False:
        criterion = LabelSmoothingCriterion(
            trg_emb.n_vocab, label_smoothing=wargs.label_smoothing)
    else:
        word2vec = tc.load(wargs.word2vec_weight)['w2v']
        # criterion = Word2VecDistanceCriterion(word2vec)
        criterion = CosineDistance(word2vec)

    if device_ids is not None:
        wlog('push criterion onto GPU {} ... '.format(device_ids[0]), 0)
        criterion = criterion.to(device)
        wlog('done.')
    # if wargs.reinfore_type == 0 or wargs.reinfore_type == 1:
    #     param = list(nmtModel.parameters())
    # else:
    #     param = list(nmtModel.parameters()) + list(criterion.parameters())
    param = list(nmtModel.parameters())
    optim.init_optimizer(param)

    lossCompute = MultiGPULossCompute(
        nmtModel.generator,
        criterion,
        wargs.d_model if wargs.decoder_type == 'att' else 2 * wargs.d_enc_hid,
        n_trg_vcb,
        trg_emb,
        nmtModel.bowMapper,
        loss_norm=wargs.loss_norm,
        chunk_size=wargs.chunk_size,
        device_ids=device_ids)

    trainer = Trainer(nmtModel_par, batch_train, vocabs, optim, lossCompute,
                      nmtModel, batch_valid, batch_tests, writer)

    trainer.train()
    writer.close()
예제 #22
0
 def __init__(self, trg_emb=None):
     super(EMBLossCriterion, self).__init__()
     wlog('using the embedding-based loss')
     assert trg_emb is not None, 'embedding loss needs target embedding'
     self.trg_word_emb = trg_emb.we
예제 #23
0
from tools.utils import *
from tools.utils import init_dir, wlog, _load_model
from translate import Translator
from inputs_handler import extract_vocab, val_wrap_data, wrap_data
from models.losser import *

if __name__ == "__main__":

    A = argparse.ArgumentParser(prog='NMT translator ... ')
    A.add_argument('--model-file', dest='model_file', help='model file')

    A.add_argument('--test-file', dest='test_file', default=None,
                   help='the input test file path we will translate')
    args = A.parse_args()
    model_file = args.model_file
    wlog('Using model: {}'.format(model_file))
    from models.rnnsearch import *

    src_vocab = extract_vocab(None, wargs.src_dict)
    trg_vocab = extract_vocab(None, wargs.trg_dict)

    src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(src_vocab_size, trg_vocab_size))

    wlog('Start decoding ... init model ... ', 0)

    nmtModel = NMT(src_vocab_size, trg_vocab_size)
    if wargs.gpu_id:
        cuda.set_device(wargs.gpu_id[0])
        nmtModel.cuda()
        wlog('Push model onto GPU[{}] ... '.format(wargs.gpu_id[0]))
예제 #24
0
def main():

    #if wargs.ss_type is not None: assert wargs.model == 1, 'Only rnnsearch support schedule sample'
    init_dir(wargs.dir_model)
    init_dir(wargs.dir_valid)

    src = os.path.join(
        wargs.dir_data, '{}.{}'.format(wargs.train_prefix,
                                       wargs.train_src_suffix))
    trg = os.path.join(
        wargs.dir_data, '{}.{}'.format(wargs.train_prefix,
                                       wargs.train_trg_suffix))
    vocabs = {}
    wlog('\n[o/Subword] Preparing source vocabulary from {} ... '.format(src))
    src_vocab = extract_vocab(src,
                              wargs.src_dict,
                              wargs.src_dict_size,
                              wargs.max_seq_len,
                              char=wargs.src_char)
    wlog('\n[o/Subword] Preparing target vocabulary from {} ... '.format(trg))
    trg_vocab = extract_vocab(trg, wargs.trg_dict, wargs.trg_dict_size,
                              wargs.max_seq_len)
    src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(
        src_vocab_size, trg_vocab_size))
    vocabs['src'], vocabs['trg'] = src_vocab, trg_vocab

    wlog('\nPreparing training set from {} and {} ... '.format(src, trg))
    trains = {}
    train_src_tlst, train_trg_tlst = wrap_data(wargs.dir_data,
                                               wargs.train_prefix,
                                               wargs.train_src_suffix,
                                               wargs.train_trg_suffix,
                                               src_vocab,
                                               trg_vocab,
                                               max_seq_len=wargs.max_seq_len,
                                               char=wargs.src_char)
    '''
    list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...]
    no padding
    '''
    batch_train = Input(train_src_tlst,
                        train_trg_tlst,
                        wargs.batch_size,
                        batch_sort=True)
    wlog('Sentence-pairs count in training data: {}'.format(
        len(train_src_tlst)))

    batch_valid = None
    if wargs.val_prefix is not None:
        val_src_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix,
                                        wargs.val_src_suffix)
        val_trg_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix,
                                        wargs.val_ref_suffix)
        wlog('\nPreparing validation set from {} and {} ... '.format(
            val_src_file, val_trg_file))
        valid_src_tlst, valid_trg_tlst = wrap_data(
            wargs.val_tst_dir,
            wargs.val_prefix,
            wargs.val_src_suffix,
            wargs.val_ref_suffix,
            src_vocab,
            trg_vocab,
            shuffle=False,
            sort_data=False,
            max_seq_len=wargs.dev_max_seq_len,
            char=wargs.src_char)
        batch_valid = Input(valid_src_tlst,
                            valid_trg_tlst,
                            1,
                            volatile=True,
                            batch_sort=False)

    batch_tests = None
    if wargs.tests_prefix is not None:
        assert isinstance(wargs.tests_prefix,
                          list), 'Test files should be list.'
        init_dir(wargs.dir_tests)
        batch_tests = {}
        for prefix in wargs.tests_prefix:
            init_dir(wargs.dir_tests + '/' + prefix)
            test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix,
                                         wargs.val_src_suffix)
            wlog('\nPreparing test set from {} ... '.format(test_file))
            test_src_tlst, _ = wrap_tst_data(test_file,
                                             src_vocab,
                                             char=wargs.src_char)
            batch_tests[prefix] = Input(test_src_tlst,
                                        None,
                                        1,
                                        volatile=True,
                                        batch_sort=False)
    wlog('\n## Finish to Prepare Dataset ! ##\n')

    nmtModel = NMT(src_vocab_size, trg_vocab_size)

    if wargs.pre_train is not None:

        assert os.path.exists(wargs.pre_train)

        _dict = _load_model(wargs.pre_train)
        # initializing parameters of interactive attention model
        class_dict = None
        if len(_dict) == 4: model_dict, eid, bid, optim = _dict
        elif len(_dict) == 5:
            model_dict, class_dict, eid, bid, optim = _dict
        for name, param in nmtModel.named_parameters():
            if name in model_dict:
                param.requires_grad = not wargs.fix_pre_params
                param.data.copy_(model_dict[name])
                wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad,
                                                  name))
            elif name.endswith('map_vocab.weight'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.weight'])
                    wlog('{:7} -> grad {}\t{}'.format('Model',
                                                      param.requires_grad,
                                                      name))
            elif name.endswith('map_vocab.bias'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.bias'])
                    wlog('{:7} -> grad {}\t{}'.format('Model',
                                                      param.requires_grad,
                                                      name))
            else:
                init_params(param, name, True)

        wargs.start_epoch = eid + 1

    else:
        for n, p in nmtModel.named_parameters():
            init_params(p, n, True)
        optim = Optim(wargs.opt_mode,
                      wargs.learning_rate,
                      wargs.max_grad_norm,
                      learning_rate_decay=wargs.learning_rate_decay,
                      start_decay_from=wargs.start_decay_from,
                      last_valid_bleu=wargs.last_valid_bleu,
                      model=wargs.model)

    if wargs.gpu_id is not None:
        wlog('Push model onto GPU {} ... '.format(wargs.gpu_id), 0)
        nmtModel.cuda()
    else:
        wlog('Push model onto CPU ... ', 0)
        nmtModel.cpu()

    wlog('done.')

    wlog(nmtModel)
    wlog(optim)
    pcnt1 = len([p for p in nmtModel.parameters()])
    pcnt2 = sum([p.nelement() for p in nmtModel.parameters()])
    wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2))

    optim.init_optimizer(nmtModel.parameters())

    trainer = Trainer(nmtModel, batch_train, vocabs, optim, batch_valid,
                      batch_tests)

    trainer.train()
예제 #25
0
    A.add_argument('--beam-size', dest='beam_size', default=wargs.beam_size, help='beamsize')
    A.add_argument('--len-norm', dest='len_norm', type=int, default=1,
                   help='During searching, whether we normalize accumulated loss by length.')

    '''

    args = A.parse_args()
    model_file = args.model_file
    '''
    search_mode = args.search_mode
    beam_size = args.beam_size
    lenNorm = args.len_norm
    '''

    if wargs.share_vocab is False:
        wlog('Starting load both vocabularies ... ')
        assert os.path.exists(wargs.src_vcb) and os.path.exists(wargs.trg_vcb), 'need vocabulary ...'
        src_vocab = extract_vocab(None, wargs.src_vcb)
        trg_vocab = extract_vocab(None, wargs.trg_vcb)
    else:
        wlog('Starting load shared vocabularies ... ')
        assert os.path.exists(wargs.src_vcb), 'need shared vocabulary ...'
        trg_vocab = src_vocab = extract_vocab(None, wargs.src_vcb)
    n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(n_src_vcb, n_trg_vcb))

    model_dict, e_idx, e_bidx, n_steps, optim = load_model(model_file)
    from models.embedding import WordEmbedding
    src_emb = WordEmbedding(n_src_vcb, wargs.d_src_emb,
                            position_encoding=wargs.position_encoding, prefix='Src')
    trg_emb = WordEmbedding(n_trg_vcb, wargs.d_trg_emb,
예제 #26
0
파일: _main.py 프로젝트: gushu333/DA4NMT
def main():

    init_dir(wargs.dir_model)
    init_dir(wargs.dir_valid)

    vocab_data = {}
    train_srcD_file = wargs.src_vocab_from
    wlog('\nPreparing out of domain source vocabulary from {} ... '.format(
        train_srcD_file))
    src_vocab = extract_vocab(train_srcD_file, wargs.src_dict,
                              wargs.src_dict_size)
    #DANN
    train_srcD_file_domain = wargs.src_domain_vocab_from
    wlog('\nPreparing in domain source vocabulary from {} ...'.format(
        train_srcD_file_domain))
    src_vocab = updata_vocab(train_srcD_file_domain, src_vocab, wargs.src_dict,
                             wargs.src_dict_size)

    vocab_data['src'] = src_vocab

    train_trgD_file = wargs.trg_vocab_from
    wlog('\nPreparing out of domain target vocabulary from {} ... '.format(
        train_trgD_file))
    trg_vocab = extract_vocab(train_trgD_file, wargs.trg_dict,
                              wargs.trg_dict_size)

    #DANN
    train_trgD_file_domain = wargs.trg_domain_vocab_from
    wlog('\nPreparing in domain target vocabulary from {} ... '.format(
        train_trgD_file_domain))
    trg_vocab = updata_vocab(train_trgD_file_domain, trg_vocab, wargs.trg_dict,
                             wargs.trg_dict_size)

    vocab_data['trg'] = trg_vocab

    train_src_file = wargs.train_src
    train_trg_file = wargs.train_trg
    if wargs.fine_tune is False:
        wlog('\nPreparing out of domain training set from {} and {} ... '.
             format(train_src_file, train_trg_file))
        train_src_tlst, train_trg_tlst = wrap_data(
            train_src_file,
            train_trg_file,
            vocab_data['src'],
            vocab_data['trg'],
            max_seq_len=wargs.max_seq_len)
    else:
        wlog('\nNo out of domain trainin set ...')

    #DANN
    train_src_file_domain = wargs.train_src_domain
    train_trg_file_domain = wargs.train_trg_domain
    wlog('\nPreparing in domain training set from {} and {}...'.format(
        train_src_file_domain, train_trg_file_domain))
    train_src_tlst_domain, train_trg_tlst_domain = wrap_data(
        train_src_file_domain,
        train_trg_file_domain,
        vocab_data['src'],
        vocab_data['trg'],
        max_seq_len=wargs.max_seq_len)
    '''
    list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...]
    no padding
    '''
    valid_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix,
                                  wargs.val_src_suffix)
    wlog('\nPreparing validation set from {} ... '.format(valid_file))
    valid_src_tlst, valid_src_lens = val_wrap_data(valid_file, src_vocab)

    if wargs.fine_tune is False:
        wlog('Out of domain Sentence-pairs count in training data: {}'.format(
            len(train_src_tlst)))
    wlog('In domain Sentence-pairs count in training data: {}'.format(
        len(train_src_tlst_domain)))

    src_vocab_size, trg_vocab_size = vocab_data['src'].size(
    ), vocab_data['trg'].size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(
        src_vocab_size, trg_vocab_size))

    if wargs.fine_tune is False:
        batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size)
    else:
        batch_train = None

    batch_valid = Input(valid_src_tlst, None, 1, volatile=True)
    #DANN
    batch_train_domain = Input(train_src_tlst_domain, train_trg_tlst_domain,
                               wargs.batch_size)

    tests_data = None
    if wargs.tests_prefix is not None:
        init_dir(wargs.dir_tests)
        tests_data = {}
        for prefix in wargs.tests_prefix:
            init_dir(wargs.dir_tests + '/' + prefix)
            test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix,
                                         wargs.val_src_suffix)
            wlog('Preparing test set from {} ... '.format(test_file))
            test_src_tlst, _ = val_wrap_data(test_file, src_vocab)
            tests_data[prefix] = Input(test_src_tlst, None, 1, volatile=True)

    sv = vocab_data['src'].idx2key
    tv = vocab_data['trg'].idx2key

    nmtModel = NMT(src_vocab_size, trg_vocab_size)

    if wargs.pre_train is not None:

        assert os.path.exists(wargs.pre_train), 'Requires pre-trained model'
        _dict = _load_model(wargs.pre_train)
        # initializing parameters of interactive attention model
        class_dict = None
        if len(_dict) == 4: model_dict, eid, bid, optim = _dict
        elif len(_dict) == 5:
            model_dict, class_dict, eid, bid, optim = _dict
        for name, param in nmtModel.named_parameters():
            if name in model_dict:
                param.requires_grad = not wargs.fix_pre_params
                param.data.copy_(model_dict[name])
                wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad,
                                                  name))
            elif name.endswith('map_vocab.weight'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.weight'])
                    wlog('{:7} -> grad {}\t{}'.format('Model',
                                                      param.requires_grad,
                                                      name))
            elif name.endswith('map_vocab.bias'):
                if class_dict is not None:
                    param.requires_grad = not wargs.fix_pre_params
                    param.data.copy_(class_dict['map_vocab.bias'])
                    wlog('{:7} -> grad {}\t{}'.format('Model',
                                                      param.requires_grad,
                                                      name))
            else:
                init_params(param, name, True)

        wargs.start_epoch = eid + 1
    else:
        for n, p in nmtModel.named_parameters():
            init_params(p, n, True)
        optim = Optim(wargs.opt_mode,
                      wargs.learning_rate,
                      wargs.max_grad_norm,
                      learning_rate_decay=wargs.learning_rate_decay,
                      start_decay_from=wargs.start_decay_from,
                      last_valid_bleu=wargs.last_valid_bleu)

    if wargs.gpu_id:
        nmtModel.cuda()
        wlog('Push model onto GPU[{}] ... '.format(wargs.gpu_id[0]))
    else:
        nmtModel.cpu()
        wlog('Push model onto CPU ... ')

    wlog(nmtModel)
    wlog(optim)
    pcnt1 = len([p for p in nmtModel.parameters()])
    pcnt2 = sum([p.nelement() for p in nmtModel.parameters()])
    wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2))

    optim.init_optimizer(nmtModel.parameters())

    trainer = Trainer(nmtModel, batch_train, batch_train_domain, vocab_data,
                      optim, batch_valid, tests_data)

    trainer.train()
예제 #27
0
    A.add_argument('--beam-size', dest='beam_size', default=wargs.beam_size, help='beamsize')
    A.add_argument('--len-norm', dest='len_norm', type=int, default=1,
                   help='During searching, whether we normalize accumulated loss by length.')

    '''

    args = A.parse_args()
    model_file = args.model_file
    '''
    search_mode = args.search_mode
    beam_size = args.beam_size
    lenNorm = args.len_norm
    '''

    if wargs.share_vocab is False:
        wlog('Starting load both vocabularies ... ')
        assert os.path.exists(wargs.src_vcb) and os.path.exists(
            wargs.trg_vcb), 'need vocabulary ...'
        src_vocab = extract_vocab(None, wargs.src_vcb)
        trg_vocab = extract_vocab(None, wargs.trg_vcb)
    else:
        wlog('Starting load shared vocabularies ... ')
        assert os.path.exists(wargs.src_vcb), 'need shared vocabulary ...'
        trg_vocab = src_vocab = extract_vocab(None, wargs.src_vcb)
    n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(
        n_src_vcb, n_trg_vcb))

    # wv = KeyedVectors.load('word_vector_en', mmap='r')
    # voc = list(wv.vocab)
    # weight = tc.zeros(n_trg_vcb, 100)
예제 #28
0
def main():

    # Check if CUDA is available
    if cuda.is_available():
        wlog('CUDA is available, specify device by gpu_id argument (i.e. gpu_id=[3])')
    else:
        wlog('Warning: CUDA is not available, try CPU')

    if wargs.gpu_id:
        cuda.set_device(wargs.gpu_id[0])
        wlog('Using GPU {}'.format(wargs.gpu_id[0]))

    init_dir(wargs.dir_model)
    init_dir(wargs.dir_valid)
    init_dir(wargs.dir_tests)
    for prefix in wargs.tests_prefix:
        if not prefix == wargs.val_prefix: init_dir(wargs.dir_tests + '/' + prefix)

    wlog('Preparing data ... ', 0)

    train_srcD_file = wargs.dir_data + 'train.10k.zh5'
    wlog('\nPreparing source vocabulary from {} ... '.format(train_srcD_file))
    src_vocab = extract_vocab(train_srcD_file, wargs.src_dict, wargs.src_dict_size)

    train_trgD_file = wargs.dir_data + 'train.10k.en5'
    wlog('\nPreparing target vocabulary from {} ... '.format(train_trgD_file))
    trg_vocab = extract_vocab(train_trgD_file, wargs.trg_dict, wargs.trg_dict_size)

    train_src_file = wargs.dir_data + 'train.10k.zh0'
    train_trg_file = wargs.dir_data + 'train.10k.en0'
    wlog('\nPreparing training set from {} and {} ... '.format(train_src_file, train_trg_file))
    train_src_tlst, train_trg_tlst = wrap_data(train_src_file, train_trg_file, src_vocab, trg_vocab)
    #list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...], no padding
    wlog('Sentence-pairs count in training data: {}'.format(len(train_src_tlst)))
    src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size()
    wlog('Vocabulary size: |source|={}, |target|={}'.format(src_vocab_size, trg_vocab_size))
    batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size)

    tests_data = None
    if wargs.tests_prefix is not None:
        tests_data = {}
        for prefix in wargs.tests_prefix:
            test_file = wargs.val_tst_dir + prefix + '.src'
            test_src_tlst, _ = val_wrap_data(test_file, src_vocab)
            # we select best model by nist03 testing data
            if prefix == wargs.val_prefix:
                wlog('\nPreparing model-select set from {} ... '.format(test_file))
                batch_valid = Input(test_src_tlst, None, 1, volatile=True, prefix=prefix)
            else:
                wlog('\nPreparing test set from {} ... '.format(test_file))
                tests_data[prefix] = Input(test_src_tlst, None, 1, volatile=True)

    nmtModel = NMT()
    classifier = Classifier(wargs.out_size, trg_vocab_size)

    if wargs.pre_train:

        model_dict, class_dict, eid, bid, optim = load_pytorch_model(wargs.pre_train)
        if isinstance(optim, list): _, _, optim = optim
        # initializing parameters of interactive attention model
        for p in nmtModel.named_parameters(): p[1].data = model_dict[p[0]]
        for p in classifier.named_parameters(): p[1].data = class_dict[p[0]]
        #wargs.start_epoch = eid + 1
    else:

        for p in nmtModel.parameters(): init_params(p, uniform=True)
        for p in classifier.parameters(): init_params(p, uniform=True)
        optim = Optim(
            wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm,
            learning_rate_decay=wargs.learning_rate_decay,
            start_decay_from=wargs.start_decay_from,
            last_valid_bleu=wargs.last_valid_bleu
        )

    if wargs.gpu_id:
        wlog('Push model onto GPU ... ')
        nmtModel.cuda()
        classifier.cuda()
    else:
        wlog('Push model onto CPU ... ')
        nmtModel.cpu()
        classifier.cuda()

    nmtModel.classifier = classifier
    wlog(nmtModel)
    pcnt1 = len([p for p in nmtModel.parameters()])
    pcnt2 = sum([p.nelement() for p in nmtModel.parameters()])
    wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2))

    optim.init_optimizer(nmtModel.parameters())

    #tor = Translator(nmtModel, src_vocab.idx2key, trg_vocab.idx2key)
    #tor.trans_tests(tests_data, pre_dict['epoch'], pre_dict['batch'])

    trainer = Trainer(nmtModel, src_vocab.idx2key, trg_vocab.idx2key, optim, trg_vocab_size)

    dev_src0 = wargs.dir_data + 'dev.1k.zh0'
    dev_trg0 = wargs.dir_data + 'dev.1k.en0'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src0, dev_trg0))
    dev_src0, dev_trg0 = wrap_data(dev_src0, dev_trg0, src_vocab, trg_vocab)
    wlog(len(train_src_tlst))
    # add 1000 to train
    train_all_chunks = (train_src_tlst, train_trg_tlst)
    dh = DataHisto(train_all_chunks)

    dev_src1 = wargs.dir_data + 'dev.1k.zh1'
    dev_trg1 = wargs.dir_data + 'dev.1k.en1'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src1, dev_trg1))
    dev_src1, dev_trg1 = wrap_data(dev_src1, dev_trg1, src_vocab, trg_vocab)

    dev_src2 = wargs.dir_data + 'dev.1k.zh2'
    dev_trg2 = wargs.dir_data + 'dev.1k.en2'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src2, dev_trg2))
    dev_src2, dev_trg2 = wrap_data(dev_src2, dev_trg2, src_vocab, trg_vocab)

    dev_src3 = wargs.dir_data + 'dev.1k.zh3'
    dev_trg3 = wargs.dir_data + 'dev.1k.en3'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src3, dev_trg3))
    dev_src3, dev_trg3 = wrap_data(dev_src3, dev_trg3, src_vocab, trg_vocab)

    dev_src4 = wargs.dir_data + 'dev.1k.zh4'
    dev_trg4 = wargs.dir_data + 'dev.1k.en4'
    wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src4, dev_trg4))
    dev_src4, dev_trg4 = wrap_data(dev_src4, dev_trg4, src_vocab, trg_vocab)
    wlog(len(dev_src4+dev_src3+dev_src2+dev_src1+dev_src0))
    dev_input = Input(dev_src4+dev_src3+dev_src2+dev_src1+dev_src0, dev_trg4+dev_trg3+dev_trg2+dev_trg1+dev_trg0, wargs.batch_size)
    trainer.train(dh, dev_input, 0, batch_valid, tests_data, merge=True, name='DH_{}'.format('dev'))

    '''
    chunk_size = 1000
    rand_ids = tc.randperm(len(train_src_tlst))[:chunk_size * 1000]
    rand_ids = rand_ids.split(chunk_size)
    #train_chunks = [(dev_src, dev_trg)]
    train_chunks = []
    for k in range(len(rand_ids)):
        rand_id = rand_ids[k]
        chunk_src_tlst = [train_src_tlst[i] for i in rand_id]
        chunk_trg_tlst = [train_trg_tlst[i] for i in rand_id]
        #wlog('Sentence-pairs count in training data: {}'.format(len(src_samples_train)))
        #batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size)
        #batch_train = Input(src_samples_train, trg_samples_train, wargs.batch_size)
        train_chunks.append((chunk_src_tlst, chunk_trg_tlst))

    chunk_D0 = train_chunks[0]
    dh = DataHisto(chunk_D0)
    c0_input = Input(chunk_D0[0], chunk_D0[1], wargs.batch_size)
    trainer.train(dh, c0_input, 0, batch_valid, tests_data, merge=False, name='DH_{}'.format(0))
    for k in range(1, len(train_chunks)):
        wlog('*' * 30, False)
        wlog(' Next Data {} '.format(k), False)
        wlog('*' * 30)
        chunk_Dk = train_chunks[k]
        ck_input = Input(chunk_Dk[0], chunk_Dk[1], wargs.batch_size)
        trainer.train(dh, ck_input, k, batch_valid, tests_data, merge=True, name='DH_{}'.format(k))
        dh.add_batch_data(chunk_Dk)
    '''

    if tests_data and wargs.final_test:

        bestModel = NMT()
        classifier = Classifier(wargs.out_size, trg_vocab_size)

        assert os.path.exists(wargs.best_model)
        model_dict = tc.load(wargs.best_model)

        best_model_dict = model_dict['model']
        best_model_dict = {k: v for k, v in best_model_dict.items() if 'classifier' not in k}

        bestModel.load_state_dict(best_model_dict)
        classifier.load_state_dict(model_dict['class'])

        if wargs.gpu_id:
            wlog('Push NMT model onto GPU ... ')
            bestModel.cuda()
            classifier.cuda()
        else:
            wlog('Push NMT model onto CPU ... ')
            bestModel.cpu()
            classifier.cpu()

        bestModel.classifier = classifier

        tor = Translator(bestModel, src_vocab.idx2key, trg_vocab.idx2key)
        tor.trans_tests(tests_data, model_dict['epoch'], model_dict['batch'])