Пример #1
0
def prepare_vocab(model_dir, vocab_type, src_file, trg_file, vocab_size, gpu_id):
    if vocab_type == 'normal':
        src_vocab = VocabNormal()
        trg_vocab = VocabNormal()
        if os.path.isfile(model_dir + 'src_vocab.normal.pkl') and os.path.isfile(model_dir + 'trg_vocab.normal.pkl'):
            src_vocab.load(model_dir + 'src_vocab.normal.pkl')
            trg_vocab.load(model_dir + 'trg_vocab.normal.pkl')
        else:
            init_vocab = {'<pad>': 0, '<unk>': 1, '<s>': 2, '</s>': 3}
            src_vocab.build(src_file, True,  init_vocab, vocab_size)
            trg_vocab.build(trg_file, False, init_vocab, vocab_size)
            save_pickle(model_dir + 'src_vocab.normal.pkl', src_vocab.vocab)
            save_pickle(model_dir + 'trg_vocab.normal.pkl', trg_vocab.vocab)
        src_vocab.set_reverse_vocab()
        trg_vocab.set_reverse_vocab()

        sos = convert.convert_list(np.array([src_vocab.vocab['<s>']], dtype=np.int32), gpu_id)
        eos = convert.convert_list(np.array([src_vocab.vocab['</s>']], dtype=np.int32), gpu_id)

    elif vocab_type == 'subword':
        src_vocab = VocabSubword()
        trg_vocab = VocabSubword()
        if os.path.isfile(model_dir + 'src_vocab.sub.model') and os.path.isfile(model_dir + 'trg_vocab.sub.model'):
            src_vocab.load(model_dir + 'src_vocab.sub.model')
            trg_vocab.load(model_dir + 'trg_vocab.sub.model')
        else:
            src_vocab.build(src_file + '.sub', model_dir + 'src_vocab.sub', vocab_size)
            trg_vocab.build(trg_file + '.sub', model_dir + 'trg_vocab.sub', vocab_size)

        sos = convert.convert_list(np.array([src_vocab.vocab.PieceToId('<s>')], dtype=np.int32), gpu_id)
        eos = convert.convert_list(np.array([src_vocab.vocab.PieceToId('</s>')], dtype=np.int32), gpu_id)

    return src_vocab, trg_vocab, sos, eos
Пример #2
0
def main():
    args = parse_args()
    model_dir = args.model_dir
    """LOAD CONFIG FILE"""
    config_files = glob.glob(os.path.join(model_dir, '*.ini'))
    assert len(config_files) == 1, 'Put only one config file in the directory'
    config_file = config_files[0]
    config = configparser.ConfigParser()
    config.read(config_file)
    """LOGGER"""
    logger = getLogger(__name__)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')

    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
    sh.setFormatter(formatter)
    logger.addHandler(sh)

    log_file = model_dir + 'log.txt'
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info('[Training start] logging to {}'.format(log_file))
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    class_size = int(config['Parameter']['class_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    weight_decay = float(config['Parameter']['weight_decay'])
    gradclip = float(config['Parameter']['gradclip'])
    vocab_type = config['Parameter']['vocab_type']
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])
    """TRINING DETAIL"""
    gpu_id = args.gpu
    n_epoch = args.epoch
    batch_size = args.batch
    interval = args.interval
    reg = False if args.type == 'l' or args.type == 's' else True
    """DATASET"""
    if args.type == 'l':
        section = 'Local'
    elif args.type == 'lr':
        section = 'Local_Reg'
    elif args.type == 's':
        section = 'Server'
    else:
        section = 'Server_Reg'
    train_src_file = config[section]['train_src_file']
    train_trg_file = config[section]['train_trg_file']
    valid_src_file = config[section]['valid_src_file']
    valid_trg_file = config[section]['valid_trg_file']
    test_src_file = config[section]['test_src_file']
    correct_txt_file = config[section]['correct_txt_file']

    train_data_size = dataset.data_size(train_src_file)
    valid_data_size = dataset.data_size(valid_src_file)
    logger.info('train size: {0}, valid size: {1}'.format(train_data_size, valid_data_size))

    if vocab_type == 'normal':
        src_vocab = dataset.VocabNormal(reg)
        trg_vocab = dataset.VocabNormal(reg)
        if os.path.isfile(model_dir + 'src_vocab.normal.pkl') and os.path.isfile(model_dir + 'trg_vocab.normal.pkl'):
            src_vocab.load(model_dir + 'src_vocab.normal.pkl')
            trg_vocab.load(model_dir + 'trg_vocab.normal.pkl')
        else:
            init_vocab = {'<pad>': 0, '<unk>': 1, '<s>': 2, '</s>': 3}
            src_vocab.build(train_src_file, True,  init_vocab, vocab_size)
            trg_vocab.build(train_trg_file, False, init_vocab, vocab_size)
            dataset.save_pickle(model_dir + 'src_vocab.normal.pkl', src_vocab.vocab)
            dataset.save_pickle(model_dir + 'trg_vocab.normal.pkl', trg_vocab.vocab)
        src_vocab.set_reverse_vocab()
        trg_vocab.set_reverse_vocab()

        sos = convert.convert_list(np.array([src_vocab.vocab['<s>']], dtype=np.int32), gpu_id)
        eos = convert.convert_list(np.array([src_vocab.vocab['</s>']], dtype=np.int32), gpu_id)

    elif vocab_type == 'subword':
        src_vocab = dataset.VocabSubword()
        trg_vocab = dataset.VocabSubword()
        if os.path.isfile(model_dir + 'src_vocab.sub.model') and os.path.isfile(model_dir + 'trg_vocab.sub.model'):
            src_vocab.load(model_dir + 'src_vocab.sub.model')
            trg_vocab.load(model_dir + 'trg_vocab.sub.model')
        else:
            src_vocab.build(train_src_file, model_dir + 'src_vocab.sub', vocab_size)
            trg_vocab.build(train_trg_file, model_dir + 'trg_vocab.sub', vocab_size)

        sos = convert.convert_list(np.array([src_vocab.vocab.PieceToId('<s>')], dtype=np.int32), gpu_id)
        eos = convert.convert_list(np.array([src_vocab.vocab.PieceToId('</s>')], dtype=np.int32), gpu_id)

    src_vocab_size = len(src_vocab.vocab)
    trg_vocab_size = len(trg_vocab.vocab)
    logger.info('src_vocab size: {}, trg_vocab size: {}'.format(src_vocab_size, trg_vocab_size))

    train_iter = dataset.Iterator(train_src_file, train_trg_file, src_vocab, trg_vocab, batch_size, sort=True, shuffle=True, reg=reg)
    # train_iter = dataset.Iterator(train_src_file, train_trg_file, src_vocab, trg_vocab, batch_size, sort=False, shuffle=False, reg=reg)
    valid_iter = dataset.Iterator(valid_src_file, valid_trg_file, src_vocab, trg_vocab, batch_size, sort=False, shuffle=False, reg=reg)
    evaluater = evaluate.Evaluate(correct_txt_file)
    test_iter = dataset.Iterator(test_src_file, test_src_file, src_vocab, trg_vocab, batch_size, sort=False, shuffle=False)
    """MODEL"""
    if reg:
        class_size = 1
        model = MultiReg(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient)
    else:
        model = Multi(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient)
    """OPTIMIZER"""
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip))
    optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    """TRAIN"""
    sum_loss = 0
    loss_dic = {}
    result = []
    for epoch in range(1, n_epoch + 1):
        for i, batch in enumerate(train_iter.generate(), start=1):
            try:
                batch = convert.convert(batch, gpu_id)
                loss = optimizer.target(*batch)
                sum_loss += loss.data
                optimizer.target.cleargrads()
                loss.backward()
                optimizer.update()

                if i % interval == 0:
                    logger.info('E{} ## iteration:{}, loss:{}'.format(epoch, i, sum_loss))
                    sum_loss = 0

            except Exception as e:
                logger.info(traceback.format_exc())
                logger.info('iteration: {}'.format(i))
                for b in batch[0]:
                    for bb in b:
                        logger.info(src_vocab.id2word(bb))
        chainer.serializers.save_npz(model_dir + 'model_epoch_{}.npz'.format(epoch), model)

        """EVALUATE"""
        valid_loss = 0
        for batch in valid_iter.generate():
            batch = convert.convert(batch, gpu_id)
            with chainer.no_backprop_mode(), chainer.using_config('train', False):
                valid_loss += optimizer.target(*batch).data
        logger.info('E{} ## val loss:{}'.format(epoch, valid_loss))
        loss_dic[epoch] = valid_loss

        """TEST"""
        outputs = []
        labels = []
        for i, batch in enumerate(test_iter.generate(), start=1):
            batch = convert.convert(batch, gpu_id)
            with chainer.no_backprop_mode(), chainer.using_config('train', False):
                output, label = model.predict(batch[0], sos, eos)
            # for o, l in zip(output, label):
            #     o = chainer.cuda.to_cpu(o)
            #     outputs.append(trg_vocab.id2word(o))
            #     labels.append(l)
            for l in label:
                labels.append(l)
        rank_list = evaluater.rank(labels)
        s_rate, s_count = evaluater.single(rank_list)
        m_rate, m_count = evaluater.multiple(rank_list)
        logger.info('E{} ## s: {} | {}'.format(epoch, ' '.join(x for x in s_rate), ' '.join(x for x in s_count)))
        logger.info('E{} ## m: {} | {}'.format(epoch, ' '.join(x for x in m_rate), ' '.join(x for x in m_count)))

        # with open(model_dir + 'model_epoch_{}.hypo'.format(epoch), 'w')as f:
        #     [f.write(o + '\n') for o in outputs]
        with open(model_dir + 'model_epoch_{}.attn'.format(epoch), 'w')as f:
            [f.write('{}\n'.format(l)) for l in labels]

        result.append('{},{},{},{}'.format(epoch, valid_loss, s_rate[-1], m_rate[-1]))

    """MODEL SAVE"""
    best_epoch = min(loss_dic, key=(lambda x: loss_dic[x]))
    logger.info('best_epoch:{0}'.format(best_epoch))
    chainer.serializers.save_npz(model_dir + 'best_model.npz', model)

    with open(model_dir + 'result.csv', 'w')as f:
        f.write('epoch,valid_loss,single,multiple\n')
        [f.write(r + '\n') for r in result]
Пример #3
0
 def convert(self):
     paths = file_paths(self.src_entry.get())
     convert_list(paths, self.src_entry.get(), self.dst_entry.get(),
                  self.bitrate_entry.get())
Пример #4
0
def main():
    args = parse_args()
    model_file = args.model_file
    model_dir = re.search(r'(.*/)', model_file).group(1)
    """LOAD CONFIG FILE"""
    config_files = glob.glob(os.path.join(model_dir, '*.ini'))
    assert len(config_files) == 1, 'Put only one config file in the directory'
    config_file = config_files[0]
    config = configparser.ConfigParser()
    config.read(config_file)
    """LOGGER"""
    logger = getLogger(__name__)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')

    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
    sh.setFormatter(formatter)
    logger.addHandler(sh)

    log_file = model_dir + 'test_log.txt'
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info('[Test start] logging to {}'.format(log_file))
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    class_size = int(config['Parameter']['class_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    coefficient = float(config['Parameter']['coefficient'])
    """TEST DETAIL"""
    gpu_id = args.gpu
    batch_size = args.batch
    data = model_dir.split('/')[-2].split('_')

    model_type = data[0]
    if 'normal' in data[1]:
        vocab_type = 'normal'
    else:
        vocab_type = 'subword'
    if data[2] == 's':
        data_path = 'server'
    else:
        data_path = 'local'
    """DATASET"""
    test_src_file = config[data_path]['test_src_file']
    row_score_file = config[data_path]['row_score_file']
    row_score = dataset.load_score_file(row_score_file)

    test_data_size = dataset.data_size(test_src_file)
    logger.info('test size: {}'.format(test_data_size))
    if vocab_type == 'normal':
        src_vocab = dataset.VocabNormal()
        src_vocab.load(model_dir + 'src_vocab.normal.pkl')
        src_vocab.set_reverse_vocab()
        trg_vocab = dataset.VocabNormal()
        trg_vocab.load(model_dir + 'trg_vocab.normal.pkl')
        trg_vocab.set_reverse_vocab()

        sos = convert.convert_list(
            np.array([src_vocab.vocab['<s>']], dtype=np.int32), gpu_id)
        eos = convert.convert_list(
            np.array([src_vocab.vocab['</s>']], dtype=np.int32), gpu_id)

    elif vocab_type == 'subword':
        src_vocab = dataset.VocabSubword()
        src_vocab.load(model_dir + 'src_vocab.sub.model')
        trg_vocab = dataset.VocabSubword()
        trg_vocab.load(model_dir + 'trg_vocab.sub.model')

        sos = convert.convert_list(
            np.array([src_vocab.vocab.PieceToId('<s>')], dtype=np.int32),
            gpu_id)
        eos = convert.convert_list(
            np.array([src_vocab.vocab.PieceToId('</s>')], dtype=np.int32),
            gpu_id)

    src_vocab_size = len(src_vocab.vocab)
    trg_vocab_size = len(trg_vocab.vocab)
    logger.info('src_vocab size: {}, trg_vocab size: {}'.format(
        src_vocab_size, trg_vocab_size))

    test_iter = dataset.Iterator(test_src_file,
                                 test_src_file,
                                 src_vocab,
                                 trg_vocab,
                                 batch_size,
                                 sort=False,
                                 shuffle=False)

    gridsearcher = gridsearch.GridSearch(test_src_file)
    """MODEL"""
    if model_type == 'multi':
        model = model_reg.Multi(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio,
                                coefficient)
    elif model_type in ['label', 'pretrain']:
        model = model_reg.Label(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio)
    else:
        model = model_reg.EncoderDecoder(src_vocab_size, trg_vocab_size,
                                         embed_size, hidden_size,
                                         dropout_ratio)

    chainer.serializers.load_npz(model_file, model)
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    """TEST"""
    epoch = 'T'
    outputs = []
    labels = []
    alignments = []
    for i, batch in enumerate(test_iter.generate(), start=1):
        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            output, label, align = model.predict(batch[0], sos, eos)
        for o in output:
            outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o)))
        for l in label:
            labels.append(chainer.cuda.to_cpu(l))
        for a in align:
            alignments.append(chainer.cuda.to_cpu(a))

    model_file = model_file[:-3]
    if model_type == 'multi':
        score = gridsearcher.gridsearch(labels, alignments)
        logger.info('E{} ## {}'.format(epoch, score[0]))
        logger.info('E{} ## {}'.format(epoch, score[1]))
        with open(model_file + 'label.T', 'w') as f:
            [f.write('{}\n'.format(l)) for l in labels]
        with open(model_file + '.hypo.T', 'w') as f:
            [f.write(o + '\n') for o in outputs]
        with open(model_file + '.align.T', 'w') as f:
            [f.write('{}\n'.format(a)) for a in alignments]

    elif model_type in ['label', 'pretrain']:
        score = gridsearcher.gridsearch(labels, alignments)
        logger.info('E{} ## {}'.format(epoch, score[0]))
        logger.info('E{} ## {}'.format(epoch, score[1]))
        with open(model_file + 'label.T', 'w') as f:
            [f.write('{}\n'.format(l)) for l in labels]

    else:
        score = gridsearcher.gridsearch(row_score, alignments)
        logger.info('E{} ## {}'.format(epoch, score[0]))
        logger.info('E{} ## {}'.format(epoch, score[1]))
        with open(model_file + '.hypo.T', 'w') as f:
            [f.write(o + '\n') for o in outputs]
        with open(model_file + '.align.T', 'w') as f:
            [f.write('{}\n'.format(a)) for a in alignments]