def main(): args = parse_args() config = configparser.ConfigParser() """ARGS DETAIL""" config_file = args.config_file batch_size = args.batch n_epoch = args.epoch pretrain_epoch = args.pretrain_epoch gpu_id = args.gpu model_type = args.model pretrain_w2v = args.pretrain_w2v data_path = args.data_path load_model = args.load_model """DIR PREPARE""" config.read(config_file) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) shuffle_data = bool(config['Parameter']['shuffle']) if pretrain_w2v: vocab_size = 'p' + str(vocab_size) if model_type == 'multi': if shuffle_data: base_dir = './pseudo_{}_{}_{}_c{}_shuffle/'.format( model_type, vocab_size, data_path[0], coefficient) else: base_dir = './pseudo_{}_{}_{}_c{}/'.format(model_type, vocab_size, data_path[0], coefficient) else: if shuffle_data: base_dir = './pseudo_{}_{}_{}_shuffle/'.format( model_type, vocab_size, data_path[0]) else: base_dir = './pseudo_{}_{}_{}/'.format(model_type, vocab_size, data_path[0]) model_save_dir = base_dir if not os.path.exists(base_dir): os.mkdir(base_dir) shutil.copyfile(config_file, base_dir + config_file) config_file = base_dir + config_file config.read(config_file) """PARAMATER""" embed_size = int(config['Parameter']['embed_size']) hidden_size = int(config['Parameter']['hidden_size']) class_size = int(config['Parameter']['class_size']) dropout_ratio = float(config['Parameter']['dropout']) weight_decay = float(config['Parameter']['weight_decay']) gradclip = float(config['Parameter']['gradclip']) vocab_size = int(config['Parameter']['vocab_size']) valid_num = int(config['Parameter']['valid_num']) shuffle_data = bool(config['Parameter']['shuffle']) """LOGGER""" log_file = model_save_dir + 'log.txt' logger = dataset.prepare_logger(log_file) logger.info(args) # 引数を記録 logger.info('[Training start] logging to {}'.format(log_file)) """DATASET""" train_src_file = config[data_path]['train_src_file'] train_trg_file = config[data_path]['train_trg_file'] valid_src_file = config[data_path]['valid_src_file'] valid_trg_file = config[data_path]['valid_trg_file'] test_src_file = config[data_path]['single_src_file'] test_trg_file = config[data_path]['single_trg_file'] src_w2v_file = config[data_path]['src_w2v_file'] trg_w2v_file = config[data_path]['trg_w2v_file'] train_data = dataset.load_label_corpus_file(train_src_file, train_trg_file) qa_data_sub_lit = dataset.split_valid_data(train_data, valid_num) valid_data = dataset.load_label_corpus_file(valid_src_file, valid_trg_file) test_data = dataset.load_label_corpus_file(test_src_file, test_trg_file) test_data_sub_lit = dataset.split_valid_data(test_data, valid_num) """VOCABULARY""" src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab( base_dir, train_data, vocab_size, gpu_id) src_vocab_size = len(src_vocab.vocab) trg_vocab_size = len(trg_vocab.vocab) src_initialW, trg_initialW = None, None if pretrain_w2v: w2v = word2vec.Word2Vec() src_initialW, vector_size, src_match_word_count = w2v.make_initialW( src_vocab.vocab, src_w2v_file) trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW( trg_vocab.vocab, trg_w2v_file) logger.info( 'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format( src_match_word_count, src_vocab_size, trg_match_word_count, trg_vocab_size)) logger.info('src_vocab size: {}, trg_vocab size: {}'.format( src_vocab_size, trg_vocab_size)) evaluater = evaluate.Evaluate() """GPU""" if gpu_id >= 0: logger.info('Use GPU') chainer.cuda.get_device_from_id(gpu_id).use() cross_valid_result = [] for ite in range(1, valid_num + 1): model_valid_dir = base_dir + 'valid{}/'.format(ite) if not os.path.exists(model_valid_dir): os.mkdir(model_valid_dir) qa_train_data, qa_dev_data, qa_test_data = dataset.separate_train_dev_test( qa_data_sub_lit, ite) train_data, dev_data, test_data = dataset.separate_train_dev_test( test_data_sub_lit, ite) test_data_id = [t['id'] for t in test_data] qa_iter = dataset.Iterator(qa_train_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=True, shuffle=True) valid_iter = dataset.Iterator(valid_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) train_iter = dataset.Iterator(train_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=True, shuffle=True) dev_iter = dataset.Iterator(dev_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) test_iter = dataset.Iterator(test_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) qa_size = len(qa_train_data) train_size = len(train_data) logger.info('V{} ## QA:{}, train:{}, dev:{} ,test:{}'.format( ite, qa_size, train_size, len(dev_data), len(test_data))) """MODEL""" if model_type == 'multi': model = model.Multi(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient, src_initialW, trg_initialW) elif model_type in ['label', 'pretrain']: model = model.Label(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, src_initialW, trg_initialW) else: model = model.EncoderDecoder(src_vocab_size, trg_vocab_size, embed_size, hidden_size, dropout_ratio, src_initialW, trg_initialW) if gpu_id >= 0: model.to_gpu() """OPTIMIZER""" optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip)) optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) """PRETRAIN""" if model_type == 'pretrain' and load_model is None: logger.info('Pre-train start') pretrain_loss_dic = {} for epoch in range(1, pretrain_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = model.pretrain(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('P{} ## train iter: {}, {}'.format( epoch, i, e)) chainer.serializers.save_npz( model_save_dir + 'p_model_epoch_{}.npz'.format(epoch), model) """EVALUATE""" valid_loss = 0 for batch in valid_iter.generate(): with chainer.no_backprop_mode(), chainer.using_config( 'train', False): valid_loss += model.pretrain(*batch).data logger.info('P{} ## train loss: {}, val loss:{}'.format( epoch, train_loss, valid_loss)) pretrain_loss_dic[epoch] = valid_loss """MODEL SAVE & LOAD""" best_epoch = min(pretrain_loss_dic, key=(lambda x: pretrain_loss_dic[x])) logger.info('best_epoch:{}, val loss: {}'.format( best_epoch, pretrain_loss_dic[best_epoch])) shutil.copyfile( model_save_dir + 'p_model_epoch_{}.npz'.format(best_epoch), model_save_dir + 'p_best_model.npz') logger.info('Pre-train finish') if load_model: logger.info('load model: {}'.format(load_model)) chainer.serializers.load_npz(base_dir + load_model, model) """TRAIN""" epoch_info = {} for epoch in range(1, n_epoch + 1): train_loss = 0 mix_train_iter = dataset.MixIterator(qa_iter, train_iter, seed=0, shuffle=shuffle_data) for i, batch in enumerate(mix_train_iter.generate(), start=1): try: loss = optimizer.target(*batch[0]) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('V{} ## E{} ## train iter: {}, {}'.format( ite, epoch, i, e)) chainer.serializers.save_npz( model_valid_dir + 'model_epoch_{}.npz'.format(epoch), model) """DEV""" labels, alignments = [], [] for i, batch in enumerate(dev_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): _, label, align = model.predict(batch[0], sos, eos) except Exception as e: logger.info('V{} ## E{} ## dev iter: {}, {}'.format( ite, epoch, i, e)) if model_type == 'multi': for l, a in zip(label, align): labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for a in align: alignments.append(chainer.cuda.to_cpu(a)) best_param_dic = evaluater.param_search(labels, alignments, dev_data) param = max(best_param_dic, key=lambda x: best_param_dic[x]['macro']) init, mix = evaluate.key_to_param(param) dev_score = round(best_param_dic[param]['macro'], 3) """TEST""" outputs, labels, alignments = [], [], [] for i, batch in enumerate(test_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): output, label, align = model.predict( batch[0], sos, eos) except Exception as e: logger.info('V{} ## E{} ## test iter: {}, {}'.format( ite, epoch, i, e)) if model_type == 'multi': for l, a in zip(label, align): labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for a in align: alignments.append(chainer.cuda.to_cpu(a)) rate, count, tf_lit, macro, micro = evaluater.eval_param( labels, alignments, test_data, init, mix) test_macro_score = round(macro, 3) test_micro_score = round(micro, 3) logger.info( 'V{} ## E{} ## loss: {}, dev: {}, param: {}, micro: {}, macro: {}' .format(ite, epoch, train_loss, dev_score, param, test_micro_score, test_macro_score)) epoch_info[epoch] = { 'id': test_data_id, 'label': labels, 'align': alignments, 'hypo': outputs, 'epoch': epoch, 'dev_score': dev_score, 'param': param, 'rate': rate, 'count': count, 'tf': tf_lit, 'macro': test_macro_score, 'micro': test_micro_score } dataset.save_output(model_valid_dir, epoch_info[epoch]) """MODEL SAVE""" best_epoch = max(epoch_info, key=(lambda x: epoch_info[x]['dev_score'])) cross_valid_result.append(epoch_info[best_epoch]) logger.info( 'V{} ## best_epoch: {}, dev: {}, micro: {}, macro: {}'.format( ite, best_epoch, epoch_info[best_epoch]['dev_score'], epoch_info[best_epoch]['micro'], epoch_info[best_epoch]['macro'])) shutil.copyfile( model_valid_dir + 'model_epoch_{}.npz'.format(best_epoch), model_valid_dir + 'best_model.npz') logger.info('') ave_dev_score, ave_macro_score, ave_micro_score = 0, 0, 0 ave_test_score = [0 for _ in range(len(cross_valid_result[0]['rate']))] id_total, label_total, align_total, tf_total = [], [], [], [] for v, r in enumerate(cross_valid_result, start=1): ave_dev_score += r['dev_score'] ave_macro_score += r['macro'] ave_micro_score += r['micro'] for i, rate in enumerate(r['rate']): ave_test_score[i] += rate logger.info(' {}: e{}, {}\tdev: {}, micro: {}, macro: {} {}'.format( v, r['epoch'], r['param'], r['dev_score'], r['micro'], dataset.float_to_str(r['rate']), r['macro'])) id_total.extend(r['id']) label_total.extend(r['label']) align_total.extend(r['align']) tf_total.extend(r['tf']) ave_dev_score = round(ave_dev_score / valid_num, 3) ave_macro_score = round(ave_macro_score / valid_num, 3) ave_micro_score = round(ave_micro_score / valid_num, 3) ave_test_score = [ ave_test_score[i] / valid_num for i in range(len(ave_test_score)) ] logger.info('dev: {}, micro: {}, macro: {} {}'.format( ave_dev_score, ave_micro_score, dataset.float_to_str(ave_test_score), ave_macro_score)) label, align, tf = dataset.sort_multi_list(id_total, label_total, align_total, tf_total) dataset.save_list(base_dir + 'label.txt', label) dataset.save_list(base_dir + 'align.txt', align) dataset.save_list(base_dir + 'tf.txt', tf)
def main(): """ model1: label model2: encdec を指定する """ args = parse_args() model_name1 = args.label_model model_dir1 = re.search(r'^(.*/)', model_name1).group(1) model_name2 = args.encdec_model model_dir2 = re.search(r'^(.*/)', model_name2).group(1) valid_type = args.valid # 結果保存用ディレクトリ作成 output_dir = model_dir1 + model_dir2 if not os.path.exists(output_dir): os.mkdir(output_dir) # 評価データ準備 config = configparser.ConfigParser() config_files = glob.glob(os.path.join(model_dir1, '*.ini')) config.read(config_files[0]) valid_num = int(config['Parameter']['valid_num']) test_src_file = config['server']['single_src_file'] test_trg_file = config['server']['single_trg_file'] data = dataset.load_label_corpus_file(test_src_file, test_trg_file) data_sub_lit = dataset.split_valid_data(data, valid_num) evaluater = evaluate.Evaluate() result_dic = {} # validファイルに分割されている時 if valid_type == 'TT': """ model1: validファイルあり model2: validファイルあり """ model_file_num = len( glob.glob(os.path.join(model_dir1, 'valid1/model_epoch_*.npz'))) label_dic = {} align_dic = {} for i in range(1, model_file_num + 1): label_dic[i] = [] align_dic[i] = [] for valid in [2, 3, 4, 5, 1]: label, _ = dataset.load_score_file( model_dir1 + 'valid{}/model_epoch_{}'.format(valid, i)) label_dic[i].append(label) _, align = dataset.load_score_file( model_dir2 + 'valid{}/model_epoch_{}'.format(valid, i)) align_dic[i].append(align) order = {1: [4, 5], 2: [5, 1], 3: [1, 2], 4: [2, 3], 5: [3, 4]} for i in tqdm(range(1, model_file_num + 1)): for j in range(1, model_file_num + 1): info = [] for ite, v in order.items(): _, dev_data, test_data = dataset.separate_train_dev_test( data_sub_lit, ite) dev_label = label_dic[i][v[0] - 1] test_label = label_dic[i][v[1] - 1] dev_align = align_dic[j][v[0] - 1] test_align = align_dic[j][v[1] - 1] best_param_dic = evaluater.param_search( dev_label, dev_align, dev_data) param = max(best_param_dic, key=lambda x: best_param_dic[x]['macro']) init, mix = evaluate.key_to_param(param) dev_score = round(best_param_dic[param]['macro'], 3) rate, count, tf_lit, macro, micro = evaluater.eval_param( test_label, test_align, test_data, init, mix) test_macro_score = round(macro, 3) test_micro_score = round(micro, 3) info.append({ 'dev_score': dev_score, 'param': param, 'macro': test_macro_score, 'micro': test_micro_score, 'tf': tf_lit }) ave_dev_score, ave_macro_score, ave_micro_score = 0, 0, 0 param = [] tf_lit = [] for v, r in enumerate(info, start=1): ave_dev_score += r['dev_score'] ave_macro_score += r['macro'] ave_micro_score += r['micro'] param.append(r['param']) tf_lit.extend(r['tf']) ave_dev_score = round(ave_dev_score / valid_num, 3) ave_macro_score = round(ave_macro_score / valid_num, 3) ave_micro_score = round(ave_micro_score / valid_num, 3) key = 'label{}_enc{}'.format(i, j) result_dic[key] = { 'dev': ave_dev_score, 'micro': ave_micro_score, 'macro': ave_macro_score, 'param': ' '.join(param), 'tf': tf_lit } best_score = max(result_dic, key=lambda x: result_dic[x]['dev']) with open(output_dir + 'merge.txt', 'w') as f: [ f.write('{}: {}\n'.format(k, v)) for k, v in sorted(result_dic.items()) ] f.write('best score\n{}: {}\n'.format(best_score, result_dic[best_score])) with open(output_dir + 'tf.txt', 'w') as f: [f.write(r + '\n') for r in result_dic[best_score]['tf']] elif valid_type == 'FF': """ model1: validファイルなし model2: validファイルなし """ model_file_num = len( glob.glob(os.path.join(model_dir1, 'model_epoch_*.npz'))) for i in tqdm(range(1, model_file_num + 1)): label, _ = dataset.load_score_file(model_dir1 + 'model_epoch_{}'.format(i)) label_sub_lit = dataset.split_valid_data(label, valid_num) for j in range(1, model_file_num + 1): _, align = dataset.load_score_file(model_dir2 + 'model_epoch_{}'.format(j)) align_sub_lit = dataset.split_valid_data(align, valid_num) info = [] for ite in range(1, valid_num + 1): _, dev_data, test_data = dataset.separate_train_dev_test( data_sub_lit, ite) _, dev_label, test_label = dataset.separate_train_dev_test( label_sub_lit, ite) _, dev_align, test_align = dataset.separate_train_dev_test( align_sub_lit, ite) best_param_dic = evaluater.param_search( dev_label, dev_align, dev_data) param = max(best_param_dic, key=lambda x: best_param_dic[x]['macro']) init, mix = evaluate.key_to_param(param) dev_score = round(best_param_dic[param]['macro'], 3) rate, count, tf_lit, macro, micro = evaluater.eval_param( test_label, test_align, test_data, init, mix) test_macro_score = round(macro, 3) test_micro_score = round(micro, 3) info.append({ 'dev_score': dev_score, 'param': param, 'macro': test_macro_score, 'micro': test_micro_score, 'tf': tf_lit }) ave_dev_score, ave_macro_score, ave_micro_score = 0, 0, 0 param = [] for v, r in enumerate(info, start=1): ave_dev_score += r['dev_score'] ave_macro_score += r['macro'] ave_micro_score += r['micro'] param.append(r['param']) tf_lit.extend(r['tf']) ave_dev_score = round(ave_dev_score / valid_num, 3) ave_macro_score = round(ave_macro_score / valid_num, 3) ave_micro_score = round(ave_micro_score / valid_num, 3) key = 'label{}_enc{}'.format(i, j) result_dic[key] = { 'dev': ave_dev_score, 'micro': ave_micro_score, 'macro': ave_macro_score, 'param': ' '.join(param), 'tf': tf_lit } best_score = max(result_dic, key=lambda x: result_dic[x]['dev']) with open(output_dir + 'merge.txt', 'w') as f: [ f.write('{}: {}\n'.format(k, v)) for k, v in sorted(result_dic.items()) ] f.write('best score\n{}: {}\n'.format(best_score, result_dic[best_score])) with open(output_dir + 'tf.txt', 'w') as f: [f.write(r + '\n') for r in result_dic[best_score]['tf']] elif valid_type == 'TF': """ model1: validファイルなし model2: validファイルなし """ model_file_num = len( glob.glob(os.path.join(model_dir1, 'valid1/model_epoch_*.npz'))) label_dic = {} for i in range(1, model_file_num + 1): label_dic[i] = [] for valid in [2, 3, 4, 5, 1]: label, _ = dataset.load_score_file( model_dir1 + 'valid{}/model_epoch_{}'.format(valid, i)) label_dic[i].append(label) for j in range(1, model_file_num + 1): _, align = dataset.load_score_file(model_dir2 + 'model_epoch_{}'.format(j)) align_sub_lit = dataset.split_valid_data(align, valid_num) # 5-fold crossvalidationでvalid, testのインデックスを指定している # 1: [4, 5]は1回目のテストでは4番目のデータをvalidation用,5番目のデータをテストで使用する order = {1: [4, 5], 2: [5, 1], 3: [1, 2], 4: [2, 3], 5: [3, 4]} for i in tqdm(range(1, model_file_num + 1)): for j in range(1, model_file_num + 1): info = [] for ite, v in order.items(): _, dev_data, test_data = dataset.separate_train_dev_test( data_sub_lit, ite) dev_label = label_dic[i][v[0] - 1] test_label = label_dic[i][v[1] - 1] _, dev_align, test_align = dataset.separate_train_dev_test( align_sub_lit, ite) best_param_dic = evaluater.param_search( dev_label, dev_align, dev_data) param = max(best_param_dic, key=lambda x: best_param_dic[x]['macro']) init, mix = evaluate.key_to_param(param) dev_score = round(best_param_dic[param]['macro'], 3) rate, count, tf_lit, macro, micro = evaluater.eval_param( test_label, test_align, test_data, init, mix) test_macro_score = round(macro, 3) test_micro_score = round(micro, 3) info.append({ 'dev_score': dev_score, 'param': param, 'macro': test_macro_score, 'micro': test_micro_score, 'tf': tf_lit }) ave_dev_score, ave_macro_score, ave_micro_score = 0, 0, 0 param = [] tf_lit = [] for v, r in enumerate(info, start=1): ave_dev_score += r['dev_score'] ave_macro_score += r['macro'] ave_micro_score += r['micro'] param.append(r['param']) tf_lit.extend(r['tf']) ave_dev_score = round(ave_dev_score / valid_num, 3) ave_macro_score = round(ave_macro_score / valid_num, 3) ave_micro_score = round(ave_micro_score / valid_num, 3) key = 'label{}_enc{}'.format(i, j) result_dic[key] = { 'dev': ave_dev_score, 'micro': ave_micro_score, 'macro': ave_macro_score, 'param': ' '.join(param), 'tf': tf_lit } best_score = max(result_dic, key=lambda x: result_dic[x]['dev']) with open(output_dir + 'merge.txt', 'w') as f: [ f.write('{}: {}\n'.format(k, v)) for k, v in sorted(result_dic.items()) ] f.write('best score\n{}: {}\n'.format(best_score, result_dic[best_score])) with open(output_dir + 'tf.txt', 'w') as f: [f.write(r + '\n') for r in result_dic[best_score]['tf']]