def do_make_data(config): # raw_input("Press Enter to Continue 222") save_prefix_dir, save_prefix_fn = os.path.split(config.data.save_prefix) ensure_path(save_prefix_dir) config_fn = config.data.save_prefix + ".data.config" voc_fn = config.data.save_prefix + ".voc" data_fn = config.data.save_prefix + ".data.json.gz" # valid_data_fn = config.save_prefix + "." + config.model + ".valid.data.npz" # voc_fn_src = config.save_prefix + ".src.voc" # voc_fn_tgt = config.save_prefix + ".tgt.voc" files_that_will_be_created = [config_fn, voc_fn, data_fn] if config.processing.bpe_src is not None: bpe_data_file_src = config.data.save_prefix + ".src.bpe" files_that_will_be_created.append(bpe_data_file_src) if config.processing.bpe_tgt is not None: bpe_data_file_tgt = config.data.save_prefix + ".tgt.bpe" files_that_will_be_created.append(bpe_data_file_tgt) if config.processing.joint_bpe is not None: bpe_data_file_joint = config.data.save_prefix + ".joint.bpe" files_that_will_be_created.append(bpe_data_file_joint) already_existing_files = [] for filename in files_that_will_be_created: # , valid_data_fn]: if os.path.exists(filename): already_existing_files.append(filename) if len(already_existing_files ) > 0 and not config.processing.force_overwrite: print "Warning: existing files are going to be replaced: ", already_existing_files raw_input("Press Enter to Continue") if config.processing.use_voc is not None: log.info("loading voc from %s" % config.processing.use_voc) # src_voc, tgt_voc = json.load(open(config.use_voc)) # src_pp = processors.load_pp_from_data(json.load(open(src_voc))) # tgt_pp = IndexingPrePostProcessor.make_from_serializable(tgt_voc) bi_idx = processors.load_pp_pair_from_file(config.processing.use_voc) else: bi_idx = processors.BiIndexingPrePostProcessor( voc_limit1=config.processing.src_voc_size, voc_limit2=config.processing.tgt_voc_size) pp = processors.BiProcessorChain() if config.processing.source_char_conversion is not None: log.info("using source char conversion %s", config.processing.source_char_conversion) char_conv_dic = json.load( open(config.processing.source_char_conversion)) pp.add_src_processor( processors.SourceCharacterConverter(char_conv_dic)) if config.processing.latin_tgt: pp.add_tgt_processor( processors.LatinScriptProcess(config.processing.latin_type)) if config.processing.latin_src: pp.add_src_processor( processors.LatinScriptProcess(config.processing.latin_type)) pp.add_src_processor( processors.SimpleSegmenter( config.processing.src_segmentation_type)) if config.processing.bpe_src is not None: pp.add_src_processor( processors.BPEProcessing(bpe_data_file=bpe_data_file_src, symbols=config.processing.bpe_src, separator="._@@@")) pp.add_tgt_processor( processors.SimpleSegmenter( config.processing.tgt_segmentation_type)) if config.processing.bpe_tgt is not None: pp.add_tgt_processor( processors.BPEProcessing(bpe_data_file=bpe_data_file_tgt, symbols=config.processing.bpe_tgt, separator="._@@@")) if config.processing.joint_bpe is not None: pp.add_biprocessor( processors.JointBPEBiProcessor( bpe_data_file=bpe_data_file_joint, symbols=config.processing.joint_bpe, separator="._@@@")) bi_idx.add_preprocessor(pp) def load_data(src_fn, tgt_fn, max_nb_ex=None, infos_dict=None): training_data, stats_src, stats_tgt = processors.build_dataset_pp( src_fn, tgt_fn, bi_idx, max_nb_ex=max_nb_ex) log.info("src data stats:\n%s", stats_src.make_report()) log.info("tgt data stats:\n%s", stats_tgt.make_report()) if infos_dict is not None: infos_dict["src"] = stats_src.report_as_obj() infos_dict["tgt"] = stats_tgt.report_as_obj() return training_data infos = collections.OrderedDict() infos["train"] = collections.OrderedDict() log.info("loading training data from %s and %s" % (config.data.src_fn, config.data.tgt_fn)) training_data = load_data(config.data.src_fn, config.data.tgt_fn, max_nb_ex=config.data.max_nb_ex, infos_dict=infos["train"]) dev_data = None if config.data.dev_src is not None: log.info("loading dev data from %s and %s" % (config.data.dev_src, config.data.dev_tgt)) infos["dev"] = collections.OrderedDict() dev_data = load_data(config.data.dev_src, config.data.dev_tgt, infos_dict=infos["dev"]) test_data = None if config.data.test_src is not None: log.info("loading test data from %s and %s" % (config.data.test_src, config.data.test_tgt)) infos["test"] = collections.OrderedDict() test_data = load_data(config.data.test_src, config.data.test_tgt, infos_dict=infos["test"]) config.insert_section("infos", infos, even_if_readonly=True, keep_at_bottom="metadata", overwrite=False) # if config.shuffle: # log.info("shuffling data") # if config.enable_fast_shuffle: # shuffle_in_unison_faster(data_input, data_target) # else: # data_input, data_target = shuffle_in_unison(data_input, data_target) log.info("saving config to %s" % config_fn) config.save_to(config_fn) # json.dump(config.__dict__, open(config_fn, "w"), # indent=2, separators=(',', ': ')) log.info("saving voc to %s" % voc_fn) processors.save_pp_pair_to_file(bi_idx, voc_fn) # json.dump([src_pp.to_serializable(), tgt_pp.to_serializable()], # open(voc_fn, "w"), indent=2, separators=(',', ': ')) log.info("saving train_data to %s" % data_fn) data_all = {"train": training_data} if test_data is not None: data_all["test"] = test_data if dev_data is not None: data_all["dev"] = dev_data json.dump(data_all, gzip.open(data_fn, "wb"), indent=2, separators=(',', ': '))
def do_eval(config_eval): src_fn = config_eval.process.src_fn tgt_fn = config_eval.output.tgt_fn mode = config_eval.method.mode gpu = config_eval.process.gpu dest_fn = config_eval.process.dest_fn mb_size = config_eval.process.mb_size nb_steps = config_eval.method.nb_steps nb_steps_ratio = config_eval.method.nb_steps_ratio max_nb_ex = config_eval.process.max_nb_ex nbest_to_rescore = config_eval.output.nbest_to_rescore nbest = config_eval.output.nbest beam_width = config_eval.method.beam_width beam_pruning_margin = config_eval.method.beam_pruning_margin beam_score_length_normalization = config_eval.method.beam_score_length_normalization beam_score_length_normalization_strength = config_eval.method.beam_score_length_normalization_strength beam_score_coverage_penalty = config_eval.beam_score_coverage_penalty beam_score_coverage_penalty_strength = config_eval.beam_score_coverage_penalty_strength always_consider_eos_and_placeholders = config_eval.method.always_consider_eos_and_placeholders if config_eval.process.force_placeholders: # making it default for now always_consider_eos_and_placeholders = True post_score_length_normalization = config_eval.method.post_score_length_normalization post_score_length_normalization_strength = config_eval.method.post_score_length_normalization_strength groundhog = config_eval.method.groundhog tgt_unk_id = config_eval.output.tgt_unk_id force_finish = config_eval.method.force_finish prob_space_combination = config_eval.method.prob_space_combination generate_attention_html = config_eval.output.generate_attention_html rich_output_filename = config_eval.output.rich_output_filename ref = config_eval.output.ref dic = config_eval.output.dic normalize_unicode_unk = config_eval.output.normalize_unicode_unk attempt_to_relocate_unk_source = config_eval.output.attempt_to_relocate_unk_source remove_unk = config_eval.output.remove_unk post_score_coverage_penalty = config_eval.method.post_score_coverage_penalty post_score_coverage_penalty_strength = config_eval.method.post_score_coverage_penalty_strength time_start = time.perf_counter() astar_params = beam_search.AStarParams( astar_batch_size=config_eval.method.astar_batch_size, astar_max_queue_size=config_eval.method.astar_max_queue_size, astar_prune_margin=config_eval.method.astar_prune_margin, astar_prune_ratio=config_eval.method.astar_prune_ratio, length_normalization_exponent=config_eval.method. astar_length_normalization_exponent, length_normalization_constant=config_eval.method. astar_length_normalization_constant, astar_priority_eval_string=config_eval.method. astar_priority_eval_string, max_length_diff=config_eval.method.astar_max_length_diff) make_constraints_dict = None if config_eval.process.server is None: encdec_list, eos_idx, src_indexer, tgt_indexer, reverse_encdec, model_infos_list = create_encdec( config_eval) eval_dir_placeholder = "@eval@/" if dest_fn.startswith(eval_dir_placeholder): if config_eval.trained_model is not None: training_model_filename = config_eval.trained_model else: if len(config_eval.process.load_model_config) == 0: log.error("Cannot detect value for $eval$ placeholder") sys.exit(1) training_model_filename = config_eval.process.load_model_config[ 0] eval_dir = os.path.join(os.path.dirname(training_model_filename), "eval") dest_fn = os.path.join(eval_dir, dest_fn[len(eval_dir_placeholder):]) log.info("$eval$ detected. dest_fn is: %s ", dest_fn) ensure_path(eval_dir) if src_fn is None: (dev_src_from_config, dev_tgt_from_config, test_src_from_config, test_tgt_from_config ) = get_src_tgt_dev_from_config_eval(config_eval) if test_src_from_config is None: log.error( "Could not find value for source text, either on command line or in config files" ) sys.exit(1) log.info("using files from config as src:%s", test_src_from_config) src_fn = test_src_from_config if ref is None: log.info("using files from config as ref:%s", test_tgt_from_config) ref = test_tgt_from_config if config_eval.process.force_placeholders: if make_constraints_dict is None: make_constraints_dict = {} make_constraints_dict[ "ph_constraint"] = placeholder_constraints_builder( src_indexer, tgt_indexer, units_placeholders=config_eval.process.units_placeholders) if config_eval.process.bilingual_dic_for_reranking: if make_constraints_dict is None: make_constraints_dict = {} print("**making ja en dic") ja_en_search, en_ja_search = dictionnary_handling.load_search_trie( config_eval.process.bilingual_dic_for_reranking, config_eval.process.invert_bilingual_dic_for_reranking) print("**define constraints") make_constraints_dict[ "dic_constraint"] = dictionnary_handling.make_constraint( ja_en_search, en_ja_search, tgt_indexer) elif False: re_word = re.compile(r"[A-Za-z]+") re_digits = re.compile(r"\d+") def unsegment(s): res = [] for w in s.split(" "): if w.startswith("▁"): w = " " + w[1:] res.append(w) return "".join(res) def make_constraints(src, src_seq): line_src = unsegment(src) line_src = unicodedata.normalize('NFKC', line_src) word_list = [ word for word in re_word.findall(line_src) if len(word) > 3 ] digit_list = [ digit for digit in re_digits.findall(line_src) if len(digit) > 2 ] if len(word_list) == 0 and len(digit_list) == 0: def constraint_fn(tgt_seq): return 1 else: def constraint_fn(tgt_seq): tgt = tgt_indexer.deconvert(tgt_seq) line_tgt = unsegment(tgt) line_tgt = unicodedata.normalize('NFKC', line_tgt) matched_word = 0 for word in word_list: if word in line_ref: matched_word += 1 matched_digit = 0 for digit in digit_list: if digit in line_ref: matched_digit += 1 if matched_word == len( word_list) and matched_digit == len( digit_list): return 1 else: return (matched_word + matched_digit) / ( len(word_list) + len(digit_list)) return constraint_fn else: make_constraints_dict = None log.info("opening source file %s" % src_fn) preprocessed_input = build_dataset_one_side_pp( src_fn, src_pp=src_indexer, max_nb_ex=max_nb_ex, make_constraints_dict=make_constraints_dict) if make_constraints_dict is not None: src_data, stats_src_pp, constraints_list = preprocessed_input else: src_data, stats_src_pp = preprocessed_input constraints_list = None log.info("src data stats:\n%s", stats_src_pp.make_report()) translation_infos = OrderedNamespace() translation_infos["src"] = src_fn translation_infos["tgt"] = tgt_fn translation_infos["ref"] = ref for num_model, model_infos in enumerate(model_infos_list): translation_infos["model%i" % num_model] = model_infos if dest_fn is not None: save_eval_config_fn = dest_fn + ".eval.init.config.json" log.info("Saving initial eval config to %s" % save_eval_config_fn) config_eval.save_to(save_eval_config_fn) # log.info("%i sentences loaded" % make_data_infos.nb_ex) # log.info("#tokens src: %i of which %i (%f%%) are unknown"%(make_data_infos.total_token, # make_data_infos.total_count_unk, # float(make_data_infos.total_count_unk * 100) / # make_data_infos.total_token)) tgt_data = None if tgt_fn is not None: log.info("opening target file %s" % tgt_fn) tgt_data, stats_tgt_pp = build_dataset_one_side_pp(tgt_fn, src_pp=tgt_indexer, max_nb_ex=max_nb_ex) log.info("tgt data stats:\n%s", stats_tgt_pp.make_report()) # log.info("%i sentences loaded"%make_data_infos.nb_ex) # log.info("#tokens src: %i of which %i (%f%%) are unknown"%(make_data_infos.total_token, # make_data_infos.total_count_unk, # float(make_data_infos.total_count_unk * 100) / # make_data_infos.total_token)) # translations = greedy_batch_translate(encdec, eos_idx, src_data, batch_size = mb_size, gpu = args.gpu) time_all_loaded = time.perf_counter() if mode == "translate": log.info("writing translation of to %s" % dest_fn) with cuda.get_device_from_id(gpu): assert len(encdec_list) == 1 translations = greedy_batch_translate( encdec_list[0], eos_idx, src_data, batch_size=mb_size, gpu=gpu, nb_steps=nb_steps, use_chainerx=config_eval.process.use_chainerx) out = io.open(dest_fn, "wt", encoding="utf8") for t in translations: if t[-1] == eos_idx: t = t[:-1] ct = tgt_indexer.deconvert(t, unk_tag="#T_UNK#") # ct = convert_idx_to_string(t, tgt_voc + ["#T_UNK#"]) out.write(ct + "\n") elif mode == "beam_search" or mode == "eval_bleu" or mode == "astar_search" or mode == "astar_eval_bleu": if config_eval.process.server is not None: from nmt_chainer.translation.server import do_start_server do_start_server(config_eval) else: def translate_closure(beam_width, nb_steps_ratio): beam_search_params = beam_search.BeamSearchParams( beam_width=beam_width, beam_pruning_margin=beam_pruning_margin, beam_score_coverage_penalty=beam_score_coverage_penalty, beam_score_coverage_penalty_strength= beam_score_coverage_penalty_strength, beam_score_length_normalization= beam_score_length_normalization, beam_score_length_normalization_strength= beam_score_length_normalization_strength, force_finish=force_finish, use_unfinished_translation_if_none_found=True, always_consider_eos_and_placeholders= always_consider_eos_and_placeholders) translate_to_file_with_beam_search( dest_fn, gpu, encdec_list, eos_idx, src_data, beam_search_params=beam_search_params, nb_steps=nb_steps, nb_steps_ratio=nb_steps_ratio, post_score_length_normalization= post_score_length_normalization, post_score_length_normalization_strength= post_score_length_normalization_strength, post_score_coverage_penalty=post_score_coverage_penalty, post_score_coverage_penalty_strength= post_score_coverage_penalty_strength, groundhog=groundhog, tgt_unk_id=tgt_unk_id, tgt_indexer=tgt_indexer, prob_space_combination=prob_space_combination, reverse_encdec=reverse_encdec, generate_attention_html=generate_attention_html, src_indexer=src_indexer, rich_output_filename=rich_output_filename, unprocessed_output_filename=dest_fn + ".unprocessed", nbest=nbest, constraints_fn_list=constraints_list, use_astar=(mode == "astar_search" or mode == "astar_eval_bleu"), astar_params=astar_params, use_chainerx=config_eval.process.use_chainerx) translation_infos["dest"] = dest_fn translation_infos["unprocessed"] = dest_fn + ".unprocessed" if mode == "eval_bleu" or mode == "astar_eval_bleu": if ref is not None: bc = bleu_computer.get_bc_from_files(ref, dest_fn) print("bleu before unk replace:", bc) translation_infos["bleu"] = bc.bleu() translation_infos["bleu_infos"] = str(bc) else: print("bleu before unk replace: No Ref Provided") from nmt_chainer.utilities import replace_tgt_unk replace_tgt_unk.replace_unk( dest_fn, src_fn, dest_fn + ".unk_replaced", dic, remove_unk, normalize_unicode_unk, attempt_to_relocate_unk_source) translation_infos[ "unk_replaced"] = dest_fn + ".unk_replaced" if ref is not None: bc = bleu_computer.get_bc_from_files( ref, dest_fn + ".unk_replaced") print("bleu after unk replace:", bc) translation_infos["post_unk_bleu"] = bc.bleu() translation_infos["post_unk_bleu_infos"] = str(bc) else: print("bleu before unk replace: No Ref Provided") return -bc.bleu() else: return None if config_eval.process.do_hyper_param_search is not None: study_filename, study_name, n_trials = do_hyper_param_search n_trials = int(n_trials) import optuna def objective(trial): nb_steps_ratio = trial.suggest_uniform( 'nb_steps_ratio', 0.9, 3.5) beam_width = trial.suggest_int("beam_width", 2, 50) return translate_closure(beam_width, nb_steps_ratio) study = optuna.create_study(study_name=study_name, storage="sqlite:///" + study_filename) study.optimize(objective, n_trials=n_trials) print(study.best_params) print(study.best_value) print(study.best_trial) else: # hyperparams optim translate_closure(beam_width, nb_steps_ratio) elif mode == "translate_attn": log.info("writing translation + attention as html to %s" % dest_fn) with cuda.get_device_from_id(gpu): assert len(encdec_list) == 1 translations, attn_all = greedy_batch_translate( encdec_list[0], eos_idx, src_data, batch_size=mb_size, gpu=gpu, get_attention=True, nb_steps=nb_steps, use_chainerx=config_eval.process.use_chainerx) # tgt_voc_with_unk = tgt_voc + ["#T_UNK#"] # src_voc_with_unk = src_voc + ["#S_UNK#"] assert len(translations) == len(src_data) assert len(attn_all) == len(src_data) attn_vis = AttentionVisualizer() for num_t in six.moves.range(len(src_data)): src_idx_list = src_data[num_t] tgt_idx_list = translations[num_t][:-1] attn = attn_all[num_t] # assert len(attn) == len(tgt_idx_list) src_w = src_indexer.deconvert_swallow( src_idx_list, unk_tag="#S_UNK#") + ["SUM_ATTN"] tgt_w = tgt_indexer.deconvert_swallow(tgt_idx_list, unk_tag="#T_UNK#") # src_w = [src_voc_with_unk[idx] for idx in src_idx_list] + ["SUM_ATTN"] # tgt_w = [tgt_voc_with_unk[idx] for idx in tgt_idx_list] # for j in six.moves.range(len(tgt_idx_list)): # tgt_idx_list.append(tgt_voc_with_unk[t_and_attn[j][0]]) # # print([src_voc_with_unk[idx] for idx in src_idx_list], tgt_idx_list) attn_vis.add_plot(src_w, tgt_w, attn) attn_vis.make_plot(dest_fn) elif mode == "align": import nmt_chainer.utilities.visualisation as visualisation assert tgt_data is not None assert len(tgt_data) == len(src_data) log.info("writing alignment as html to %s" % dest_fn) with cuda.get_device_from_id(gpu): assert len(encdec_list) == 1 loss, attn_all = batch_align( encdec_list[0], eos_idx, list(six.moves.zip(src_data, tgt_data)), batch_size=mb_size, gpu=gpu, use_chainerx=config_eval.process.use_chainerx) # tgt_voc_with_unk = tgt_voc + ["#T_UNK#"] # src_voc_with_unk = src_voc + ["#S_UNK#"] assert len(attn_all) == len(src_data) plots_list = [] for num_t in six.moves.range(len(src_data)): src_idx_list = src_data[num_t] tgt_idx_list = tgt_data[num_t] attn = attn_all[num_t] # assert len(attn) == len(tgt_idx_list) alignment = np.zeros((len(src_idx_list) + 1, len(tgt_idx_list))) sum_al = [0] * len(tgt_idx_list) for i in six.moves.range(len(src_idx_list)): for j in six.moves.range(len(tgt_idx_list)): alignment[i, j] = attn[j][i] sum_al[j] += alignment[i, j] for j in six.moves.range(len(tgt_idx_list)): alignment[len(src_idx_list), j] = sum_al[j] src_w = src_indexer.deconvert_swallow( src_idx_list, unk_tag="#S_UNK#") + ["SUM_ATTN"] tgt_w = tgt_indexer.deconvert_swallow(tgt_idx_list, unk_tag="#T_UNK#") # src_w = [src_voc_with_unk[idx] for idx in src_idx_list] + ["SUM_ATTN"] # tgt_w = [tgt_voc_with_unk[idx] for idx in tgt_idx_list] # for j in six.moves.range(len(tgt_idx_list)): # tgt_idx_list.append(tgt_voc_with_unk[t_and_attn[j][0]]) # # print([src_voc_with_unk[idx] for idx in src_idx_list], tgt_idx_list) p1 = visualisation.make_alignment_figure(src_w, tgt_w, alignment) # p2 = visualisation.make_alignment_figure( # [src_voc_with_unk[idx] for idx in src_idx_list], tgt_idx_list, alignment) plots_list.append(p1) p_all = visualisation.Column(*plots_list) visualisation.output_file(dest_fn) visualisation.show(p_all) # for t in translations_with_attn: # for x, attn in t: # print(x, attn) # out.write(convert_idx_to_string([x for x, attn in t], tgt_voc + ["#T_UNK#"]) + "\n") elif mode == "score_nbest": log.info("opening nbest file %s" % nbest_to_rescore) nbest_f = io.open(nbest_to_rescore, 'rt', encoding="utf8") nbest_list = [[]] for line in nbest_f: line = line.strip().split("|||") num_src = int(line[0].strip()) if num_src >= len(nbest_list): assert num_src == len(nbest_list) if max_nb_ex is not None and num_src >= max_nb_ex: break nbest_list.append([]) else: assert num_src == len(nbest_list) - 1 sentence = line[1].strip() nbest_list[-1].append(sentence.split(" ")) log.info("found nbest lists for %i source sentences" % len(nbest_list)) nbest_converted, make_data_infos = make_data.build_dataset_for_nbest_list_scoring( tgt_indexer, nbest_list) log.info("total %i sentences loaded" % make_data_infos.nb_ex) log.info("#tokens src: %i of which %i (%f%%) are unknown" % (make_data_infos.total_token, make_data_infos.total_count_unk, float(make_data_infos.total_count_unk * 100) / make_data_infos.total_token)) if len(nbest_list) != len(src_data[:max_nb_ex]): log.warn("mismatch in lengths nbest vs src : %i != %i" % (len(nbest_list), len(src_data[:max_nb_ex]))) assert len(nbest_list) == len(src_data[:max_nb_ex]) log.info("starting scoring") from nmt_chainer.utilities import utils res = [] for num in six.moves.range(len(nbest_converted)): if num % 200 == 0: print(num, file=sys.stderr) elif num % 50 == 0: print("*", file=sys.stderr) res.append([]) src, tgt_list = src_data[num], nbest_converted[num] src_batch, src_mask = utils.make_batch_src([src], gpu=gpu, volatile="on") assert len(encdec_list) == 1 scorer = encdec_list[0].nbest_scorer(src_batch, src_mask) nb_batches = (len(tgt_list) + mb_size - 1) // mb_size for num_batch in six.moves.range(nb_batches): tgt_batch, arg_sort = utils.make_batch_tgt( tgt_list[num_batch * nb_batches:(num_batch + 1) * nb_batches], eos_idx=eos_idx, gpu=gpu, volatile="on", need_arg_sort=True) scores, attn = scorer(tgt_batch) scores, _ = scores scores = scores.data assert len(arg_sort) == len(scores) de_sorted_scores = [None] * len(scores) for xpos in six.moves.range(len(arg_sort)): original_pos = arg_sort[xpos] de_sorted_scores[original_pos] = scores[xpos] res[-1] += de_sorted_scores print('', file=sys.stderr) log.info("writing scores to %s" % dest_fn) out = io.open(dest_fn, "wt", encoding="utf8") for num in six.moves.range(len(res)): for score in res[num]: out.write("%i %f\n" % (num, score)) time_end = time.perf_counter() translation_infos["loading_time"] = time_all_loaded - time_start translation_infos["translation_time"] = time_end - time_all_loaded translation_infos["total_time"] = time_end - time_start if dest_fn is not None: config_eval_session = config_eval.copy(readonly=False) config_eval_session.add_section("translation_infos", keep_at_bottom="metadata") config_eval_session["translation_infos"] = translation_infos config_eval_session.set_metadata_modified_time() save_eval_config_fn = dest_fn + ".eval.config.json" log.info("Saving eval config to %s" % save_eval_config_fn) config_eval_session.save_to(save_eval_config_fn)
def do_train(config_training): src_indexer, tgt_indexer = load_voc_and_update_training_config(config_training) save_prefix = config_training.training_management.save_prefix output_files_dict = {} output_files_dict["train_config"] = save_prefix + ".train.config" output_files_dict["model_ckpt"] = save_prefix + ".model." + "ckpt" + ".npz" output_files_dict["model_final"] = save_prefix + \ ".model." + "final" + ".npz" output_files_dict["model_best"] = save_prefix + ".model." + "best" + ".npz" output_files_dict["model_best_loss"] = save_prefix + ".model." + "best_loss" + ".npz" # output_files_dict["model_ckpt_config"] = save_prefix + ".model." + "ckpt" + ".config" # output_files_dict["model_final_config"] = save_prefix + ".model." + "final" + ".config" # output_files_dict["model_best_config"] = save_prefix + ".model." + "best" + ".config" # output_files_dict["model_best_loss_config"] = save_prefix + ".model." + "best_loss" + ".config" output_files_dict["test_translation_output"] = save_prefix + ".test.out" output_files_dict["test_src_output"] = save_prefix + ".test.src.out" output_files_dict["dev_translation_output"] = save_prefix + ".dev.out" output_files_dict["dev_src_output"] = save_prefix + ".dev.src.out" output_files_dict["valid_translation_output"] = save_prefix + ".valid.out" output_files_dict["valid_src_output"] = save_prefix + ".valid.src.out" output_files_dict["sqlite_db"] = save_prefix + ".result.sqlite" output_files_dict["optimizer_ckpt"] = save_prefix + ".optimizer." + "ckpt" + ".npz" output_files_dict["optimizer_final"] = save_prefix + ".optimizer." + "final" + ".npz" save_prefix_dir, save_prefix_fn = os.path.split(save_prefix) ensure_path(save_prefix_dir) already_existing_files = [] for key_info, filename in output_files_dict.iteritems(): # , valid_data_fn]: if os.path.exists(filename): already_existing_files.append(filename) if len(already_existing_files) > 0: print "Warning: existing files are going to be replaced / updated: ", already_existing_files if not config_training.training_management.force_overwrite: raw_input("Press Enter to Continue") save_train_config_fn = output_files_dict["train_config"] log.info("Saving training config to %s" % save_train_config_fn) config_training.save_to(save_train_config_fn) # json.dump(config_training, open(save_train_config_fn, "w"), indent=2, separators=(',', ': ')) Vi = len(src_indexer) # + UNK Vo = len(tgt_indexer) # + UNK eos_idx = Vo data_fn = config_training.data.data_fn log.info("loading training data from %s" % data_fn) training_data_all = json.load(gzip.open(data_fn, "rb")) training_data = training_data_all["train"] log.info("loaded %i sentences as training data" % len(training_data)) if "test" in training_data_all: test_data = training_data_all["test"] log.info("Found test data: %i sentences" % len(test_data)) else: test_data = None log.info("No test data found") if "dev" in training_data_all: dev_data = training_data_all["dev"] log.info("Found dev data: %i sentences" % len(dev_data)) else: dev_data = None log.info("No dev data found") if "valid" in training_data_all: valid_data = training_data_all["valid"] log.info("Found valid data: %i sentences" % len(valid_data)) else: valid_data = None log.info("No valid data found") max_src_tgt_length = config_training.training_management.max_src_tgt_length if max_src_tgt_length is not None: log.info("filtering sentences of length larger than %i" % (max_src_tgt_length)) filtered_training_data = [] nb_filtered = 0 for src, tgt in training_data: if len(src) <= max_src_tgt_length and len( tgt) <= max_src_tgt_length: filtered_training_data.append((src, tgt)) else: nb_filtered += 1 log.info("filtered %i sentences of length larger than %i" % (nb_filtered, max_src_tgt_length)) training_data = filtered_training_data if not config_training.training.no_shuffle_of_training_data: log.info("shuffling") import random random.shuffle(training_data) log.info("done") encdec, _, _, _ = create_encdec_and_indexers_from_config_dict(config_training, src_indexer=src_indexer, tgt_indexer=tgt_indexer, load_config_model="if_exists" if config_training.training_management.resume else "no") # create_encdec_from_config_dict(config_training.model, src_indexer, tgt_indexer, # load_config_model = "if_exists" if config_training.training_management.resume else "no") # if config_training.training_management.resume: # if "model_parameters" not in config_training: # log.error("cannot find model parameters in config file") # if config_training.model_parameters.type == "model": # model_filename = config_training.model_parameters.filename # log.info("resuming from model parameters %s" % model_filename) # serializers.load_npz(model_filename, encdec) if config_training.training_management.load_model is not None: log.info("loading model parameters from %s", config_training.training_management.load_model) serializers.load_npz(config_training.training_management.load_model, encdec) gpu = config_training.training_management.gpu if gpu is not None: encdec = encdec.to_gpu(gpu) if config_training.training.optimizer == "adadelta": optimizer = optimizers.AdaDelta() elif config_training.training.optimizer == "adam": optimizer = optimizers.Adam() elif config_training.training.optimizer == "adagrad": optimizer = optimizers.AdaGrad(lr=config_training.training.learning_rate) elif config_training.training.optimizer == "sgd": optimizer = optimizers.SGD(lr=config_training.training.learning_rate) elif config_training.training.optimizer == "momentum": optimizer = optimizers.MomentumSGD(lr=config_training.training.learning_rate, momentum=config_training.training.momentum) elif config_training.training.optimizer == "nesterov": optimizer = optimizers.NesterovAG(lr=config_training.training.learning_rate, momentum=config_training.training.momentum) elif config_training.training.optimizer == "rmsprop": optimizer = optimizers.RMSprop(lr=config_training.training.learning_rate) elif config_training.training.optimizer == "rmspropgraves": optimizer = optimizers.RMSpropGraves(lr=config_training.training.learning_rate, momentum=config_training.training.momentum) else: raise NotImplemented with cuda.get_device(gpu): optimizer.setup(encdec) if config_training.training.l2_gradient_clipping is not None and config_training.training.l2_gradient_clipping > 0: optimizer.add_hook(chainer.optimizer.GradientClipping( config_training.training.l2_gradient_clipping)) if config_training.training.hard_gradient_clipping is not None and config_training.training.hard_gradient_clipping > 0: optimizer.add_hook(chainer.optimizer.GradientHardClipping( *config_training.training.hard_gradient_clipping)) if config_training.training.weight_decay is not None: optimizer.add_hook( chainer.optimizer.WeightDecay( config_training.training.weight_decay)) if config_training.training_management.load_optimizer_state is not None: with cuda.get_device(gpu): log.info("loading optimizer parameters from %s", config_training.training_management.load_optimizer_state) serializers.load_npz(config_training.training_management.load_optimizer_state, optimizer) if config_training.training_management.timer_hook: timer_hook = profiling_tools.MyTimerHook else: import contextlib @contextlib.contextmanager def timer_hook(): yield import training_chainer with cuda.get_device(gpu): with timer_hook() as timer_infos: if config_training.training_management.max_nb_iters is not None: stop_trigger = ( config_training.training_management.max_nb_iters, "iteration") if config_training.training_management.max_nb_epochs is not None: log.warn( "max_nb_iters and max_nb_epochs both specified. Only max_nb_iters will be considered.") elif config_training.training_management.max_nb_epochs is not None: stop_trigger = ( config_training.training_management.max_nb_epochs, "epoch") else: stop_trigger = None training_chainer.train_on_data_chainer(encdec, optimizer, training_data, output_files_dict, src_indexer, tgt_indexer, eos_idx=eos_idx, config_training=config_training, stop_trigger=stop_trigger, test_data=test_data, dev_data=dev_data, valid_data=valid_data )
def do_recap(args): data_dir = os.path.join(args.target_dir, "data") train_dir = os.path.join(args.target_dir, "train") eval_dir = os.path.join(args.target_dir, "eval") ensure_path(args.target_dir) ensure_path(data_dir) ensure_path(train_dir) ensure_path(eval_dir) index = open(os.path.join(args.target_dir, "index.html"), "w") index.write("<html><body>") data_urlname_list = defaultdict(list) train_urlname_list = defaultdict(list) data_config_fn_list = [] eval_config_fn_list = [] itdir = os.walk(args.source_dir) data_to_train = defaultdict(list) train_to_data = {} for current_dir, dirs, files in itdir: for fn in files: if fn.endswith(train_config_suffix): fn_full = os.path.join(current_dir, fn) urlname, data_prefix, time_last_exp, infos, description = process_train_config( fn_full, train_dir) data_to_train[data_prefix].append(urlname) train_to_data[urlname] = data_prefix train_urlname_list[data_prefix].append( (time_last_exp, urlname, infos, description)) elif fn.endswith(data_config_suffix): fn_full = os.path.join(current_dir, fn) data_config_fn_list.append(fn_full) elif fn.endswith(eval_config_suffix): fn_full = os.path.join(current_dir, fn) eval_config_fn_list.append(fn_full) else: pass data_to_srctgt = {} for fn_full in data_config_fn_list: urlname, data_prefix, src_tgt_fn, time_config_created, src_voc_size, tgt_voc_size = process_data_config( fn_full, data_dir, data_to_train) data_urlname_list[src_tgt_fn].append( (time_config_created, urlname, data_prefix, src_voc_size, tgt_voc_size)) data_to_srctgt[data_prefix] = src_tgt_fn index.write("<h1>DATA</h1><p>") for src_tgt_fn, urlname_list in data_urlname_list.iteritems(): index.write("<h3>** src: %s | tgt: %s **</h3>" % src_tgt_fn) urlname_list.sort(reverse=True) for time_config_created, urlname, data_prefix, src_voc_size, tgt_voc_size in urlname_list: index.write('%s s:%i t:%i \t<a href = "data/%s">%s</a><p/>' % (time.ctime(time_config_created), src_voc_size, tgt_voc_size, urlname, data_prefix)) train_urlname_list_src_tgt = defaultdict(list) for data_path, urlname_list in train_urlname_list.iteritems(): if data_path in data_to_srctgt: train_urlname_list_src_tgt[ data_to_srctgt[data_path]] += urlname_list else: train_urlname_list_src_tgt[("unk", "unk")] += urlname_list current_time = time.time() index.write("<h1>TRAIN</h1><p>") for src_tgt_fn in sorted( train_urlname_list_src_tgt.keys(), key=lambda x: max(train_urlname_list_src_tgt[x][i][0] for i in xrange( len(train_urlname_list_src_tgt[x]))), reverse=True): urlname_list = train_urlname_list_src_tgt[src_tgt_fn] index.write("<h3>** src: %s | tgt: %s **</h3>" % src_tgt_fn) urlname_list.sort(reverse=True) for time_last_exp, urlname, infos, description in urlname_list: if abs(time_last_exp - current_time) < 3000: recently_updated = True else: recently_updated = False if recently_updated: timestring = "<b>%s [RCT]</b>" % time.ctime(time_last_exp) else: timestring = "%s" % time.ctime(time_last_exp) index.write( '%s <a href = "train/%s">%s</a> [%s]<p/>' % (timestring, urlname, urlname.split(dir_sep)[-1], description)) if infos is not None: for key in sorted(infos.keys()): index.write("%s : %r ||| " % (key, infos[key])) index.write("<p>") index.write("<h1>EVAL</h1><p>") for fn_full in eval_config_fn_list: urlname, desc = process_eval_config(fn_full, eval_dir) index.write('<a href = "eval/%s">%s</a> <b>%f</b> %s [%i]<p/>' % (urlname, urlname.split(dir_sep)[-1], desc["bleu"], desc["description_training"], desc["nb_models_used"])) index.write("</body></html>")
def do_eval(config_eval): src_fn = config_eval.process.src_fn tgt_fn = config_eval.output.tgt_fn mode = config_eval.method.mode gpu = config_eval.process.gpu dest_fn = config_eval.process.dest_fn mb_size = config_eval.process.mb_size nb_steps = config_eval.method.nb_steps nb_steps_ratio = config_eval.method.nb_steps_ratio max_nb_ex = config_eval.process.max_nb_ex nbest_to_rescore = config_eval.output.nbest_to_rescore nbest = config_eval.output.nbest beam_width = config_eval.method.beam_width beam_pruning_margin = config_eval.method.beam_pruning_margin beam_score_length_normalization = config_eval.method.beam_score_length_normalization beam_score_length_normalization_strength = config_eval.method.beam_score_length_normalization_strength beam_score_coverage_penalty = config_eval.beam_score_coverage_penalty beam_score_coverage_penalty_strength = config_eval.beam_score_coverage_penalty_strength post_score_length_normalization = config_eval.method.post_score_length_normalization post_score_length_normalization_strength = config_eval.method.post_score_length_normalization_strength groundhog = config_eval.method.groundhog tgt_unk_id = config_eval.output.tgt_unk_id force_finish = config_eval.method.force_finish prob_space_combination = config_eval.method.prob_space_combination generate_attention_html = config_eval.output.generate_attention_html rich_output_filename = config_eval.output.rich_output_filename ref = config_eval.output.ref dic = config_eval.output.dic normalize_unicode_unk = config_eval.output.normalize_unicode_unk attempt_to_relocate_unk_source = config_eval.output.attempt_to_relocate_unk_source remove_unk = config_eval.output.remove_unk post_score_coverage_penalty = config_eval.method.post_score_coverage_penalty post_score_coverage_penalty_strength = config_eval.method.post_score_coverage_penalty_strength time_start = time.clock() encdec_list, eos_idx, src_indexer, tgt_indexer, reverse_encdec, model_infos_list = create_encdec(config_eval) if config_eval.process.server is None: eval_dir_placeholder = "@eval@/" if dest_fn.startswith(eval_dir_placeholder): if config_eval.trained_model is not None: training_model_filename = config_eval.trained_model else: if len(config_eval.process.load_model_config) == 0: log.error("Cannot detect value for $eval$ placeholder") sys.exit(1) training_model_filename = config_eval.process.load_model_config[0] eval_dir = os.path.join(os.path.dirname(training_model_filename), "eval") dest_fn = os.path.join(eval_dir, dest_fn[len(eval_dir_placeholder):]) log.info("$eval$ detected. dest_fn is: %s ", dest_fn) ensure_path(eval_dir) if src_fn is None: (dev_src_from_config, dev_tgt_from_config, test_src_from_config, test_tgt_from_config) = get_src_tgt_dev_from_config_eval(config_eval) if test_src_from_config is None: log.error("Could not find value for source text, either on command line or in config files") sys.exit(1) log.info("using files from config as src:%s", test_src_from_config) src_fn = test_src_from_config if ref is None: log.info("using files from config as ref:%s", test_tgt_from_config) ref = test_tgt_from_config log.info("opening source file %s" % src_fn) src_data, stats_src_pp = build_dataset_one_side_pp(src_fn, src_pp=src_indexer, max_nb_ex=max_nb_ex) log.info("src data stats:\n%s", stats_src_pp.make_report()) if dest_fn is not None: save_eval_config_fn = dest_fn + ".eval.init.config.json" log.info("Saving initial eval config to %s" % save_eval_config_fn) config_eval.save_to(save_eval_config_fn) translation_infos = OrderedNamespace() # log.info("%i sentences loaded" % make_data_infos.nb_ex) # log.info("#tokens src: %i of which %i (%f%%) are unknown"%(make_data_infos.total_token, # make_data_infos.total_count_unk, # float(make_data_infos.total_count_unk * 100) / # make_data_infos.total_token)) tgt_data = None if tgt_fn is not None: log.info("opening target file %s" % tgt_fn) tgt_data, stats_tgt_pp = build_dataset_one_side_pp(tgt_fn, src_pp=tgt_indexer, max_nb_ex=max_nb_ex) log.info("tgt data stats:\n%s", stats_tgt_pp.make_report()) # log.info("%i sentences loaded"%make_data_infos.nb_ex) # log.info("#tokens src: %i of which %i (%f%%) are unknown"%(make_data_infos.total_token, # make_data_infos.total_count_unk, # float(make_data_infos.total_count_unk * 100) / # make_data_infos.total_token)) # translations = greedy_batch_translate(encdec, eos_idx, src_data, batch_size = mb_size, gpu = args.gpu) translation_infos["src"] = src_fn translation_infos["tgt"] = tgt_fn translation_infos["ref"] = ref for num_model, model_infos in enumerate(model_infos_list): translation_infos["model%i" % num_model] = model_infos time_all_loaded = time.clock() if mode == "translate": log.info("writing translation of to %s" % dest_fn) with cuda.get_device(gpu): assert len(encdec_list) == 1 translations = greedy_batch_translate( encdec_list[0], eos_idx, src_data, batch_size=mb_size, gpu=gpu, nb_steps=nb_steps) out = codecs.open(dest_fn, "w", encoding="utf8") for t in translations: if t[-1] == eos_idx: t = t[:-1] ct = tgt_indexer.deconvert(t, unk_tag="#T_UNK#") # ct = convert_idx_to_string(t, tgt_voc + ["#T_UNK#"]) out.write(ct + "\n") elif mode == "beam_search" or mode == "eval_bleu": if config_eval.process.server is not None: from nmt_chainer.translation.server import do_start_server do_start_server(config_eval) else: translate_to_file_with_beam_search(dest_fn, gpu, encdec_list, eos_idx, src_data, beam_width=beam_width, beam_pruning_margin=beam_pruning_margin, beam_score_coverage_penalty=beam_score_coverage_penalty, beam_score_coverage_penalty_strength=beam_score_coverage_penalty_strength, nb_steps=nb_steps, nb_steps_ratio=nb_steps_ratio, beam_score_length_normalization=beam_score_length_normalization, beam_score_length_normalization_strength=beam_score_length_normalization_strength, post_score_length_normalization=post_score_length_normalization, post_score_length_normalization_strength=post_score_length_normalization_strength, post_score_coverage_penalty=post_score_coverage_penalty, post_score_coverage_penalty_strength=post_score_coverage_penalty_strength, groundhog=groundhog, tgt_unk_id=tgt_unk_id, tgt_indexer=tgt_indexer, force_finish=force_finish, prob_space_combination=prob_space_combination, reverse_encdec=reverse_encdec, generate_attention_html=generate_attention_html, src_indexer=src_indexer, rich_output_filename=rich_output_filename, use_unfinished_translation_if_none_found=True, unprocessed_output_filename=dest_fn + ".unprocessed", nbest=nbest) translation_infos["dest"] = dest_fn translation_infos["unprocessed"] = dest_fn + ".unprocessed" if mode == "eval_bleu": if ref is not None: bc = bleu_computer.get_bc_from_files(ref, dest_fn) print "bleu before unk replace:", bc translation_infos["bleu"] = bc.bleu() translation_infos["bleu_infos"] = str(bc) else: print "bleu before unk replace: No Ref Provided" from nmt_chainer.utilities import replace_tgt_unk replace_tgt_unk.replace_unk(dest_fn, src_fn, dest_fn + ".unk_replaced", dic, remove_unk, normalize_unicode_unk, attempt_to_relocate_unk_source) translation_infos["unk_replaced"] = dest_fn + ".unk_replaced" if ref is not None: bc = bleu_computer.get_bc_from_files(ref, dest_fn + ".unk_replaced") print "bleu after unk replace:", bc translation_infos["post_unk_bleu"] = bc.bleu() translation_infos["post_unk_bleu_infos"] = str(bc) else: print "bleu before unk replace: No Ref Provided" elif mode == "translate_attn": log.info("writing translation + attention as html to %s" % dest_fn) with cuda.get_device(gpu): assert len(encdec_list) == 1 translations, attn_all = greedy_batch_translate( encdec_list[0], eos_idx, src_data, batch_size=mb_size, gpu=gpu, get_attention=True, nb_steps=nb_steps) # tgt_voc_with_unk = tgt_voc + ["#T_UNK#"] # src_voc_with_unk = src_voc + ["#S_UNK#"] assert len(translations) == len(src_data) assert len(attn_all) == len(src_data) attn_vis = AttentionVisualizer() for num_t in xrange(len(src_data)): src_idx_list = src_data[num_t] tgt_idx_list = translations[num_t][:-1] attn = attn_all[num_t] # assert len(attn) == len(tgt_idx_list) src_w = src_indexer.deconvert_swallow(src_idx_list, unk_tag="#S_UNK#") + ["SUM_ATTN"] tgt_w = tgt_indexer.deconvert_swallow(tgt_idx_list, unk_tag="#T_UNK#") # src_w = [src_voc_with_unk[idx] for idx in src_idx_list] + ["SUM_ATTN"] # tgt_w = [tgt_voc_with_unk[idx] for idx in tgt_idx_list] # for j in xrange(len(tgt_idx_list)): # tgt_idx_list.append(tgt_voc_with_unk[t_and_attn[j][0]]) # # print [src_voc_with_unk[idx] for idx in src_idx_list], tgt_idx_list attn_vis.add_plot(src_w, tgt_w, attn) attn_vis.make_plot(dest_fn) elif mode == "align": import nmt_chainer.utilities.visualisation as visualisation assert tgt_data is not None assert len(tgt_data) == len(src_data) log.info("writing alignment as html to %s" % dest_fn) with cuda.get_device(gpu): assert len(encdec_list) == 1 loss, attn_all = batch_align( encdec_list[0], eos_idx, zip(src_data, tgt_data), batch_size=mb_size, gpu=gpu) # tgt_voc_with_unk = tgt_voc + ["#T_UNK#"] # src_voc_with_unk = src_voc + ["#S_UNK#"] assert len(attn_all) == len(src_data) plots_list = [] for num_t in xrange(len(src_data)): src_idx_list = src_data[num_t] tgt_idx_list = tgt_data[num_t] attn = attn_all[num_t] # assert len(attn) == len(tgt_idx_list) alignment = np.zeros((len(src_idx_list) + 1, len(tgt_idx_list))) sum_al = [0] * len(tgt_idx_list) for i in xrange(len(src_idx_list)): for j in xrange(len(tgt_idx_list)): alignment[i, j] = attn[j][i] sum_al[j] += alignment[i, j] for j in xrange(len(tgt_idx_list)): alignment[len(src_idx_list), j] = sum_al[j] src_w = src_indexer.deconvert_swallow(src_idx_list, unk_tag="#S_UNK#") + ["SUM_ATTN"] tgt_w = tgt_indexer.deconvert_swallow(tgt_idx_list, unk_tag="#T_UNK#") # src_w = [src_voc_with_unk[idx] for idx in src_idx_list] + ["SUM_ATTN"] # tgt_w = [tgt_voc_with_unk[idx] for idx in tgt_idx_list] # for j in xrange(len(tgt_idx_list)): # tgt_idx_list.append(tgt_voc_with_unk[t_and_attn[j][0]]) # # print [src_voc_with_unk[idx] for idx in src_idx_list], tgt_idx_list p1 = visualisation.make_alignment_figure( src_w, tgt_w, alignment) # p2 = visualisation.make_alignment_figure( # [src_voc_with_unk[idx] for idx in src_idx_list], tgt_idx_list, alignment) plots_list.append(p1) p_all = visualisation.Column(*plots_list) visualisation.output_file(dest_fn) visualisation.show(p_all) # for t in translations_with_attn: # for x, attn in t: # print x, attn # out.write(convert_idx_to_string([x for x, attn in t], tgt_voc + ["#T_UNK#"]) + "\n") elif mode == "score_nbest": log.info("opening nbest file %s" % nbest_to_rescore) nbest_f = codecs.open(nbest_to_rescore, encoding="utf8") nbest_list = [[]] for line in nbest_f: line = line.strip().split("|||") num_src = int(line[0].strip()) if num_src >= len(nbest_list): assert num_src == len(nbest_list) if max_nb_ex is not None and num_src >= max_nb_ex: break nbest_list.append([]) else: assert num_src == len(nbest_list) - 1 sentence = line[1].strip() nbest_list[-1].append(sentence.split(" ")) log.info("found nbest lists for %i source sentences" % len(nbest_list)) nbest_converted, make_data_infos = make_data.build_dataset_for_nbest_list_scoring(tgt_indexer, nbest_list) log.info("total %i sentences loaded" % make_data_infos.nb_ex) log.info("#tokens src: %i of which %i (%f%%) are unknown" % (make_data_infos.total_token, make_data_infos.total_count_unk, float(make_data_infos.total_count_unk * 100) / make_data_infos.total_token)) if len(nbest_list) != len(src_data[:max_nb_ex]): log.warn("mismatch in lengths nbest vs src : %i != %i" % (len(nbest_list), len(src_data[:max_nb_ex]))) assert len(nbest_list) == len(src_data[:max_nb_ex]) log.info("starting scoring") from nmt_chainer.utilities import utils res = [] for num in xrange(len(nbest_converted)): if num % 200 == 0: print >>sys.stderr, num, elif num % 50 == 0: print >>sys.stderr, "*", res.append([]) src, tgt_list = src_data[num], nbest_converted[num] src_batch, src_mask = utils.make_batch_src([src], gpu=gpu, volatile="on") assert len(encdec_list) == 1 scorer = encdec_list[0].nbest_scorer(src_batch, src_mask) nb_batches = (len(tgt_list) + mb_size - 1) / mb_size for num_batch in xrange(nb_batches): tgt_batch, arg_sort = utils.make_batch_tgt(tgt_list[num_batch * nb_batches: (num_batch + 1) * nb_batches], eos_idx=eos_idx, gpu=gpu, volatile="on", need_arg_sort=True) scores, attn = scorer(tgt_batch) scores, _ = scores scores = scores.data assert len(arg_sort) == len(scores) de_sorted_scores = [None] * len(scores) for xpos in xrange(len(arg_sort)): original_pos = arg_sort[xpos] de_sorted_scores[original_pos] = scores[xpos] res[-1] += de_sorted_scores print >>sys.stderr log.info("writing scores to %s" % dest_fn) out = codecs.open(dest_fn, "w", encoding="utf8") for num in xrange(len(res)): for score in res[num]: out.write("%i %f\n" % (num, score)) time_end = time.clock() translation_infos["loading_time"] = time_all_loaded - time_start translation_infos["translation_time"] = time_end - time_all_loaded translation_infos["total_time"] = time_end - time_start if dest_fn is not None: config_eval_session = config_eval.copy(readonly=False) config_eval_session.add_section("translation_infos", keep_at_bottom="metadata") config_eval_session["translation_infos"] = translation_infos config_eval_session.set_metadata_modified_time() save_eval_config_fn = dest_fn + ".eval.config.json" log.info("Saving eval config to %s" % save_eval_config_fn) config_eval_session.save_to(save_eval_config_fn)