def load_lexicon(self): logger.info("Loading lexicon from file: " + self.lexicon_file) assert self.src_vocab.frozen assert self.trg_vocab.frozen lexicon = [{} for _ in range(len(self.src_vocab))] with open(self.lexicon_file, encoding='utf-8') as fp: for line in fp: try: trg, src, prob = line.rstrip().split() except: logger.warning("Failed to parse 'trg src prob' from:" + line.strip()) continue trg_id = self.trg_vocab.convert(trg) src_id = self.src_vocab.convert(src) lexicon[src_id][trg_id] = float(prob) # Setting the rest of the weight to the unknown word for i in range(len(lexicon)): sum_prob = sum(lexicon[i].values()) if sum_prob < 1.0: lexicon[i][self.trg_vocab.convert( self.trg_vocab.unk_token)] = 1.0 - sum_prob # Overriding special tokens src_unk_id = self.src_vocab.convert(self.src_vocab.unk_token) trg_unk_id = self.trg_vocab.convert(self.trg_vocab.unk_token) lexicon[self.src_vocab.SS] = {self.trg_vocab.SS: 1.0} lexicon[self.src_vocab.ES] = {self.trg_vocab.ES: 1.0} # TODO(philip30): Note sure if this is intended lexicon[src_unk_id] = {trg_unk_id: 1.0} return lexicon
def print_cg_conditional() -> None: if settings.PRINT_CG_ON_ERROR: if xnmt.backend_dynet: import dynet as dy dy.print_text_graphviz() else: logger.warning("CG printing not implemented with Torch backend")
def calc_attention(self, state: dy.Expression) -> dy.Expression: logger.warning("BilinearAttender does currently not do masking, which may harm training results.") Wa = dy.parameter(self.pWa) scores = (dy.transpose(state) * Wa) * self.I normalized = dy.softmax(scores) self.attention_vecs.append(normalized) return dy.transpose(normalized)
def update(self) -> None: """ Update the parameters. """ try: if not (self.skip_noisy and self._check_gradients_noisy()): self.optimizer.update() else: logger.info("skipping noisy update") except RuntimeError: logger.warning( "Failed to perform update. Skipping example and clearing gradients." ) for subcol in ParamManager.param_col.subcols.values(): for param in subcol.parameters_list(): param.scale_gradient(0)
def update(self) -> None: """ Update the parameters. """ self.global_step += 1 if settings.USE_TENSORBOARD: tee.tensorboard_writer.add_scalars(name="lr", tag_scalar_dict={"lr": self.optimizer.learning_rate}, global_step=self.global_step) if not self.skip_noisy: tee.tensorboard_writer.add_scalars(name="grad", tag_scalar_dict={"norm": np.exp(self.grad_log_norm())}, global_step=self.global_step) try: if not (self.skip_noisy and self.check_gradients_noisy()): self.optimizer.update() else: logger.info("skipping noisy update") except RuntimeError: logger.warning("Failed to perform update. Skipping example and clearing gradients.") for subcol in ParamManager.param_col.subcols.values(): for param in subcol.parameters_list(): param.scale_gradient(0)
def main(overwrite_args=None): with tee.Tee(), tee.Tee(error=True): argparser = argparse.ArgumentParser() argparser.add_argument("--dynet-mem", type=str) argparser.add_argument("--dynet-seed", type=int, help="set random seed for DyNet and XNMT.") argparser.add_argument("--dynet-autobatch", type=int) argparser.add_argument("--dynet-devices", type=str) argparser.add_argument("--dynet-viz", action='store_true', help="use visualization") argparser.add_argument("--dynet-gpu", action='store_true', help="use GPU acceleration") argparser.add_argument("--dynet-gpu-ids", type=int) argparser.add_argument("--dynet-gpus", type=int) argparser.add_argument("--dynet-weight-decay", type=float) argparser.add_argument("--dynet-profiling", type=int) argparser.add_argument("--settings", type=str, default="standard", help="settings (standard, debug, or unittest)" "must be given in '=' syntax, e.g." " --settings=standard") argparser.add_argument("experiments_file") argparser.add_argument("experiment_name", nargs='*', help="Run only the specified experiments") argparser.set_defaults(generate_doc=False) args = argparser.parse_args(overwrite_args) if args.dynet_seed: random.seed(args.dynet_seed) np.random.seed(args.dynet_seed) if args.dynet_gpu: if settings.CHECK_VALIDITY: settings.CHECK_VALIDITY = False log_preamble( "disabling CHECK_VALIDITY because it is not supported on GPU currently", logging.WARNING) config_experiment_names = YamlPreloader.experiment_names_from_file( args.experiments_file) results = [] # Check ahead of time that all experiments exist, to avoid bad surprises experiment_names = args.experiment_name or config_experiment_names if args.experiment_name: nonexistent = set(experiment_names).difference( config_experiment_names) if len(nonexistent) != 0: raise Exception("Experiments {} do not exist".format(",".join( list(nonexistent)))) log_preamble( f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" ) for experiment_name in experiment_names: ParamManager.init_param_col() uninitialized_exp_args = YamlPreloader.preload_experiment_from_file( args.experiments_file, experiment_name) logger.info(f"=> Running {experiment_name}") glob_args = uninitialized_exp_args.data.exp_global log_file = glob_args.log_file if os.path.isfile(log_file) and not settings.OVERWRITE_LOG: logger.warning( f"log file {log_file} already exists, skipping experiment; please delete log file by hand if you want to overwrite it " f"(or activate OVERWRITE_LOG, by either specifying an environment variable as OVERWRITE_LOG=1, " f"or specifying --settings=debug, or changing xnmt.settings.Standard.OVERWRITE_LOG manually)" ) continue tee.set_out_file(log_file) model_file = glob_args.model_file uninitialized_exp_args.data.exp_global.commandline_args = args # Create the model experiment = initialize_if_needed(uninitialized_exp_args) ParamManager.param_col.model_file = experiment.exp_global.model_file ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints ParamManager.populate() # Run the experiment eval_scores = experiment(save_fct=lambda: save_to_file( model_file, experiment, ParamManager.param_col)) results.append((experiment_name, eval_scores)) print_results(results) tee.unset_out_file()
def on_html_report(self, context=None): logger.warning("Unimplemented html report for retriever!") idx, src_words, scores, kbest = self.html_input html = etree.Element('html') # TODO(philip30): Write the logic of retriever html here return html
def __call__(self, generator, src_file=None, trg_file=None, candidate_id_file=None): """ Args: generator (GeneratorModel): the model to be used src_file (str): path of input src file to be translated trg_file (str): path of file where trg translatons will be written candidate_id_file (str): if we are doing something like retrieval where we select from fixed candidates, sometimes we want to limit our candidates to a certain subset of the full set. this setting allows us to do this. """ args = dict(src_file=src_file or self.src_file, trg_file=trg_file or self.trg_file, ref_file=self.ref_file, max_src_len=self.max_src_len, post_process=self.post_process, candidate_id_file=candidate_id_file, report_path=self.report_path, report_type=self.report_type, beam=self.beam, max_len=self.max_len, len_norm_type=self.len_norm_type, mode=self.mode) is_reporting = issubclass( generator.__class__, Reportable) and args["report_path"] is not None # Corpus src_corpus = list(generator.src_reader.read_sents(args["src_file"])) # Get reference if it exists and is necessary if args["mode"] == "forced" or args["mode"] == "forceddebug" or args[ "mode"] == "score": if args["ref_file"] == None: raise RuntimeError( "When performing {} decoding, must specify reference file". format(args["mode"])) score_src_corpus = [] ref_corpus = [] with open(args["ref_file"], "r", encoding="utf-8") as fp: for line in fp: if args["mode"] == "score": nbest = line.split("|||") assert len( nbest ) > 1, "When performing scoring, ref_file must have nbest format 'index ||| hypothesis'" src_index = int(nbest[0].strip()) assert src_index < len( src_corpus ), "The src_file has only {} instances, nbest file has invalid src_index {}".format( len(src_corpus), src_index) score_src_corpus.append(src_corpus[src_index]) trg_input = generator.trg_reader.read_sent( nbest[1].strip()) else: trg_input = generator.trg_reader.read_sent(line) ref_corpus.append(trg_input) if args["mode"] == "score": src_corpus = score_src_corpus else: if self.max_len and any( len(s) > self.max_len for s in ref_corpus): logger.warning( "Forced decoding with some targets being longer than max_len. Increase max_len to avoid unexpected behavior." ) else: ref_corpus = None # Vocab src_vocab = generator.src_reader.vocab if hasattr( generator.src_reader, "vocab") else None trg_vocab = generator.trg_reader.vocab if hasattr( generator.trg_reader, "vocab") else None # Perform initialization generator.set_train(False) generator.initialize_generator(**args) if hasattr(generator, "set_post_processor"): generator.set_post_processor(self.get_output_processor()) if hasattr(generator, "set_trg_vocab"): generator.set_trg_vocab(trg_vocab) if hasattr(generator, "set_reporting_src_vocab"): generator.set_reporting_src_vocab(src_vocab) if is_reporting: generator.set_report_resource("src_vocab", src_vocab) generator.set_report_resource("trg_vocab", trg_vocab) # If we're debugging, calculate the loss for each target sentence ref_scores = None if args["mode"] == 'forceddebug' or args["mode"] == 'score': some_batcher = xnmt.batcher.InOrderBatcher(32) # Arbitrary if not isinstance(some_batcher, xnmt.batcher.InOrderBatcher): raise ValueError( f"forceddebug requires InOrderBatcher, got: {some_batcher}" ) batched_src, batched_ref = some_batcher.pack( src_corpus, ref_corpus) ref_scores = [] for src, ref in zip(batched_src, batched_ref): dy.renew_cg(immediate_compute=settings.IMMEDIATE_COMPUTE, check_validity=settings.CHECK_VALIDITY) loss_expr = generator.calc_loss( src, ref, loss_calculator=LossCalculator()) if isinstance(loss_expr.value(), Iterable): ref_scores.extend(loss_expr.value()) else: ref_scores.append(loss_expr.value()) ref_scores = [-x for x in ref_scores] # Make the parent directory if necessary make_parent_dir(args["trg_file"]) # Perform generation of output if args["mode"] != 'score': with open(args["trg_file"], 'wt', encoding='utf-8' ) as fp: # Saving the translated output to a trg file src_ret = [] for i, src in enumerate(src_corpus): # This is necessary when the batcher does some sort of pre-processing, e.g. # when the batcher pads to a particular number of dimensions if self.batcher: self.batcher.add_single_batch(src_curr=[src], trg_curr=None, src_ret=src_ret, trg_ret=None) src = src_ret.pop()[0] # Do the decoding if args["max_src_len"] is not None and len( src) > args["max_src_len"]: output_txt = NO_DECODING_ATTEMPTED else: dy.renew_cg( immediate_compute=settings.IMMEDIATE_COMPUTE, check_validity=settings.CHECK_VALIDITY) ref_ids = ref_corpus[i] if ref_corpus != None else None output = generator.generate_output( src, i, forced_trg_ids=ref_ids) # If debugging forced decoding, make sure it matches if ref_scores != None and ( abs(output[0].score - ref_scores[i]) / abs(ref_scores[i])) > 1e-5: logger.error( f'Forced decoding score {output[0].score} and loss {ref_scores[i]} do not match at sentence {i}' ) output_txt = output[0].plaintext # Printing to trg file fp.write(f"{output_txt}\n") else: with open(args["trg_file"], 'wt', encoding='utf-8') as fp: with open(args["ref_file"], "r", encoding="utf-8") as nbest_fp: for nbest, score in zip(nbest_fp, ref_scores): fp.write("{} ||| score={}\n".format( nbest.strip(), score))
def generate_output(self, translator, initial_state, src_length=None, forced_trg_ids=None): # TODO(philip30): can only do single decoding, not batched assert forced_trg_ids is None or self.beam_size == 1 if forced_trg_ids is not None and forced_trg_ids.sent_len() > self.max_len: logger.warning("Forced decoding with a target longer than max_len. " "Increase max_len to avoid unexpected behavior.") active_hyp = [self.Hypothesis(0, None, None, None)] completed_hyp = [] for length in range(self.max_len): if len(completed_hyp) >= self.beam_size: break # Expand hyp new_set = [] for hyp in active_hyp: if length > 0: prev_word = hyp.word prev_state = hyp.output.state else: prev_word = None prev_state = initial_state if prev_word == Vocab.ES: completed_hyp.append(hyp) continue current_output = translator.generate_one_step(prev_word, prev_state) score = current_output.logsoftmax.npvalue().transpose() if self.scores_proc: self.scores_proc(score) # Next Words if forced_trg_ids is None: top_words = np.argpartition(score, max(-len(score),-self.beam_size))[-self.beam_size:] else: top_words = [forced_trg_ids[length]] # Queue next states for cur_word in top_words: new_score = self.len_norm.normalize_partial_topk(hyp.score, score[cur_word], length + 1) new_set.append(self.Hypothesis(new_score, current_output, hyp, cur_word)) # Next top hypothesis active_hyp = sorted(new_set, key=lambda x: x.score, reverse=True)[:self.beam_size] # There is no hyp reached </s> if len(completed_hyp) == 0: completed_hyp = active_hyp # Length Normalization normalized_scores = self.len_norm.normalize_completed(completed_hyp, src_length) hyp_and_score = sorted(list(zip(completed_hyp, normalized_scores)), key=lambda x: x[1], reverse=True) if self.one_best: hyp_and_score = [hyp_and_score[0]] # Backtracing + Packing outputs results = [] for end_hyp, score in hyp_and_score: logsoftmaxes = [] word_ids = [] attentions = [] states = [] current = end_hyp while current.parent is not None: word_ids.append(current.word) attentions.append(current.output.attention) # TODO(philip30): This should probably be uncommented. # These 2 statements are an overhead because it is need only for reinforce and minrisk # Furthermore, the attentions is only needed for report. # We should have a global flag to indicate whether this is needed or not? # The global flag is modified if certain objects is instantiated. #logsoftmaxes.append(dy.pick(current.output.logsoftmax, current.word)) #states.append(translator.get_nobp_state(current.output.state)) current = current.parent results.append(SearchOutput([list(reversed(word_ids))], [list(reversed(attentions))], [score], list(reversed(logsoftmaxes)), list(reversed(states)), None)) return results
def main(overwrite_args: Optional[Sequence[str]] = None) -> None: with tee.Tee(), tee.Tee(error=True): argparser = argparse.ArgumentParser() utils.add_backend_argparse(argparser) argparser.add_argument("--settings", type=str, default="standard", help="settings (standard, debug, or unittest)" "must be given in '=' syntax, e.g." " --settings=standard") argparser.add_argument( "--resume", action='store_true', help="whether a saved experiment is being resumed, and" "locations of output files should be re-used.") argparser.add_argument("--backend", type=str, default="dynet", help="backend (dynet or torch)") argparser.add_argument("experiments_file") argparser.add_argument("experiment_name", nargs='*', help="Run only the specified experiments") argparser.set_defaults(generate_doc=False) args = argparser.parse_args(overwrite_args) if xnmt.backend_dynet and args.dynet_seed: args.seed = args.dynet_seed if getattr(args, "seed", None): random.seed(args.seed) np.random.seed(args.seed) if xnmt.backend_torch: torch.manual_seed(0) if xnmt.backend_dynet and args.dynet_gpu and settings.CHECK_VALIDITY: settings.CHECK_VALIDITY = False log_preamble( "disabling CHECK_VALIDITY because it is not supported in the DyNet/GPU setting", logging.WARNING) config_experiment_names = YamlPreloader.experiment_names_from_file( args.experiments_file) results = [] # Check ahead of time that all experiments exist, to avoid bad surprises experiment_names = args.experiment_name or config_experiment_names if args.experiment_name: nonexistent = set(experiment_names).difference( config_experiment_names) if len(nonexistent) != 0: raise Exception("Experiments {} do not exist".format(",".join( list(nonexistent)))) log_preamble( f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} with {'DyNet' if xnmt.backend_dynet else 'PyTorch'} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" ) for experiment_name in experiment_names: ParamManager.init_param_col() uninitialized_exp_args = YamlPreloader.preload_experiment_from_file( args.experiments_file, experiment_name, resume=args.resume) logger.info(f"=> Running {experiment_name}") glob_args = uninitialized_exp_args.data.exp_global log_file = glob_args.log_file if not settings.OVERWRITE_LOG: log_files_exist = [] if os.path.isfile(log_file): log_files_exist.append(log_file) if os.path.isdir(log_file + ".tb"): log_files_exist.append(log_file + ".tb/") if log_files_exist: logger.warning( f"log file(s) {' '.join(log_files_exist)} already exists, skipping experiment; " f"please delete log file by hand if you want to overwrite it " f"(or activate OVERWRITE_LOG, by either specifying an environment variable OVERWRITE_LOG=1, " f"or specifying --settings=debug, or changing xnmt.settings.Standard.OVERWRITE_LOG manually)" ) continue elif settings.OVERWRITE_LOG and os.path.isdir(log_file + ".tb"): shutil.rmtree( log_file + ".tb/" ) # remove tensorboard logs from previous run that is being overwritten tee.set_out_file(log_file, exp_name=experiment_name) try: model_file = glob_args.model_file uninitialized_exp_args.data.exp_global.commandline_args = vars( args) # Create the model experiment = initialize_if_needed(uninitialized_exp_args) ParamManager.param_col.model_file = experiment.exp_global.model_file ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints ParamManager.populate() # Run the experiment eval_scores = experiment( save_fct=lambda: save_to_file(model_file, experiment)) results.append((experiment_name, eval_scores)) print_results(results) except Exception as e: file_logger.error(traceback.format_exc()) raise e finally: tee.unset_out_file()