def load_lexicon(self): logger.info("Loading lexicon from file: " + self.lexicon_file) assert self.src_vocab.frozen assert self.trg_vocab.frozen lexicon = [{} for _ in range(len(self.src_vocab))] with open(self.lexicon_file, encoding='utf-8') as fp: for line in fp: try: trg, src, prob = line.rstrip().split() except: logger.warning("Failed to parse 'trg src prob' from:" + line.strip()) continue trg_id = self.trg_vocab.convert(trg) src_id = self.src_vocab.convert(src) lexicon[src_id][trg_id] = float(prob) # Setting the rest of the weight to the unknown word for i in range(len(lexicon)): sum_prob = sum(lexicon[i].values()) if sum_prob < 1.0: lexicon[i][self.trg_vocab.convert( self.trg_vocab.unk_token)] = 1.0 - sum_prob # Overriding special tokens src_unk_id = self.src_vocab.convert(self.src_vocab.unk_token) trg_unk_id = self.trg_vocab.convert(self.trg_vocab.unk_token) lexicon[self.src_vocab.SS] = {self.trg_vocab.SS: 1.0} lexicon[self.src_vocab.ES] = {self.trg_vocab.ES: 1.0} # TODO(philip30): Note sure if this is intended lexicon[src_unk_id] = {trg_unk_id: 1.0} return lexicon
def __init__(self, filename, emb_dim=Ref("exp_global.default_layer_dim"), weight_noise=Ref("exp_global.weight_noise", default=0.0), word_dropout=0.0, fix_norm = None, vocab = None, yaml_path = None, src_reader = Ref("model.src_reader", default=None), trg_reader = Ref("model.trg_reader", default=None)): self.emb_dim = emb_dim self.weight_noise = weight_noise self.word_dropout = word_dropout self.word_id_mask = None self.train = False self.fix_norm = fix_norm self.pretrained_filename = filename param_collection = ParamManager.my_params(self) self.vocab = self.choose_vocab(vocab, yaml_path, src_reader, trg_reader) self.vocab_size = len(vocab) self.save_processed_arg("vocab", self.vocab) with open(self.pretrained_filename, encoding='utf-8') as embeddings_file: total_embs, in_vocab, missing, initial_embeddings = self._read_fasttext_embeddings(vocab, embeddings_file) self.embeddings = param_collection.lookup_parameters_from_numpy(initial_embeddings) logger.info(f"{in_vocab} vocabulary matches out of {total_embs} total embeddings; " f"{missing} vocabulary words without a pretrained embedding out of {self.vocab_size}")
def read_sents(self, filename, filter_ids=None): with h5py.File(filename, "r") as hf: h5_keys = sorted(hf.keys(), key=lambda x: int(x)) if filter_ids is not None: filter_ids = sorted(filter_ids) h5_keys = [h5_keys[i] for i in filter_ids] h5_keys.sort(key=lambda x: int(x)) for sent_no, key in enumerate(h5_keys): inp = hf[key][:] if self.transpose: inp = inp.transpose() sub_inp = inp[self.feat_from:self.feat_to:self.feat_skip, :self .timestep_truncate:self.timestep_skip] if sub_inp.size < inp.size: inp = np.empty_like(sub_inp) np.copyto(inp, sub_inp) else: inp = sub_inp if sent_no % 1000 == 999: logger.info( f"Read {sent_no+1} lines ({float(sent_no+1)/len(h5_keys)*100:.2f}%) of {filename} at {key}" ) yield ArraySentence( idx=filter_ids[sent_no] if filter_ids else sent_no, nparr=inp)
def perform_inference(self, generator: 'models.GeneratorModel', src_file: str = None, trg_file: str = None) \ -> None: """ Perform inference. Args: generator: the model to be used src_file: path of input src file to be translated trg_file: path of file where trg translatons will be written """ src_file = src_file or self.src_file trg_file = trg_file or self.trg_file utils.make_parent_dir(trg_file) logger.info(f'Performing inference on {src_file}') ref_corpus, src_corpus = self._read_corpus(generator, src_file, mode=self.mode, ref_file=self.ref_file) event_trigger.set_train(False) ref_scores = None if self.mode == 'score': ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents) self._write_rescored_output(ref_scores, self.ref_file, trg_file) if self.mode == 'forceddebug': ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents) if self.mode != 'score': self._generate_output(generator=generator, forced_ref_corpus=ref_corpus, assert_scores=ref_scores, src_corpus=src_corpus, trg_file=trg_file, batcher=self.batcher, max_src_len=self.max_src_len)
def read_sents(self, filename, filter_ids=None): npzFile = np.load(filename, mmap_mode=None if filter_ids is None else "r") npzKeys = sorted(npzFile.files, key=lambda x: int(x.split('_')[-1])) if filter_ids is not None: filter_ids = sorted(filter_ids) npzKeys = [npzKeys[i] for i in filter_ids] npzKeys.sort(key=lambda x: int(x.split('_')[-1])) for sent_no, key in enumerate(npzKeys): inp = npzFile[key] if self.transpose: inp = inp.transpose() sub_inp = inp[self.feat_from:self.feat_to:self.feat_skip, :self. timestep_truncate:self.timestep_skip] if sub_inp.size < inp.size: inp = np.empty_like(sub_inp) np.copyto(inp, sub_inp) else: inp = sub_inp if sent_no % 1000 == 999: logger.info( f"Read {sent_no+1} lines ({float(sent_no+1)/len(npzKeys)*100:.2f}%) of {filename} at {key}" ) yield ArraySentence( idx=filter_ids[sent_no] if filter_ids else sent_no, nparr=inp) npzFile.close()
def xnmt_evaluate(ref_file: OneOrSeveral[str], hyp_file: OneOrSeveral[str], evaluators: Sequence[Evaluator], desc: Any = None) -> Sequence[EvalScore]: """"Returns the eval score (e.g. BLEU) of the hyp sents using reference trg sents Args: ref_file: path of the reference file hyp_file: path of the hypothesis trg file evaluators: Evaluation metrics. Can be a list of evaluator objects, or a shortcut string desc: descriptive string passed on to evaluators """ hyp_postprocess = lambda line: line.split() ref_postprocess = lambda line: line.split() ref_corpus = read_data(ref_file, post_process=ref_postprocess) hyp_corpus = read_data(hyp_file, post_process=hyp_postprocess) len_before = len(hyp_corpus) ref_corpus, hyp_corpus = zip( *filter(lambda x: NO_DECODING_ATTEMPTED not in x[1], zip(ref_corpus, hyp_corpus))) if len(ref_corpus) < len_before: logger.info( f"> ignoring {len_before - len(ref_corpus)} out of {len_before} test sentences." ) return [ evaluator.evaluate(ref_corpus, hyp_corpus, desc=desc) for evaluator in evaluators ]
def report(self) -> None: this_report_time = time.time() self.last_report_sents_since_start = self.training_task.training_state.sents_since_start self.fractional_epoch = (self.training_task.training_state.epoch_num - 1) \ + self.training_task.training_state.sents_into_epoch / self.training_task.cur_num_sentences() dev_time = self.time_tracker.get_and_reset() utils.log_readable_and_tensorboard( template=DevLossTracker.REPORT_TEMPLATE_DEV, args={self.dev_score.metric_name(): self.dev_score.value()}, n_iter=self.fractional_epoch, data_name="dev", task_name=self.name, score=self.dev_score, time=utils.format_time(this_report_time - self.start_time)) for score in self.aux_scores: utils.log_readable_and_tensorboard( template=DevLossTracker.REPORT_TEMPLATE_DEV_AUX, args={score.metric_name(): score.value()}, n_iter=self.fractional_epoch, data_name="dev", task_name=self.name, score=score) logger.info( DevLossTracker.REPORT_TEMPLATE_TIME_NEEDED.format( time_needed=utils.format_time(dev_time), extra={"task_name": self.name})) self.aux_scores = []
def _augment_data_next_epoch(self): """ This is run in the background if reload_command is given to prepare data for the next epoch """ augment_command = self.reload_command if self._augmentation_handle is None: # first run self._augmentation_handle = Popen(augment_command + " --epoch %d" % self.training_state.epoch_num, shell=True) self._augmentation_handle.wait() self._augmentation_handle.poll() retcode = self._augmentation_handle.returncode if retcode is not None: if self.training_state.epoch_num > 0: logger.info('using reloaded data') # reload the data self.src_data, self.trg_data, self.src_batches, self.trg_batches = \ xnmt.input_reader.read_parallel_corpus(self.model.src_reader, self.model.trg_reader, self.src_file, self.trg_file, batcher=self.batcher, sample_sents=self.sample_train_sents, max_num_sents=self.max_num_train_sents, max_src_len=self.max_src_len, max_trg_len=self.max_trg_len) # restart data generation self._augmentation_handle = Popen(augment_command + " --epoch %d" % self.training_state.epoch_num, shell=True) else: logger.info('new data set is not ready yet, using data from last epoch.')
def update(self): self.steps += 1 decay = (self.dim**(-0.5)) * np.min( [self.steps**(-0.5), self.steps * (self.warmup_steps**(-1.5))]) self.optimizer.learning_rate = 1. * decay super().update() if self.steps % 200 == 0: logger.info('> Optimizer Logging') logger.info(' Steps=%d, learning_rate=%.2e' % (self.steps, self.optimizer.learning_rate))
def __init__(self, emb_dim: int = Ref("exp_global.default_layer_dim"), vocab_size: Optional[int] = None, vocab: Optional[vocabs.Vocab] = None, yaml_path: Path = Path(''), src_reader: Optional[input_readers.InputReader] = Ref( "model.src_reader", default=None), trg_reader: Optional[input_readers.InputReader] = Ref( "model.trg_reader", default=None), is_dense: bool = False, param_init: pinit.ParamInitializer = Ref( "exp_global.param_init", default=bare(pinit.GlorotInitializer)), bias_init: pinit.ParamInitializer = Ref( "exp_global.bias_init", default=bare(pinit.ZeroInitializer)), init_fastext: Optional[str] = None, weight_noise: float = Ref("exp_global.weight_noise", default=0.0), fix_norm: Optional[float] = None): super().__init__(emb_dim=emb_dim, weight_noise=weight_noise, fix_norm=fix_norm) # Embedding Parameters pcol = param_collections.ParamManager.my_params(self) self.vocab_size = self.choose_vocab_size(vocab_size, vocab, yaml_path, src_reader, trg_reader) emb_mtr_dim = (self.vocab_size, self.emb_dim) if init_fastext is not None: logger.info("Setting Dense to False because of init_fastext") is_dense = False if not is_dense: if init_fastext is not None: self.embeddings = pcol.lookup_parameters_from_numpy( self._read_fasttext_embeddings(vocab, init_fastext)) else: self.embeddings = pcol.add_lookup_parameters( emb_mtr_dim, init=param_init.initializer(emb_mtr_dim, is_lookup=True)) else: self.embeddings = pcol.add_parameters(emb_mtr_dim, init=param_init.initializer( emb_mtr_dim, is_lookup=True)) self.bias = pcol.add_parameters((self.vocab_size, ), init=bias_init.initializer( (self.vocab_size, ))) # Model States self.is_dense = is_dense self.train = False self.save_processed_arg("vocab_size", self.vocab_size)
def update(self) -> None: self.steps += 1 if self.warmup_steps: decay = (self.dim ** (-0.5)) * np.min([self.steps ** (-0.5), self.steps * (self.warmup_steps ** (-1.5))]) else: decay = (self.dim ** (-0.5)) * self.steps ** (-0.5) self.lr_factor = 1. * decay super().update() if self.steps % 200 == 0: logger.info('> Optimizer Logging') logger.info(f' Steps={self.steps}, learning_rate={self.lr_factor:.2e}')
def __init__(self, tasks: List[PreprocTask] = [], overwrite: bool = False): logger.info("> Preprocessing") for task in tasks: # Sanity check if len(task.in_files) != len(task.out_files): raise RuntimeError( "Length of in_files and out_files in preprocessor must be identical" ) task.run_preproc_task(overwrite=overwrite)
def _read_fasttext_embeddings(self, vocab: vocabs.Vocab, init_fastext): """ Reads FastText embeddings from a file. Also prints stats about the loaded embeddings for sanity checking. Args: vocab: a `Vocab` object containing the vocabulary for the experiment embeddings_file_handle: A file handle on the embeddings file. The embeddings must be in FastText text format. Returns: tuple: A tuple of (total number of embeddings read, # embeddings that match vocabulary words, # vocabulary words without a matching embedding, embeddings array). """ with open(init_fastext, encoding='utf-8') as embeddings_file_handle: _, dimension = next(embeddings_file_handle).split() if int(dimension) != self.emb_dim: raise Exception( f"An embedding size of {self.emb_dim} was specified, but the pretrained embeddings have size {dimension}" ) # Poor man's Glorot initializer for missing embeddings bound = np.sqrt(6 / (self.vocab_size + self.emb_dim)) total_embs = 0 in_vocab = 0 missing = 0 embeddings = np.empty((self.vocab_size, self.emb_dim), dtype='float') found = np.zeros(self.vocab_size, dtype='bool_') for line in embeddings_file_handle: total_embs += 1 word, vals = line.strip().split(' ', 1) if word in vocab.w2i: in_vocab += 1 index = vocab.w2i[word] embeddings[index] = np.fromstring(vals, sep=" ") found[index] = True for i in range(self.vocab_size): if not found[i]: missing += 1 embeddings[i] = np.random.uniform(-bound, bound, self.emb_dim) logger.info( f"{in_vocab} vocabulary matches out of {total_embs} total embeddings; " f"{missing} vocabulary words without a pretrained embedding out of {self.vocab_size}" ) return embeddings
def populate() -> None: """ Populate the parameter collections. Searches the given data paths and loads parameter collections if they exist, otherwise leave parameters in their randomly initialized state. """ assert ParamManager.initialized, "must call ParamManager.init_param_col() first" populated_subcols = [] for subcol_name in ParamManager.param_col.subcols: for load_path in ParamManager.load_paths: data_file = os.path.join(load_path, subcol_name) if os.path.isfile(data_file): ParamManager.param_col.load_subcol_from_data_file( subcol_name, data_file) populated_subcols.append(subcol_name) if len(ParamManager.param_col.subcols) == len(populated_subcols): logger.info( f"> populated neural network parameters of all components from given data files" ) elif len(populated_subcols) == 0: logger.info( f"> use randomly initialized neural network parameters for all components" ) else: logger.info( f"> populated a subset of neural network parameters from given data files: {populated_subcols}.\n" f" Did not populate {ParamManager.param_col.subcols.keys() - set(populated_subcols)}.\n" f" If partial population was not intended, likely the unpopulated component or its owner" f" does not adhere to the Serializable protocol correctly, see documentation:\n" f" http://xnmt.readthedocs.io/en/latest/writing_xnmt_classes.html#using-serializable-subcomponents" ) logger.info( f" neural network param count: {ParamManager.param_col.parameter_count()}" )
def __call__(self, save_fct): """ Launch training loop, followed by final evaluation. """ eval_scores = ["Not evaluated"] if self.status != "done": if self.train: logger.info("> Training") self.train.run_training(save_fct=save_fct) logger.info('reverting learned weights to best checkpoint..') try: ParamManager.param_col.revert_to_best_model() except RevertingUnsavedModelException: pass evaluate_args = self.evaluate if evaluate_args: logger.info("> Performing final evaluation") eval_scores = [] for evaluator in evaluate_args: eval_score = evaluator.eval() if type(eval_score) == list: eval_scores.extend(eval_score) else: eval_scores.append(eval_score) self.save_processed_arg("status", "done") save_fct() else: logger.info("Experiment already finished, skipping.") return eval_scores
def checkpoint(self, control_learning_schedule=True): """ Performs a dev checkpoint Args: control_learning_schedule: If False, only evaluate dev data. If True, also perform model saving, LR decay etc. if needed. Returns: True if the model needs saving, False otherwise """ ret = False self.logger.new_dev() # Perform evaluation if self.dev_tasks and len(self.dev_tasks) > 0: dev_scores = [] for dev_task in self.dev_tasks: dev_score, dev_word_cnt = dev_task.eval() if type(dev_score) == list: dev_scores.extend(dev_score) else: dev_scores.append(dev_score) # TODO: This is passing "1" for the number of words, as this is not implemented yet self.logger.set_dev_score(dev_word_cnt, dev_scores[0]) for dev_score in dev_scores[1:]: self.logger.report_auxiliary_score(dev_score) # Control the learning schedule if control_learning_schedule: logger.info("> Checkpoint") # Write out the model if it's the best one if self.logger.report_dev_and_check_model(): ret = True self.training_state.cur_attempt = 0 else: # otherwise: learning rate decay / early stopping self.training_state.cur_attempt += 1 if self.lr_decay < 1.0: should_decay = False if (self.initial_patience is None or self.training_state.num_times_lr_decayed>0) \ and self.training_state.cur_attempt >= self.patience: should_decay = True if self.initial_patience is not None and self.training_state.num_times_lr_decayed==0 \ and self.training_state.cur_attempt >= self.initial_patience: should_decay = True if should_decay: self.training_state.num_times_lr_decayed += 1 if self.training_state.num_times_lr_decayed > self.lr_decay_times: logger.info(' Early stopping') self.early_stopping_reached = True else: self.training_state.cur_attempt = 0 self.trainer.learning_rate *= self.lr_decay logger.info(' new learning rate: %s' % self.trainer.learning_rate) if self.restart_trainer: logger.info(' restarting trainer and reverting learned weights to best checkpoint..') self.trainer.restart() ParamManager.param_col.revert_to_best_model() return ret
def __init__(self, exp_global:Optional[ExpGlobal] = bare(ExpGlobal), preproc:Optional[PreprocRunner] = None, model:Optional[GeneratorModel] = None, train:Optional[TrainingRegimen] = None, evaluate:Optional[List[EvalTask]] = None, random_search_report:Optional[dict] = None) -> None: self.exp_global = exp_global self.preproc = preproc self.model = model self.train = train self.evaluate = evaluate if random_search_report: logger.info(f"> instantiated random parameter search: {random_search_report}")
def update(self) -> None: self.global_step += 1 if self.rescale_grads: torch.nn.utils.clip_grad_norm_(ParamManager.global_collection().parameters(), self.rescale_grads) self.scheduler.step() if settings.USE_TENSORBOARD: tee.tensorboard_writer.add_scalars(name="lr", tag_scalar_dict={"lr": self.learning_rate * self.lr_factor}, global_step=self.global_step) if not self.skip_noisy: tee.tensorboard_writer.add_scalars(name="grad", tag_scalar_dict={"norm": np.exp(self.grad_log_norm())}, global_step=self.global_step) if not (self.skip_noisy and self.check_gradients_noisy()): self.optimizer.step() else: logger.info("skipping noisy update")
def log_readable_and_tensorboard(template: str, args: MutableMapping, n_iter: numbers.Real, data_name: str, task_name: Optional[str] = None, **kwargs) -> None: log_args = dict(args) log_args["data_name"] = data_name log_args["epoch"] = n_iter log_args.update(kwargs) if task_name: log_args["task_name"] = task_name logger.info(template.format(**log_args), extra=log_args) from xnmt.tee import tensorboard_writer tensorboard_writer.add_scalars( f"{task_name}/{data_name}" if task_name else data_name, args, n_iter)
def log_readable_and_tensorboard(template, args, n_iter, data_name, task_name=None, **kwargs): log_args = dict(args) log_args["data_name"] = data_name log_args["epoch"] = n_iter log_args.update(kwargs) if task_name: log_args["task_name"] = task_name logger.info(template.format(**log_args), extra=log_args) from xnmt.tee import tensorboard_writer tensorboard_writer.add_scalars( f"{task_name}/{data_name}" if task_name else data_name, args, n_iter)
def update(self) -> None: """ Update the parameters. """ try: if not (self.skip_noisy and self._check_gradients_noisy()): self.optimizer.update() else: logger.info("skipping noisy update") except RuntimeError: logger.warning( "Failed to perform update. Skipping example and clearing gradients." ) for subcol in ParamManager.param_col.subcols.values(): for param in subcol.parameters_list(): param.scale_gradient(0)
def read_parallel_corpus(src_reader: InputReader, trg_reader: InputReader, src_file: str, trg_file: str, batcher: xnmt.batcher.Batcher=None, sample_sents=None, max_num_sents=None, max_src_len=None, max_trg_len=None): """ A utility function to read a parallel corpus. Args: src_reader (InputReader): trg_reader (InputReader): src_file (str): trg_file (str): batcher (Batcher): sample_sents (int): if not None, denote the number of sents that should be randomly chosen from all available sents. max_num_sents (int): if not None, read only the first this many sents max_src_len (int): skip pair if src side is too long max_trg_len (int): skip pair if trg side is too long Returns: A tuple of (src_data, trg_data, src_batches, trg_batches) where ``*_batches = *_data`` if ``batcher=None`` """ src_data = [] trg_data = [] if sample_sents: logger.info(f"Starting to read {sample_sents} parallel sentences of {src_file} and {trg_file}") src_len = src_reader.count_sents(src_file) trg_len = trg_reader.count_sents(trg_file) if src_len != trg_len: raise RuntimeError(f"training src sentences don't match trg sentences: {src_len} != {trg_len}!") if max_num_sents and max_num_sents < src_len: src_len = trg_len = max_num_sents filter_ids = np.random.choice(src_len, sample_sents, replace=False) else: logger.info(f"Starting to read {src_file} and {trg_file}") filter_ids = None src_len, trg_len = 0, 0 src_train_iterator = src_reader.read_sents(src_file, filter_ids) trg_train_iterator = trg_reader.read_sents(trg_file, filter_ids) for src_sent, trg_sent in zip_longest(src_train_iterator, trg_train_iterator): if src_sent is None or trg_sent is None: raise RuntimeError(f"training src sentences don't match trg sentences: {src_len or src_reader.count_sents(src_file)} != {trg_len or trg_reader.count_sents(trg_file)}!") if max_num_sents and (max_num_sents <= len(src_data)): break src_len_ok = max_src_len is None or src_sent.sent_len() <= max_src_len trg_len_ok = max_trg_len is None or trg_sent.sent_len() <= max_trg_len if src_len_ok and trg_len_ok: src_data.append(src_sent) trg_data.append(trg_sent) logger.info(f"Done reading {src_file} and {trg_file}. Packing into batches.") # Pack batches if batcher is not None: src_batches, trg_batches = batcher.pack(src_data, trg_data) else: src_batches, trg_batches = src_data, trg_data logger.info(f"Done packing batches.") return src_data, trg_data, src_batches, trg_batches
def __init__(self, src_reader: input_readers.InputReader, trg_reader: input_readers.InputReader, src_embedder: embedders.Embedder = bare( embedders.LookupEmbedder), encoder: recurrent.UniLSTMSeqTransducer = bare( recurrent.UniLSTMSeqTransducer), attender: attenders.Attender = bare(attenders.MlpAttender), decoder: decoders.Decoder = bare( decoders.AutoRegressiveDecoder), inference: inferences.AutoRegressiveInference = bare( inferences.AutoRegressiveInference), truncate_dec_batches: bool = False, policy_network: Optional[PolicyNetwork] = None, policy_train_oracle=False, policy_test_oracle=False, policy_sample=False, read_before_write=False) -> None: super().__init__(src_reader=src_reader, trg_reader=trg_reader, encoder=encoder, attender=attender, src_embedder=src_embedder, decoder=decoder, inference=inference, truncate_dec_batches=truncate_dec_batches) policy_network = self.add_serializable_component( "policy_network", policy_network, lambda: policy_network) PolicyConditionedModel.__init__(self, policy_network, policy_train_oracle, policy_test_oracle) self.policy_sample = policy_sample self.read_before_write = read_before_write if self.read_before_write: logger.info( "Setting looking oracle to always false in SimultTranslator for 'read_before_write'" ) self.policy_train_oracle = False self.policy_test_oracle = False self.outputs = [] self.decoder_states = [] self.model_states = []
def __init__(self, name: str, exp_global: Optional[ExpGlobal] = bare(ExpGlobal), preproc: Optional[preproc.PreprocRunner] = None, model: Optional[models_base.TrainableModel] = None, train: Optional[regimens.TrainingRegimen] = None, evaluate: Optional[List[eval_tasks.EvalTask]] = None, random_search_report: Optional[dict] = None, status: Optional[str] = None) -> None: self.name = name self.exp_global = exp_global self.preproc = preproc self.model = model self.train = train self.evaluate = evaluate self.status = status if random_search_report: logger.info(f"> instantiated random parameter search: {random_search_report}")
def log_readable_and_tensorboard(template: str, args: MutableMapping, n_iter: numbers.Integral, fractional_epoch: numbers.Real, data_name: str, task_name: Optional[str] = None, **kwargs) -> None: log_args = dict(args) log_args["data_name"] = data_name log_args["epoch"] = fractional_epoch log_args["n_iter"] = n_iter log_args.update(kwargs) if task_name: log_args["task_name"] = task_name logger.info(template.format(**log_args), extra=log_args) if settings.USE_TENSORBOARD: from xnmt.tee import tensorboard_writer if tensorboard_writer.writer is not None: tensorboard_writer.add_scalars( f"{task_name}/{data_name}" if task_name else data_name, args, n_iter)
def update(self) -> None: """ Update the parameters. """ self.global_step += 1 if settings.USE_TENSORBOARD: tee.tensorboard_writer.add_scalars(name="lr", tag_scalar_dict={"lr": self.optimizer.learning_rate}, global_step=self.global_step) if not self.skip_noisy: tee.tensorboard_writer.add_scalars(name="grad", tag_scalar_dict={"norm": np.exp(self.grad_log_norm())}, global_step=self.global_step) try: if not (self.skip_noisy and self.check_gradients_noisy()): self.optimizer.update() else: logger.info("skipping noisy update") except RuntimeError: logger.warning("Failed to perform update. Skipping example and clearing gradients.") for subcol in ParamManager.param_col.subcols.values(): for param in subcol.parameters_list(): param.scale_gradient(0)
def populate() -> None: """ Populate the parameter collections. Searches the given data paths and loads parameter collections if they exist, otherwise leave parameters in their randomly initialized state. """ assert ParamManager.initialized, "must call ParamManager.init_param_col() first" populated_subcols = [] for subcol_name in ParamManager.param_col.subcols: for load_path in ParamManager.load_paths: data_file = os.path.join(load_path, subcol_name) if os.path.isfile(data_file): ParamManager.param_col.load_subcol_from_data_file( subcol_name, data_file) populated_subcols.append(subcol_name) if len(ParamManager.param_col.subcols) == len(populated_subcols): logger.info( f"> populated DyNet weights of all components from given data files" ) elif len(populated_subcols) == 0: logger.info( f"> use randomly initialized DyNet weights of all components") else: logger.info( f"> populated a subset of DyNet weights from given data files: {populated_subcols}.\n" f" Did not populate {ParamManager.param_col.subcols.keys() - set(populated_subcols)}.\n" f" (Note: if partial population was not intended, likely the unpopulated component or its owner" f" does not adhere to the Serializable protocol correctly, see documentation)." )
def extract_to(self, in_file: str, out_file: str): output_file = open(out_file, "w") counter, num_node_sum1, num_edge_sum1, num_node_sum2, num_edge_sum2 = 0, 0, 0, 0, 0 with open(in_file) as f: for line in f: graph = LatticeFromPlfExtractor._Lattice() graph.read_plf_line(line) graph.insert_initial_node() graph.insert_final_node() graph.forward() graph2 = LatticeFromPlfExtractor._Lattice.convert_to_node_labeled_lattice( graph) if len(graph2.nodes) == 1: graph2.insert_initial_node() serial = graph2.serialize_to_string() output_file.write(serial + "\n") counter += 1 num_node_sum1 += len(graph.nodes) num_node_sum2 += len(graph2.nodes) num_edge_sum1 += len(graph.edges) num_edge_sum2 += len(graph2.edges) if counter % 1000 == 0: logger.info(f"finished {counter} lattices.") output_file.close() logger.info( f"avg # nodes, # edges for edge-labeled lattices: {float(num_node_sum1) / counter}, {float(num_edge_sum1) / counter}" ) logger.info( f"avg # nodes, # edges for node-labeled lattices: {float(num_node_sum2) / counter}, {float(num_edge_sum2) / counter}" )
def report_dev_and_check_model(self): """ Print dev testing report and check whether the dev loss is the best seen so far. Return: True if the dev loss is the best and required save operations """ this_report_time = time.time() sent_num = self.eval_dev_every if self.eval_dev_every != 0 else self.total_train_sent self.sent_num_not_report_dev = self.sent_num_not_report_dev % sent_num self.fractional_epoch = (self.epoch_num - 1) + self.sent_num / self.total_train_sent self.log_readable_and_structured( LossTracker.REPORT_TEMPLATE_DEV, { "key": "dev_loss", "epoch": self.fractional_epoch, "score": self.dev_score, "words": self.dev_words, "words_per_sec": self.dev_words / (this_report_time - self.dev_start_time), "time": self.format_time(this_report_time - self.start_time) }) save_model = True if self.best_dev_score is not None: save_model = self.dev_score.better_than(self.best_dev_score) if save_model: self.best_dev_score = self.dev_score logger.info( f"Epoch {self.fractional_epoch:.4f}: best dev score, writing out model" ) return save_model
def __call__(self, save_fct): """ Launch training loop, followed by final evaluation. """ eval_scores = "Not evaluated" if self.train: logger.info("> Training") self.train.run_training(save_fct=save_fct) logger.info('reverting learned weights to best checkpoint..') ParamManager.param_col.revert_to_best_model() evaluate_args = self.evaluate if evaluate_args: logger.info("> Performing final evaluation") eval_scores = [] for evaluator in evaluate_args: eval_score, _ = evaluator.eval() if type(eval_score) == list: eval_scores.extend(eval_score) else: eval_scores.append(eval_score) return eval_scores