Пример #1
0
 def load_lexicon(self):
     logger.info("Loading lexicon from file: " + self.lexicon_file)
     assert self.src_vocab.frozen
     assert self.trg_vocab.frozen
     lexicon = [{} for _ in range(len(self.src_vocab))]
     with open(self.lexicon_file, encoding='utf-8') as fp:
         for line in fp:
             try:
                 trg, src, prob = line.rstrip().split()
             except:
                 logger.warning("Failed to parse 'trg src prob' from:" +
                                line.strip())
                 continue
             trg_id = self.trg_vocab.convert(trg)
             src_id = self.src_vocab.convert(src)
             lexicon[src_id][trg_id] = float(prob)
     # Setting the rest of the weight to the unknown word
     for i in range(len(lexicon)):
         sum_prob = sum(lexicon[i].values())
         if sum_prob < 1.0:
             lexicon[i][self.trg_vocab.convert(
                 self.trg_vocab.unk_token)] = 1.0 - sum_prob
     # Overriding special tokens
     src_unk_id = self.src_vocab.convert(self.src_vocab.unk_token)
     trg_unk_id = self.trg_vocab.convert(self.trg_vocab.unk_token)
     lexicon[self.src_vocab.SS] = {self.trg_vocab.SS: 1.0}
     lexicon[self.src_vocab.ES] = {self.trg_vocab.ES: 1.0}
     # TODO(philip30): Note sure if this is intended
     lexicon[src_unk_id] = {trg_unk_id: 1.0}
     return lexicon
Пример #2
0
  def __init__(self,
               filename,
               emb_dim=Ref("exp_global.default_layer_dim"),
               weight_noise=Ref("exp_global.weight_noise", default=0.0),
               word_dropout=0.0,
               fix_norm = None,
               vocab = None,
               yaml_path = None,
               src_reader = Ref("model.src_reader", default=None),
               trg_reader = Ref("model.trg_reader", default=None)):
    self.emb_dim = emb_dim
    self.weight_noise = weight_noise
    self.word_dropout = word_dropout
    self.word_id_mask = None
    self.train = False
    self.fix_norm = fix_norm
    self.pretrained_filename = filename
    param_collection = ParamManager.my_params(self)
    self.vocab = self.choose_vocab(vocab, yaml_path, src_reader, trg_reader)
    self.vocab_size = len(vocab)
    self.save_processed_arg("vocab", self.vocab)
    with open(self.pretrained_filename, encoding='utf-8') as embeddings_file:
      total_embs, in_vocab, missing, initial_embeddings = self._read_fasttext_embeddings(vocab, embeddings_file)
    self.embeddings = param_collection.lookup_parameters_from_numpy(initial_embeddings)

    logger.info(f"{in_vocab} vocabulary matches out of {total_embs} total embeddings; "
                f"{missing} vocabulary words without a pretrained embedding out of {self.vocab_size}")
Пример #3
0
    def read_sents(self, filename, filter_ids=None):
        with h5py.File(filename, "r") as hf:
            h5_keys = sorted(hf.keys(), key=lambda x: int(x))
            if filter_ids is not None:
                filter_ids = sorted(filter_ids)
                h5_keys = [h5_keys[i] for i in filter_ids]
                h5_keys.sort(key=lambda x: int(x))
            for sent_no, key in enumerate(h5_keys):
                inp = hf[key][:]
                if self.transpose:
                    inp = inp.transpose()

                sub_inp = inp[self.feat_from:self.feat_to:self.feat_skip, :self
                              .timestep_truncate:self.timestep_skip]
                if sub_inp.size < inp.size:
                    inp = np.empty_like(sub_inp)
                    np.copyto(inp, sub_inp)
                else:
                    inp = sub_inp

                if sent_no % 1000 == 999:
                    logger.info(
                        f"Read {sent_no+1} lines ({float(sent_no+1)/len(h5_keys)*100:.2f}%) of {filename} at {key}"
                    )
                yield ArraySentence(
                    idx=filter_ids[sent_no] if filter_ids else sent_no,
                    nparr=inp)
Пример #4
0
  def perform_inference(self, generator: 'models.GeneratorModel', src_file: str = None, trg_file: str = None) \
          -> None:
    """
    Perform inference.

    Args:
      generator: the model to be used
      src_file: path of input src file to be translated
      trg_file: path of file where trg translatons will be written
    """
    src_file = src_file or self.src_file
    trg_file = trg_file or self.trg_file
    utils.make_parent_dir(trg_file)

    logger.info(f'Performing inference on {src_file}')

    ref_corpus, src_corpus = self._read_corpus(generator, src_file, mode=self.mode, ref_file=self.ref_file)

    event_trigger.set_train(False)

    ref_scores = None
    if self.mode == 'score':
      ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents)
      self._write_rescored_output(ref_scores, self.ref_file, trg_file)

    if self.mode == 'forceddebug':
      ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents)

    if self.mode != 'score':
      self._generate_output(generator=generator, forced_ref_corpus=ref_corpus, assert_scores=ref_scores,
                            src_corpus=src_corpus, trg_file=trg_file, batcher=self.batcher,
                            max_src_len=self.max_src_len)
Пример #5
0
    def read_sents(self, filename, filter_ids=None):
        npzFile = np.load(filename,
                          mmap_mode=None if filter_ids is None else "r")
        npzKeys = sorted(npzFile.files, key=lambda x: int(x.split('_')[-1]))
        if filter_ids is not None:
            filter_ids = sorted(filter_ids)
            npzKeys = [npzKeys[i] for i in filter_ids]
            npzKeys.sort(key=lambda x: int(x.split('_')[-1]))
        for sent_no, key in enumerate(npzKeys):
            inp = npzFile[key]
            if self.transpose:
                inp = inp.transpose()

            sub_inp = inp[self.feat_from:self.feat_to:self.feat_skip, :self.
                          timestep_truncate:self.timestep_skip]
            if sub_inp.size < inp.size:
                inp = np.empty_like(sub_inp)
                np.copyto(inp, sub_inp)
            else:
                inp = sub_inp

            if sent_no % 1000 == 999:
                logger.info(
                    f"Read {sent_no+1} lines ({float(sent_no+1)/len(npzKeys)*100:.2f}%) of {filename} at {key}"
                )
            yield ArraySentence(
                idx=filter_ids[sent_no] if filter_ids else sent_no, nparr=inp)
        npzFile.close()
Пример #6
0
def xnmt_evaluate(ref_file: OneOrSeveral[str],
                  hyp_file: OneOrSeveral[str],
                  evaluators: Sequence[Evaluator],
                  desc: Any = None) -> Sequence[EvalScore]:
    """"Returns the eval score (e.g. BLEU) of the hyp sents using reference trg sents

  Args:
    ref_file: path of the reference file
    hyp_file: path of the hypothesis trg file
    evaluators: Evaluation metrics. Can be a list of evaluator objects, or a shortcut string
    desc: descriptive string passed on to evaluators
  """
    hyp_postprocess = lambda line: line.split()
    ref_postprocess = lambda line: line.split()

    ref_corpus = read_data(ref_file, post_process=ref_postprocess)
    hyp_corpus = read_data(hyp_file, post_process=hyp_postprocess)
    len_before = len(hyp_corpus)
    ref_corpus, hyp_corpus = zip(
        *filter(lambda x: NO_DECODING_ATTEMPTED not in x[1],
                zip(ref_corpus, hyp_corpus)))
    if len(ref_corpus) < len_before:
        logger.info(
            f"> ignoring {len_before - len(ref_corpus)} out of {len_before} test sentences."
        )

    return [
        evaluator.evaluate(ref_corpus, hyp_corpus, desc=desc)
        for evaluator in evaluators
    ]
Пример #7
0
 def report(self) -> None:
     this_report_time = time.time()
     self.last_report_sents_since_start = self.training_task.training_state.sents_since_start
     self.fractional_epoch = (self.training_task.training_state.epoch_num - 1) \
                             + self.training_task.training_state.sents_into_epoch / self.training_task.cur_num_sentences()
     dev_time = self.time_tracker.get_and_reset()
     utils.log_readable_and_tensorboard(
         template=DevLossTracker.REPORT_TEMPLATE_DEV,
         args={self.dev_score.metric_name(): self.dev_score.value()},
         n_iter=self.fractional_epoch,
         data_name="dev",
         task_name=self.name,
         score=self.dev_score,
         time=utils.format_time(this_report_time - self.start_time))
     for score in self.aux_scores:
         utils.log_readable_and_tensorboard(
             template=DevLossTracker.REPORT_TEMPLATE_DEV_AUX,
             args={score.metric_name(): score.value()},
             n_iter=self.fractional_epoch,
             data_name="dev",
             task_name=self.name,
             score=score)
     logger.info(
         DevLossTracker.REPORT_TEMPLATE_TIME_NEEDED.format(
             time_needed=utils.format_time(dev_time),
             extra={"task_name": self.name}))
     self.aux_scores = []
Пример #8
0
  def _augment_data_next_epoch(self):
    """
    This is run in the background if reload_command is given to prepare data for the next epoch
    """
    augment_command = self.reload_command
    if self._augmentation_handle is None:
      # first run
      self._augmentation_handle = Popen(augment_command + " --epoch %d" % self.training_state.epoch_num, shell=True)
      self._augmentation_handle.wait()

    self._augmentation_handle.poll()
    retcode = self._augmentation_handle.returncode
    if retcode is not None:
      if self.training_state.epoch_num > 0:
        logger.info('using reloaded data')
      # reload the data   
      self.src_data, self.trg_data, self.src_batches, self.trg_batches = \
          xnmt.input_reader.read_parallel_corpus(self.model.src_reader, self.model.trg_reader,
                                          self.src_file, self.trg_file,
                                          batcher=self.batcher, sample_sents=self.sample_train_sents,
                                          max_num_sents=self.max_num_train_sents,
                                          max_src_len=self.max_src_len, max_trg_len=self.max_trg_len)
      # restart data generation
      self._augmentation_handle = Popen(augment_command + " --epoch %d" % self.training_state.epoch_num, shell=True)
    else:
      logger.info('new data set is not ready yet, using data from last epoch.')
Пример #9
0
    def update(self):
        self.steps += 1
        decay = (self.dim**(-0.5)) * np.min(
            [self.steps**(-0.5), self.steps * (self.warmup_steps**(-1.5))])
        self.optimizer.learning_rate = 1. * decay
        super().update()

        if self.steps % 200 == 0:
            logger.info('> Optimizer Logging')
            logger.info('  Steps=%d, learning_rate=%.2e' %
                        (self.steps, self.optimizer.learning_rate))
Пример #10
0
    def __init__(self,
                 emb_dim: int = Ref("exp_global.default_layer_dim"),
                 vocab_size: Optional[int] = None,
                 vocab: Optional[vocabs.Vocab] = None,
                 yaml_path: Path = Path(''),
                 src_reader: Optional[input_readers.InputReader] = Ref(
                     "model.src_reader", default=None),
                 trg_reader: Optional[input_readers.InputReader] = Ref(
                     "model.trg_reader", default=None),
                 is_dense: bool = False,
                 param_init: pinit.ParamInitializer = Ref(
                     "exp_global.param_init",
                     default=bare(pinit.GlorotInitializer)),
                 bias_init: pinit.ParamInitializer = Ref(
                     "exp_global.bias_init",
                     default=bare(pinit.ZeroInitializer)),
                 init_fastext: Optional[str] = None,
                 weight_noise: float = Ref("exp_global.weight_noise",
                                           default=0.0),
                 fix_norm: Optional[float] = None):
        super().__init__(emb_dim=emb_dim,
                         weight_noise=weight_noise,
                         fix_norm=fix_norm)
        # Embedding Parameters
        pcol = param_collections.ParamManager.my_params(self)
        self.vocab_size = self.choose_vocab_size(vocab_size, vocab, yaml_path,
                                                 src_reader, trg_reader)
        emb_mtr_dim = (self.vocab_size, self.emb_dim)

        if init_fastext is not None:
            logger.info("Setting Dense to False because of init_fastext")
            is_dense = False

        if not is_dense:
            if init_fastext is not None:
                self.embeddings = pcol.lookup_parameters_from_numpy(
                    self._read_fasttext_embeddings(vocab, init_fastext))
            else:
                self.embeddings = pcol.add_lookup_parameters(
                    emb_mtr_dim,
                    init=param_init.initializer(emb_mtr_dim, is_lookup=True))
        else:
            self.embeddings = pcol.add_parameters(emb_mtr_dim,
                                                  init=param_init.initializer(
                                                      emb_mtr_dim,
                                                      is_lookup=True))
            self.bias = pcol.add_parameters((self.vocab_size, ),
                                            init=bias_init.initializer(
                                                (self.vocab_size, )))

        # Model States
        self.is_dense = is_dense
        self.train = False
        self.save_processed_arg("vocab_size", self.vocab_size)
Пример #11
0
  def update(self) -> None:
    self.steps += 1
    if self.warmup_steps:
      decay = (self.dim ** (-0.5)) * np.min([self.steps ** (-0.5), self.steps * (self.warmup_steps ** (-1.5))])
    else:
      decay = (self.dim ** (-0.5)) * self.steps ** (-0.5)
    self.lr_factor = 1. * decay
    super().update()

    if self.steps % 200 == 0:
      logger.info('> Optimizer Logging')
      logger.info(f'  Steps={self.steps}, learning_rate={self.lr_factor:.2e}')
Пример #12
0
    def __init__(self, tasks: List[PreprocTask] = [], overwrite: bool = False):
        logger.info("> Preprocessing")

        for task in tasks:

            # Sanity check
            if len(task.in_files) != len(task.out_files):
                raise RuntimeError(
                    "Length of in_files and out_files in preprocessor must be identical"
                )

            task.run_preproc_task(overwrite=overwrite)
Пример #13
0
    def _read_fasttext_embeddings(self, vocab: vocabs.Vocab, init_fastext):
        """
    Reads FastText embeddings from a file. Also prints stats about the loaded embeddings for sanity checking.

    Args:
      vocab: a `Vocab` object containing the vocabulary for the experiment
      embeddings_file_handle: A file handle on the embeddings file. The embeddings must be in FastText text
                              format.
    Returns:
      tuple: A tuple of (total number of embeddings read, # embeddings that match vocabulary words, # vocabulary words
     without a matching embedding, embeddings array).
    """
        with open(init_fastext, encoding='utf-8') as embeddings_file_handle:
            _, dimension = next(embeddings_file_handle).split()
            if int(dimension) != self.emb_dim:
                raise Exception(
                    f"An embedding size of {self.emb_dim} was specified, but the pretrained embeddings have size {dimension}"
                )

            # Poor man's Glorot initializer for missing embeddings
            bound = np.sqrt(6 / (self.vocab_size + self.emb_dim))

            total_embs = 0
            in_vocab = 0
            missing = 0

            embeddings = np.empty((self.vocab_size, self.emb_dim),
                                  dtype='float')
            found = np.zeros(self.vocab_size, dtype='bool_')

            for line in embeddings_file_handle:
                total_embs += 1
                word, vals = line.strip().split(' ', 1)
                if word in vocab.w2i:
                    in_vocab += 1
                    index = vocab.w2i[word]
                    embeddings[index] = np.fromstring(vals, sep=" ")
                    found[index] = True

            for i in range(self.vocab_size):
                if not found[i]:
                    missing += 1
                    embeddings[i] = np.random.uniform(-bound, bound,
                                                      self.emb_dim)

            logger.info(
                f"{in_vocab} vocabulary matches out of {total_embs} total embeddings; "
                f"{missing} vocabulary words without a pretrained embedding out of {self.vocab_size}"
            )

        return embeddings
Пример #14
0
    def populate() -> None:
        """
    Populate the parameter collections.

    Searches the given data paths and loads parameter collections if they exist, otherwise leave parameters in their
    randomly initialized state.
    """
        assert ParamManager.initialized, "must call ParamManager.init_param_col() first"
        populated_subcols = []
        for subcol_name in ParamManager.param_col.subcols:
            for load_path in ParamManager.load_paths:
                data_file = os.path.join(load_path, subcol_name)
                if os.path.isfile(data_file):
                    ParamManager.param_col.load_subcol_from_data_file(
                        subcol_name, data_file)
                    populated_subcols.append(subcol_name)
        if len(ParamManager.param_col.subcols) == len(populated_subcols):
            logger.info(
                f"> populated neural network parameters of all components from given data files"
            )
        elif len(populated_subcols) == 0:
            logger.info(
                f"> use randomly initialized neural network parameters for all components"
            )
        else:
            logger.info(
                f"> populated a subset of neural network parameters from given data files: {populated_subcols}.\n"
                f"  Did not populate {ParamManager.param_col.subcols.keys() - set(populated_subcols)}.\n"
                f"  If partial population was not intended, likely the unpopulated component or its owner"
                f"   does not adhere to the Serializable protocol correctly, see documentation:\n"
                f"   http://xnmt.readthedocs.io/en/latest/writing_xnmt_classes.html#using-serializable-subcomponents"
            )
        logger.info(
            f"  neural network param count: {ParamManager.param_col.parameter_count()}"
        )
Пример #15
0
    def __call__(self, save_fct):
        """
    Launch training loop, followed by final evaluation.
    """
        eval_scores = ["Not evaluated"]
        if self.status != "done":
            if self.train:
                logger.info("> Training")
                self.train.run_training(save_fct=save_fct)
                logger.info('reverting learned weights to best checkpoint..')
                try:
                    ParamManager.param_col.revert_to_best_model()
                except RevertingUnsavedModelException:
                    pass

            evaluate_args = self.evaluate
            if evaluate_args:
                logger.info("> Performing final evaluation")
                eval_scores = []
                for evaluator in evaluate_args:
                    eval_score = evaluator.eval()
                    if type(eval_score) == list:
                        eval_scores.extend(eval_score)
                    else:
                        eval_scores.append(eval_score)

            self.save_processed_arg("status", "done")
            save_fct()
        else:
            logger.info("Experiment already finished, skipping.")

        return eval_scores
Пример #16
0
  def checkpoint(self, control_learning_schedule=True):
    """
    Performs a dev checkpoint

    Args:
      control_learning_schedule: If False, only evaluate dev data.
                                      If True, also perform model saving, LR decay etc. if needed.
    Returns:
      True if the model needs saving, False otherwise
    """
    ret = False
    self.logger.new_dev()

    # Perform evaluation
    if self.dev_tasks and len(self.dev_tasks) > 0:
      dev_scores = []
      for dev_task in self.dev_tasks:
        dev_score, dev_word_cnt = dev_task.eval()
        if type(dev_score) == list:
          dev_scores.extend(dev_score)
        else:
          dev_scores.append(dev_score)
      # TODO: This is passing "1" for the number of words, as this is not implemented yet
      self.logger.set_dev_score(dev_word_cnt, dev_scores[0])
      for dev_score in dev_scores[1:]:
        self.logger.report_auxiliary_score(dev_score)

    # Control the learning schedule
    if control_learning_schedule:
      logger.info("> Checkpoint")
      # Write out the model if it's the best one
      if self.logger.report_dev_and_check_model():
        ret = True
        self.training_state.cur_attempt = 0
      else:
        # otherwise: learning rate decay / early stopping
        self.training_state.cur_attempt += 1
        if self.lr_decay < 1.0:
          should_decay = False
          if (self.initial_patience is None or self.training_state.num_times_lr_decayed>0) \
                  and self.training_state.cur_attempt >= self.patience:
            should_decay = True
          if self.initial_patience is not None and self.training_state.num_times_lr_decayed==0 \
                  and self.training_state.cur_attempt >= self.initial_patience:
            should_decay = True
          if should_decay:
            self.training_state.num_times_lr_decayed += 1
            if self.training_state.num_times_lr_decayed > self.lr_decay_times:
              logger.info('  Early stopping')
              self.early_stopping_reached = True
            else:
              self.training_state.cur_attempt = 0
              self.trainer.learning_rate *= self.lr_decay
              logger.info('  new learning rate: %s' % self.trainer.learning_rate)
              if self.restart_trainer:
                logger.info('  restarting trainer and reverting learned weights to best checkpoint..')
                self.trainer.restart()
                ParamManager.param_col.revert_to_best_model()

    return ret
Пример #17
0
  def __init__(self,
               exp_global:Optional[ExpGlobal] = bare(ExpGlobal),
               preproc:Optional[PreprocRunner] = None,
               model:Optional[GeneratorModel] = None,
               train:Optional[TrainingRegimen] = None,
               evaluate:Optional[List[EvalTask]] = None,
               random_search_report:Optional[dict] = None) -> None:
    self.exp_global = exp_global
    self.preproc = preproc
    self.model = model
    self.train = train
    self.evaluate = evaluate

    if random_search_report:
      logger.info(f"> instantiated random parameter search: {random_search_report}")
Пример #18
0
 def update(self) -> None:
   self.global_step += 1
   if self.rescale_grads:
     torch.nn.utils.clip_grad_norm_(ParamManager.global_collection().parameters(), self.rescale_grads)
   self.scheduler.step()
   if settings.USE_TENSORBOARD:
     tee.tensorboard_writer.add_scalars(name="lr", tag_scalar_dict={"lr": self.learning_rate * self.lr_factor},
                                        global_step=self.global_step)
     if not self.skip_noisy:
       tee.tensorboard_writer.add_scalars(name="grad", tag_scalar_dict={"norm": np.exp(self.grad_log_norm())},
                                                                       global_step=self.global_step)
   if not (self.skip_noisy and self.check_gradients_noisy()):
     self.optimizer.step()
   else:
     logger.info("skipping noisy update")
Пример #19
0
def log_readable_and_tensorboard(template: str,
                                 args: MutableMapping,
                                 n_iter: numbers.Real,
                                 data_name: str,
                                 task_name: Optional[str] = None,
                                 **kwargs) -> None:
    log_args = dict(args)
    log_args["data_name"] = data_name
    log_args["epoch"] = n_iter
    log_args.update(kwargs)
    if task_name: log_args["task_name"] = task_name
    logger.info(template.format(**log_args), extra=log_args)

    from xnmt.tee import tensorboard_writer
    tensorboard_writer.add_scalars(
        f"{task_name}/{data_name}" if task_name else data_name, args, n_iter)
Пример #20
0
def log_readable_and_tensorboard(template,
                                 args,
                                 n_iter,
                                 data_name,
                                 task_name=None,
                                 **kwargs):
    log_args = dict(args)
    log_args["data_name"] = data_name
    log_args["epoch"] = n_iter
    log_args.update(kwargs)
    if task_name: log_args["task_name"] = task_name
    logger.info(template.format(**log_args), extra=log_args)

    from xnmt.tee import tensorboard_writer
    tensorboard_writer.add_scalars(
        f"{task_name}/{data_name}" if task_name else data_name, args, n_iter)
Пример #21
0
 def update(self) -> None:
     """
 Update the parameters.
 """
     try:
         if not (self.skip_noisy and self._check_gradients_noisy()):
             self.optimizer.update()
         else:
             logger.info("skipping noisy update")
     except RuntimeError:
         logger.warning(
             "Failed to perform update. Skipping example and clearing gradients."
         )
         for subcol in ParamManager.param_col.subcols.values():
             for param in subcol.parameters_list():
                 param.scale_gradient(0)
Пример #22
0
def read_parallel_corpus(src_reader: InputReader, trg_reader: InputReader, src_file: str, trg_file: str,
                         batcher: xnmt.batcher.Batcher=None, sample_sents=None, max_num_sents=None, max_src_len=None, max_trg_len=None):
  """
  A utility function to read a parallel corpus.

  Args:
    src_reader (InputReader):
    trg_reader (InputReader):
    src_file (str):
    trg_file (str):
    batcher (Batcher):
    sample_sents (int): if not None, denote the number of sents that should be randomly chosen from all available sents.
    max_num_sents (int): if not None, read only the first this many sents
    max_src_len (int): skip pair if src side is too long
    max_trg_len (int): skip pair if trg side is too long

  Returns:
    A tuple of (src_data, trg_data, src_batches, trg_batches) where ``*_batches = *_data`` if ``batcher=None``
  """
  src_data = []
  trg_data = []
  if sample_sents:
    logger.info(f"Starting to read {sample_sents} parallel sentences of {src_file} and {trg_file}")
    src_len = src_reader.count_sents(src_file)
    trg_len = trg_reader.count_sents(trg_file)
    if src_len != trg_len: raise RuntimeError(f"training src sentences don't match trg sentences: {src_len} != {trg_len}!")
    if max_num_sents and max_num_sents < src_len: src_len = trg_len = max_num_sents
    filter_ids = np.random.choice(src_len, sample_sents, replace=False)
  else:
    logger.info(f"Starting to read {src_file} and {trg_file}")
    filter_ids = None
    src_len, trg_len = 0, 0
  src_train_iterator = src_reader.read_sents(src_file, filter_ids)
  trg_train_iterator = trg_reader.read_sents(trg_file, filter_ids)
  for src_sent, trg_sent in zip_longest(src_train_iterator, trg_train_iterator):
    if src_sent is None or trg_sent is None:
      raise RuntimeError(f"training src sentences don't match trg sentences: {src_len or src_reader.count_sents(src_file)} != {trg_len or trg_reader.count_sents(trg_file)}!")
    if max_num_sents and (max_num_sents <= len(src_data)):
      break
    src_len_ok = max_src_len is None or src_sent.sent_len() <= max_src_len
    trg_len_ok = max_trg_len is None or trg_sent.sent_len() <= max_trg_len
    if src_len_ok and trg_len_ok:
      src_data.append(src_sent)
      trg_data.append(trg_sent)

  logger.info(f"Done reading {src_file} and {trg_file}. Packing into batches.")

  # Pack batches
  if batcher is not None:
    src_batches, trg_batches = batcher.pack(src_data, trg_data)
  else:
    src_batches, trg_batches = src_data, trg_data

  logger.info(f"Done packing batches.")

  return src_data, trg_data, src_batches, trg_batches
Пример #23
0
    def __init__(self,
                 src_reader: input_readers.InputReader,
                 trg_reader: input_readers.InputReader,
                 src_embedder: embedders.Embedder = bare(
                     embedders.LookupEmbedder),
                 encoder: recurrent.UniLSTMSeqTransducer = bare(
                     recurrent.UniLSTMSeqTransducer),
                 attender: attenders.Attender = bare(attenders.MlpAttender),
                 decoder: decoders.Decoder = bare(
                     decoders.AutoRegressiveDecoder),
                 inference: inferences.AutoRegressiveInference = bare(
                     inferences.AutoRegressiveInference),
                 truncate_dec_batches: bool = False,
                 policy_network: Optional[PolicyNetwork] = None,
                 policy_train_oracle=False,
                 policy_test_oracle=False,
                 policy_sample=False,
                 read_before_write=False) -> None:
        super().__init__(src_reader=src_reader,
                         trg_reader=trg_reader,
                         encoder=encoder,
                         attender=attender,
                         src_embedder=src_embedder,
                         decoder=decoder,
                         inference=inference,
                         truncate_dec_batches=truncate_dec_batches)
        policy_network = self.add_serializable_component(
            "policy_network", policy_network, lambda: policy_network)
        PolicyConditionedModel.__init__(self, policy_network,
                                        policy_train_oracle,
                                        policy_test_oracle)
        self.policy_sample = policy_sample
        self.read_before_write = read_before_write

        if self.read_before_write:
            logger.info(
                "Setting looking oracle to always false in SimultTranslator for 'read_before_write'"
            )
            self.policy_train_oracle = False
            self.policy_test_oracle = False

        self.outputs = []
        self.decoder_states = []
        self.model_states = []
Пример #24
0
  def __init__(self,
               name: str,
               exp_global: Optional[ExpGlobal] = bare(ExpGlobal),
               preproc: Optional[preproc.PreprocRunner] = None,
               model: Optional[models_base.TrainableModel] = None,
               train: Optional[regimens.TrainingRegimen] = None,
               evaluate: Optional[List[eval_tasks.EvalTask]] = None,
               random_search_report: Optional[dict] = None,
               status: Optional[str] = None) -> None:
    self.name = name
    self.exp_global = exp_global
    self.preproc = preproc
    self.model = model
    self.train = train
    self.evaluate = evaluate
    self.status = status

    if random_search_report:
      logger.info(f"> instantiated random parameter search: {random_search_report}")
Пример #25
0
def log_readable_and_tensorboard(template: str,
                                 args: MutableMapping,
                                 n_iter: numbers.Integral,
                                 fractional_epoch: numbers.Real,
                                 data_name: str,
                                 task_name: Optional[str] = None,
                                 **kwargs) -> None:
    log_args = dict(args)
    log_args["data_name"] = data_name
    log_args["epoch"] = fractional_epoch
    log_args["n_iter"] = n_iter
    log_args.update(kwargs)
    if task_name: log_args["task_name"] = task_name
    logger.info(template.format(**log_args), extra=log_args)

    if settings.USE_TENSORBOARD:
        from xnmt.tee import tensorboard_writer
        if tensorboard_writer.writer is not None:
            tensorboard_writer.add_scalars(
                f"{task_name}/{data_name}" if task_name else data_name, args,
                n_iter)
Пример #26
0
 def update(self) -> None:
   """
   Update the parameters.
   """
   self.global_step += 1
   if settings.USE_TENSORBOARD:
     tee.tensorboard_writer.add_scalars(name="lr", tag_scalar_dict={"lr": self.optimizer.learning_rate},
                                        global_step=self.global_step)
     if not self.skip_noisy:
       tee.tensorboard_writer.add_scalars(name="grad", tag_scalar_dict={"norm": np.exp(self.grad_log_norm())},
                                                                       global_step=self.global_step)
   try:
     if not (self.skip_noisy and self.check_gradients_noisy()):
       self.optimizer.update()
     else:
       logger.info("skipping noisy update")
   except RuntimeError:
     logger.warning("Failed to perform update. Skipping example and clearing gradients.")
     for subcol in ParamManager.param_col.subcols.values():
       for param in subcol.parameters_list():
         param.scale_gradient(0)
Пример #27
0
    def populate() -> None:
        """
    Populate the parameter collections.

    Searches the given data paths and loads parameter collections if they exist, otherwise leave parameters in their
    randomly initialized state.
    """
        assert ParamManager.initialized, "must call ParamManager.init_param_col() first"
        populated_subcols = []
        for subcol_name in ParamManager.param_col.subcols:
            for load_path in ParamManager.load_paths:
                data_file = os.path.join(load_path, subcol_name)
                if os.path.isfile(data_file):
                    ParamManager.param_col.load_subcol_from_data_file(
                        subcol_name, data_file)
                    populated_subcols.append(subcol_name)
        if len(ParamManager.param_col.subcols) == len(populated_subcols):
            logger.info(
                f"> populated DyNet weights of all components from given data files"
            )
        elif len(populated_subcols) == 0:
            logger.info(
                f"> use randomly initialized DyNet weights of all components")
        else:
            logger.info(
                f"> populated a subset of DyNet weights from given data files: {populated_subcols}.\n"
                f"  Did not populate {ParamManager.param_col.subcols.keys() - set(populated_subcols)}.\n"
                f"  (Note: if partial population was not intended, likely the unpopulated component or its owner"
                f"   does not adhere to the Serializable protocol correctly, see documentation)."
            )
Пример #28
0
    def extract_to(self, in_file: str, out_file: str):

        output_file = open(out_file, "w")

        counter, num_node_sum1, num_edge_sum1, num_node_sum2, num_edge_sum2 = 0, 0, 0, 0, 0
        with open(in_file) as f:
            for line in f:
                graph = LatticeFromPlfExtractor._Lattice()
                graph.read_plf_line(line)
                graph.insert_initial_node()
                graph.insert_final_node()
                graph.forward()
                graph2 = LatticeFromPlfExtractor._Lattice.convert_to_node_labeled_lattice(
                    graph)
                if len(graph2.nodes) == 1:
                    graph2.insert_initial_node()
                serial = graph2.serialize_to_string()
                output_file.write(serial + "\n")
                counter += 1
                num_node_sum1 += len(graph.nodes)
                num_node_sum2 += len(graph2.nodes)
                num_edge_sum1 += len(graph.edges)
                num_edge_sum2 += len(graph2.edges)
                if counter % 1000 == 0:
                    logger.info(f"finished {counter} lattices.")

        output_file.close()

        logger.info(
            f"avg # nodes, # edges for edge-labeled lattices: {float(num_node_sum1) / counter}, {float(num_edge_sum1) / counter}"
        )
        logger.info(
            f"avg # nodes, # edges for node-labeled lattices: {float(num_node_sum2) / counter}, {float(num_edge_sum2) / counter}"
        )
Пример #29
0
    def report_dev_and_check_model(self):
        """
    Print dev testing report and check whether the dev loss is the best seen so far.

    Return:
      True if the dev loss is the best and required save operations
    """
        this_report_time = time.time()
        sent_num = self.eval_dev_every if self.eval_dev_every != 0 else self.total_train_sent
        self.sent_num_not_report_dev = self.sent_num_not_report_dev % sent_num
        self.fractional_epoch = (self.epoch_num -
                                 1) + self.sent_num / self.total_train_sent
        self.log_readable_and_structured(
            LossTracker.REPORT_TEMPLATE_DEV, {
                "key":
                "dev_loss",
                "epoch":
                self.fractional_epoch,
                "score":
                self.dev_score,
                "words":
                self.dev_words,
                "words_per_sec":
                self.dev_words / (this_report_time - self.dev_start_time),
                "time":
                self.format_time(this_report_time - self.start_time)
            })

        save_model = True
        if self.best_dev_score is not None:
            save_model = self.dev_score.better_than(self.best_dev_score)
        if save_model:
            self.best_dev_score = self.dev_score
            logger.info(
                f"Epoch {self.fractional_epoch:.4f}: best dev score, writing out model"
            )
        return save_model
Пример #30
0
    def __call__(self, save_fct):
        """
    Launch training loop, followed by final evaluation.
    """
        eval_scores = "Not evaluated"
        if self.train:
            logger.info("> Training")
            self.train.run_training(save_fct=save_fct)
            logger.info('reverting learned weights to best checkpoint..')
            ParamManager.param_col.revert_to_best_model()

        evaluate_args = self.evaluate
        if evaluate_args:
            logger.info("> Performing final evaluation")
            eval_scores = []
            for evaluator in evaluate_args:
                eval_score, _ = evaluator.eval()
                if type(eval_score) == list:
                    eval_scores.extend(eval_score)
                else:
                    eval_scores.append(eval_score)

        return eval_scores