Exemplo n.º 1
0
 def on_end_inference(self):
     if self.hyp_sents:
         ref_filename = f"{self.report_path}/tmp/compare-mt.ref"
         out_filename = f"{self.report_path}/tmp/compare-mt.out"
         util.make_parent_dir(out_filename)
         with open(ref_filename, "w") as fout:
             for l in self.ref_sents:
                 fout.write(f"{l.strip()}\n")
         with open(out_filename, "w") as fout:
             for l in self.hyp_sents:
                 fout.write(f"{l.strip()}\n")
         import xnmt.thirdparty.comparemt.compare_mt as compare_mt
         args = util.ArgClass(ref_file=ref_filename,
                              out_file=out_filename,
                              out2_file=self.out2_file,
                              train_file=self.train_file,
                              train_counts=self.train_counts,
                              alpha=self.alpha,
                              ngram=self.ngram,
                              ngram_size=self.ngram_size,
                              sent_size=self.sent_size)
         out_lines = compare_mt.main(args)
         report_filename = f"{self.report_path}/compare-mt.txt"
         util.make_parent_dir(report_filename)
         with open(report_filename, "w") as fout:
             for l in out_lines:
                 fout.write(f"{l}\n")
         self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
Exemplo n.º 2
0
  def perform_inference(self, generator: 'model_base.GeneratorModel', src_file: str = None, trg_file: str = None,
                        ref_file_to_report=None):
    """
    Perform inference.

    Args:
      generator: the model to be used
      src_file: path of input src file to be translated
      trg_file: path of file where trg translatons will be written
    """
    src_file = src_file or self.src_file
    trg_file = trg_file or self.trg_file
    util.make_parent_dir(trg_file)

    logger.info(f'Performing inference on {src_file}')

    ref_corpus, src_corpus = self._read_corpus(generator, src_file, mode=self.mode, ref_file=self.ref_file)

    generator.set_train(False)

    ref_scores = None
    if self.mode == 'score':
      ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents)
      self._write_rescored_output(ref_scores, self.ref_file, trg_file)

    if self.mode == 'forceddebug':
      ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents)

    if self.mode != 'score':
      self._generate_output(generator=generator, forced_ref_corpus=ref_corpus, assert_scores=ref_scores,
                            src_corpus=src_corpus, trg_file=trg_file, batcher=self.batcher,
                            max_src_len=self.max_src_len, ref_file_to_report=ref_file_to_report)
    self.end_inference()
Exemplo n.º 3
0
 def write_html(self) -> None:
     html_str = "\n".join(self.html_contents)
     soup = bs(html_str, "lxml")
     pretty_html = soup.prettify()
     html_file_name = f"{self.report_path}/{self.report_name}.html"
     util.make_parent_dir(html_file_name)
     with open(html_file_name, 'w', encoding='utf-8') as f:
         f.write(pretty_html)
Exemplo n.º 4
0
 def on_end_inference(self):
     if self.hyp_sents:
         html_filename = f"{self.report_path}/charcut.html"
         util.make_parent_dir(html_filename)
         args = util.ArgClass(html_output_file=html_filename,
                              match_size=self.match_size,
                              alt_norm=self.alt_norm)
         aligned_segs = charcut.load_input_segs(cand_segs=self.hyp_sents,
                                                ref_segs=self.ref_sents,
                                                src_segs=self.src_sents)
         charcut.run_on(aligned_segs, args)
         self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
Exemplo n.º 5
0
def set_out_file(out_file):
    unset_out_file()
    make_parent_dir(out_file)
    fh = logging.FileHandler(out_file, mode='w')
    fh.setLevel(settings.LOG_LEVEL_FILE)
    fh.setFormatter(MainFormatter())
    logger.addHandler(fh)
    yaml_fh = logging.FileHandler(f"{out_file}.yaml", mode='w')
    yaml_fh.setLevel(logging.DEBUG)
    yaml_fh.setFormatter(YamlFormatter())
    yaml_fh.setLevel(logging.DEBUG)
    yaml_logger.addHandler(yaml_fh)
Exemplo n.º 6
0
    def __init__(self,
                 path,
                 train_files,
                 vocab_size,
                 overwrite=False,
                 model_prefix='sentpiece',
                 output_format='piece',
                 model_type='bpe',
                 encode_extra_options=None,
                 decode_extra_options=None):
        """
    Initialize the wrapper around sentencepiece and train the tokenizer.

    If overwrite is set to False, learned model will not be overwritten, even if parameters
    are changed.

    "File" output for Sentencepiece written to StringIO temporarily before being written to disk.

    """
        self.sentpiece_path = path
        self.model_prefix = model_prefix
        self.output_format = output_format
        self.input_format = output_format
        self.encode_extra_options = [
            '--extra_options=' + encode_extra_options
        ] if encode_extra_options else []
        self.decode_extra_options = [
            '--extra_options=' + decode_extra_options
        ] if decode_extra_options else []

        make_parent_dir(model_prefix)

        if ((not os.path.exists(self.model_prefix + '.model'))
                or (not os.path.exists(self.model_prefix + '.vocab'))
                or overwrite):
            sentpiece_train_exec_loc = os.path.join(path, 'spm_train')
            sentpiece_train_command = [
                sentpiece_train_exec_loc, '--input=' + ','.join(train_files),
                '--model_prefix=' + str(model_prefix),
                '--vocab_size=' + str(vocab_size),
                '--model_type=' + str(model_type)
            ]
            subprocess.call(sentpiece_train_command)

        sentpiece_encode_exec_loc = os.path.join(self.sentpiece_path,
                                                 'spm_encode')
        sentpiece_encode_command = [
            sentpiece_encode_exec_loc, '--model=' + self.model_prefix +
            '.model', '--output_format=' + self.output_format
        ] + self.encode_extra_options
        self.tokenizer_command = sentpiece_encode_command
Exemplo n.º 7
0
    def create_report(self, segment_actions, src_vocab, src, **kwargs):
        if self.report_fp is None:
            report_path = self.report_path + "/segment.txt"
            util.make_parent_dir(report_path)
            self.report_fp = open(report_path, "w")

        actions = segment_actions[0][0]
        src = [src_vocab[x] for x in src]
        words = []
        start = 0
        for end in actions:
            words.append("".join(str(src[start:end + 1])))
            start = end + 1
        print(" ".join(words), file=self.report_fp)
Exemplo n.º 8
0
def plot_attention(src_words, trg_words, attention_matrix, file_name, size_x = 8.0, size_y = 8.0):
  """This takes in source and target words and an attention matrix (in numpy format)
  and prints a visualization of this to a file.

  Args:
    src_words: a list of words in the source
    trg_words: a list of target words
    attention_matrix: a two-dimensional numpy array of values between zero and one,
      where rows correspond to source words, and columns correspond to target words
    file_name: the name of the file to which we write the attention
    size_x: width of the main plot
    size_y: height of the plot
  """
  trg_words = [unidecode(w) for w in trg_words]
  src_is_speech = isinstance(src_words, np.ndarray)
  max_len = len(''.join(trg_words))
  if not src_is_speech:
    max_len = max(max_len, len(''.join(src_words)))
    src_words = [unidecode(w) for w in src_words]
  if max_len>150: matplotlib.rc('font', size=5)
  elif max_len>50: matplotlib.rc('font', size=7)
  dpi = 100 if max_len <= 150 else 150
  fig, axs = plt.subplots(nrows=1, ncols=2 if src_is_speech else 1,
                          figsize=(size_x+(1.0 if src_is_speech else 0.0), size_y),
                          gridspec_kw = {'width_ratios':[1, size_x]} if src_is_speech else None)
  ax = axs[1] if src_is_speech else axs
  # put the major ticks at the middle of each cell
  ax.set_xticks(np.arange(attention_matrix.shape[1]) + 0.5, minor=False)
  ax.set_yticks(np.arange(attention_matrix.shape[0]) + 0.5, minor=False)
  ax.invert_yaxis()
  if src_is_speech: plt.yticks([], [])

  # label axes by words
  ax.set_xticklabels(trg_words, minor=False)
  if not src_is_speech: ax.set_yticklabels(src_words, minor=False)
  ax.xaxis.tick_top()

  # draw the heatmap
  plt.pcolor(attention_matrix, cmap=plt.cm.Blues, vmin=0, vmax=1)
  plt.colorbar()

  if src_is_speech:
    ax = axs[0]
    plot_speech_features(feature_matrix=src_words, ax=ax, dpi=dpi)
    fig.tight_layout()

  util.make_parent_dir(file_name)
  plt.savefig(file_name, dpi=dpi)
  plt.close()
Exemplo n.º 9
0
 def run_preproc_task(self, overwrite=False):
     tokenizers = {
         my_opts["filenum"]: [tok for tok in my_opts["tokenizers"]]
         for my_opts in self.specs
     }
     for file_num, (in_file, out_file) in enumerate(
             zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             make_parent_dir(out_file)
             my_tokenizers = tokenizers.get(file_num, tokenizers["all"])
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 for tokenizer in my_tokenizers:
                     in_stream = tokenizer.tokenize_stream(in_stream)
                 for line in in_stream:
                     out_stream.write(f"{line}\n")
Exemplo n.º 10
0
 def run_preproc_task(self, overwrite=False):
     normalizers = {
         my_opts["filenum"]: Normalizer.from_spec(my_opts["spec"])
         for my_opts in self.specs
     }
     for i, (in_file,
             out_file) in enumerate(zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             make_parent_dir(out_file)
             my_normalizers = normalizers.get(i, normalizers["all"])
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 for line in in_stream:
                     line = line.strip()
                     for normalizer in my_normalizers:
                         line = normalizer.normalize(line)
                     out_stream.write(line + "\n")
Exemplo n.º 11
0
    def __init__(self,
                 path,
                 train_files,
                 vocab_size,
                 overwrite=False,
                 model_prefix='sentpiece',
                 output_format='piece',
                 model_type='bpe',
                 hard_vocab_limit=True,
                 encode_extra_options=None,
                 decode_extra_options=None):
        """
    This will initialize and train the sentencepiece tokenizer.

    If overwrite is set to False, learned model will not be overwritten, even if parameters
    are changed.

    "File" output for Sentencepiece written to StringIO temporarily before being written to disk.

    """
        import sentencepiece as spm
        # TODO: deprecate the path argument
        self.sentpiece_path = path
        self.model_prefix = model_prefix
        self.output_format = output_format
        self.input_format = output_format
        self.overwrite = overwrite
        self.encode_extra_options = [
            '--extra_options=' + encode_extra_options
        ] if encode_extra_options else []
        self.decode_extra_options = [
            '--extra_options=' + decode_extra_options
        ] if decode_extra_options else []

        make_parent_dir(model_prefix)
        self.sentpiece_train_args = [
            '--input=' + ','.join(train_files),
            '--model_prefix=' + str(model_prefix),
            '--vocab_size=' + str(vocab_size),
            '--hard_vocab_limit=' + str(hard_vocab_limit).lower(),
            '--model_type=' + str(model_type)
        ]

        self.sentpiece_processor = None
Exemplo n.º 12
0
 def run_preproc_task(self, overwrite=False):
     filters = {
         my_opts["filenum"]: VocabFilterer.from_spec(my_opts["spec"])
         for my_opts in self.specs
     }
     for i, (in_file,
             out_file) in enumerate(zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             make_parent_dir(out_file)
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 vocab = {}
                 for line in in_stream:
                     for word in line.strip().split():
                         vocab[word] = vocab.get(word, 0) + 1
                 for my_filter in filters.get(i, filters["all"]):
                     vocab = my_filter.filter(vocab)
                 for word in vocab.keys():
                     out_stream.write((word + u"\n"))
Exemplo n.º 13
0
def set_out_file(out_file):
  """
  Set the file to log to. Before calling this, logs are only passed to stdout/stderr.
  Args:
    out_file: file name
  """
  unset_out_file()
  make_parent_dir(out_file)
  with open(out_file, mode="w") as f_out:
    for line in _preamble_content:
      f_out.write(f"{line}\n")
  fh = logging.FileHandler(out_file)
  fh.setLevel(settings.LOG_LEVEL_FILE)
  fh.setFormatter(MainFormatter())
  logger.addHandler(fh)
  yaml_fh = logging.FileHandler(f"{out_file}.yaml", mode='w')
  yaml_fh.setLevel(logging.DEBUG)
  yaml_fh.setFormatter(YamlFormatter())
  yaml_fh.setLevel(logging.DEBUG)
  yaml_logger.addHandler(yaml_fh)
Exemplo n.º 14
0
def plot_speech_features(feature_matrix, file_name=None, vertical = True, ax=None, length = 8.0, dpi=100):
  """Plot speech feature matrix.

  Args:
    feature_matrix: a two-dimensional numpy array of values between zero and one,
      where rows correspond to source words, and columns correspond to target words
    file_name: the name of the file to which we write the attention; if not given, the plt context will be left un-closed
    vertical: if True, the time dimension will be projected onto the y axis, otherwise the x axis
    ax: if given, draw on this matplotlib axis; otherwise create a new figure
    length: figure length (if ax is not given)
  """
  if not ax:
    plt.subplots(figsize=(1.0, length))
  if vertical: feature_matrix = feature_matrix.T
  if ax:
    ax.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1)
    ax.axis('off')
  else:
    plt.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1)
    plt.axis('off')
  if file_name is not None:
    util.make_parent_dir(file_name)
    plt.savefig(file_name, dpi=dpi)
    plt.close()
Exemplo n.º 15
0
    def __call__(self,
                 generator,
                 src_file=None,
                 trg_file=None,
                 candidate_id_file=None):
        """
    Args:
      generator (GeneratorModel): the model to be used
      src_file (str): path of input src file to be translated
      trg_file (str): path of file where trg translatons will be written
      candidate_id_file (str): if we are doing something like retrieval where we select from fixed candidates, sometimes we want to limit our candidates to a certain subset of the full set. this setting allows us to do this.
    """
        args = dict(src_file=src_file or self.src_file,
                    trg_file=trg_file or self.trg_file,
                    ref_file=self.ref_file,
                    max_src_len=self.max_src_len,
                    post_process=self.post_process,
                    candidate_id_file=candidate_id_file,
                    report_path=self.report_path,
                    report_type=self.report_type,
                    beam=self.beam,
                    max_len=self.max_len,
                    len_norm_type=self.len_norm_type,
                    mode=self.mode)

        is_reporting = issubclass(
            generator.__class__,
            Reportable) and args["report_path"] is not None
        # Corpus
        src_corpus = list(generator.src_reader.read_sents(args["src_file"]))
        # Get reference if it exists and is necessary
        if args["mode"] == "forced" or args["mode"] == "forceddebug" or args[
                "mode"] == "score":
            if args["ref_file"] == None:
                raise RuntimeError(
                    "When performing {} decoding, must specify reference file".
                    format(args["mode"]))
            score_src_corpus = []
            ref_corpus = []
            with open(args["ref_file"], "r", encoding="utf-8") as fp:
                for line in fp:
                    if args["mode"] == "score":
                        nbest = line.split("|||")
                        assert len(
                            nbest
                        ) > 1, "When performing scoring, ref_file must have nbest format 'index ||| hypothesis'"
                        src_index = int(nbest[0].strip())
                        assert src_index < len(
                            src_corpus
                        ), "The src_file has only {} instances, nbest file has invalid src_index {}".format(
                            len(src_corpus), src_index)
                        score_src_corpus.append(src_corpus[src_index])
                        trg_input = generator.trg_reader.read_sent(
                            nbest[1].strip())
                    else:
                        trg_input = generator.trg_reader.read_sent(line)
                    ref_corpus.append(trg_input)
            if args["mode"] == "score":
                src_corpus = score_src_corpus
            else:
                if self.max_len and any(
                        len(s) > self.max_len for s in ref_corpus):
                    logger.warning(
                        "Forced decoding with some targets being longer than max_len. Increase max_len to avoid unexpected behavior."
                    )
        else:
            ref_corpus = None
        # Vocab
        src_vocab = generator.src_reader.vocab if hasattr(
            generator.src_reader, "vocab") else None
        trg_vocab = generator.trg_reader.vocab if hasattr(
            generator.trg_reader, "vocab") else None
        # Perform initialization
        generator.set_train(False)
        generator.initialize_generator(**args)

        if hasattr(generator, "set_post_processor"):
            generator.set_post_processor(self.get_output_processor())
        if hasattr(generator, "set_trg_vocab"):
            generator.set_trg_vocab(trg_vocab)
        if hasattr(generator, "set_reporting_src_vocab"):
            generator.set_reporting_src_vocab(src_vocab)

        if is_reporting:
            generator.set_report_resource("src_vocab", src_vocab)
            generator.set_report_resource("trg_vocab", trg_vocab)

        # If we're debugging, calculate the loss for each target sentence
        ref_scores = None
        if args["mode"] == 'forceddebug' or args["mode"] == 'score':
            some_batcher = xnmt.batcher.InOrderBatcher(32)  # Arbitrary
            if not isinstance(some_batcher, xnmt.batcher.InOrderBatcher):
                raise ValueError(
                    f"forceddebug requires InOrderBatcher, got: {some_batcher}"
                )
            batched_src, batched_ref = some_batcher.pack(
                src_corpus, ref_corpus)
            ref_scores = []
            for src, ref in zip(batched_src, batched_ref):
                dy.renew_cg(immediate_compute=settings.IMMEDIATE_COMPUTE,
                            check_validity=settings.CHECK_VALIDITY)
                loss_expr = generator.calc_loss(
                    src, ref, loss_calculator=LossCalculator())
                if isinstance(loss_expr.value(), Iterable):
                    ref_scores.extend(loss_expr.value())
                else:
                    ref_scores.append(loss_expr.value())
            ref_scores = [-x for x in ref_scores]

        # Make the parent directory if necessary
        make_parent_dir(args["trg_file"])

        # Perform generation of output
        if args["mode"] != 'score':
            with open(args["trg_file"], 'wt', encoding='utf-8'
                      ) as fp:  # Saving the translated output to a trg file
                src_ret = []
                for i, src in enumerate(src_corpus):
                    # This is necessary when the batcher does some sort of pre-processing, e.g.
                    # when the batcher pads to a particular number of dimensions
                    if self.batcher:
                        self.batcher.add_single_batch(src_curr=[src],
                                                      trg_curr=None,
                                                      src_ret=src_ret,
                                                      trg_ret=None)
                        src = src_ret.pop()[0]
                    # Do the decoding
                    if args["max_src_len"] is not None and len(
                            src) > args["max_src_len"]:
                        output_txt = NO_DECODING_ATTEMPTED
                    else:
                        dy.renew_cg(
                            immediate_compute=settings.IMMEDIATE_COMPUTE,
                            check_validity=settings.CHECK_VALIDITY)
                        ref_ids = ref_corpus[i] if ref_corpus != None else None
                        output = generator.generate_output(
                            src, i, forced_trg_ids=ref_ids)
                        # If debugging forced decoding, make sure it matches
                        if ref_scores != None and (
                                abs(output[0].score - ref_scores[i]) /
                                abs(ref_scores[i])) > 1e-5:
                            logger.error(
                                f'Forced decoding score {output[0].score} and loss {ref_scores[i]} do not match at sentence {i}'
                            )
                        output_txt = output[0].plaintext
                    # Printing to trg file
                    fp.write(f"{output_txt}\n")
        else:
            with open(args["trg_file"], 'wt', encoding='utf-8') as fp:
                with open(args["ref_file"], "r", encoding="utf-8") as nbest_fp:
                    for nbest, score in zip(nbest_fp, ref_scores):
                        fp.write("{} ||| score={}\n".format(
                            nbest.strip(), score))
Exemplo n.º 16
0
 def run_preproc_task(self, overwrite=False):
     extractor = self.specs
     for in_file, out_file in zip(self.in_files, self.out_files):
         if overwrite or not os.path.isfile(out_file):
             make_parent_dir(out_file)
             extractor.extract_to(in_file, out_file)
Exemplo n.º 17
0
 def setUp(self):
   yaml.add_representer(DummyClass, xnmt.init_representer)
   self.out_dir = "test/tmp"
   util.make_parent_dir(f"{self.out_dir}/asdf")