예제 #1
0
 def wrapper(obj, *args, **kwargs):
     try:
         return f(obj, *args, **kwargs), f.__name__
     except:
         logger.error("Error handling xnmt event at object:",
                      obj.__class__.__name__)
         raise
예제 #2
0
 def read_sents(self, filename: str, filter_ids: Sequence[numbers.Integral] = None):
   # Routine to add tree
   def emit_tree(idx, lines):
     nodes = {}
     edge_list = []
     for node_id, form, lemma, pos, feat, head, deprel in lines:
       nodes[node_id] = sent.SyntaxTreeNode(node_id=node_id, value=form, head=pos)
     for node_id, form, lemma, pos, feat, head, deprel in lines:
       if head != 0 and deprel != "ROOT":
         edge_list.append(HyperEdge(head, [node_id], None, deprel))
     return sent.RNNGSequenceSentence(idx,
                                      HyperGraph(edge_list, nodes),
                                      self.surface_vocab,
                                      self.nt_vocab,
                                      all_surfaces=True)
   idx = 0
   lines = []
   # Loop all lines in the file
   with open(filename) as fp:
     for line in fp:
       line = line.strip()
       if len(line) == 0:
         yield emit_tree(idx, lines)
         lines.clear()
         idx += 1
       else:
         try:
           node_id, form, lemma, pos, feat, head, deprel = line.strip().split()
           lines.append((int(node_id), form, lemma, pos, feat, int(head), deprel))
         except ValueError:
           logger.error("Bad line: %s", line)
     if len(lines) != 0:
       yield emit_tree(idx, lines)
예제 #3
0
 def evaluate(self, ref, hyp, desc=None):
   try:
     from xnmt.cython import xnmt_cython
   except:
     logger.error("BLEU evaluate fast requires xnmt cython installation step."
                  "please check the documentation.")
     raise
   return xnmt_cython.bleu_sentence(self.ngram, self.smooth, ref, hyp)
예제 #4
0
 def evaluate_one_sent(self, ref, hyp):
   try:
     from xnmt.cython import xnmt_cython
   except:
     logger.error("BLEU evaluate fast requires xnmt cython installation step."
                  "please check the documentation.")
     raise
   if len(ref) == 0 or len(hyp) == 0: return 0
   return xnmt_cython.bleu_sentence(self.ngram, self.smooth, ref, hyp)
예제 #5
0
    def read_sents(self,
                   filename: str,
                   filter_ids: Sequence[numbers.Integral] = None):
        # Routine to add tree
        def emit_tree(idx, lines):
            nodes = {}
            edge_list = []
            max_node = -1
            for node_id, form, lemma, pos, feat, head, deprel in lines:
                nodes[node_id] = sent.SyntaxTreeNode(node_id=node_id,
                                                     value=form,
                                                     head=pos)
                max_node = max(max_node, node_id)
            nodes[max_node + 1] = sent.SyntaxTreeNode(
                node_id=max_node + 1,
                value=vocabs.Vocab.ES_STR,
                head=vocabs.Vocab.ES_STR)
            root = -1
            for node_id, form, lemma, pos, feat, head, deprel in lines:
                if head == 0:
                    root = node_id
                else:
                    edge_list.append(HyperEdge(head, [node_id], None, deprel))
            edge_list.append(
                HyperEdge(root, [max_node + 1], None, vocabs.Vocab.ES_STR))
            return sent.DepTreeRNNGSequenceSentence(
                idx,
                score=None,
                graph=HyperGraph(edge_list, nodes),
                surface_vocab=self.value_vocab,
                nt_vocab=self.node_vocab,
                edge_vocab=self.edge_vocab,
                all_surfaces=True,
                output_procs=self.output_procs)

        idx = 0
        lines = []
        # Loop all lines in the file
        with open(filename) as fp:
            for line in fp:
                line = line.strip()
                if len(line) <= 1:
                    yield emit_tree(idx, lines)
                    lines.clear()
                    idx += 1
                else:
                    try:
                        node_id, form, lemma, pos, feat, head, deprel = line.strip(
                        ).split("\t")
                        lines.append((int(node_id), form, lemma, pos, feat,
                                      int(head), deprel))
                    except ValueError:
                        logger.error("Bad line: %s", line)
                        raise
            if len(lines) != 0:
                yield emit_tree(idx, lines)
예제 #6
0
 def sparse_to_dense(self, actions, length):
     try:
         from xnmt.cython import xnmt_cython
     except:
         logger.error(
             "BLEU evaluate fast requires xnmt cython installation step."
             "please check the documentation.")
         raise RuntimeError()
     batch_dense = []
     for batch_action in actions:
         batch_dense.append(
             xnmt_cython.dense_from_sparse(batch_action, length))
     return np.array(batch_dense).transpose()
예제 #7
0
 def sparse_to_dense(self, actions, length):
     try:
         from xnmt.cython import xnmt_cython
     except:
         logger.error(
             "BLEU evaluate fast requires xnmt cython installation step."
             "please check the documentation.")
         raise RuntimeError()
     dense_actions = []
     for sample_actions in actions:
         batch_dense = []
         for batch_action in sample_actions:
             batch_dense.append(
                 xnmt_cython.dense_from_sparse(batch_action, length))
         dense_actions.append(batch_dense)
     arr = np.array(dense_actions)  # (sample, batch, length)
     return np.rollaxis(arr, 2)
예제 #8
0
 def __exit__(self, et, ev, traceback):
     if et is not None:  # exception occurred
         logger.error("------ Error Report ------")
         for key, val in self.args.items():
             logger.error(f"*** {key} ***")
             if callable(val):
                 val()
             else:
                 logger.error(str(val))
예제 #9
0
 def __exit__(self, et, ev, traceback):
     if et is not None:  # exception occurred
         logger.error("------ Error Report ------")
         for key, val in self.args.items():
             logger.error(f"*** {key} ***")
             if callable(val):
                 val()
             elif batchers.is_batched(val):
                 for sent in val:
                     if hasattr(sent, "idx"):
                         print("{:>10}. {}".format(sent.idx,
                                                   str(sent)[:100]))
                     else:
                         print("{}".format(str(sent)))
             else:
                 logger.error(str(val))
예제 #10
0
    def __call__(self,
                 generator,
                 src_file=None,
                 trg_file=None,
                 candidate_id_file=None):
        """
    Args:
      generator (GeneratorModel): the model to be used
      src_file (str): path of input src file to be translated
      trg_file (str): path of file where trg translatons will be written
      candidate_id_file (str): if we are doing something like retrieval where we select from fixed candidates, sometimes we want to limit our candidates to a certain subset of the full set. this setting allows us to do this.
    """
        args = dict(src_file=src_file or self.src_file,
                    trg_file=trg_file or self.trg_file,
                    ref_file=self.ref_file,
                    max_src_len=self.max_src_len,
                    post_process=self.post_process,
                    candidate_id_file=candidate_id_file,
                    report_path=self.report_path,
                    report_type=self.report_type,
                    beam=self.beam,
                    max_len=self.max_len,
                    len_norm_type=self.len_norm_type,
                    mode=self.mode)

        is_reporting = issubclass(
            generator.__class__,
            Reportable) and args["report_path"] is not None
        # Corpus
        src_corpus = list(generator.src_reader.read_sents(args["src_file"]))
        # Get reference if it exists and is necessary
        if args["mode"] == "forced" or args["mode"] == "forceddebug" or args[
                "mode"] == "score":
            if args["ref_file"] == None:
                raise RuntimeError(
                    "When performing {} decoding, must specify reference file".
                    format(args["mode"]))
            score_src_corpus = []
            ref_corpus = []
            with open(args["ref_file"], "r", encoding="utf-8") as fp:
                for line in fp:
                    if args["mode"] == "score":
                        nbest = line.split("|||")
                        assert len(
                            nbest
                        ) > 1, "When performing scoring, ref_file must have nbest format 'index ||| hypothesis'"
                        src_index = int(nbest[0].strip())
                        assert src_index < len(
                            src_corpus
                        ), "The src_file has only {} instances, nbest file has invalid src_index {}".format(
                            len(src_corpus), src_index)
                        score_src_corpus.append(src_corpus[src_index])
                        trg_input = generator.trg_reader.read_sent(
                            nbest[1].strip())
                    else:
                        trg_input = generator.trg_reader.read_sent(line)
                    ref_corpus.append(trg_input)
            if args["mode"] == "score":
                src_corpus = score_src_corpus
            else:
                if self.max_len and any(
                        len(s) > self.max_len for s in ref_corpus):
                    logger.warning(
                        "Forced decoding with some targets being longer than max_len. Increase max_len to avoid unexpected behavior."
                    )
        else:
            ref_corpus = None
        # Vocab
        src_vocab = generator.src_reader.vocab if hasattr(
            generator.src_reader, "vocab") else None
        trg_vocab = generator.trg_reader.vocab if hasattr(
            generator.trg_reader, "vocab") else None
        # Perform initialization
        generator.set_train(False)
        generator.initialize_generator(**args)

        if hasattr(generator, "set_post_processor"):
            generator.set_post_processor(self.get_output_processor())
        if hasattr(generator, "set_trg_vocab"):
            generator.set_trg_vocab(trg_vocab)
        if hasattr(generator, "set_reporting_src_vocab"):
            generator.set_reporting_src_vocab(src_vocab)

        if is_reporting:
            generator.set_report_resource("src_vocab", src_vocab)
            generator.set_report_resource("trg_vocab", trg_vocab)

        # If we're debugging, calculate the loss for each target sentence
        ref_scores = None
        if args["mode"] == 'forceddebug' or args["mode"] == 'score':
            some_batcher = xnmt.batcher.InOrderBatcher(32)  # Arbitrary
            if not isinstance(some_batcher, xnmt.batcher.InOrderBatcher):
                raise ValueError(
                    f"forceddebug requires InOrderBatcher, got: {some_batcher}"
                )
            batched_src, batched_ref = some_batcher.pack(
                src_corpus, ref_corpus)
            ref_scores = []
            for src, ref in zip(batched_src, batched_ref):
                dy.renew_cg(immediate_compute=settings.IMMEDIATE_COMPUTE,
                            check_validity=settings.CHECK_VALIDITY)
                loss_expr = generator.calc_loss(
                    src, ref, loss_calculator=LossCalculator())
                if isinstance(loss_expr.value(), Iterable):
                    ref_scores.extend(loss_expr.value())
                else:
                    ref_scores.append(loss_expr.value())
            ref_scores = [-x for x in ref_scores]

        # Make the parent directory if necessary
        make_parent_dir(args["trg_file"])

        # Perform generation of output
        if args["mode"] != 'score':
            with open(args["trg_file"], 'wt', encoding='utf-8'
                      ) as fp:  # Saving the translated output to a trg file
                src_ret = []
                for i, src in enumerate(src_corpus):
                    # This is necessary when the batcher does some sort of pre-processing, e.g.
                    # when the batcher pads to a particular number of dimensions
                    if self.batcher:
                        self.batcher.add_single_batch(src_curr=[src],
                                                      trg_curr=None,
                                                      src_ret=src_ret,
                                                      trg_ret=None)
                        src = src_ret.pop()[0]
                    # Do the decoding
                    if args["max_src_len"] is not None and len(
                            src) > args["max_src_len"]:
                        output_txt = NO_DECODING_ATTEMPTED
                    else:
                        dy.renew_cg(
                            immediate_compute=settings.IMMEDIATE_COMPUTE,
                            check_validity=settings.CHECK_VALIDITY)
                        ref_ids = ref_corpus[i] if ref_corpus != None else None
                        output = generator.generate_output(
                            src, i, forced_trg_ids=ref_ids)
                        # If debugging forced decoding, make sure it matches
                        if ref_scores != None and (
                                abs(output[0].score - ref_scores[i]) /
                                abs(ref_scores[i])) > 1e-5:
                            logger.error(
                                f'Forced decoding score {output[0].score} and loss {ref_scores[i]} do not match at sentence {i}'
                            )
                        output_txt = output[0].plaintext
                    # Printing to trg file
                    fp.write(f"{output_txt}\n")
        else:
            with open(args["trg_file"], 'wt', encoding='utf-8') as fp:
                with open(args["ref_file"], "r", encoding="utf-8") as nbest_fp:
                    for nbest, score in zip(nbest_fp, ref_scores):
                        fp.write("{} ||| score={}\n".format(
                            nbest.strip(), score))