示例#1
0
    def process_txt_sentences(self, txt_sentences, strip_eos=True):
        ret_tgt = []
        for sentence_txt in txt_sentences:
            sentence = normalize_string(sentence_txt)
            if self.normalize_sal_entities:
                sentence, e2, rlookup = normalize_sal_entities(sentence, "")
            else:
                rlookup = None
            if self.reorder_numbered_placeholders:
                sentence, _, rlookup2 = reorder_numbered_placeholders(
                    sentence, "")
            if rlookup is None:
                rlookup = rlookup2
            else:
                rlookup.update(rlookup2)
            gen_tgt_toks, _ = self.seq2seq.generate(sentence)
            gen_tgt_toks = [t for t in gen_tgt_toks
                            if len(t) > 0]  # Fix to remove double empties
            if strip_eos:
                if gen_tgt_toks[-1] == '<EOS>':
                    gen_tgt_toks = gen_tgt_toks[0:-1]
            if self.normalize_sal_entities or self.reorder_numbered_placeholders:
                gen_tgt_toks = reinsert_from_lookup(gen_tgt_toks, rlookup)
            ret_str = " ".join(gen_tgt_toks)
            if self.convert_to_json:
                if self.output_lang_name == 'sexp':
                    ret_str = convert_sexp2json(ret_str)
                elif self.output_lang_name == 'pn':
                    ret_str = convert_pn2json(ret_str)


#                    ret_str = convert_sexp2json(convert_pn2sexp(ret_str))
            ret_tgt.append(ret_str)
        return ret_tgt
示例#2
0
 def inner_fn(l):
     normed_pairs = [normalize_string(s) for s in l.split('\t')]
     pair_str1, pair_str2 = normed_pairs[0], normed_pairs[1]
     if normalize_sal_entities:
         pair_str1, pair_str2, _ = normalize_sal_entities(
             pair_str1, pair_str2)
     if reorder_numplaceholders:
         pair_str1, pair_str2, _ = reorder_numbered_placeholders(
             pair_str1, pair_str2)
     return [pair_str1, pair_str2]
示例#3
0
 def process_sentences(self, sentence_dicts):
     accum_cst = []
     for sentence_dict in sentence_dicts:
         try:
             sid = sentence_dict['id']
             if 'sentence' in sentence_dict:
                 # Old format
                 raw_sentence = sentence_dict['sentence']
             else:
                 # New format
                 raw_sentence = sentence_dict['new-text']
             if self.verbose:
                 print("sid={}, raw={}".format(sid, raw_sentence))
             # For each sentence, identify the IDs first, normalize them, and then replace them
             sentence = normalize_string(raw_sentence)
             if self.normalize_sal_entities:
                 sentence, e2, rlookup = normalize_sal_entities(
                     sentence, "")
             else:
                 rlookup = {}
             if self.reorder_numbered_placeholders:
                 sentence, _, rlookup2 = reorder_numbered_placeholders(
                     sentence, "")
                 rlookup.update(rlookup2)
             if self.verbose:
                 print("Placeholder checks done")
             gen_cst_toks, _ = self.seq2seq.generate(sentence)
             if self.verbose:
                 print("... generated")
             # Hack to fix empty tokens introduced before '!=' (inequality) functions
             gen_cst_toks = [t for t in gen_cst_toks if len(t) > 0]
             if self.normalize_sal_entities or self.reorder_numbered_placeholders:
                 print("RLookup={}".format(rlookup))
                 gen_cst_toks = reinsert_from_lookup(gen_cst_toks, rlookup)
             if self.output_lang_name.lower(
             ) == "json" or self.convert_to_json:
                 gen_str_form = " ".join(gen_cst_toks).replace(
                     "<EOS>", "").replace("<SOS>", "")
             else:
                 # Apply JSON-ification only if the target is not JSON
                 gen_str_form = gen_toks2dict(gen_cst_toks)
             if self.convert_to_json:
                 if self.output_lang_name == 'sexp':
                     gen_str_form = convert_sexp2json(gen_str_form)
                 elif self.output_lang_name == 'pn':
                     #debugging - making this call prints out before and after
                     # gen_str_form = convert_pn2json(gen_str_form)
                     gen_str_form = convert_pn2pn(gen_str_form)
                     print(gen_str_form)
             gen_cst_dict = json.loads(gen_str_form)
             if self.verbose:
                 print("... remapping done")
             if self.include_normed_forms:
                 accum_cst.append({
                     "id": sid,
                     "nl": raw_sentence,
                     "cst": gen_cst_dict,
                     "normed_form": sentence
                 })
             else:
                 accum_cst.append({
                     "id": sid,
                     "nl": raw_sentence,
                     "cst": gen_cst_dict
                 })
         except JSONDecodeError:
             print("Invalid JSON! Value={}".format(gen_str_form))
             if self.include_normed_forms:
                 accum_cst.append({
                     "id": sid,
                     "error": "Invalid JSON",
                     "normed_form": sentence,
                     "nl": raw_sentence,
                     "cst_attempted": gen_str_form
                 })
             else:
                 accum_cst.append({
                     "id":
                     sid,
                     "error":
                     "Invalid JSON, value={}".format(gen_str_form)
                 })
     #     except:
     #         if self.include_normed_forms:
     #             accum_cst.append({ "id" : sid, "error" : "Unhandled exception", "normed_form" : sentence })
     #         else:
     #             accum_cst.append({ "id" : sid, "error" : "Unhandled exception, e={}".format(sys.exc_info()) })
     root_elt = {"sentences": accum_cst}
     return json.dumps(root_elt, indent=3)