def process_txt_sentences(self, txt_sentences, strip_eos=True): ret_tgt = [] for sentence_txt in txt_sentences: sentence = normalize_string(sentence_txt) if self.normalize_sal_entities: sentence, e2, rlookup = normalize_sal_entities(sentence, "") else: rlookup = None if self.reorder_numbered_placeholders: sentence, _, rlookup2 = reorder_numbered_placeholders( sentence, "") if rlookup is None: rlookup = rlookup2 else: rlookup.update(rlookup2) gen_tgt_toks, _ = self.seq2seq.generate(sentence) gen_tgt_toks = [t for t in gen_tgt_toks if len(t) > 0] # Fix to remove double empties if strip_eos: if gen_tgt_toks[-1] == '<EOS>': gen_tgt_toks = gen_tgt_toks[0:-1] if self.normalize_sal_entities or self.reorder_numbered_placeholders: gen_tgt_toks = reinsert_from_lookup(gen_tgt_toks, rlookup) ret_str = " ".join(gen_tgt_toks) if self.convert_to_json: if self.output_lang_name == 'sexp': ret_str = convert_sexp2json(ret_str) elif self.output_lang_name == 'pn': ret_str = convert_pn2json(ret_str) # ret_str = convert_sexp2json(convert_pn2sexp(ret_str)) ret_tgt.append(ret_str) return ret_tgt
def inner_fn(l): normed_pairs = [normalize_string(s) for s in l.split('\t')] pair_str1, pair_str2 = normed_pairs[0], normed_pairs[1] if normalize_sal_entities: pair_str1, pair_str2, _ = normalize_sal_entities( pair_str1, pair_str2) if reorder_numplaceholders: pair_str1, pair_str2, _ = reorder_numbered_placeholders( pair_str1, pair_str2) return [pair_str1, pair_str2]
def process_sentences(self, sentence_dicts): accum_cst = [] for sentence_dict in sentence_dicts: try: sid = sentence_dict['id'] if 'sentence' in sentence_dict: # Old format raw_sentence = sentence_dict['sentence'] else: # New format raw_sentence = sentence_dict['new-text'] if self.verbose: print("sid={}, raw={}".format(sid, raw_sentence)) # For each sentence, identify the IDs first, normalize them, and then replace them sentence = normalize_string(raw_sentence) if self.normalize_sal_entities: sentence, e2, rlookup = normalize_sal_entities( sentence, "") else: rlookup = {} if self.reorder_numbered_placeholders: sentence, _, rlookup2 = reorder_numbered_placeholders( sentence, "") rlookup.update(rlookup2) if self.verbose: print("Placeholder checks done") gen_cst_toks, _ = self.seq2seq.generate(sentence) if self.verbose: print("... generated") # Hack to fix empty tokens introduced before '!=' (inequality) functions gen_cst_toks = [t for t in gen_cst_toks if len(t) > 0] if self.normalize_sal_entities or self.reorder_numbered_placeholders: print("RLookup={}".format(rlookup)) gen_cst_toks = reinsert_from_lookup(gen_cst_toks, rlookup) if self.output_lang_name.lower( ) == "json" or self.convert_to_json: gen_str_form = " ".join(gen_cst_toks).replace( "<EOS>", "").replace("<SOS>", "") else: # Apply JSON-ification only if the target is not JSON gen_str_form = gen_toks2dict(gen_cst_toks) if self.convert_to_json: if self.output_lang_name == 'sexp': gen_str_form = convert_sexp2json(gen_str_form) elif self.output_lang_name == 'pn': #debugging - making this call prints out before and after # gen_str_form = convert_pn2json(gen_str_form) gen_str_form = convert_pn2pn(gen_str_form) print(gen_str_form) gen_cst_dict = json.loads(gen_str_form) if self.verbose: print("... remapping done") if self.include_normed_forms: accum_cst.append({ "id": sid, "nl": raw_sentence, "cst": gen_cst_dict, "normed_form": sentence }) else: accum_cst.append({ "id": sid, "nl": raw_sentence, "cst": gen_cst_dict }) except JSONDecodeError: print("Invalid JSON! Value={}".format(gen_str_form)) if self.include_normed_forms: accum_cst.append({ "id": sid, "error": "Invalid JSON", "normed_form": sentence, "nl": raw_sentence, "cst_attempted": gen_str_form }) else: accum_cst.append({ "id": sid, "error": "Invalid JSON, value={}".format(gen_str_form) }) # except: # if self.include_normed_forms: # accum_cst.append({ "id" : sid, "error" : "Unhandled exception", "normed_form" : sentence }) # else: # accum_cst.append({ "id" : sid, "error" : "Unhandled exception, e={}".format(sys.exc_info()) }) root_elt = {"sentences": accum_cst} return json.dumps(root_elt, indent=3)