def __call__(self, s_txt, t_txt): s_tags = XmlUtils.extract_tags(s_txt) if not s_tags: return t_txt t_tags = XmlUtils.extract_tags(t_txt) # Number of tags is equal - just replace one by one if len(s_tags) == len(t_tags): for s_tag, t_tag in zip(s_tags, t_tags): t_txt = t_txt.replace(t_tag, s_tag, 1) return t_txt else: s_toks = TMTextProcessors.tokenizer( self.langs[0]).tokenizer.process( XmlUtils.replace_tags(XmlUtils.fix_tags(s_txt)[0], adjacent_space_placeholder=XmlUtils. SPACE_PLACEHOLDER)).split() # TODO: s_universal = self._preprocess(s_toks, self.langs[0]) # Strip all tags from target text before tokenizing it t_toks = TMTextProcessors.tokenizer( self.langs[1]).tokenizer.process( XmlUtils.strip_tags(t_txt)).split() #TODO: t_universal = self._preprocess(t_toks, self.langs[1]) t_toks_new = [] # Iterate over tokenized source and target text and apply simple alighnment algorithm (by token). # Insert source tags at the aligned places in the target text ti = 0 for si in range(0, len(s_toks)): count = 1 # init if s_toks[si] == XmlUtils.TAG_PLACEHOLDER: t_toks_new.append(s_tags.pop(0)) elif s_toks[si] == XmlUtils.SPACE_PLACEHOLDER: t_toks_new.append(XmlUtils.SPACE_PLACEHOLDER) elif ti < len(t_toks): t_toks_new.append(t_toks[ti]) ti += 1 else: break # source is longer than target, stop here # Append remaining target tokens if ti < len(t_toks): t_toks_new += t_toks[ti:] # If not all tags have been aligned, just contatenate remaining ones to the end if s_tags: t_toks_new += s_tags # Join tokenized text into string. TODO: implement as a part of TMTokenizer class (language-dependent) # return self.tok[1].join(t_toks_new) ttext_with_tags = XmlUtils.join_tags( ' '.join(t_toks_new), '(</?[^<>]+/?>)([^<>]+)(</?[^<>]+/?>)' ) # --> join words with tags <b> this </b> --> <b>this</b> # Handle whitespaces which are adjacent to tags ttext_with_tags = re.sub('\s+<', '<', ttext_with_tags) ttext_with_tags = re.sub('>\s+', '>', ttext_with_tags) ttext_with_tags = re.sub(XmlUtils.SPACE_PLACEHOLDER, '', ttext_with_tags) return ttext_with_tags
def _preprocess(self, text, lang): dic_query = {} s_tags = XmlUtils.extract_tags(text) if not s_tags: dic_query['query'] = text else: dic_query['query'] = XmlUtils.strip_tags( text) # split tag to do the match dic_query['tokenizer'] = TMUtilsMatching.pre_process( dic_query['query'], self.src_lang, 'tokenizer', {}) dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'], lang, 'pos_tagger', {}) dic_query['universal'] = TMUtilsMatching.segment_2_universal( dic_query['tokenizer'].lower(), dic_query['pos'], lang) # universal_text[0] dic_query['universal'] = dic_query['pos'] regex_class = TMRegexMatch( self.src_lang, self.tgt_lang) # Class to improve fuzzy match dic_query['query_re'] = TMUtilsMatching.pre_process( dic_query['tokenizer'], self.src_lang, 'reg_exp', regex_class.re_pp) return dic_query
def __call__(self, s_txt, t_txt): # Extract source tags to be transferred: ['<X[1]>', '</X[1]>'] print("Source text: {}".format(s_txt)) s_tags = XmlUtils.extract_tags(s_txt) print("Source tags: {}".format(s_tags)) if not s_tags: return t_txt # Remove any tags from the target t_txt = XmlUtils.strip_tags(t_txt) # Rename tags to avoid problems in XML parser # I have <X[1]>a dog</X[1]> ---> I have <T1>a dog</T1> s_txt_fixed = XmlUtils.simplify_tags(s_txt) s_tags_fixed = XmlUtils.extract_tags(s_txt_fixed) print("Fixed source tags: {}".format(s_tags_fixed)) # Keep mapping of fixed tags to original tags for the final recovery: # tags_map = {'<T1>: '<X[1]>', '</T1>': '</X[1]>'} assert len(s_tags_fixed) == len(s_tags) tags_map = dict(zip(s_tags_fixed, s_tags)) print("Tags map: {}".format(tags_map)) # Run POS tagging (before, replace XML tags with a placeholder in the source text): # I chase <T1>a dog</T1> --> I chase ELASTICTMTAG a dog ELASTICTMTAG # --> I/NOUN have/VERB ELASTICTMTAG/NOUN a/DET dog/NOUN ELASTICTMTAG/NOUN s_pos = self.pos_taggers[0].tag_segments( [XmlUtils.replace_tags(s_txt_fixed)])[0] t_pos = self.pos_taggers[1].tag_segments([t_txt])[0] # Recover fixed tags: # I,NOUN have,VERB ELASTICTMTAG,NOUN a,DET dog,NOUN ELASTICTMTAG,NOUN # ---> NOUN VERB <T1> DET NOUN </T1> s_pos_with_tags, s_pos = XmlUtils.recover_tags_pos(s_pos, s_tags_fixed) print("S_POS_WITH_TAGS: {}, S_POS: {}, T_POS: {}".format( s_pos_with_tags, s_pos, t_pos)) # For each tag (T1, T2 etc.), remove other tags and run prediction algorithm, based on IOB tags. Return value # is a map of tags to their correspondent indexes in target (tokenized) text tag2t_index = self.tags2indexes(s_tags_fixed, s_pos_with_tags, s_pos, [t[1] for t in t_pos]) # Place tags at predicted indexes in the target text t_txt_with_tags = self.place_tags(s_tags_fixed, tag2t_index, tags_map, t_pos) if not t_txt_with_tags: return None # TODO: join using language-specific "joiner" (opposite of tokenizer) return " ".join(t_txt_with_tags)
def tags2string_xml_tags(self, text, text_pos): pos_str = self.tags2string(text_pos) # If no XML tags found, just return concatenated POS tags tags = XmlUtils.extract_tags(text) if not tags: return pos_str pos = [] for word_pos in text_pos: # Contatenate POS tags and XML tags into the string if word_pos[0] == XmlUtils.TAG_PLACEHOLDER: pos.append(tags.pop(0)) elif len(word_pos) < 2: continue else: pos.append(word_pos[1]) return " ".join(pos)
def tags2string_iob_tags(self, text, text_pos): pos_str = self.tags2string(text_pos) # If no XML tags found, just return concatenated POS tags tags = XmlUtils.extract_tags(text) if not tags: return pos_str pos = [] for word_pos in text_pos: # Contatenate POS tags and XML tags into the string if word_pos[0] == XmlUtils.TAG_PLACEHOLDER: pos.append(tags.pop(0)) elif len(word_pos) < 2: continue else: pos.append(word_pos[1]) iobs = [] for w in pos: if self.is_self_closing_tag(w): iob = self.tag2iob(pos, w) if iob: iobs.append(iob) return iobs