示例#1
0
    def __call__(self, s_txt, t_txt):
        s_tags = XmlUtils.extract_tags(s_txt)
        if not s_tags: return t_txt

        t_tags = XmlUtils.extract_tags(t_txt)
        # Number of tags is equal - just replace one by one
        if len(s_tags) == len(t_tags):
            for s_tag, t_tag in zip(s_tags, t_tags):
                t_txt = t_txt.replace(t_tag, s_tag, 1)
            return t_txt
        else:
            s_toks = TMTextProcessors.tokenizer(
                self.langs[0]).tokenizer.process(
                    XmlUtils.replace_tags(XmlUtils.fix_tags(s_txt)[0],
                                          adjacent_space_placeholder=XmlUtils.
                                          SPACE_PLACEHOLDER)).split()
            # TODO: s_universal = self._preprocess(s_toks, self.langs[0])
            # Strip all tags from target text before tokenizing it
            t_toks = TMTextProcessors.tokenizer(
                self.langs[1]).tokenizer.process(
                    XmlUtils.strip_tags(t_txt)).split()
            #TODO: t_universal = self._preprocess(t_toks, self.langs[1])
            t_toks_new = []
            # Iterate over tokenized source and target text and apply simple alighnment algorithm (by token).
            # Insert source tags at the aligned places in the target text
            ti = 0
            for si in range(0, len(s_toks)):
                count = 1  # init
                if s_toks[si] == XmlUtils.TAG_PLACEHOLDER:
                    t_toks_new.append(s_tags.pop(0))
                elif s_toks[si] == XmlUtils.SPACE_PLACEHOLDER:
                    t_toks_new.append(XmlUtils.SPACE_PLACEHOLDER)
                elif ti < len(t_toks):
                    t_toks_new.append(t_toks[ti])
                    ti += 1
                else:
                    break  # source is longer than target, stop here
            # Append remaining target tokens
            if ti < len(t_toks): t_toks_new += t_toks[ti:]
            # If not all tags have been aligned, just contatenate remaining ones to the end
            if s_tags: t_toks_new += s_tags
        # Join tokenized text into string. TODO: implement as a part of TMTokenizer class (language-dependent)
        # return self.tok[1].join(t_toks_new)
        ttext_with_tags = XmlUtils.join_tags(
            ' '.join(t_toks_new), '(</?[^<>]+/?>)([^<>]+)(</?[^<>]+/?>)'
        )  # --> join words with tags <b> this </b> --> <b>this</b>
        # Handle whitespaces which are adjacent to tags
        ttext_with_tags = re.sub('\s+<', '<', ttext_with_tags)
        ttext_with_tags = re.sub('>\s+', '>', ttext_with_tags)
        ttext_with_tags = re.sub(XmlUtils.SPACE_PLACEHOLDER, '',
                                 ttext_with_tags)
        return ttext_with_tags
示例#2
0
    def _preprocess(self, text, lang):

        dic_query = {}
        s_tags = XmlUtils.extract_tags(text)
        if not s_tags:
            dic_query['query'] = text
        else:
            dic_query['query'] = XmlUtils.strip_tags(
                text)  # split tag to do the match

        dic_query['tokenizer'] = TMUtilsMatching.pre_process(
            dic_query['query'], self.src_lang, 'tokenizer', {})
        dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'],
                                                       lang, 'pos_tagger', {})
        dic_query['universal'] = TMUtilsMatching.segment_2_universal(
            dic_query['tokenizer'].lower(), dic_query['pos'],
            lang)  # universal_text[0]
        dic_query['universal'] = dic_query['pos']

        regex_class = TMRegexMatch(
            self.src_lang, self.tgt_lang)  # Class to improve fuzzy match
        dic_query['query_re'] = TMUtilsMatching.pre_process(
            dic_query['tokenizer'], self.src_lang, 'reg_exp',
            regex_class.re_pp)
        return dic_query
示例#3
0
    def __call__(self, s_txt, t_txt):
        # Extract source tags to be transferred: ['<X[1]>', '</X[1]>']
        print("Source text: {}".format(s_txt))
        s_tags = XmlUtils.extract_tags(s_txt)
        print("Source tags: {}".format(s_tags))
        if not s_tags: return t_txt
        # Remove any tags from the target
        t_txt = XmlUtils.strip_tags(t_txt)

        # Rename tags to avoid problems in XML parser
        # I have <X[1]>a dog</X[1]> ---> I have <T1>a dog</T1>
        s_txt_fixed = XmlUtils.simplify_tags(s_txt)
        s_tags_fixed = XmlUtils.extract_tags(s_txt_fixed)
        print("Fixed source tags: {}".format(s_tags_fixed))
        # Keep mapping of fixed tags to original tags for the final recovery:
        # tags_map = {'<T1>: '<X[1]>', '</T1>': '</X[1]>'}
        assert len(s_tags_fixed) == len(s_tags)
        tags_map = dict(zip(s_tags_fixed, s_tags))
        print("Tags map: {}".format(tags_map))

        # Run POS tagging (before, replace XML tags with a placeholder in the source text):
        # I chase <T1>a dog</T1> --> I chase ELASTICTMTAG a dog ELASTICTMTAG
        # --> I/NOUN have/VERB ELASTICTMTAG/NOUN a/DET dog/NOUN ELASTICTMTAG/NOUN
        s_pos = self.pos_taggers[0].tag_segments(
            [XmlUtils.replace_tags(s_txt_fixed)])[0]
        t_pos = self.pos_taggers[1].tag_segments([t_txt])[0]
        # Recover fixed tags:
        # I,NOUN have,VERB ELASTICTMTAG,NOUN a,DET dog,NOUN ELASTICTMTAG,NOUN
        # ---> NOUN VERB <T1> DET NOUN </T1>
        s_pos_with_tags, s_pos = XmlUtils.recover_tags_pos(s_pos, s_tags_fixed)
        print("S_POS_WITH_TAGS: {}, S_POS: {}, T_POS: {}".format(
            s_pos_with_tags, s_pos, t_pos))
        # For each tag (T1, T2 etc.), remove other tags and run prediction algorithm, based on IOB tags. Return value
        # is a map of tags to their correspondent indexes in target (tokenized) text
        tag2t_index = self.tags2indexes(s_tags_fixed, s_pos_with_tags, s_pos,
                                        [t[1] for t in t_pos])

        # Place tags at predicted indexes in the target text
        t_txt_with_tags = self.place_tags(s_tags_fixed, tag2t_index, tags_map,
                                          t_pos)
        if not t_txt_with_tags: return None
        # TODO: join using language-specific "joiner" (opposite of tokenizer)
        return " ".join(t_txt_with_tags)
示例#4
0
    def tags2string_xml_tags(self, text, text_pos):
        pos_str = self.tags2string(text_pos)
        # If no XML tags found, just return concatenated POS tags
        tags = XmlUtils.extract_tags(text)
        if not tags: return pos_str
        pos = []

        for word_pos in text_pos:
            # Contatenate POS tags and XML tags into the string
            if word_pos[0] == XmlUtils.TAG_PLACEHOLDER:
                pos.append(tags.pop(0))
            elif len(word_pos) < 2:
                continue
            else:
                pos.append(word_pos[1])

        return " ".join(pos)
示例#5
0
    def tags2string_iob_tags(self, text, text_pos):
        pos_str = self.tags2string(text_pos)
        # If no XML tags found, just return concatenated POS tags
        tags = XmlUtils.extract_tags(text)
        if not tags: return pos_str
        pos = []

        for word_pos in text_pos:
            # Contatenate POS tags and XML tags into the string
            if word_pos[0] == XmlUtils.TAG_PLACEHOLDER:
                pos.append(tags.pop(0))
            elif len(word_pos) < 2:
                continue
            else:
                pos.append(word_pos[1])

        iobs = []
        for w in pos:
            if self.is_self_closing_tag(w):
                iob = self.tag2iob(pos, w)
                if iob:
                    iobs.append(iob)
        return iobs