Пример #1
0
    def __call__(self, s_txt, t_txt):
        s_tags = XmlUtils.extract_tags(s_txt)
        if not s_tags: return t_txt

        t_tags = XmlUtils.extract_tags(t_txt)
        # Number of tags is equal - just replace one by one
        if len(s_tags) == len(t_tags):
            for s_tag, t_tag in zip(s_tags, t_tags):
                t_txt = t_txt.replace(t_tag, s_tag, 1)
            return t_txt
        else:
            s_toks = TMTextProcessors.tokenizer(
                self.langs[0]).tokenizer.process(
                    XmlUtils.replace_tags(XmlUtils.fix_tags(s_txt)[0],
                                          adjacent_space_placeholder=XmlUtils.
                                          SPACE_PLACEHOLDER)).split()
            # TODO: s_universal = self._preprocess(s_toks, self.langs[0])
            # Strip all tags from target text before tokenizing it
            t_toks = TMTextProcessors.tokenizer(
                self.langs[1]).tokenizer.process(
                    XmlUtils.strip_tags(t_txt)).split()
            #TODO: t_universal = self._preprocess(t_toks, self.langs[1])
            t_toks_new = []
            # Iterate over tokenized source and target text and apply simple alighnment algorithm (by token).
            # Insert source tags at the aligned places in the target text
            ti = 0
            for si in range(0, len(s_toks)):
                count = 1  # init
                if s_toks[si] == XmlUtils.TAG_PLACEHOLDER:
                    t_toks_new.append(s_tags.pop(0))
                elif s_toks[si] == XmlUtils.SPACE_PLACEHOLDER:
                    t_toks_new.append(XmlUtils.SPACE_PLACEHOLDER)
                elif ti < len(t_toks):
                    t_toks_new.append(t_toks[ti])
                    ti += 1
                else:
                    break  # source is longer than target, stop here
            # Append remaining target tokens
            if ti < len(t_toks): t_toks_new += t_toks[ti:]
            # If not all tags have been aligned, just contatenate remaining ones to the end
            if s_tags: t_toks_new += s_tags
        # Join tokenized text into string. TODO: implement as a part of TMTokenizer class (language-dependent)
        # return self.tok[1].join(t_toks_new)
        ttext_with_tags = XmlUtils.join_tags(
            ' '.join(t_toks_new), '(</?[^<>]+/?>)([^<>]+)(</?[^<>]+/?>)'
        )  # --> join words with tags <b> this </b> --> <b>this</b>
        # Handle whitespaces which are adjacent to tags
        ttext_with_tags = re.sub('\s+<', '<', ttext_with_tags)
        ttext_with_tags = re.sub('>\s+', '>', ttext_with_tags)
        ttext_with_tags = re.sub(XmlUtils.SPACE_PLACEHOLDER, '',
                                 ttext_with_tags)
        return ttext_with_tags
Пример #2
0
    def process(self, text):
        # Check if there any tags at all
        if not re.search("<.*>", text): return text
        # Keep original text and its stripped version
        org_text = text
        text, stext = XmlUtils.fix_tags(text)
        try:
            #print("ORG TEXT: {}, PARSING: {}".format(org_text, text))
            text = XmlUtils.rename_tags(text)
            for e in self.parser.error_log:
                # Check for certain errors which might create problems in TM and therefore remove all tags at once
                if e.type_name == 'ERR_TAG_NAME_MISMATCH' or e.type_name == 'ERR_TAG_NOT_FINISHED':
                    logging.warning(
                        "Failed to parse segment text into XML: '{}' reason: {}. Removing tags instead"
                        .format(org_text, e))
                    return stext

        except Exception as ex:
            logging.warning(
                "Failed to rename tags in {}, reason: {}. Removing tags instead: {}"
                .format(org_text, ex, stext))
            return stext
        return text