Пример #1
0
 def process(self, text):
     # **********load tokenizer according to the language
     nltk_model = self.models.get(self.language).split('/')[2].split('.')[0]
     text = ' '.join(self.tokenizer.word_tokenize(text, nltk_model))
     if re.search(TOK_PATTERN, text):  # Check if the text have tags
         text = XmlUtils.join_tags(text, JOIN_PATTERN)
     return text
Пример #2
0
    def __call__(self, s_txt, t_txt):
        s_tags = XmlUtils.extract_tags(s_txt)
        if not s_tags: return t_txt

        t_tags = XmlUtils.extract_tags(t_txt)
        # Number of tags is equal - just replace one by one
        if len(s_tags) == len(t_tags):
            for s_tag, t_tag in zip(s_tags, t_tags):
                t_txt = t_txt.replace(t_tag, s_tag, 1)
            return t_txt
        else:
            s_toks = TMTextProcessors.tokenizer(
                self.langs[0]).tokenizer.process(
                    XmlUtils.replace_tags(XmlUtils.fix_tags(s_txt)[0],
                                          adjacent_space_placeholder=XmlUtils.
                                          SPACE_PLACEHOLDER)).split()
            # TODO: s_universal = self._preprocess(s_toks, self.langs[0])
            # Strip all tags from target text before tokenizing it
            t_toks = TMTextProcessors.tokenizer(
                self.langs[1]).tokenizer.process(
                    XmlUtils.strip_tags(t_txt)).split()
            #TODO: t_universal = self._preprocess(t_toks, self.langs[1])
            t_toks_new = []
            # Iterate over tokenized source and target text and apply simple alighnment algorithm (by token).
            # Insert source tags at the aligned places in the target text
            ti = 0
            for si in range(0, len(s_toks)):
                count = 1  # init
                if s_toks[si] == XmlUtils.TAG_PLACEHOLDER:
                    t_toks_new.append(s_tags.pop(0))
                elif s_toks[si] == XmlUtils.SPACE_PLACEHOLDER:
                    t_toks_new.append(XmlUtils.SPACE_PLACEHOLDER)
                elif ti < len(t_toks):
                    t_toks_new.append(t_toks[ti])
                    ti += 1
                else:
                    break  # source is longer than target, stop here
            # Append remaining target tokens
            if ti < len(t_toks): t_toks_new += t_toks[ti:]
            # If not all tags have been aligned, just contatenate remaining ones to the end
            if s_tags: t_toks_new += s_tags
        # Join tokenized text into string. TODO: implement as a part of TMTokenizer class (language-dependent)
        # return self.tok[1].join(t_toks_new)
        ttext_with_tags = XmlUtils.join_tags(
            ' '.join(t_toks_new), '(</?[^<>]+/?>)([^<>]+)(</?[^<>]+/?>)'
        )  # --> join words with tags <b> this </b> --> <b>this</b>
        # Handle whitespaces which are adjacent to tags
        ttext_with_tags = re.sub('\s+<', '<', ttext_with_tags)
        ttext_with_tags = re.sub('>\s+', '>', ttext_with_tags)
        ttext_with_tags = re.sub(XmlUtils.SPACE_PLACEHOLDER, '',
                                 ttext_with_tags)
        return ttext_with_tags
Пример #3
0
    def process(self, text):
        #Probably if good transform the input text in ' ' + text + '\n'

        tokenizer = subprocess.Popen(self.args,
                                     stdin=subprocess.PIPE,
                                     stdout=subprocess.PIPE)
        tok_sents, tok_exc = tokenizer.communicate(input=text.encode('utf8'))
        tokenizer.wait()
        text = (tok_sents.decode("utf-8")).strip('\n')

        if re.search(TOK_PATTERN, text):  # Check if the text have tags
            text = XmlUtils.join_tags(text, JOIN_PATTERN)
        return text
Пример #4
0
 def process(self, sentences):
     text = self.tm_tokenize.segment(sentences).strip('\n')
     if re.search(TOK_PATTERN, text):  # Check if the text have tags
         text = XmlUtils.join_tags(text, JOIN_PATTERN)
     return text
Пример #5
0
 def process(self, text):
     text = ' '.join(self.tokenizer.wordpunct_tokenize(text))
     if re.search(TOK_PATTERN, text):  # Check if the text have tags
         text = XmlUtils.join_tags(text, JOIN_PATTERN)
     return text