def _break_and_wrap(text_to_tokenize_match): text_to_tokenize = text_to_tokenize_match.group(0) wrapped = "" for token in Languages.tokenize(Languages.chinese.value, text_to_tokenize): word_zh = WordZH.get_or_create_with_translator(word=token)[0] wrapped += '<span class="chinese-word"><span>' + word_zh.pinyin + '</span><span>' + token + '</span></span>' return wrapped
def auto_tokenize(self): """ Tokenize the business text into words, create their objects if necessary and link the business text to them. Only applied to Chinese words. """ word_model = to_word_model(self.language) if word_model == WordZH: tokens = Languages.tokenize(self.language, self.text) self.words_zh.clear() ordinal = 0 for token in tokens: word_object = word_model.get_or_create_with_translator(word=token)[0] BusinessTextWordZH.objects.create(text=self, word=word_object, ordinal=ordinal).save() ordinal += 1