Exemplos de SoMaJo em Python, exemplos de somajo.SoMaJo em Python

Exemplo n.º 1

0

Exibir arquivo

def split(list_of_text, thread_number, TMP_DIR):
    """
    Splits text in sentences
    Writes line for line with leading space (for BPE) 
    Every document is separated by a free line
    """
    print(os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)))
    outF = open(
        os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)),
        "w")
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for part in list_of_text:
        sentences = tokenizer.tokenize_text([part])
        for sentence in sentences:
            output = ""
            for token in sentence:
                #word_list = [token.text for token in sentence]
                if (token.space_after and not token.last_in_sentence
                        and not token.first_in_sentence):
                    output += (token.text + ' ')
                elif token.first_in_sentence:
                    output += (' ' + token.text + ' ')
                else:
                    #output = " ".join(word_list[:-1])
                    output += token.text
                    #output += word_list[-1]
            #sen_out.append(output)
            outF.write(output)
            outF.write("\n")
        outF.write("\n")

    return thread_number

Exemplo n.º 2

0

Exibir arquivo

    def __init__(self, language, processes=None):
        from somajo import SoMaJo

        tokenizer_type = {"de": "de_CMC", "en": "en_PTB"}[language]
        self.tokenizer = SoMaJo(tokenizer_type,
                                split_camel_case=True,
                                split_sentences=True)

Exemplo n.º 3

0

Exibir arquivo

class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = SoMaJo("de_CMC", split_camel_case=True, split_sentences=True)

    def _equal(self, raw, tokenized_sentences):
        """"""
        sentences = self.tokenizer.tokenize_text([raw])
        sentences = [" ".join([t.text for t in s]) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

    def _equal_xml(self, raw, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        eos_tags = set(eos_tags)
        sentences = self.tokenizer.tokenize_xml(raw, eos_tags)
        sentences = [" ".join([t.text for t in s]) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

    def _equal_xml_strip(self, raw, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        eos_tags = set(eos_tags)
        sentences = self.tokenizer.tokenize_xml(raw, eos_tags, strip_tags=True)
        sentences = [" ".join([t.text for t in s]) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

Exemplo n.º 4

0

Exibir arquivo

def tokenize(text):
    tokenizer = SoMaJo(language="de_CMC")
    for i in range(len(text)):
        text[i] = text[i].split()
        tok = tokenizer.tokenize_text(text[i])
        tok_sent = []
        for sent in tok:
            for word in sent:
                tok_sent.append(word.text)
        text[i] = tok_sent

Exemplo n.º 5

0

Exibir arquivo

Arquivo: linguistic_preprocessing.py Projeto: akohnert/authorshipattribution

 def tokenizer(self, text):
     tokenizer = SoMaJo("en_PTB")
     tokenized_object = tokenizer.tokenize_text([text])
     sentences = []
     types = []
     for sent in tokenized_object:
         sentence = []
         for token in sent:
             sentence.append(token.text)
             types.append(token.token_class)
         sentences.append(sentence)
     self.output['tokens'] = sentences
     self.output['types'] = types
     return sentences, types

Exemplo n.º 6

0

Exibir arquivo

def replace_hashtags_tokenizer(text):
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for i in range(len(text)):
        line = text[i].split()
        for j in range(len(line)):
            if line[j].startswith('#'):
                hashtag = []
                line[j] = line[j].replace('#', "")
                hashtag.append(line[j])
                tok_hashtag = tokenizer.tokenize_text(hashtag)
                for tok in tok_hashtag:
                    for t in tok:
                        print(t.text)
        text[i] = " ".join(line)
    return (text)

Exemplo n.º 7

0

Exibir arquivo

class SoMaJoSentenceTokenizer(Tokenizer):
    def __init__(self, model_name: str):
        super().__init__()
        self.tokenizer = SoMaJo(model_name)

    def tokenize(self, text: str) -> List[str]:
        out_sentences = []
        sentences = list(self.tokenizer.tokenize_text([text]))

        for i, sentence in enumerate(sentences):
            text = ""

            for token in sentence:
                if "SpaceAfter=No" in token.extra_info:
                    whitespace = ""
                else:
                    whitespace = " "

                text += token.text + whitespace

            if i == len(sentences) - 1:
                text = text.rstrip()

            out_sentences.append(text)

        return out_sentences

Exemplo n.º 8

0

Exibir arquivo

def main():
    args = arguments()
    n_tokens = 0
    n_sentences = 0
    t0 = time.perf_counter()
    is_xml = False
    if args.xml or args.tag is not None:
        is_xml = True
    tokenizer = SoMaJo(args.language,
                       split_camel_case=args.split_camel_case,
                       split_sentences=args.split_sentences,
                       xml_sentences=args.sentence_tag)
    if is_xml:
        eos_tags = args.tag
        if eos_tags is None:
            eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split(
            )
        eos_tags = set(eos_tags)
        chunks = tokenizer.tokenize_xml_file(args.FILE,
                                             eos_tags,
                                             strip_tags=args.strip_tags,
                                             parallel=args.parallel)
    else:
        chunks = tokenizer.tokenize_text_file(args.FILE,
                                              args.paragraph_separator,
                                              parallel=args.parallel)
    for chunk in chunks:
        n_sentences += 1
        for token in chunk:
            output = token.text
            if not token.markup:
                n_tokens += 1
                if args.token_classes:
                    output += "\t" + token.token_class
                if args.extra_info:
                    output += "\t" + token.extra_info
            print(output)
        if args.split_sentences and args.sentence_tag is None:
            print()
    t1 = time.perf_counter()
    if args.split_sentences:
        logging.info(
            "Tokenized %d tokens (%d sentences) in %d seconds (%d tokens/s)" %
            (n_tokens, n_sentences, t1 - t0, n_tokens / (t1 - t0)))
    else:
        logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" %
                     (n_tokens, t1 - t0, n_tokens / (t1 - t0)))

Exemplo n.º 9

0

Exibir arquivo

    def make(self, prerequisite_data):
        paragraphs = prerequisite_data['paragraph']
        tokenizer = SoMaJo("de_CMC", split_camel_case=True)
        sentences = tokenizer.tokenize_text(paragraphs)

        tokens = []
        sentence_alignment = []

        for (i, s) in zip(range(len(sentences)), sentences):
            tokens += [token.text for token in s]
            sentence_alignment += [i] * len(s)

        return {
            'token-somajo': tokens,
            'sentence-somajo': sentence_alignment,
            'token': tokens,
            'sentence': sentence_alignment
        }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: rouge.py Projeto: domfr/GeRouge

    def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False):
        self.tokenizer = SoMaJo('de_CMC')
        self.sentence_splitter = SentenceSplitter(is_tuple=False)
        self.alpha = alpha
        self.stemming = stemming
        self.split_compounds = split_compounds
        self.stemmer = SnowballStemmer('german')
        self.minimal_mode = minimal_mode
        self.base_path = pathlib.Path(__file__).parent.absolute()

        self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–']
        self.remove_chars.extend(list(string.punctuation))
        self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')]

        self.stop = set()
        with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f:
            for line in f:
                self.stop.add(line.strip())
        if not minimal_mode:
            self.smart_stop = set()
            with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f:
                for line in f:
                    word = line.strip().lower()
                    self.smart_stop.add(word)
                    for replace_char in self.replace_chars:
                        word = word.replace(replace_char[0], replace_char[1])
            self.lemmas = {}
            with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f:
                for line in f:
                    l = line.strip().split('\t')
                    l[0] = l[0].strip().lower()
                    l[1] = l[1].strip().lower()
                    for replace_char in self.replace_chars:
                        l[0] = l[0].replace(replace_char[0], replace_char[1])
                        l[1] = l[1].replace(replace_char[0], replace_char[1])
                    self.lemmas[l[0]] = l[1]

Exemplo n.º 11

0

Exibir arquivo

class SoMaJoTokenizer(Tokenizer):
    def __init__(self, language, processes=None):
        from somajo import SoMaJo

        tokenizer_type = {"de": "de_CMC", "en": "en_PTB"}[language]
        self.tokenizer = SoMaJo(tokenizer_type,
                                split_camel_case=True,
                                split_sentences=True)

    def _tokenize_text(self, text):
        sentences = []

        if len(text) == 0:
            return sentences

        for sentence in self.tokenizer.tokenize_text([text]):
            sentences.append([
                Token(token.text, " " if token.space_after else "")
                for token in sentence
            ])

        if not text[-1].isspace():
            sentences[-1][-1] = Token(sentences[-1][-1].text, "")

        return sentences

    def split(self, texts, verbose=False):
        bar = None
        if verbose:
            from tqdm.auto import tqdm

            bar = tqdm(total=len(texts))

        # pool.imap leaks memory for some reason
        for sentences in map(self._tokenize_text, texts):
            yield sentences

            if verbose:
                bar.update(1)

Exemplo n.º 12

0

Exibir arquivo

class SoMaJoWordTokenizer(Tokenizer):
    def __init__(self, model_name: str):
        super().__init__()
        self.tokenizer = SoMaJo(model_name, split_sentences=False)

    def tokenize(self, text: str) -> List[str]:
        out_tokens = []
        tokens = next(self.tokenizer.tokenize_text([text]))

        for i, token in enumerate(tokens):
            if "SpaceAfter=No" in token.extra_info or i == len(tokens) - 1:
                whitespace = ""
            else:
                whitespace = " "

            # sometimes sample more spaces than one space so the model learns to deal with it
            while random.random() < 0.05:
                whitespace += " "

            out_tokens.append(token.text + whitespace)

        return [x for x in out_tokens if len(x) > 0]

Exemplo n.º 13

0

Exibir arquivo

def SentenceSplit(text):

    tokenizer = SoMaJo("de_CMC")
    tokens = tokenizer.tokenize_text(text)
    return tokens

Exemplo n.º 14

0

Exibir arquivo

Arquivo: rouge.py Projeto: domfr/GeRouge

class GeRouge:
    """
    Computes ROUGE scores on German texts.

    Args:
        alpha: Weighting factor of Recall and Precision. Between 0 and 1.
        stemming: Boolean. Defines whether stemming is used or not.
        split_compounds: Boolean. Defines whether compound words are split or not.
        minimal_mode: Boolean. Skip time consuming steps for quick calculation.
                        TODO: specify what exactly is skipped.
    """

    def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False):
        self.tokenizer = SoMaJo('de_CMC')
        self.sentence_splitter = SentenceSplitter(is_tuple=False)
        self.alpha = alpha
        self.stemming = stemming
        self.split_compounds = split_compounds
        self.stemmer = SnowballStemmer('german')
        self.minimal_mode = minimal_mode
        self.base_path = pathlib.Path(__file__).parent.absolute()

        self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–']
        self.remove_chars.extend(list(string.punctuation))
        self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')]

        self.stop = set()
        with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f:
            for line in f:
                self.stop.add(line.strip())
        if not minimal_mode:
            self.smart_stop = set()
            with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f:
                for line in f:
                    word = line.strip().lower()
                    self.smart_stop.add(word)
                    for replace_char in self.replace_chars:
                        word = word.replace(replace_char[0], replace_char[1])
            self.lemmas = {}
            with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f:
                for line in f:
                    l = line.strip().split('\t')
                    l[0] = l[0].strip().lower()
                    l[1] = l[1].strip().lower()
                    for replace_char in self.replace_chars:
                        l[0] = l[0].replace(replace_char[0], replace_char[1])
                        l[1] = l[1].replace(replace_char[0], replace_char[1])
                    self.lemmas[l[0]] = l[1]

    def tokenize_sents(self, text):
        # SoMaJo now splits sentences simultaneously
        sents = list(self.tokenizer.tokenize_text([text]))
        length = sum([len(sent) for sent in sents])
        transformed_sents = [list(self.transform_sent(sent)) for sent in sents]
        transformed_sents = [[token for token in sent if token is not None and token != ''] for sent in
                             transformed_sents]

        return transformed_sents, length

    @staticmethod
    def create_ngrams(transformed_sents, n=1):
        ngrams_sents = [ngram_splitter(sent, n) for sent in transformed_sents if len(sent) >= n]
        ngrams = set([token for sent in ngrams_sents for token in sent])

        return ngrams

    def transform_sent(self, sent):
        for token in sent:
            token, splitted = self.transform_token(token.text)
            if splitted:
                for partial_token in token:
                    yield partial_token
            else:
                yield token

    def transform_token(self, token):
        if not self.minimal_mode and token.lower().strip() in self.lemmas:
            token = self.lemmas[token.lower().strip()]

        compound_candidates = self.split_compound(token)
        if self.split_compounds and compound_candidates is not None and compound_candidates[0][0] > 0.5 and \
                compound_candidates[0][1] != token:
            return_tokens = []
            for token in compound_candidates[0][1:]:
                if len(token) > 0:
                    tokens, splitted = self.transform_token(token)
                    if splitted:
                        return_tokens.extend(tokens)
                    else:
                        return_tokens.append(tokens)
            return return_tokens, True
        else:
            token = token.lower().strip()

            for remove_char in self.remove_chars:
                token = token.replace(remove_char, '')
            for replace_char in self.replace_chars:
                token = token.replace(replace_char[0], replace_char[1])

            if (token in self.stop or
                    (not self.minimal_mode and token in self.smart_stop) or
                    bool(re.search(r'\d', token))):
                token = ''
            elif self.stemming:
                if not self.minimal_mode and token in self.lemmas:
                    token = self.lemmas[token]
                token = self.stemmer.stem(token)

            return token, False

    def rouge_n(self, reference, summary, ngrams=(1, 2)):
        """
        Computes Rouge-N scores based on n-grams.
        :param reference: Ground truth summary.
        :param summary: Generated prediction summary.
        :param ngrams: For which n-grams to calculate scores. Can be arbitrarily many.
        :return: List of (precision, recall, F1) tuples for each individual n-gram length.
        """
        reference_tokenized, reference_length = self.tokenize_sents(reference)
        summary_tokenized, summary_length = self.tokenize_sents(summary)
        return self.rouge_n_partial(reference_tokenized, summary_tokenized, ngrams)

    def rouge_l(self, reference, summary):
        """
        Calculates Rouge-L based on the longest common sub-sequence.
        :param reference: Ground truth summary.
        :param summary: Generated prediction summary.
        :return: Tuple with (precision, recall, F1) values for Rouge-L.
        """
        reference_tokenized, _ = self.tokenize_sents(reference)
        summary_tokenized, _ = self.tokenize_sents(summary)
        return self.computeL(summary_tokenized, reference_tokenized)

    def rouge_n_partial(self, reference_tokenized, summary_tokenized, ngrams):
        rouge_n = []

        for n in ngrams:
            if n < 1:
                rouge_n.append((0, 0, 0))
                continue

            reference = self.create_ngrams(reference_tokenized, n=n)
            summary = self.create_ngrams(summary_tokenized, n=n)

            if len(reference) == 0 or len(summary) == 0:
                rouge_n.append((0, 0, 0))
                continue

            matches = sum(
                [sum([ngram_reference == ngram_summary for ngram_summary in summary]) for ngram_reference in reference])
            rouge_p = matches / len(summary)
            rouge_r = matches / len(reference)
            denominator = (rouge_r * self.alpha) + (rouge_p * (1 - self.alpha))
            if denominator != 0:
                rouge_f1 = (rouge_p * rouge_r) / denominator
            else:
                rouge_f1 = 0.0
            rouge_n.append((rouge_p, rouge_r, rouge_f1))

        return rouge_n

    def computeL(self, sys, ref):
        unionLCS = set()
        ref_size = sum([len(l) for l in ref])
        sys_size = sum([len(l) for l in sys])
        for r in ref:
            for s in sys:
                seq1 = GeRouge.lcs(r, s)
                seq2 = GeRouge.lcs(s, r)
                seq = seq1 if len(seq1) > len(seq2) else seq2
                unionLCS.update(seq)

        if ref_size > 0:
            rouge_r = len(unionLCS) / ref_size
        else:
            rouge_r = 0
        if sys_size > 0:
            rouge_p = len(unionLCS) / sys_size
        else:
            rouge_p = 0
        denominator = (rouge_r * self.alpha) + (rouge_p * (1 - self.alpha))
        if denominator != 0:
            rouge_f1 = (rouge_p * rouge_r) / denominator
        else:
            rouge_f1 = 0.0

        return rouge_p, rouge_r, rouge_f1

    @staticmethod
    def split_compound(word: str):
        """
        Code adapted from: https://github.com/dtuggener/CharSplit
        Return list of possible splits, best first
        :param word: Word to be split
        :return: List of all splits
        """
        word = word.lower()

        # If there is a hyphen in the word, return part of the word behind the last hyphen
        if '-' in word:
            return [[1., '-'.join((word.split('-'))[:-1]).title(), word.split('-')[-1].title()]]

        scores = []  # Score for each possible split position
        # Iterate through characters, start at forth character, go to 3rd last
        for n in range(3, len(word) - 2):

            pre_slice = word[:n]

            # Cut of Fugen-S
            if pre_slice.endswith('ts') or pre_slice.endswith('gs') or pre_slice.endswith('ks') \
                    or pre_slice.endswith('hls') or pre_slice.endswith('ns'):
                if len(word[:n - 1]) > 2: pre_slice = word[:n - 1]

            # Start, in, and end probabilities
            pre_slice_prob = []
            in_slice_prob = []
            start_slice_prob = []

            # Extract all ngrams
            for k in range(len(word) + 1, 2, -1):

                # Probability of first compound, given by its ending prob
                if pre_slice_prob == [] and k <= len(pre_slice):
                    end_ngram = pre_slice[-k:]  # Look backwards
                    pre_slice_prob.append(ngram_probs.suffix.get(end_ngram, -1))  # Punish unlikely pre_slice end_ngram

                # Probability of ngram in word, if high, split unlikely
                in_ngram = word[n:n + k]
                in_slice_prob.append(ngram_probs.infix.get(in_ngram, 1))  # Favor ngrams not occurring within words

                # Probability of word starting
                if start_slice_prob == []:
                    ngram = word[n:n + k]
                    # Cut Fugen-S
                    if ngram.endswith('ts') or ngram.endswith('gs') or ngram.endswith('ks') \
                            or ngram.endswith('hls') or ngram.endswith('ns'):
                        if len(ngram[:-1]) > 2:
                            ngram = ngram[:-1]
                    start_slice_prob.append(ngram_probs.prefix.get(ngram, -1))

            if pre_slice_prob == [] or start_slice_prob == []: continue

            start_slice_prob = max(start_slice_prob)
            pre_slice_prob = max(pre_slice_prob)  # Highest, best preslice
            in_slice_prob = min(in_slice_prob)  # Lowest, punish splitting of good ingrams
            score = start_slice_prob - in_slice_prob + pre_slice_prob
            scores.append([score, word[:n].title(), word[n:].title()])

        scores.sort(reverse=True)
        if scores == []:
            scores = [[0, word.title(), word.title()]]
        return sorted(scores, reverse=True)

    @staticmethod
    def lcs(a, b):
        lcsWords = []
        start = 0

        for word1 in a:
            for i in range(start, len(b)):
                word2 = b[i]
                if word1 == word2:
                    lcsWords.append(word2)
                    start = i + 1

        return lcsWords

Exemplo n.º 15

0

Exibir arquivo

 def __init__(self, model_name: str):
     super().__init__()
     self.tokenizer = SoMaJo(model_name)

Exemplo n.º 16

0

Exibir arquivo

            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False


# =============================================================================
# SoMaJo taken from https://github.com/tsproisl/SoMaJo
# =============================================================================
if False:
    from tqdm import tqdm

    sen_out = []
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for part in tqdm(raw_text):
        sentences = tokenizer.tokenize_text([part])
        for sentence in sentences:
            word_list = [token.text for token in sentence]
            output = " ".join(word_list[:-1])
            output += word_list[-1]
            sen_out.append(output)

    _is_punctuation(raw_text[-1][-1])

    stripped = []
    for index, part in tqdm(enumerate(sen_out)):
        reordered = ""
        for char in part:
            if not _is_punctuation(char):

Exemplo n.º 17

0

Exibir arquivo

 def setUp(self):
     """Necessary preparations"""
     self.tokenizer = SoMaJo("de_CMC", split_camel_case=True, split_sentences=True)

Exemplo n.º 18

0

Exibir arquivo

 def __init__(self, model_name: str):
     super().__init__()
     self.tokenizer = SoMaJo(model_name, split_sentences=False)

Exemplo n.º 19

0

Exibir arquivo

import gzip
import orjson
from somajo import SoMaJo
from tqdm import tqdm
import argparse

tokenizer = SoMaJo("de_CMC")


# see https://github.com/tsproisl/SoMaJo/issues/17
def detokenize(tokens):
    out = []
    for token in tokens:
        if token.original_spelling is not None:
            out.append(token.original_spelling)
        else:
            out.append(token.text)

        if token.space_after:
            out.append(" ")

    return "".join(out)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('filename')
    args = parser.parse_args()
    input_filename = args.filename

Exemplo n.º 20

0

Exibir arquivo

from somajo import SoMaJo
import os
import re
from multiprocessing import Pool, cpu_count

INPUT_DIR = "../data/wiki/"

OUTPUT_DIR = "../output/wiki/"

tokenizer = SoMaJo("de_CMC")

html_tag_patten = re.compile('<[^<>]+>')


# see https://github.com/tsproisl/SoMaJo/issues/17
def detokenize(tokens):
    out = []
    for token in tokens:
        if token.original_spelling is not None:
            out.append(token.original_spelling)
        else:
            out.append(token.text)

        if token.space_after:
            out.append(" ")

    return "".join(out)


def is_doc_start_line(line):
    return line.startswith('<doc')

Exemplo n.º 21

0

Exibir arquivo

Arquivo: tokenize.py Projeto: lschmelzeisen/nasty-analysis

class TokenizedBaseDocument(BaseDocument):
    _tokenizers: ClassVar[Mapping[str, SoMaJo]] = {
        "en": SoMaJo("en_PTB", split_sentences=False),
        "de": SoMaJo("de_CMC", split_sentences=False),
    }
    _lang_callback: ClassVar[Callable[[Mapping[str, object]], str]] = lambda _: "en"
    _text_field_map: ClassVar[Mapping[str, object]]

    @classmethod
    def _make_text_field_map(
        cls,
        document_cls: _T_DocumentMeta,
    ) -> Mapping[str, object]:
        text_field_map: MutableMapping[str, object] = {}

        mapping = document_cls._doc_type.mapping
        for field_name in mapping:
            field = mapping[field_name]

            if isinstance(field, Text):
                text_field_map[field_name] = True

            elif isinstance(field, Object):
                inner_class = field._doc_class
                inner_text_field_map = cls._make_text_field_map(inner_class)
                if inner_text_field_map:
                    text_field_map[field_name] = inner_text_field_map

        return text_field_map

    @classmethod
    @overrides
    def prepare_doc_dict(cls, doc_dict: MutableMapping[str, object]) -> None:
        super().prepare_doc_dict(doc_dict)

        lang = cls._lang_callback(doc_dict)
        if lang not in cls._tokenizers.keys():
            _LOGGER.error(
                "No tokenizer available for language '{}'. Defaulting to '{}'. "
                "Available languages: {}",
                lang,
                "en",
                ", ".join(cls._tokenizers.keys()),
            )
            lang = "en"

        cls._tokenize_doc_dict(doc_dict, cls._text_field_map, lang)

    @classmethod
    def _tokenize_doc_dict(
        cls,
        doc_dict: MutableMapping[str, object],
        text_field_map: Mapping[str, object],
        lang: str,
    ) -> None:
        for field_name, text_field_or_childs in text_field_map.items():
            # text_field_or_childs is either True or a mapping
            value = doc_dict.get(field_name)
            if not value:
                continue
            elif text_field_or_childs is True:
                (
                    doc_dict[field_name],
                    doc_dict[field_name + "_orig"],
                    doc_dict[field_name + "_tokens"],
                ) = cls._tokenize(checked_cast(str, value), lang)
            elif isinstance(value, MutableMapping):
                cls._tokenize_doc_dict(
                    value, cast(Mapping[str, object], text_field_or_childs), lang
                )
            elif isinstance(value, Sequence):
                for v in value:
                    cls._tokenize_doc_dict(
                        v, cast(Mapping[str, object], text_field_or_childs), lang
                    )
            else:
                raise ValueError(
                    f"Value for Object-field needs to be either a Mapping or a "
                    f"Sequence. The value was: {value}"
                )

    @classmethod
    def _tokenize(cls, text_orig: str, lang: str) -> Tuple[str, str, Sequence[str]]:
        text = text_orig.strip()
        text = normalize("NFKC", text)
        if not text:
            return "", "", []

        try:
            text = str(html.fromstring(text).text_content())
        except LxmlError:
            _LOGGER.warning(
                "lxml HTML parsing failed. Skipping it for this document.",
                exc_info=True,
            )

        if not text:
            return "", "", []

        tokens = [
            token.text.lower()
            for token in next(cls._tokenizers[lang].tokenize_text([text]))
            if (token.token_class not in ["URL", "symbol"])
        ]
        return " ".join(tokens), text_orig, tokens