def split(list_of_text, thread_number, TMP_DIR): """ Splits text in sentences Writes line for line with leading space (for BPE) Every document is separated by a free line """ print(os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number))) outF = open( os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)), "w") tokenizer = SoMaJo("de_CMC", split_camel_case=True) for part in list_of_text: sentences = tokenizer.tokenize_text([part]) for sentence in sentences: output = "" for token in sentence: #word_list = [token.text for token in sentence] if (token.space_after and not token.last_in_sentence and not token.first_in_sentence): output += (token.text + ' ') elif token.first_in_sentence: output += (' ' + token.text + ' ') else: #output = " ".join(word_list[:-1]) output += token.text #output += word_list[-1] #sen_out.append(output) outF.write(output) outF.write("\n") outF.write("\n") return thread_number
def __init__(self, language, processes=None): from somajo import SoMaJo tokenizer_type = {"de": "de_CMC", "en": "en_PTB"}[language] self.tokenizer = SoMaJo(tokenizer_type, split_camel_case=True, split_sentences=True)
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = SoMaJo("de_CMC", split_camel_case=True, split_sentences=True) def _equal(self, raw, tokenized_sentences): """""" sentences = self.tokenizer.tokenize_text([raw]) sentences = [" ".join([t.text for t in s]) for s in sentences] self.assertEqual(sentences, tokenized_sentences) def _equal_xml(self, raw, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() eos_tags = set(eos_tags) sentences = self.tokenizer.tokenize_xml(raw, eos_tags) sentences = [" ".join([t.text for t in s]) for s in sentences] self.assertEqual(sentences, tokenized_sentences) def _equal_xml_strip(self, raw, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() eos_tags = set(eos_tags) sentences = self.tokenizer.tokenize_xml(raw, eos_tags, strip_tags=True) sentences = [" ".join([t.text for t in s]) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
def tokenize(text): tokenizer = SoMaJo(language="de_CMC") for i in range(len(text)): text[i] = text[i].split() tok = tokenizer.tokenize_text(text[i]) tok_sent = [] for sent in tok: for word in sent: tok_sent.append(word.text) text[i] = tok_sent
def tokenizer(self, text): tokenizer = SoMaJo("en_PTB") tokenized_object = tokenizer.tokenize_text([text]) sentences = [] types = [] for sent in tokenized_object: sentence = [] for token in sent: sentence.append(token.text) types.append(token.token_class) sentences.append(sentence) self.output['tokens'] = sentences self.output['types'] = types return sentences, types
def replace_hashtags_tokenizer(text): tokenizer = SoMaJo("de_CMC", split_camel_case=True) for i in range(len(text)): line = text[i].split() for j in range(len(line)): if line[j].startswith('#'): hashtag = [] line[j] = line[j].replace('#', "") hashtag.append(line[j]) tok_hashtag = tokenizer.tokenize_text(hashtag) for tok in tok_hashtag: for t in tok: print(t.text) text[i] = " ".join(line) return (text)
class SoMaJoSentenceTokenizer(Tokenizer): def __init__(self, model_name: str): super().__init__() self.tokenizer = SoMaJo(model_name) def tokenize(self, text: str) -> List[str]: out_sentences = [] sentences = list(self.tokenizer.tokenize_text([text])) for i, sentence in enumerate(sentences): text = "" for token in sentence: if "SpaceAfter=No" in token.extra_info: whitespace = "" else: whitespace = " " text += token.text + whitespace if i == len(sentences) - 1: text = text.rstrip() out_sentences.append(text) return out_sentences
def main(): args = arguments() n_tokens = 0 n_sentences = 0 t0 = time.perf_counter() is_xml = False if args.xml or args.tag is not None: is_xml = True tokenizer = SoMaJo(args.language, split_camel_case=args.split_camel_case, split_sentences=args.split_sentences, xml_sentences=args.sentence_tag) if is_xml: eos_tags = args.tag if eos_tags is None: eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split( ) eos_tags = set(eos_tags) chunks = tokenizer.tokenize_xml_file(args.FILE, eos_tags, strip_tags=args.strip_tags, parallel=args.parallel) else: chunks = tokenizer.tokenize_text_file(args.FILE, args.paragraph_separator, parallel=args.parallel) for chunk in chunks: n_sentences += 1 for token in chunk: output = token.text if not token.markup: n_tokens += 1 if args.token_classes: output += "\t" + token.token_class if args.extra_info: output += "\t" + token.extra_info print(output) if args.split_sentences and args.sentence_tag is None: print() t1 = time.perf_counter() if args.split_sentences: logging.info( "Tokenized %d tokens (%d sentences) in %d seconds (%d tokens/s)" % (n_tokens, n_sentences, t1 - t0, n_tokens / (t1 - t0))) else: logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
def make(self, prerequisite_data): paragraphs = prerequisite_data['paragraph'] tokenizer = SoMaJo("de_CMC", split_camel_case=True) sentences = tokenizer.tokenize_text(paragraphs) tokens = [] sentence_alignment = [] for (i, s) in zip(range(len(sentences)), sentences): tokens += [token.text for token in s] sentence_alignment += [i] * len(s) return { 'token-somajo': tokens, 'sentence-somajo': sentence_alignment, 'token': tokens, 'sentence': sentence_alignment }
def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False): self.tokenizer = SoMaJo('de_CMC') self.sentence_splitter = SentenceSplitter(is_tuple=False) self.alpha = alpha self.stemming = stemming self.split_compounds = split_compounds self.stemmer = SnowballStemmer('german') self.minimal_mode = minimal_mode self.base_path = pathlib.Path(__file__).parent.absolute() self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–'] self.remove_chars.extend(list(string.punctuation)) self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')] self.stop = set() with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f: for line in f: self.stop.add(line.strip()) if not minimal_mode: self.smart_stop = set() with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f: for line in f: word = line.strip().lower() self.smart_stop.add(word) for replace_char in self.replace_chars: word = word.replace(replace_char[0], replace_char[1]) self.lemmas = {} with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f: for line in f: l = line.strip().split('\t') l[0] = l[0].strip().lower() l[1] = l[1].strip().lower() for replace_char in self.replace_chars: l[0] = l[0].replace(replace_char[0], replace_char[1]) l[1] = l[1].replace(replace_char[0], replace_char[1]) self.lemmas[l[0]] = l[1]
class SoMaJoTokenizer(Tokenizer): def __init__(self, language, processes=None): from somajo import SoMaJo tokenizer_type = {"de": "de_CMC", "en": "en_PTB"}[language] self.tokenizer = SoMaJo(tokenizer_type, split_camel_case=True, split_sentences=True) def _tokenize_text(self, text): sentences = [] if len(text) == 0: return sentences for sentence in self.tokenizer.tokenize_text([text]): sentences.append([ Token(token.text, " " if token.space_after else "") for token in sentence ]) if not text[-1].isspace(): sentences[-1][-1] = Token(sentences[-1][-1].text, "") return sentences def split(self, texts, verbose=False): bar = None if verbose: from tqdm.auto import tqdm bar = tqdm(total=len(texts)) # pool.imap leaks memory for some reason for sentences in map(self._tokenize_text, texts): yield sentences if verbose: bar.update(1)
class SoMaJoWordTokenizer(Tokenizer): def __init__(self, model_name: str): super().__init__() self.tokenizer = SoMaJo(model_name, split_sentences=False) def tokenize(self, text: str) -> List[str]: out_tokens = [] tokens = next(self.tokenizer.tokenize_text([text])) for i, token in enumerate(tokens): if "SpaceAfter=No" in token.extra_info or i == len(tokens) - 1: whitespace = "" else: whitespace = " " # sometimes sample more spaces than one space so the model learns to deal with it while random.random() < 0.05: whitespace += " " out_tokens.append(token.text + whitespace) return [x for x in out_tokens if len(x) > 0]
def SentenceSplit(text): tokenizer = SoMaJo("de_CMC") tokens = tokenizer.tokenize_text(text) return tokens
class GeRouge: """ Computes ROUGE scores on German texts. Args: alpha: Weighting factor of Recall and Precision. Between 0 and 1. stemming: Boolean. Defines whether stemming is used or not. split_compounds: Boolean. Defines whether compound words are split or not. minimal_mode: Boolean. Skip time consuming steps for quick calculation. TODO: specify what exactly is skipped. """ def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False): self.tokenizer = SoMaJo('de_CMC') self.sentence_splitter = SentenceSplitter(is_tuple=False) self.alpha = alpha self.stemming = stemming self.split_compounds = split_compounds self.stemmer = SnowballStemmer('german') self.minimal_mode = minimal_mode self.base_path = pathlib.Path(__file__).parent.absolute() self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–'] self.remove_chars.extend(list(string.punctuation)) self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')] self.stop = set() with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f: for line in f: self.stop.add(line.strip()) if not minimal_mode: self.smart_stop = set() with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f: for line in f: word = line.strip().lower() self.smart_stop.add(word) for replace_char in self.replace_chars: word = word.replace(replace_char[0], replace_char[1]) self.lemmas = {} with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f: for line in f: l = line.strip().split('\t') l[0] = l[0].strip().lower() l[1] = l[1].strip().lower() for replace_char in self.replace_chars: l[0] = l[0].replace(replace_char[0], replace_char[1]) l[1] = l[1].replace(replace_char[0], replace_char[1]) self.lemmas[l[0]] = l[1] def tokenize_sents(self, text): # SoMaJo now splits sentences simultaneously sents = list(self.tokenizer.tokenize_text([text])) length = sum([len(sent) for sent in sents]) transformed_sents = [list(self.transform_sent(sent)) for sent in sents] transformed_sents = [[token for token in sent if token is not None and token != ''] for sent in transformed_sents] return transformed_sents, length @staticmethod def create_ngrams(transformed_sents, n=1): ngrams_sents = [ngram_splitter(sent, n) for sent in transformed_sents if len(sent) >= n] ngrams = set([token for sent in ngrams_sents for token in sent]) return ngrams def transform_sent(self, sent): for token in sent: token, splitted = self.transform_token(token.text) if splitted: for partial_token in token: yield partial_token else: yield token def transform_token(self, token): if not self.minimal_mode and token.lower().strip() in self.lemmas: token = self.lemmas[token.lower().strip()] compound_candidates = self.split_compound(token) if self.split_compounds and compound_candidates is not None and compound_candidates[0][0] > 0.5 and \ compound_candidates[0][1] != token: return_tokens = [] for token in compound_candidates[0][1:]: if len(token) > 0: tokens, splitted = self.transform_token(token) if splitted: return_tokens.extend(tokens) else: return_tokens.append(tokens) return return_tokens, True else: token = token.lower().strip() for remove_char in self.remove_chars: token = token.replace(remove_char, '') for replace_char in self.replace_chars: token = token.replace(replace_char[0], replace_char[1]) if (token in self.stop or (not self.minimal_mode and token in self.smart_stop) or bool(re.search(r'\d', token))): token = '' elif self.stemming: if not self.minimal_mode and token in self.lemmas: token = self.lemmas[token] token = self.stemmer.stem(token) return token, False def rouge_n(self, reference, summary, ngrams=(1, 2)): """ Computes Rouge-N scores based on n-grams. :param reference: Ground truth summary. :param summary: Generated prediction summary. :param ngrams: For which n-grams to calculate scores. Can be arbitrarily many. :return: List of (precision, recall, F1) tuples for each individual n-gram length. """ reference_tokenized, reference_length = self.tokenize_sents(reference) summary_tokenized, summary_length = self.tokenize_sents(summary) return self.rouge_n_partial(reference_tokenized, summary_tokenized, ngrams) def rouge_l(self, reference, summary): """ Calculates Rouge-L based on the longest common sub-sequence. :param reference: Ground truth summary. :param summary: Generated prediction summary. :return: Tuple with (precision, recall, F1) values for Rouge-L. """ reference_tokenized, _ = self.tokenize_sents(reference) summary_tokenized, _ = self.tokenize_sents(summary) return self.computeL(summary_tokenized, reference_tokenized) def rouge_n_partial(self, reference_tokenized, summary_tokenized, ngrams): rouge_n = [] for n in ngrams: if n < 1: rouge_n.append((0, 0, 0)) continue reference = self.create_ngrams(reference_tokenized, n=n) summary = self.create_ngrams(summary_tokenized, n=n) if len(reference) == 0 or len(summary) == 0: rouge_n.append((0, 0, 0)) continue matches = sum( [sum([ngram_reference == ngram_summary for ngram_summary in summary]) for ngram_reference in reference]) rouge_p = matches / len(summary) rouge_r = matches / len(reference) denominator = (rouge_r * self.alpha) + (rouge_p * (1 - self.alpha)) if denominator != 0: rouge_f1 = (rouge_p * rouge_r) / denominator else: rouge_f1 = 0.0 rouge_n.append((rouge_p, rouge_r, rouge_f1)) return rouge_n def computeL(self, sys, ref): unionLCS = set() ref_size = sum([len(l) for l in ref]) sys_size = sum([len(l) for l in sys]) for r in ref: for s in sys: seq1 = GeRouge.lcs(r, s) seq2 = GeRouge.lcs(s, r) seq = seq1 if len(seq1) > len(seq2) else seq2 unionLCS.update(seq) if ref_size > 0: rouge_r = len(unionLCS) / ref_size else: rouge_r = 0 if sys_size > 0: rouge_p = len(unionLCS) / sys_size else: rouge_p = 0 denominator = (rouge_r * self.alpha) + (rouge_p * (1 - self.alpha)) if denominator != 0: rouge_f1 = (rouge_p * rouge_r) / denominator else: rouge_f1 = 0.0 return rouge_p, rouge_r, rouge_f1 @staticmethod def split_compound(word: str): """ Code adapted from: https://github.com/dtuggener/CharSplit Return list of possible splits, best first :param word: Word to be split :return: List of all splits """ word = word.lower() # If there is a hyphen in the word, return part of the word behind the last hyphen if '-' in word: return [[1., '-'.join((word.split('-'))[:-1]).title(), word.split('-')[-1].title()]] scores = [] # Score for each possible split position # Iterate through characters, start at forth character, go to 3rd last for n in range(3, len(word) - 2): pre_slice = word[:n] # Cut of Fugen-S if pre_slice.endswith('ts') or pre_slice.endswith('gs') or pre_slice.endswith('ks') \ or pre_slice.endswith('hls') or pre_slice.endswith('ns'): if len(word[:n - 1]) > 2: pre_slice = word[:n - 1] # Start, in, and end probabilities pre_slice_prob = [] in_slice_prob = [] start_slice_prob = [] # Extract all ngrams for k in range(len(word) + 1, 2, -1): # Probability of first compound, given by its ending prob if pre_slice_prob == [] and k <= len(pre_slice): end_ngram = pre_slice[-k:] # Look backwards pre_slice_prob.append(ngram_probs.suffix.get(end_ngram, -1)) # Punish unlikely pre_slice end_ngram # Probability of ngram in word, if high, split unlikely in_ngram = word[n:n + k] in_slice_prob.append(ngram_probs.infix.get(in_ngram, 1)) # Favor ngrams not occurring within words # Probability of word starting if start_slice_prob == []: ngram = word[n:n + k] # Cut Fugen-S if ngram.endswith('ts') or ngram.endswith('gs') or ngram.endswith('ks') \ or ngram.endswith('hls') or ngram.endswith('ns'): if len(ngram[:-1]) > 2: ngram = ngram[:-1] start_slice_prob.append(ngram_probs.prefix.get(ngram, -1)) if pre_slice_prob == [] or start_slice_prob == []: continue start_slice_prob = max(start_slice_prob) pre_slice_prob = max(pre_slice_prob) # Highest, best preslice in_slice_prob = min(in_slice_prob) # Lowest, punish splitting of good ingrams score = start_slice_prob - in_slice_prob + pre_slice_prob scores.append([score, word[:n].title(), word[n:].title()]) scores.sort(reverse=True) if scores == []: scores = [[0, word.title(), word.title()]] return sorted(scores, reverse=True) @staticmethod def lcs(a, b): lcsWords = [] start = 0 for word1 in a: for i in range(start, len(b)): word2 = b[i] if word1 == word2: lcsWords.append(word2) start = i + 1 return lcsWords
def __init__(self, model_name: str): super().__init__() self.tokenizer = SoMaJo(model_name)
or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False # ============================================================================= # SoMaJo taken from https://github.com/tsproisl/SoMaJo # ============================================================================= if False: from tqdm import tqdm sen_out = [] tokenizer = SoMaJo("de_CMC", split_camel_case=True) for part in tqdm(raw_text): sentences = tokenizer.tokenize_text([part]) for sentence in sentences: word_list = [token.text for token in sentence] output = " ".join(word_list[:-1]) output += word_list[-1] sen_out.append(output) _is_punctuation(raw_text[-1][-1]) stripped = [] for index, part in tqdm(enumerate(sen_out)): reordered = "" for char in part: if not _is_punctuation(char):
def setUp(self): """Necessary preparations""" self.tokenizer = SoMaJo("de_CMC", split_camel_case=True, split_sentences=True)
def __init__(self, model_name: str): super().__init__() self.tokenizer = SoMaJo(model_name, split_sentences=False)
import gzip import orjson from somajo import SoMaJo from tqdm import tqdm import argparse tokenizer = SoMaJo("de_CMC") # see https://github.com/tsproisl/SoMaJo/issues/17 def detokenize(tokens): out = [] for token in tokens: if token.original_spelling is not None: out.append(token.original_spelling) else: out.append(token.text) if token.space_after: out.append(" ") return "".join(out) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('filename') args = parser.parse_args() input_filename = args.filename
from somajo import SoMaJo import os import re from multiprocessing import Pool, cpu_count INPUT_DIR = "../data/wiki/" OUTPUT_DIR = "../output/wiki/" tokenizer = SoMaJo("de_CMC") html_tag_patten = re.compile('<[^<>]+>') # see https://github.com/tsproisl/SoMaJo/issues/17 def detokenize(tokens): out = [] for token in tokens: if token.original_spelling is not None: out.append(token.original_spelling) else: out.append(token.text) if token.space_after: out.append(" ") return "".join(out) def is_doc_start_line(line): return line.startswith('<doc')
class TokenizedBaseDocument(BaseDocument): _tokenizers: ClassVar[Mapping[str, SoMaJo]] = { "en": SoMaJo("en_PTB", split_sentences=False), "de": SoMaJo("de_CMC", split_sentences=False), } _lang_callback: ClassVar[Callable[[Mapping[str, object]], str]] = lambda _: "en" _text_field_map: ClassVar[Mapping[str, object]] @classmethod def _make_text_field_map( cls, document_cls: _T_DocumentMeta, ) -> Mapping[str, object]: text_field_map: MutableMapping[str, object] = {} mapping = document_cls._doc_type.mapping for field_name in mapping: field = mapping[field_name] if isinstance(field, Text): text_field_map[field_name] = True elif isinstance(field, Object): inner_class = field._doc_class inner_text_field_map = cls._make_text_field_map(inner_class) if inner_text_field_map: text_field_map[field_name] = inner_text_field_map return text_field_map @classmethod @overrides def prepare_doc_dict(cls, doc_dict: MutableMapping[str, object]) -> None: super().prepare_doc_dict(doc_dict) lang = cls._lang_callback(doc_dict) if lang not in cls._tokenizers.keys(): _LOGGER.error( "No tokenizer available for language '{}'. Defaulting to '{}'. " "Available languages: {}", lang, "en", ", ".join(cls._tokenizers.keys()), ) lang = "en" cls._tokenize_doc_dict(doc_dict, cls._text_field_map, lang) @classmethod def _tokenize_doc_dict( cls, doc_dict: MutableMapping[str, object], text_field_map: Mapping[str, object], lang: str, ) -> None: for field_name, text_field_or_childs in text_field_map.items(): # text_field_or_childs is either True or a mapping value = doc_dict.get(field_name) if not value: continue elif text_field_or_childs is True: ( doc_dict[field_name], doc_dict[field_name + "_orig"], doc_dict[field_name + "_tokens"], ) = cls._tokenize(checked_cast(str, value), lang) elif isinstance(value, MutableMapping): cls._tokenize_doc_dict( value, cast(Mapping[str, object], text_field_or_childs), lang ) elif isinstance(value, Sequence): for v in value: cls._tokenize_doc_dict( v, cast(Mapping[str, object], text_field_or_childs), lang ) else: raise ValueError( f"Value for Object-field needs to be either a Mapping or a " f"Sequence. The value was: {value}" ) @classmethod def _tokenize(cls, text_orig: str, lang: str) -> Tuple[str, str, Sequence[str]]: text = text_orig.strip() text = normalize("NFKC", text) if not text: return "", "", [] try: text = str(html.fromstring(text).text_content()) except LxmlError: _LOGGER.warning( "lxml HTML parsing failed. Skipping it for this document.", exc_info=True, ) if not text: return "", "", [] tokens = [ token.text.lower() for token in next(cls._tokenizers[lang].tokenize_text([text])) if (token.token_class not in ["URL", "symbol"]) ] return " ".join(tokens), text_orig, tokens