def split(list_of_text, thread_number, TMP_DIR): """ Splits text in sentences Writes line for line with leading space (for BPE) Every document is separated by a free line """ print(os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number))) outF = open( os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)), "w") tokenizer = SoMaJo("de_CMC", split_camel_case=True) for part in list_of_text: sentences = tokenizer.tokenize_text([part]) for sentence in sentences: output = "" for token in sentence: #word_list = [token.text for token in sentence] if (token.space_after and not token.last_in_sentence and not token.first_in_sentence): output += (token.text + ' ') elif token.first_in_sentence: output += (' ' + token.text + ' ') else: #output = " ".join(word_list[:-1]) output += token.text #output += word_list[-1] #sen_out.append(output) outF.write(output) outF.write("\n") outF.write("\n") return thread_number
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = SoMaJo("de_CMC", split_camel_case=True, split_sentences=True) def _equal(self, raw, tokenized_sentences): """""" sentences = self.tokenizer.tokenize_text([raw]) sentences = [" ".join([t.text for t in s]) for s in sentences] self.assertEqual(sentences, tokenized_sentences) def _equal_xml(self, raw, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() eos_tags = set(eos_tags) sentences = self.tokenizer.tokenize_xml(raw, eos_tags) sentences = [" ".join([t.text for t in s]) for s in sentences] self.assertEqual(sentences, tokenized_sentences) def _equal_xml_strip(self, raw, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() eos_tags = set(eos_tags) sentences = self.tokenizer.tokenize_xml(raw, eos_tags, strip_tags=True) sentences = [" ".join([t.text for t in s]) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
class SoMaJoSentenceTokenizer(Tokenizer): def __init__(self, model_name: str): super().__init__() self.tokenizer = SoMaJo(model_name) def tokenize(self, text: str) -> List[str]: out_sentences = [] sentences = list(self.tokenizer.tokenize_text([text])) for i, sentence in enumerate(sentences): text = "" for token in sentence: if "SpaceAfter=No" in token.extra_info: whitespace = "" else: whitespace = " " text += token.text + whitespace if i == len(sentences) - 1: text = text.rstrip() out_sentences.append(text) return out_sentences
def tokenize(text): tokenizer = SoMaJo(language="de_CMC") for i in range(len(text)): text[i] = text[i].split() tok = tokenizer.tokenize_text(text[i]) tok_sent = [] for sent in tok: for word in sent: tok_sent.append(word.text) text[i] = tok_sent
def tokenizer(self, text): tokenizer = SoMaJo("en_PTB") tokenized_object = tokenizer.tokenize_text([text]) sentences = [] types = [] for sent in tokenized_object: sentence = [] for token in sent: sentence.append(token.text) types.append(token.token_class) sentences.append(sentence) self.output['tokens'] = sentences self.output['types'] = types return sentences, types
def replace_hashtags_tokenizer(text): tokenizer = SoMaJo("de_CMC", split_camel_case=True) for i in range(len(text)): line = text[i].split() for j in range(len(line)): if line[j].startswith('#'): hashtag = [] line[j] = line[j].replace('#', "") hashtag.append(line[j]) tok_hashtag = tokenizer.tokenize_text(hashtag) for tok in tok_hashtag: for t in tok: print(t.text) text[i] = " ".join(line) return (text)
def make(self, prerequisite_data): paragraphs = prerequisite_data['paragraph'] tokenizer = SoMaJo("de_CMC", split_camel_case=True) sentences = tokenizer.tokenize_text(paragraphs) tokens = [] sentence_alignment = [] for (i, s) in zip(range(len(sentences)), sentences): tokens += [token.text for token in s] sentence_alignment += [i] * len(s) return { 'token-somajo': tokens, 'sentence-somajo': sentence_alignment, 'token': tokens, 'sentence': sentence_alignment }
class SoMaJoTokenizer(Tokenizer): def __init__(self, language, processes=None): from somajo import SoMaJo tokenizer_type = {"de": "de_CMC", "en": "en_PTB"}[language] self.tokenizer = SoMaJo(tokenizer_type, split_camel_case=True, split_sentences=True) def _tokenize_text(self, text): sentences = [] if len(text) == 0: return sentences for sentence in self.tokenizer.tokenize_text([text]): sentences.append([ Token(token.text, " " if token.space_after else "") for token in sentence ]) if not text[-1].isspace(): sentences[-1][-1] = Token(sentences[-1][-1].text, "") return sentences def split(self, texts, verbose=False): bar = None if verbose: from tqdm.auto import tqdm bar = tqdm(total=len(texts)) # pool.imap leaks memory for some reason for sentences in map(self._tokenize_text, texts): yield sentences if verbose: bar.update(1)
class SoMaJoWordTokenizer(Tokenizer): def __init__(self, model_name: str): super().__init__() self.tokenizer = SoMaJo(model_name, split_sentences=False) def tokenize(self, text: str) -> List[str]: out_tokens = [] tokens = next(self.tokenizer.tokenize_text([text])) for i, token in enumerate(tokens): if "SpaceAfter=No" in token.extra_info or i == len(tokens) - 1: whitespace = "" else: whitespace = " " # sometimes sample more spaces than one space so the model learns to deal with it while random.random() < 0.05: whitespace += " " out_tokens.append(token.text + whitespace) return [x for x in out_tokens if len(x) > 0]
class GeRouge: """ Computes ROUGE scores on German texts. Args: alpha: Weighting factor of Recall and Precision. Between 0 and 1. stemming: Boolean. Defines whether stemming is used or not. split_compounds: Boolean. Defines whether compound words are split or not. minimal_mode: Boolean. Skip time consuming steps for quick calculation. TODO: specify what exactly is skipped. """ def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False): self.tokenizer = SoMaJo('de_CMC') self.sentence_splitter = SentenceSplitter(is_tuple=False) self.alpha = alpha self.stemming = stemming self.split_compounds = split_compounds self.stemmer = SnowballStemmer('german') self.minimal_mode = minimal_mode self.base_path = pathlib.Path(__file__).parent.absolute() self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–'] self.remove_chars.extend(list(string.punctuation)) self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')] self.stop = set() with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f: for line in f: self.stop.add(line.strip()) if not minimal_mode: self.smart_stop = set() with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f: for line in f: word = line.strip().lower() self.smart_stop.add(word) for replace_char in self.replace_chars: word = word.replace(replace_char[0], replace_char[1]) self.lemmas = {} with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f: for line in f: l = line.strip().split('\t') l[0] = l[0].strip().lower() l[1] = l[1].strip().lower() for replace_char in self.replace_chars: l[0] = l[0].replace(replace_char[0], replace_char[1]) l[1] = l[1].replace(replace_char[0], replace_char[1]) self.lemmas[l[0]] = l[1] def tokenize_sents(self, text): # SoMaJo now splits sentences simultaneously sents = list(self.tokenizer.tokenize_text([text])) length = sum([len(sent) for sent in sents]) transformed_sents = [list(self.transform_sent(sent)) for sent in sents] transformed_sents = [[token for token in sent if token is not None and token != ''] for sent in transformed_sents] return transformed_sents, length @staticmethod def create_ngrams(transformed_sents, n=1): ngrams_sents = [ngram_splitter(sent, n) for sent in transformed_sents if len(sent) >= n] ngrams = set([token for sent in ngrams_sents for token in sent]) return ngrams def transform_sent(self, sent): for token in sent: token, splitted = self.transform_token(token.text) if splitted: for partial_token in token: yield partial_token else: yield token def transform_token(self, token): if not self.minimal_mode and token.lower().strip() in self.lemmas: token = self.lemmas[token.lower().strip()] compound_candidates = self.split_compound(token) if self.split_compounds and compound_candidates is not None and compound_candidates[0][0] > 0.5 and \ compound_candidates[0][1] != token: return_tokens = [] for token in compound_candidates[0][1:]: if len(token) > 0: tokens, splitted = self.transform_token(token) if splitted: return_tokens.extend(tokens) else: return_tokens.append(tokens) return return_tokens, True else: token = token.lower().strip() for remove_char in self.remove_chars: token = token.replace(remove_char, '') for replace_char in self.replace_chars: token = token.replace(replace_char[0], replace_char[1]) if (token in self.stop or (not self.minimal_mode and token in self.smart_stop) or bool(re.search(r'\d', token))): token = '' elif self.stemming: if not self.minimal_mode and token in self.lemmas: token = self.lemmas[token] token = self.stemmer.stem(token) return token, False def rouge_n(self, reference, summary, ngrams=(1, 2)): """ Computes Rouge-N scores based on n-grams. :param reference: Ground truth summary. :param summary: Generated prediction summary. :param ngrams: For which n-grams to calculate scores. Can be arbitrarily many. :return: List of (precision, recall, F1) tuples for each individual n-gram length. """ reference_tokenized, reference_length = self.tokenize_sents(reference) summary_tokenized, summary_length = self.tokenize_sents(summary) return self.rouge_n_partial(reference_tokenized, summary_tokenized, ngrams) def rouge_l(self, reference, summary): """ Calculates Rouge-L based on the longest common sub-sequence. :param reference: Ground truth summary. :param summary: Generated prediction summary. :return: Tuple with (precision, recall, F1) values for Rouge-L. """ reference_tokenized, _ = self.tokenize_sents(reference) summary_tokenized, _ = self.tokenize_sents(summary) return self.computeL(summary_tokenized, reference_tokenized) def rouge_n_partial(self, reference_tokenized, summary_tokenized, ngrams): rouge_n = [] for n in ngrams: if n < 1: rouge_n.append((0, 0, 0)) continue reference = self.create_ngrams(reference_tokenized, n=n) summary = self.create_ngrams(summary_tokenized, n=n) if len(reference) == 0 or len(summary) == 0: rouge_n.append((0, 0, 0)) continue matches = sum( [sum([ngram_reference == ngram_summary for ngram_summary in summary]) for ngram_reference in reference]) rouge_p = matches / len(summary) rouge_r = matches / len(reference) denominator = (rouge_r * self.alpha) + (rouge_p * (1 - self.alpha)) if denominator != 0: rouge_f1 = (rouge_p * rouge_r) / denominator else: rouge_f1 = 0.0 rouge_n.append((rouge_p, rouge_r, rouge_f1)) return rouge_n def computeL(self, sys, ref): unionLCS = set() ref_size = sum([len(l) for l in ref]) sys_size = sum([len(l) for l in sys]) for r in ref: for s in sys: seq1 = GeRouge.lcs(r, s) seq2 = GeRouge.lcs(s, r) seq = seq1 if len(seq1) > len(seq2) else seq2 unionLCS.update(seq) if ref_size > 0: rouge_r = len(unionLCS) / ref_size else: rouge_r = 0 if sys_size > 0: rouge_p = len(unionLCS) / sys_size else: rouge_p = 0 denominator = (rouge_r * self.alpha) + (rouge_p * (1 - self.alpha)) if denominator != 0: rouge_f1 = (rouge_p * rouge_r) / denominator else: rouge_f1 = 0.0 return rouge_p, rouge_r, rouge_f1 @staticmethod def split_compound(word: str): """ Code adapted from: https://github.com/dtuggener/CharSplit Return list of possible splits, best first :param word: Word to be split :return: List of all splits """ word = word.lower() # If there is a hyphen in the word, return part of the word behind the last hyphen if '-' in word: return [[1., '-'.join((word.split('-'))[:-1]).title(), word.split('-')[-1].title()]] scores = [] # Score for each possible split position # Iterate through characters, start at forth character, go to 3rd last for n in range(3, len(word) - 2): pre_slice = word[:n] # Cut of Fugen-S if pre_slice.endswith('ts') or pre_slice.endswith('gs') or pre_slice.endswith('ks') \ or pre_slice.endswith('hls') or pre_slice.endswith('ns'): if len(word[:n - 1]) > 2: pre_slice = word[:n - 1] # Start, in, and end probabilities pre_slice_prob = [] in_slice_prob = [] start_slice_prob = [] # Extract all ngrams for k in range(len(word) + 1, 2, -1): # Probability of first compound, given by its ending prob if pre_slice_prob == [] and k <= len(pre_slice): end_ngram = pre_slice[-k:] # Look backwards pre_slice_prob.append(ngram_probs.suffix.get(end_ngram, -1)) # Punish unlikely pre_slice end_ngram # Probability of ngram in word, if high, split unlikely in_ngram = word[n:n + k] in_slice_prob.append(ngram_probs.infix.get(in_ngram, 1)) # Favor ngrams not occurring within words # Probability of word starting if start_slice_prob == []: ngram = word[n:n + k] # Cut Fugen-S if ngram.endswith('ts') or ngram.endswith('gs') or ngram.endswith('ks') \ or ngram.endswith('hls') or ngram.endswith('ns'): if len(ngram[:-1]) > 2: ngram = ngram[:-1] start_slice_prob.append(ngram_probs.prefix.get(ngram, -1)) if pre_slice_prob == [] or start_slice_prob == []: continue start_slice_prob = max(start_slice_prob) pre_slice_prob = max(pre_slice_prob) # Highest, best preslice in_slice_prob = min(in_slice_prob) # Lowest, punish splitting of good ingrams score = start_slice_prob - in_slice_prob + pre_slice_prob scores.append([score, word[:n].title(), word[n:].title()]) scores.sort(reverse=True) if scores == []: scores = [[0, word.title(), word.title()]] return sorted(scores, reverse=True) @staticmethod def lcs(a, b): lcsWords = [] start = 0 for word1 in a: for i in range(start, len(b)): word2 = b[i] if word1 == word2: lcsWords.append(word2) start = i + 1 return lcsWords
cat = unicodedata.category(char) if cat.startswith("P"): return True return False # ============================================================================= # SoMaJo taken from https://github.com/tsproisl/SoMaJo # ============================================================================= if False: from tqdm import tqdm sen_out = [] tokenizer = SoMaJo("de_CMC", split_camel_case=True) for part in tqdm(raw_text): sentences = tokenizer.tokenize_text([part]) for sentence in sentences: word_list = [token.text for token in sentence] output = " ".join(word_list[:-1]) output += word_list[-1] sen_out.append(output) _is_punctuation(raw_text[-1][-1]) stripped = [] for index, part in tqdm(enumerate(sen_out)): reordered = "" for char in part: if not _is_punctuation(char): reordered += char else:
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('filename') args = parser.parse_args() input_filename = args.filename with gzip.open(input_filename, 'r') as f, \ gzip.open(input_filename + '-out.gz', 'wt') as output_file: with tqdm(total=2980314) as pbar: for line in f: pbar.update(1) line_dict = orjson.loads(line) content = line_dict['raw_content'] language = line_dict['language'] if language == 'de': sentences = tokenizer.tokenize_text([content], parallel=1) for s in sentences: sentence_string = detokenize(s) output_file.write(sentence_string + '\n') # split documents? #output_file.write('\n') else: print('###################') print(language) print(content)
def SentenceSplit(text): tokenizer = SoMaJo("de_CMC") tokens = tokenizer.tokenize_text(text) return tokens