def split(list_of_text, thread_number, TMP_DIR): """ Splits text in sentences Writes line for line with leading space (for BPE) Every document is separated by a free line """ print(os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number))) outF = open( os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)), "w") tokenizer = SoMaJo("de_CMC", split_camel_case=True) for part in list_of_text: sentences = tokenizer.tokenize_text([part]) for sentence in sentences: output = "" for token in sentence: #word_list = [token.text for token in sentence] if (token.space_after and not token.last_in_sentence and not token.first_in_sentence): output += (token.text + ' ') elif token.first_in_sentence: output += (' ' + token.text + ' ') else: #output = " ".join(word_list[:-1]) output += token.text #output += word_list[-1] #sen_out.append(output) outF.write(output) outF.write("\n") outF.write("\n") return thread_number
def __init__(self, language, processes=None): from somajo import SoMaJo tokenizer_type = {"de": "de_CMC", "en": "en_PTB"}[language] self.tokenizer = SoMaJo(tokenizer_type, split_camel_case=True, split_sentences=True)
def tokenize(text): tokenizer = SoMaJo(language="de_CMC") for i in range(len(text)): text[i] = text[i].split() tok = tokenizer.tokenize_text(text[i]) tok_sent = [] for sent in tok: for word in sent: tok_sent.append(word.text) text[i] = tok_sent
def tokenizer(self, text): tokenizer = SoMaJo("en_PTB") tokenized_object = tokenizer.tokenize_text([text]) sentences = [] types = [] for sent in tokenized_object: sentence = [] for token in sent: sentence.append(token.text) types.append(token.token_class) sentences.append(sentence) self.output['tokens'] = sentences self.output['types'] = types return sentences, types
def replace_hashtags_tokenizer(text): tokenizer = SoMaJo("de_CMC", split_camel_case=True) for i in range(len(text)): line = text[i].split() for j in range(len(line)): if line[j].startswith('#'): hashtag = [] line[j] = line[j].replace('#', "") hashtag.append(line[j]) tok_hashtag = tokenizer.tokenize_text(hashtag) for tok in tok_hashtag: for t in tok: print(t.text) text[i] = " ".join(line) return (text)
def main(): args = arguments() n_tokens = 0 n_sentences = 0 t0 = time.perf_counter() is_xml = False if args.xml or args.tag is not None: is_xml = True tokenizer = SoMaJo(args.language, split_camel_case=args.split_camel_case, split_sentences=args.split_sentences, xml_sentences=args.sentence_tag) if is_xml: eos_tags = args.tag if eos_tags is None: eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split( ) eos_tags = set(eos_tags) chunks = tokenizer.tokenize_xml_file(args.FILE, eos_tags, strip_tags=args.strip_tags, parallel=args.parallel) else: chunks = tokenizer.tokenize_text_file(args.FILE, args.paragraph_separator, parallel=args.parallel) for chunk in chunks: n_sentences += 1 for token in chunk: output = token.text if not token.markup: n_tokens += 1 if args.token_classes: output += "\t" + token.token_class if args.extra_info: output += "\t" + token.extra_info print(output) if args.split_sentences and args.sentence_tag is None: print() t1 = time.perf_counter() if args.split_sentences: logging.info( "Tokenized %d tokens (%d sentences) in %d seconds (%d tokens/s)" % (n_tokens, n_sentences, t1 - t0, n_tokens / (t1 - t0))) else: logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
def make(self, prerequisite_data): paragraphs = prerequisite_data['paragraph'] tokenizer = SoMaJo("de_CMC", split_camel_case=True) sentences = tokenizer.tokenize_text(paragraphs) tokens = [] sentence_alignment = [] for (i, s) in zip(range(len(sentences)), sentences): tokens += [token.text for token in s] sentence_alignment += [i] * len(s) return { 'token-somajo': tokens, 'sentence-somajo': sentence_alignment, 'token': tokens, 'sentence': sentence_alignment }
def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False): self.tokenizer = SoMaJo('de_CMC') self.sentence_splitter = SentenceSplitter(is_tuple=False) self.alpha = alpha self.stemming = stemming self.split_compounds = split_compounds self.stemmer = SnowballStemmer('german') self.minimal_mode = minimal_mode self.base_path = pathlib.Path(__file__).parent.absolute() self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–'] self.remove_chars.extend(list(string.punctuation)) self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')] self.stop = set() with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f: for line in f: self.stop.add(line.strip()) if not minimal_mode: self.smart_stop = set() with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f: for line in f: word = line.strip().lower() self.smart_stop.add(word) for replace_char in self.replace_chars: word = word.replace(replace_char[0], replace_char[1]) self.lemmas = {} with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f: for line in f: l = line.strip().split('\t') l[0] = l[0].strip().lower() l[1] = l[1].strip().lower() for replace_char in self.replace_chars: l[0] = l[0].replace(replace_char[0], replace_char[1]) l[1] = l[1].replace(replace_char[0], replace_char[1]) self.lemmas[l[0]] = l[1]
or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False # ============================================================================= # SoMaJo taken from https://github.com/tsproisl/SoMaJo # ============================================================================= if False: from tqdm import tqdm sen_out = [] tokenizer = SoMaJo("de_CMC", split_camel_case=True) for part in tqdm(raw_text): sentences = tokenizer.tokenize_text([part]) for sentence in sentences: word_list = [token.text for token in sentence] output = " ".join(word_list[:-1]) output += word_list[-1] sen_out.append(output) _is_punctuation(raw_text[-1][-1]) stripped = [] for index, part in tqdm(enumerate(sen_out)): reordered = "" for char in part: if not _is_punctuation(char):
def setUp(self): """Necessary preparations""" self.tokenizer = SoMaJo("de_CMC", split_camel_case=True, split_sentences=True)
import gzip import orjson from somajo import SoMaJo from tqdm import tqdm import argparse tokenizer = SoMaJo("de_CMC") # see https://github.com/tsproisl/SoMaJo/issues/17 def detokenize(tokens): out = [] for token in tokens: if token.original_spelling is not None: out.append(token.original_spelling) else: out.append(token.text) if token.space_after: out.append(" ") return "".join(out) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('filename') args = parser.parse_args() input_filename = args.filename
def SentenceSplit(text): tokenizer = SoMaJo("de_CMC") tokens = tokenizer.tokenize_text(text) return tokens
def __init__(self, model_name: str): super().__init__() self.tokenizer = SoMaJo(model_name, split_sentences=False)
def __init__(self, model_name: str): super().__init__() self.tokenizer = SoMaJo(model_name)
class TokenizedBaseDocument(BaseDocument): _tokenizers: ClassVar[Mapping[str, SoMaJo]] = { "en": SoMaJo("en_PTB", split_sentences=False), "de": SoMaJo("de_CMC", split_sentences=False), } _lang_callback: ClassVar[Callable[[Mapping[str, object]], str]] = lambda _: "en" _text_field_map: ClassVar[Mapping[str, object]] @classmethod def _make_text_field_map( cls, document_cls: _T_DocumentMeta, ) -> Mapping[str, object]: text_field_map: MutableMapping[str, object] = {} mapping = document_cls._doc_type.mapping for field_name in mapping: field = mapping[field_name] if isinstance(field, Text): text_field_map[field_name] = True elif isinstance(field, Object): inner_class = field._doc_class inner_text_field_map = cls._make_text_field_map(inner_class) if inner_text_field_map: text_field_map[field_name] = inner_text_field_map return text_field_map @classmethod @overrides def prepare_doc_dict(cls, doc_dict: MutableMapping[str, object]) -> None: super().prepare_doc_dict(doc_dict) lang = cls._lang_callback(doc_dict) if lang not in cls._tokenizers.keys(): _LOGGER.error( "No tokenizer available for language '{}'. Defaulting to '{}'. " "Available languages: {}", lang, "en", ", ".join(cls._tokenizers.keys()), ) lang = "en" cls._tokenize_doc_dict(doc_dict, cls._text_field_map, lang) @classmethod def _tokenize_doc_dict( cls, doc_dict: MutableMapping[str, object], text_field_map: Mapping[str, object], lang: str, ) -> None: for field_name, text_field_or_childs in text_field_map.items(): # text_field_or_childs is either True or a mapping value = doc_dict.get(field_name) if not value: continue elif text_field_or_childs is True: ( doc_dict[field_name], doc_dict[field_name + "_orig"], doc_dict[field_name + "_tokens"], ) = cls._tokenize(checked_cast(str, value), lang) elif isinstance(value, MutableMapping): cls._tokenize_doc_dict( value, cast(Mapping[str, object], text_field_or_childs), lang ) elif isinstance(value, Sequence): for v in value: cls._tokenize_doc_dict( v, cast(Mapping[str, object], text_field_or_childs), lang ) else: raise ValueError( f"Value for Object-field needs to be either a Mapping or a " f"Sequence. The value was: {value}" ) @classmethod def _tokenize(cls, text_orig: str, lang: str) -> Tuple[str, str, Sequence[str]]: text = text_orig.strip() text = normalize("NFKC", text) if not text: return "", "", [] try: text = str(html.fromstring(text).text_content()) except LxmlError: _LOGGER.warning( "lxml HTML parsing failed. Skipping it for this document.", exc_info=True, ) if not text: return "", "", [] tokens = [ token.text.lower() for token in next(cls._tokenizers[lang].tokenize_text([text])) if (token.token_class not in ["URL", "symbol"]) ] return " ".join(tokens), text_orig, tokens