def __init__(self): self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) self._sentence_splitter = SentenceSplitter()
def __init__(self, language='en'): self.language = language if language == 'en': self.tokenizer = PunktSentenceTokenizer() elif language == 'de': self.tokenizer = SentenceSplitter(is_tuple=False) else: raise NotImplementedError
def main(): args = arguments() n_tokens = 0 t0 = time.perf_counter() is_xml = False if args.xml or args.tag is not None: is_xml = True tokenizer = Tokenizer(args.split_camel_case, args.token_classes, args.extra_info, args.language) sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info, args.language) if is_xml: if args.parallel > 1: logging.warning( "Parallel tokenization of XML files is currently not supported." ) eos_tags = args.tag if eos_tags is None: eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split( ) eos_tags = set(eos_tags) tokenized_paragraphs = [tokenizer.tokenize_xml(args.FILE)] if args.split_sentences: tokenized_paragraphs = list( sentence_splitter.split_xml(tokenized_paragraphs[0], eos_tags)) else: if args.paragraph_separator == "empty_lines": paragraphs = utils.get_paragraphs(args.FILE) elif args.paragraph_separator == "single_newlines": paragraphs = (line for line in args.FILE if line.strip() != "") if args.parallel > 1: pool = multiprocessing.Pool( min(args.parallel, multiprocessing.cpu_count())) tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs, 250) else: tokenized_paragraphs = map(tokenizer.tokenize, paragraphs) tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp) if args.split_sentences: tokenized_paragraphs = map(sentence_splitter.split, tokenized_paragraphs) tokenized_paragraphs = (s for tp in tokenized_paragraphs for s in tp) if args.token_classes or args.extra_info: if is_xml: tokenized_paragraphs = ([(l[0], ) if l[1] is None else l for l in tp] for tp in tokenized_paragraphs) tokenized_paragraphs = (["\t".join(t) for t in tp] for tp in tokenized_paragraphs) for tp in tokenized_paragraphs: n_tokens += len(tp) print("\n".join(tp), "\n", sep="") t1 = time.perf_counter() logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
def SentenceSplit(text): tokenizer = Tokenizer(split_camel_case=False, token_classes=False, extra_info=False) tokens = tokenizer.tokenize(text) sentence_splitter = SentenceSplitter(is_tuple=False) sentences = sentence_splitter.split(tokens) return sentences
def get_sents(texts): tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) sentence_splitter = SentenceSplitter(is_tuple=False) results = [] for text in texts: # text = clean(text, lang='de', lower=False) tokens = tokenizer.tokenize_paragraph(text) sentences = sentence_splitter.split(tokens) cleaned = [clean(' '.join(s), no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de') for s in sentences] results.append(cleaned) return results
class SentenceTokenizer(object): def __init__(self, language='en'): self.language = language if language == 'en': self.tokenizer = PunktSentenceTokenizer() elif language == 'de': self.tokenizer = SentenceSplitter(is_tuple=False) else: raise NotImplementedError def tokenize(self, sentences): if self.language == 'en': return self.tokenizer.tokenize(sentences) else: return self.tokenizer.split(sentences)
def main(): args = arguments() n_tokens = 0 t0 = time.perf_counter() tokenizer = Tokenizer(args.split_camel_case, args.token_classes, args.extra_info) sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info) if args.paragraph_separator == "empty_lines": paragraphs = get_paragraphs(args.FILE) elif args.paragraph_separator == "single_newlines": paragraphs = (line for line in args.FILE if line.strip() != "") if args.parallel > 1: pool = multiprocessing.Pool( min(args.parallel, multiprocessing.cpu_count())) tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs, 250) else: tokenized_paragraphs = map(tokenizer.tokenize, paragraphs) tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp) if args.split_sentences: tokenized_paragraphs = map(sentence_splitter.split, tokenized_paragraphs) tokenized_paragraphs = (s for tp in tokenized_paragraphs for s in tp) if args.token_classes or args.extra_info: tokenized_paragraphs = (["\t".join(t) for t in tp] for tp in tokenized_paragraphs) for tp in tokenized_paragraphs: n_tokens += len(tp) print("\n".join(tp), "\n", sep="") t1 = time.perf_counter() logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
class NERTokenizer: def __init__(self): self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) self._sentence_splitter = SentenceSplitter() def parse_text(self, text): tokens = self._word_tokenizer.tokenize_paragraph(text) sentences_tokenized = self._sentence_splitter.split(tokens) sentences = [] for sen in sentences_tokenized: sen = [tok.replace(" ", "") for tok in sen] if len(sen) == 0: continue sentences.append((sen, [])) return sentences
class TestSentenceSplitterPretokenized(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.sentence_splitter = SentenceSplitter(language="de_CMC") def _equal(self, tokens, tokenized_sentences): """""" sentences = self.sentence_splitter.split(tokens.split()) self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) def _equal_xml(self, tokens, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() eos_tags = set(eos_tags) sentences = self.sentence_splitter.split_xml(tokens.split(), eos_tags) self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
def splitSentTokenIdx(text): # generate tokens from text: tokens = tokenSplit(text) # sort to sentences: sentence_splitter = SentenceSplitter(is_tuple=False) sentences = sentence_splitter.split(tokens) # add start and end indexes of token in text: endIdxUpdate = 0 sents_idxd = [] for sent in sentences: tokens_idxd = [] for token in sent: startIdx = text.find(token, endIdxUpdate) endIdx = startIdx + len(token) if startIdx != -1: endIdxUpdate = endIdx tokens_idxd.append((token, startIdx, endIdx)) sents_idxd.append(tokens_idxd) return sents_idxd
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter() def _equal(self, raw, tokenized_sentences): """""" tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split(tokens) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences) def _equal_xml(self, raw, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br div ol ul dl table".split() eos_tags = set(eos_tags) tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split_xml(tokens, eos_tags) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter() def _equal(self, raw, tokenized_sentences): """""" tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split(tokens) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False): self.tokenizer = SoMaJo('de_CMC') self.sentence_splitter = SentenceSplitter(is_tuple=False) self.alpha = alpha self.stemming = stemming self.split_compounds = split_compounds self.stemmer = SnowballStemmer('german') self.minimal_mode = minimal_mode self.base_path = pathlib.Path(__file__).parent.absolute() self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–'] self.remove_chars.extend(list(string.punctuation)) self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')] self.stop = set() with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f: for line in f: self.stop.add(line.strip()) if not minimal_mode: self.smart_stop = set() with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f: for line in f: word = line.strip().lower() self.smart_stop.add(word) for replace_char in self.replace_chars: word = word.replace(replace_char[0], replace_char[1]) self.lemmas = {} with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f: for line in f: l = line.strip().split('\t') l[0] = l[0].strip().lower() l[1] = l[1].strip().lower() for replace_char in self.replace_chars: l[0] = l[0].replace(replace_char[0], replace_char[1]) l[1] = l[1].replace(replace_char[0], replace_char[1]) self.lemmas[l[0]] = l[1]
def myprocessor(myinput): tokenizer = Tokenizer(language="de") sentsplitter = SentenceSplitter(language="de") tokenized = tokenizer.tokenize_paragraph(myinput) sentsplit = sentsplitter.split(tokenized) return sentsplit
def read_clef(clef_file): with open(clef_file, 'r') as f: sentence_splitter = SentenceSplitter() docs = [] segments = [] text_part = [] header = None urls = [] def make_segement(): nonlocal docs, segments, text_part if len(text_part) == 0: return tmp = None # noinspection PyBroadException try: tmp = pd.read_csv(StringIO(header + "".join(text_part)), sep='\t', comment='#', quoting=3) except: import ipdb ipdb.set_trace() tmp = tmp.reset_index().rename(columns={'index': 'TOKEN_ID'}) tmp['url_id'] = len(docs) tmp['segment_id'] = len(segments) segments.append(tmp) text_part = [] def make_doc(): nonlocal docs, segments, sentence_splitter doc = pd.concat(segments) sentences = sentence_splitter.split( doc.TOKEN.astype(str).to_list()) doc['TOKEN_ID'] = [i for s in sentences for i in range(len(s))] docs.append(doc) segments = [] for line in tqdm(f): if header is None: header = "\t".join(line.split()) + '\n' continue if not line.startswith('#'): text_part.append(line) if re.match(r'#\s+segment_iiif_link\s+=.*', line): make_segement() if re.match(r'#\s+document_id\s+=.*', line): make_segement() urls.append(line) if len(segments) > 0: make_doc() make_segement() make_doc() return urls, pd.concat(docs).reset_index(drop=True)
def setUp(self): """Necessary preparations""" self.sentence_splitter = SentenceSplitter(language="de_CMC")
def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter()