def process_text_line(line): tokenizer = Tokenizer() tokens = tokenizer.tokenize(line) #sentence_splitter = SentenceSplitter() #sentences = sentence_splitter.split(tokens) sentences = tokens result = [] for s in sentences: if PROCESS_DISCUSSION: s = remove_discussion_suffix(s) if len(s) >= 4: sentence_string = " ".join(s) if PROCESS_DISCUSSION: # check if this line still contains a dirty comment: if "( CEST )" not in sentence_string and "( CET )" not in sentence_string: result.append(sentence_string) else: result.append(sentence_string) return result
class TestTokenizer(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) def _equal(self, raw, tokenized): """""" self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split()) def _equal_xml(self, raw, tokenized): """""" self.assertEqual(self.tokenizer.tokenize_xml(raw, is_file=False), tokenized.split()) def _fail_means_improvement(self, raw, tokenized): """""" self.assertNotEqual(self.tokenizer.tokenize(raw), tokenized.split())
class TestTokenizer(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) def _equal(self, raw, tokenized): """""" self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split())
class TestTokenizerExtra(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True, extra_info=True) def _equal(self, raw, tokenized): """""" tokens, extra_info = zip(*self.tokenizer.tokenize(raw)) self.assertEqual(list(tokens), tokenized.split())
def SentenceSplit(text): tokenizer = Tokenizer(split_camel_case=False, token_classes=False, extra_info=False) tokens = tokenizer.tokenize(text) sentence_splitter = SentenceSplitter(is_tuple=False) sentences = sentence_splitter.split(tokens) return sentences
def build_list(filename): tokenizer = Tokenizer(split_camel_case=False, token_classes=False, extra_info=False) gazetteers = set() f = open(filename, 'r', encoding='utf-8') for line in f.readlines(): gazetteers.add(' '.join(tokenizer.tokenize(line.strip()))) f.close() print('read {}'.format(filename)) return gazetteers
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter() def _equal(self, raw, tokenized_sentences): """""" tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split(tokens) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter() def _equal(self, raw, tokenized_sentences): """""" tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split(tokens) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences) def _equal_xml(self, raw, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br div ol ul dl table".split() eos_tags = set(eos_tags) tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split_xml(tokens, eos_tags) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
class WordTokenizer(object): def __init__(self, language='en'): self.language = language if language == 'en': self.tokenizer = TreebankTokenizer() elif language == 'de': self.tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) else: raise NotImplementedError def tokenize(self, sentence): return self.tokenizer.tokenize(sentence)
import gspread from oauth2client.service_account import ServiceAccountCredentials from somajo import Tokenizer scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] credentials = ServiceAccountCredentials.from_json_keyfile_name('easy-deutsch.json', scope) gc = gspread.authorize(credentials) sheet = gc.open("Deutsch Wörter").worksheet('Expressions') tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) data = py_(sheet.get_all_values()).filter(lambda r: r[0]).map(lambda r: py_.compact(r)).map( lambda r: [py_.capitalize(r[0], strict=False), *r[1:]] ).map( lambda r, i: dict(id=i, de=r[0], low=r[0].lower(), tokens=tokenizer.tokenize(r[0].lower()), rest=r[1:]) ).value() token_index = {} for tokens in py_.pluck(data, 'tokens'): for token in tokens: if len(token) <= 1: continue t = token.lower() if t not in token_index: token_index[t] = dict( key=t, ids=py_(data).filter(lambda d: t in d['tokens']).pluck('id').value() )
def tokenSplit(text): tokenizer = Tokenizer(split_camel_case=False, token_classes=False, extra_info=False) tokens = tokenizer.tokenize(text) return tokens
tokenizer = Tokenizer(split_camel_case=False, token_classes=True) count_all = Counter() count_hashtags = Counter() twStop = set( io.open('resources/german_stopwords.txt', encoding='utf-8').read().splitlines()) stop = set(stopwords.words('german')) with io.open("data/fluechtlinge.json", encoding='utf-8') as jsonFile: for line in jsonFile: tweet = json.loads(line) text = tweet['text'].encode('utf-8').replace('ö', 'oe').replace( 'ä', 'ae').replace('ü', 'ue') regular = [ token.token for token in tokenizer.tokenize(text.lower()) if token.token_class == "regular" and token.token not in twStop ] hashtag = [ token.token for token in tokenizer.tokenize(tweet['text'].lower()) if token.token_class == "hashtag" ] count_all.update(regular) count_hashtags.update(hashtag) tokens = tokenizer.tokenize(tweet['text']) print text for token in tokenizer.tokenize(text): print token.token + " ist " + token.token_class print "Häufigste Worte: " for word in count_all.most_common(10):