def run_tokenization(options, filename, non_capitalized=None): with open(filename, "r", encoding="utf-8") as input_file: data = input_file.read() if options.skip_tokenization: sentences = [ sentence.split('\n') for sentence in data.split('\n\n') if sentence.strip() ] elif options.skip_segmentation: sentences = [ build_sentences(line, segment=False) for line in data.split('\n') if line.strip() ] else: if non_capitalized is None: n_capitalized = len(re.findall(r'[\.!?] +[A-ZÅÄÖ]', data)) n_non_capitalized = len(re.findall(r'[\.!?] +[a-zåäö]', data)) non_capitalized = n_non_capitalized > 5 * n_capitalized sentences = build_sentences(data, non_capitalized=non_capitalized) sentences = list( filter(bool, [[token for token in sentence if len(token) <= MAX_TOKEN] for sentence in sentences])) return sentences
def test_abbreviations(self): test = "Jag skickar räkningen p.g.a. ditt inköp" expected = [["Jag", "skickar", "räkningen", "p.g.a.", "ditt", "inköp"]] self.assertEqual(list(tokenizer.build_sentences(test)), expected) test = "Vi har bl a svamp" expected = [["Vi", "har", "bl.a.", "svamp"]] self.assertEqual(list(tokenizer.build_sentences(test)), expected)
def run_tokenization(options, filename): with open(filename, "r", encoding="utf-8") as input_file: data = input_file.read() if options.skip_tokenization: sentences = [ sentence.split('\n') for sentence in data.split('\n\n') if sentence.strip()] else: sentences = build_sentences(data) return sentences
def run_tokenization(options, filename): with open(filename, "r", encoding="utf-8") as input_file: data = input_file.read() if options.skip_tokenization: sentences = [ sentence.split('\n') for sentence in data.split('\n\n') if sentence.strip() ] else: sentences = build_sentences(data) sentences = list(filter(bool, [[token for token in sentence if len(token) <= MAX_TOKEN] for sentence in sentences])) return sentences
def test_single_word(self): test = "hej" expected = [["hej"]] self.assertEqual(list(tokenizer.build_sentences(test)), expected)
def test_numeric(self): test = "Temperatur: 21.0 grader" expected = [["Temperatur", ":", "21.0", "grader"]] self.assertEqual(list(tokenizer.build_sentences(test)), expected)
def test_empty_string(self): test = "" expected = [] self.assertEqual(list(tokenizer.build_sentences(test)), expected)
def test_smileys(self): test = "Jag säger :) och :( samtidigt" expected = [["Jag", "säger", ":)", "och", ":(", "samtidigt"]] self.assertEqual(list(tokenizer.build_sentences(test)), expected)
def test_paragraphs_without_punctuation(self): test = """Första meningen Andra meningen""" expected = [["Första", "meningen"], ["Andra", "meningen"]] self.assertEqual(list(tokenizer.build_sentences(test)), expected)
def test_two_sentences(self): test = "Jag skriver text. Och mer text." expected = [["Jag", "skriver", "text", "."], ["Och", "mer", "text", "."]] self.assertEqual(list(tokenizer.build_sentences(test)), expected)
def test_sentence(self): test = "hej hopp." expected = [["hej", "hopp", "."]] self.assertEqual(list(tokenizer.build_sentences(test)), expected)