示例#1
0
def run_tokenization(options, filename, non_capitalized=None):
    with open(filename, "r", encoding="utf-8") as input_file:
        data = input_file.read()

        if options.skip_tokenization:
            sentences = [
                sentence.split('\n') for sentence in data.split('\n\n')
                if sentence.strip()
            ]
        elif options.skip_segmentation:
            sentences = [
                build_sentences(line, segment=False)
                for line in data.split('\n') if line.strip()
            ]
        else:
            if non_capitalized is None:
                n_capitalized = len(re.findall(r'[\.!?] +[A-ZÅÄÖ]', data))
                n_non_capitalized = len(re.findall(r'[\.!?] +[a-zåäö]', data))
                non_capitalized = n_non_capitalized > 5 * n_capitalized
            sentences = build_sentences(data, non_capitalized=non_capitalized)

    sentences = list(
        filter(bool, [[token for token in sentence if len(token) <= MAX_TOKEN]
                      for sentence in sentences]))
    return sentences
示例#2
0
    def test_abbreviations(self):
        test = "Jag skickar räkningen p.g.a. ditt inköp"
        expected = [["Jag", "skickar", "räkningen", "p.g.a.", "ditt", "inköp"]]
        self.assertEqual(list(tokenizer.build_sentences(test)), expected)

        test = "Vi har bl a svamp"
        expected = [["Vi", "har", "bl.a.", "svamp"]]
        self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#3
0
    def test_abbreviations(self):
        test = "Jag skickar räkningen p.g.a. ditt inköp"
        expected = [["Jag", "skickar", "räkningen", "p.g.a.", "ditt", "inköp"]]
        self.assertEqual(list(tokenizer.build_sentences(test)), expected)

        test = "Vi har bl a svamp"
        expected = [["Vi", "har", "bl.a.", "svamp"]]
        self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#4
0
def run_tokenization(options, filename):
    with open(filename, "r", encoding="utf-8") as input_file:
        data = input_file.read()

        if options.skip_tokenization:
            sentences = [
                sentence.split('\n')
                for sentence in data.split('\n\n')
                if sentence.strip()]
        else:
            sentences = build_sentences(data)

    return sentences
示例#5
0
def run_tokenization(options, filename):
    with open(filename, "r", encoding="utf-8") as input_file:
        data = input_file.read()

        if options.skip_tokenization:
            sentences = [
                sentence.split('\n')
                for sentence in data.split('\n\n')
                if sentence.strip()
            ]
        else:
            sentences = build_sentences(data)

    sentences = list(filter(bool,
        [[token for token in sentence if len(token) <= MAX_TOKEN]
            for sentence in sentences]))
    return sentences
示例#6
0
 def test_single_word(self):
     test = "hej"
     expected = [["hej"]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#7
0
 def test_numeric(self):
     test = "Temperatur: 21.0 grader"
     expected = [["Temperatur", ":", "21.0", "grader"]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#8
0
 def test_empty_string(self):
     test = ""
     expected = []
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#9
0
 def test_smileys(self):
     test = "Jag säger :) och :( samtidigt"
     expected = [["Jag", "säger", ":)", "och", ":(", "samtidigt"]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#10
0
    def test_paragraphs_without_punctuation(self):
        test = """Första meningen

        Andra meningen"""
        expected = [["Första", "meningen"], ["Andra", "meningen"]]
        self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#11
0
    def test_paragraphs_without_punctuation(self):
        test = """Första meningen

        Andra meningen"""
        expected = [["Första", "meningen"], ["Andra", "meningen"]]
        self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#12
0
 def test_two_sentences(self):
     test = "Jag skriver text. Och mer text."
     expected = [["Jag", "skriver", "text", "."],
                 ["Och", "mer", "text", "."]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#13
0
 def test_two_sentences(self):
     test = "Jag skriver text. Och mer text."
     expected = [["Jag", "skriver", "text", "."], ["Och", "mer", "text", "."]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#14
0
 def test_single_word(self):
     test = "hej"
     expected = [["hej"]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#15
0
 def test_numeric(self):
     test = "Temperatur: 21.0 grader"
     expected = [["Temperatur", ":", "21.0", "grader"]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#16
0
 def test_smileys(self):
     test = "Jag säger :) och :( samtidigt"
     expected = [["Jag", "säger", ":)", "och", ":(", "samtidigt"]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#17
0
 def test_empty_string(self):
     test = ""
     expected = []
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#18
0
 def test_sentence(self):
     test = "hej hopp."
     expected = [["hej", "hopp", "."]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)
示例#19
0
 def test_sentence(self):
     test = "hej hopp."
     expected = [["hej", "hopp", "."]]
     self.assertEqual(list(tokenizer.build_sentences(test)), expected)