Пример #1
0
 def test_sentencier_en_new_lines(self):
     """
     New lines are also considered as a separator.
     """
     sentencizer = Sentencizer()
     text = 'It is a sunny day!!!! When Andy comes back,\n' \
            'we are going to the zoo.'
     crafted_chunk_list = sentencizer.craft(text, 0)
     self.assertEqual(len(crafted_chunk_list), 3)
Пример #2
0
 def test_sentencier_en_float_numbers(self):
     """
     Separators in float numbers, URLs, emails, abbreviations (like 'Mr.')
     are not taking into account.
     """
     sentencizer = Sentencizer()
     text = 'With a 0.99 probability this sentence will be ' \
            'tokenized in 2 sentences.'
     crafted_chunk_list = sentencizer.craft(text, 0)
     self.assertEqual(len(crafted_chunk_list), 2)
Пример #3
0
 def test_sentencier_en_trim_spaces(self):
     """
     Trimming all spaces at the beginning an end of the chunks.
     Keeping extra spaces inside chunks.
     Ignoring chunks with only spaces.
     """
     sentencizer = Sentencizer()
     buffer = b'  This ,  text is...  . Amazing !!'
     chunks = [i["text"] for i in sentencizer.craft(buffer, 0)]
     self.assertListEqual(chunks, ["This ,  text is", "Amazing"])
Пример #4
0
def main(data_repeat=100, repeat=3):
    """
    :param data_repeat: Number of times do you want to repeat the byte array.
    :param repeat: Number of times to repeat the test to get a more exact average time.
    """
    # Setup.
    raw_bytes = data_loader.get_bytes(repeat=data_repeat)
    sentencizer = Sentencizer()
    # Test execution.
    timer = timeit.Timer(lambda: sentencizer.craft(raw_bytes, 0))
    times = timer.repeat(repeat, 1)
    # Show results.
    mb = len(raw_bytes) / 2**20
    time_mean = sum(times) / repeat
    print(f"{mb:.2f}MB read {repeat} times.")
    print(f"Average per iteration: {time_mean:.6f}.")
Пример #5
0
    def test_sentencier_en_trim_spaces(self):
        """
        Trimming all spaces at the beginning an end of the chunks.
        Keeping extra spaces inside chunks.
        Ignoring chunks with only spaces.
        """
        sentencizer = Sentencizer()
        text = '  This ,  text is...  . Amazing !!'
        chunks = [i['text'] for i in sentencizer.craft(text, 0)]
        locs = [i['location'] for i in sentencizer.craft(text, 0)]
        self.assertListEqual(chunks, ["This ,  text is...", "Amazing"])
        self.assertEqual(text[locs[0][0]:locs[0][1]], '  This ,  text is...')
        self.assertEqual(text[locs[1][0]:locs[1][1]], ' Amazing')

        def validate(req):
            self.assertEqual(req.docs[0].chunks[0].text, 'This ,  text is...')
            self.assertEqual(req.docs[0].chunks[1].text, 'Amazing')

        f = Flow().add(yaml_path='!Sentencizer')
        with f:
            f.index_lines(['  This ,  text is...  . Amazing !!'], output_fn=validate, callback_on_body=True)
Пример #6
0
 def test_sentencier_en(self):
     sentencizer = Sentencizer()
     raw_bytes = b'It is a sunny day!!!! When Andy comes back, we are going to the zoo.'
     crafted_chunk_list = sentencizer.craft(raw_bytes, 0)
     self.assertEqual(len(crafted_chunk_list), 2)
Пример #7
0
 def test_sentencier_cn(self):
     sentencizer = Sentencizer()
     raw_bytes = '今天是个大晴天!安迪回来以后,我们准备去动物园。'.encode('utf8')
     crafted_chunk_list = sentencizer.craft(raw_bytes, 0)
     self.assertEqual(len(crafted_chunk_list), 2)
Пример #8
0
 def test_sentencier_cn(self):
     sentencizer = Sentencizer()
     text = '今天是个大晴天!安迪回来以后,我们准备去动物园。'
     crafted_chunk_list = sentencizer.craft(text, 0)
     self.assertEqual(len(crafted_chunk_list), 2)
Пример #9
0
 def test_sentencier_cn(self):
     sentencizer = Sentencizer()
     text = '今天是个大晴天!安迪回来以后,我们准备去动物园。'
     crafted_chunk_list = sentencizer.craft(text, 0)
     # Sentencizer does not work for chinese because string.printable does not contain Chinese characters
     self.assertEqual(len(crafted_chunk_list), 0)