def test_sentencier_en_new_lines(self): """ New lines are also considered as a separator. """ sentencizer = Sentencizer() text = 'It is a sunny day!!!! When Andy comes back,\n' \ 'we are going to the zoo.' crafted_chunk_list = sentencizer.craft(text, 0) self.assertEqual(len(crafted_chunk_list), 3)
def test_sentencier_en_float_numbers(self): """ Separators in float numbers, URLs, emails, abbreviations (like 'Mr.') are not taking into account. """ sentencizer = Sentencizer() text = 'With a 0.99 probability this sentence will be ' \ 'tokenized in 2 sentences.' crafted_chunk_list = sentencizer.craft(text, 0) self.assertEqual(len(crafted_chunk_list), 2)
def test_sentencier_en_trim_spaces(self): """ Trimming all spaces at the beginning an end of the chunks. Keeping extra spaces inside chunks. Ignoring chunks with only spaces. """ sentencizer = Sentencizer() buffer = b' This , text is... . Amazing !!' chunks = [i["text"] for i in sentencizer.craft(buffer, 0)] self.assertListEqual(chunks, ["This , text is", "Amazing"])
def test_sentencier_en_trim_spaces(self): """ Trimming all spaces at the beginning an end of the chunks. Keeping extra spaces inside chunks. Ignoring chunks with only spaces. """ sentencizer = Sentencizer() text = ' This , text is... . Amazing !!' chunks = [i['text'] for i in sentencizer.craft(text, 0)] locs = [i['location'] for i in sentencizer.craft(text, 0)] self.assertListEqual(chunks, ["This , text is...", "Amazing"]) self.assertEqual(text[locs[0][0]:locs[0][1]], ' This , text is...') self.assertEqual(text[locs[1][0]:locs[1][1]], ' Amazing') def validate(req): self.assertEqual(req.docs[0].chunks[0].text, 'This , text is...') self.assertEqual(req.docs[0].chunks[1].text, 'Amazing') f = Flow().add(yaml_path='!Sentencizer') with f: f.index_lines([' This , text is... . Amazing !!'], output_fn=validate, callback_on_body=True)
def main(data_repeat=100, repeat=3): """ :param data_repeat: Number of times do you want to repeat the byte array. :param repeat: Number of times to repeat the test to get a more exact average time. """ # Setup. raw_bytes = data_loader.get_bytes(repeat=data_repeat) sentencizer = Sentencizer() # Test execution. timer = timeit.Timer(lambda: sentencizer.craft(raw_bytes, 0)) times = timer.repeat(repeat, 1) # Show results. mb = len(raw_bytes) / 2**20 time_mean = sum(times) / repeat print(f"{mb:.2f}MB read {repeat} times.") print(f"Average per iteration: {time_mean:.6f}.")
def test_sentencier_en(self): sentencizer = Sentencizer() raw_bytes = b'It is a sunny day!!!! When Andy comes back, we are going to the zoo.' crafted_chunk_list = sentencizer.craft(raw_bytes, 0) self.assertEqual(len(crafted_chunk_list), 2)
def test_sentencier_cn(self): sentencizer = Sentencizer() raw_bytes = '今天是个大晴天!安迪回来以后,我们准备去动物园。'.encode('utf8') crafted_chunk_list = sentencizer.craft(raw_bytes, 0) self.assertEqual(len(crafted_chunk_list), 2)
def test_sentencier_cn(self): sentencizer = Sentencizer() text = '今天是个大晴天!安迪回来以后,我们准备去动物园。' crafted_chunk_list = sentencizer.craft(text, 0) self.assertEqual(len(crafted_chunk_list), 2)
def test_sentencier_cn(self): sentencizer = Sentencizer() text = '今天是个大晴天!安迪回来以后,我们准备去动物园。' crafted_chunk_list = sentencizer.craft(text, 0) # Sentencizer does not work for chinese because string.printable does not contain Chinese characters self.assertEqual(len(crafted_chunk_list), 0)