def test_split_join_tagged_text_en(self): directory = get_data_dir()+"/tagged_texts/en" for i, fn in enumerate(os.listdir(directory)): with self.subTest(i=i): sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read() tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text) text, tags = remove_tags(tagged_text) sentences, skeleton, list_tags = split_text(text, "en", tags) for sentence in sentences: self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip())) # All sentences have to be able to be added tags for sentence, stags in izip(sentences, list_tags): insert_tags(sentence, stags) new_text, new_tags = join_text(sentences,skeleton,list_tags) msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags) self.assertEqual(text,new_text,msg) self.assertEqual(tags, new_tags,msg) new_tagged_text = insert_tags(new_text,new_tags) self.assertEqual(new_tagged_text, tagged_text) new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton) self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)