def test_split_join_tagged_text_en(self): directory = get_data_dir()+"/tagged_texts/en" for i, fn in enumerate(os.listdir(directory)): with self.subTest(i=i): sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read() tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text) text, tags = remove_tags(tagged_text) sentences, skeleton, list_tags = split_text(text, "en", tags) for sentence in sentences: self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip())) # All sentences have to be able to be added tags for sentence, stags in izip(sentences, list_tags): insert_tags(sentence, stags) new_text, new_tags = join_text(sentences,skeleton,list_tags) msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags) self.assertEqual(text,new_text,msg) self.assertEqual(tags, new_tags,msg) new_tagged_text = insert_tags(new_text,new_tags) self.assertEqual(new_tagged_text, tagged_text) new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton) self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)
def test_correct_tuples(self): strs = [ # No tags [u'a b', [[0, 0], [1, 1]], u'1 2'], [u'a b', [[0, 1], [1, 0]], u'2 1'], # Single Tag [u'a<a/> b', [[0, 0], [1, 1]], u'1<a/> 2'], [u'a<a/> b', [[0, 1], [1, 0]], u'2 1<a/>'], [u'a <a/>b', [[0, 0], [1, 1]], u'1 <a/>2'], [u'<a/>a b', [[0, 0], [1, 1]], u'<a/>1 2'], [u'<b/><a/>a b', [[0, 0], [1, 1]], u'<b/><a/>1 2'], [u'a b<a/>', [[0, 0], [1, 1]], u'1 2<a/>'], [u'a b<b/><a/>', [[0, 0], [1, 1]], u'1 2<b/><a/>'], [u'<a/><b/>a b, <c/>c <d/>d e <e/>f<f/>', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6]], u'<a/><b/>1 2, <c/>3 <d/>4 5 <e/>6<f/>'], [u'<a/><b/>aaaaaaaa, <c/><d/>bbbbbbbb <e/>cccccccccc dddd eeeeee<f/><g/>', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 7], [5, 5]], u'<a/><b/>obrigado, <c/><d/>maricela <e/>cccccccccc del Centro de ayuda<f/><g/>'], [u'<a/><b/>Peishan<c/>', [[0, 0]], u'<a/><b/>Peishan<c/>'], [u'<a/>3.', [[0, 0], [1, 1]], u'<a/>3.'], # Test words more than one char [u'<x>ccc dd</x>', [[0, 0], [1, 1]], u'<x>aa ffff</x>'], [u'a b <x>c d</x>.', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], u'1 2 <x>3 4</x>.'], [u'aaa bbbb <x>cccc dddd</x>.', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], u'11 222 <x>3333 444444</x>.'], # punct [u'<x>ccc dd.</x>', [[0, 0], [1, 1], [2, 2]], u'<x>aa ffff.</x>'], [u'<x>ccc dd</x>.', [[0, 0], [1, 1], [2, 2]], u'<x>aa ffff</x>.'], # Empty Tags [u'a<a></a> b', [[0, 0], [1, 1]], u'1<a></a> 2'], [u'a<a></a> b', [[0, 1], [1, 0]], u'2 1<a></a>'], ['Looking<b></b> at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], 'Olhando<b></b> para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], ['Looking <b></b>at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], 'Olhando <b></b>para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], # Two Tags [u'<a>a <i>b </i></a>', [[0, 0], [1, 1]], u'<a>1 <i>2 </i></a>'], [u'<a>a <i>b </i></a>', [[0, 1], [1, 0]], u'<a><i>2 </i>1 </a>'], [u'<a> a<i> b</i></a>', [[0, 1], [1, 0]], u'<a><i> 2</i> 1</a>'], [u'<a> a<s/><i> b</i></a>', [[0, 1], [1, 0]], u'<a><i> 2</i> 1<s/></a>'], # Nested tags ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], "<a>a<i> b<b> c</b> d</i> e</a>"], ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 3], [3, 2], [4, 4]], "<a>a<i> b d<b> c</b></i> e</a>"], ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 2], [3, 4], [4, 3]], "<a>a<i> b<b> c</b> e d</i></a>"], ['<a/><b><c/><d><e><f/><g/><h/></e></d></b>Looking at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], '<a/><b><c/><d><e><f/><g/><h/></e></d></b>Olhando para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], ['Looking<b><c/><a></a> at</b> the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], '<b>Olhando<c/><a></a></b> para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], ['Looking <b><c/><a></a></b>at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], 'Olhando <b><c/><a></a></b>para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], ['<a/><b/>All the best, <c/><d/>Anjali <e/><f>Change.org </f>Centro De Apoio<g/><h/>', [[0, 1], [1, 1], [2, 0], [2, 2], [2, 3], [3, 4], [4, 5], [6, 7], [7, 8], [8, 9], [5, 6]], '<a/><b/>Com os melhores cumprimentos, <c/><d/>Anjali <e/><f>Change.org </f>Centro De Apoio<g/><h/>'], ['<p>You can only link a <strong>personal</strong> Facebook profile to your Pinterest account. We don\'t support Facebook business pages.</p>', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [6, 7], [7, 5], [8, 8], [9, 9], [10, 12], [11, 10], [12, 13]], '<p>Usted puede unico enlace un perfil de <strong>Facebook</strong> a su cuenta de Pinterest.</p>'], # Overwrite invalid result in html/xml ['<a>First link1</a> <b>Second link2</b>', [[0, 0], [1, 1], [2, 1], [3, 3]], '<a>First <b>link1 Second link2</b></a>'], ['<a>Cash out</a><b/><c> I cant receive my cash out via PayPal </c><d><e>alternatives in Turkey</e>', [[0, 0], [2, 2], [3, 1], [3, 2], [4, 3], [5, 4], [6, 5], [8, 6], [8, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12]], '<a>Cobrar<c> no puedo<b/> recibir mi pago a traves de PayPal </c></a><d><e>alternativas en Turquia</e>'], ] for i, (tagged_text, wa, truth) in enumerate(strs): with self.subTest(i=i): clean_text, tags = remove_tags(tagged_text) clean_trg_txt, _ = remove_tags(truth) src_tok = tokenize_string(clean_text) trg_tok = tokenize_string(clean_trg_txt) trg_clean_text, _ = remove_tags(truth) pp = phrase_tuples(tagged_text, src_tok, tags, clean_text) tp = compute_target_covering_words(pp, wa, trg_tok, clean_trg_txt) t_tags = generate_target_dict(tags, tp) t_text = insert_tags(trg_clean_text, t_tags) self.assertEqual(truth, t_text, "\nWanted:%s\nGot:%s\n" % (truth, t_text))
def test_neste_examples(self): """ Assume we can always tokenize on space Assume that we have the same characters on target. """ examples = [ ["<a><i>a b</i></a>", [[0, 0], [1, 1]], "<a><i>a b</i></a>"], ["<a><i>a b</i></a>", [[0, 1], [1, 0]], "<a><i>b a</i></a>"], ["<a>c<i>a b</i></a>", [[0, 0], [1, 2], [2, 1]], "<a>c<i>b a</i></a>"], ["<a>c<i>a b</i> d</a>", [[0, 0], [1, 2], [2, 1], [3, 3]], "<a>c<i>b a</i> d</a>"], ["<a>c<i>a b</i> e d</a>", [[0, 0], [1, 2], [2, 1], [3, 4], [4, 3]], "<a>c<i>b a</i> d e</a>"], ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], "<a>a<i> b<b> c</b> d</i> e</a>"], ["<a>a<i> b</i></a>", [[0, 0], [1, 1]], "<a>a<i> b</i></a>"], ] # nested tags # "<a><i>a</i></a> --> <a><i>1</i></a>" # "<a>a<i>b</i></a> --> <a>a<i>b</i></a>" # "<a>a<i>b</i>c</a> --> <a>a<i>b</i>c</a>" for i, (tagged_text, wrd_align, truth) in enumerate(examples): with self.subTest(i=i): text, s_tags = remove_tags(tagged_text) s_token = text.split(" ") tok_src = split_with_indices(text, " ") tok_trg = tok_src t_token = get_target_tokens(s_token, wrd_align) t_tags = align_tags(tagged_text, text, s_token, tok_src, tok_trg, s_tags, t_token, wrd_align=wrd_align) target_text = " ".join(t_token) target_tagged_text = insert_tags(target_text, t_tags) self.assertEqual(target_tagged_text, truth, "\n%s\nTruth:\n%s\nGot:\n%s" % ( tagged_text, truth, target_tagged_text))