def test_split_join_tagged_text_en(self):
        directory = get_data_dir()+"/tagged_texts/en"
        for i, fn in enumerate(os.listdir(directory)):

            with self.subTest(i=i):
                sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read()
                tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text)
                text, tags = remove_tags(tagged_text)
                sentences, skeleton, list_tags = split_text(text, "en", tags)
                for sentence in sentences:
                    self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
                # All sentences have to be able to be added tags
                for sentence, stags in izip(sentences, list_tags):
                    insert_tags(sentence, stags)

                new_text, new_tags = join_text(sentences,skeleton,list_tags)
                msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags)
                self.assertEqual(text,new_text,msg)
                self.assertEqual(tags, new_tags,msg)
                new_tagged_text = insert_tags(new_text,new_tags)
                self.assertEqual(new_tagged_text, tagged_text)
                new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton)
                self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)
    def test_correct_tuples(self):

        strs = [
            # No tags
            [u'a b',
             [[0, 0], [1, 1]],
             u'1 2'],
            [u'a b',
             [[0, 1], [1, 0]],
             u'2 1'],

            # Single Tag
            [u'a<a/> b',
             [[0, 0], [1, 1]],
             u'1<a/> 2'],
            [u'a<a/> b',
             [[0, 1], [1, 0]],
             u'2 1<a/>'],
            [u'a <a/>b',
             [[0, 0], [1, 1]],
             u'1 <a/>2'],
            [u'<a/>a b',
             [[0, 0], [1, 1]],
             u'<a/>1 2'],
            [u'<b/><a/>a b',
             [[0, 0], [1, 1]],
             u'<b/><a/>1 2'],
            [u'a b<a/>',
             [[0, 0], [1, 1]],
             u'1 2<a/>'],
            [u'a b<b/><a/>',
             [[0, 0], [1, 1]],
             u'1 2<b/><a/>'],
            [u'<a/><b/>a b, <c/>c <d/>d e <e/>f<f/>',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6]],
             u'<a/><b/>1 2, <c/>3 <d/>4 5 <e/>6<f/>'],
            [u'<a/><b/>aaaaaaaa, <c/><d/>bbbbbbbb <e/>cccccccccc dddd eeeeee<f/><g/>',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 7], [5, 5]],
             u'<a/><b/>obrigado, <c/><d/>maricela <e/>cccccccccc del Centro de ayuda<f/><g/>'],
            [u'<a/><b/>Peishan<c/>',
             [[0, 0]],
             u'<a/><b/>Peishan<c/>'],
            [u'<a/>3.',
             [[0, 0], [1, 1]],
             u'<a/>3.'],

            # Test words more than one char
            [u'<x>ccc dd</x>',
             [[0, 0], [1, 1]],
             u'<x>aa ffff</x>'],
            [u'a b <x>c d</x>.',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]],
             u'1 2 <x>3 4</x>.'],
            [u'aaa bbbb <x>cccc dddd</x>.',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]],
             u'11 222 <x>3333 444444</x>.'],

            # punct
            [u'<x>ccc dd.</x>',
             [[0, 0], [1, 1], [2, 2]],
             u'<x>aa ffff.</x>'],
            [u'<x>ccc dd</x>.',
             [[0, 0], [1, 1], [2, 2]],
             u'<x>aa ffff</x>.'],

            # Empty Tags
            [u'a<a></a> b',
             [[0, 0], [1, 1]],
             u'1<a></a> 2'],
            [u'a<a></a> b',
             [[0, 1], [1, 0]],
             u'2 1<a></a>'],
            ['Looking<b></b> at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             'Olhando<b></b> para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],
            ['Looking <b></b>at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             'Olhando <b></b>para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],

            # Two Tags
            [u'<a>a <i>b </i></a>',
             [[0, 0], [1, 1]],
             u'<a>1 <i>2 </i></a>'],
            [u'<a>a <i>b </i></a>',
             [[0, 1], [1, 0]],
             u'<a><i>2 </i>1 </a>'],
            [u'<a> a<i> b</i></a>',
             [[0, 1], [1, 0]],
             u'<a><i> 2</i> 1</a>'],

            [u'<a> a<s/><i> b</i></a>',
             [[0, 1], [1, 0]],
             u'<a><i> 2</i> 1<s/></a>'],

            # Nested tags
            ["<a>a<i> b<b> c</b> d</i> e</a>",
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]],
             "<a>a<i> b<b> c</b> d</i> e</a>"],
            ["<a>a<i> b<b> c</b> d</i> e</a>",
             [[0, 0], [1, 1], [2, 3], [3, 2], [4, 4]],
             "<a>a<i> b d<b> c</b></i> e</a>"],
            ["<a>a<i> b<b> c</b> d</i> e</a>",
             [[0, 0], [1, 1], [2, 2], [3, 4], [4, 3]],
             "<a>a<i> b<b> c</b> e d</i></a>"],
            ['<a/><b><c/><d><e><f/><g/><h/></e></d></b>Looking at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             '<a/><b><c/><d><e><f/><g/><h/></e></d></b>Olhando para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],
            ['Looking<b><c/><a></a> at</b> the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             '<b>Olhando<c/><a></a></b> para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],
            ['Looking <b><c/><a></a></b>at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             'Olhando <b><c/><a></a></b>para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],
            ['<a/><b/>All the best, <c/><d/>Anjali <e/><f>Change.org </f>Centro De Apoio<g/><h/>',
             [[0, 1], [1, 1], [2, 0], [2, 2], [2, 3], [3, 4], [4, 5], [6, 7], [7, 8], [8, 9], [5, 6]],
             '<a/><b/>Com os melhores cumprimentos, <c/><d/>Anjali <e/><f>Change.org </f>Centro De Apoio<g/><h/>'],

            ['<p>You can only link a <strong>personal</strong> Facebook profile to your Pinterest account. We don\'t support Facebook business pages.</p>',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [6, 7], [7, 5], [8, 8], [9, 9], [10, 12], [11, 10], [12, 13]],
             '<p>Usted puede unico enlace un perfil de <strong>Facebook</strong> a su cuenta de Pinterest.</p>'],


             # Overwrite invalid result in html/xml
             ['<a>First link1</a> <b>Second link2</b>',
             [[0, 0], [1, 1], [2, 1], [3, 3]],
             '<a>First <b>link1 Second link2</b></a>'],
             ['<a>Cash out</a><b/><c> I cant receive my cash out via PayPal </c><d><e>alternatives in Turkey</e>',
             [[0, 0], [2, 2], [3, 1], [3, 2], [4, 3], [5, 4], [6, 5], [8, 6], [8, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12]],
             '<a>Cobrar<c> no puedo<b/> recibir mi pago a traves de PayPal </c></a><d><e>alternativas en Turquia</e>'],

            ]

        for i, (tagged_text, wa, truth) in enumerate(strs):
            with self.subTest(i=i):
                clean_text, tags = remove_tags(tagged_text)
                clean_trg_txt, _ = remove_tags(truth)

                src_tok = tokenize_string(clean_text)
                trg_tok = tokenize_string(clean_trg_txt)

                trg_clean_text, _ = remove_tags(truth)
                pp = phrase_tuples(tagged_text, src_tok, tags, clean_text)
                tp = compute_target_covering_words(pp, wa, trg_tok,
                                                   clean_trg_txt)

                t_tags = generate_target_dict(tags, tp)
                t_text = insert_tags(trg_clean_text, t_tags)

                self.assertEqual(truth, t_text,
                                 "\nWanted:%s\nGot:%s\n" % (truth, t_text))
    def test_neste_examples(self):
        """ Assume we can always tokenize on space
            Assume that we have the same characters on target.
        """

        examples = [
            ["<a><i>a b</i></a>",
             [[0, 0], [1, 1]],
             "<a><i>a b</i></a>"],
            ["<a><i>a b</i></a>",
             [[0, 1], [1, 0]],
             "<a><i>b a</i></a>"],
            ["<a>c<i>a b</i></a>",
             [[0, 0], [1, 2], [2, 1]],
             "<a>c<i>b a</i></a>"],
            ["<a>c<i>a b</i> d</a>",
             [[0, 0], [1, 2], [2, 1], [3, 3]],
             "<a>c<i>b a</i> d</a>"],
            ["<a>c<i>a b</i> e d</a>",
             [[0, 0], [1, 2], [2, 1], [3, 4], [4, 3]],
            "<a>c<i>b a</i> d e</a>"],
            ["<a>a<i> b<b> c</b> d</i> e</a>",
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]],
             "<a>a<i> b<b> c</b> d</i> e</a>"],
            ["<a>a<i> b</i></a>",
             [[0, 0], [1, 1]],
             "<a>a<i> b</i></a>"],


        ]

        # nested tags
            # "<a><i>a</i></a> --> <a><i>1</i></a>"

            # "<a>a<i>b</i></a> --> <a>a<i>b</i></a>"

            # "<a>a<i>b</i>c</a> --> <a>a<i>b</i>c</a>"

        for i, (tagged_text, wrd_align, truth) in enumerate(examples):
            with self.subTest(i=i):
                text, s_tags = remove_tags(tagged_text)
                s_token = text.split(" ")
                tok_src = split_with_indices(text, " ")
                tok_trg = tok_src
                t_token = get_target_tokens(s_token, wrd_align)

                t_tags = align_tags(tagged_text,
                                    text,
                                    s_token,
                                    tok_src,
                                    tok_trg,
                                    s_tags,
                                    t_token,
                                    wrd_align=wrd_align)

                target_text = " ".join(t_token)

                target_tagged_text = insert_tags(target_text, t_tags)

                self.assertEqual(target_tagged_text, truth,
                                 "\n%s\nTruth:\n%s\nGot:\n%s" % (
                                     tagged_text, truth,
                                     target_tagged_text))