def test_preserve_annotations_en(self):
     tagged_text = u''''Hello Jaime,<br/><br/>You have provide the link to your petition.  <br/><span class="notranslate" origval="0">https://www.change.org/p/firm%C3%A1-para-que-cristina-fern%C3%A1ndez-de-kirchner-revoque-la-designaci%C3%B3n-de-la-hija-de-agust%C3%ADn-rossi-como-directora-en-el-banco-naci%C3%B3n?recruiter=49166865&utm_source=share_petition&utm_medium=email&utm_campaign=share_email_responsive</span><br/><br/>This link however looks very long. I suggest you cuztomize the link so that it will be easier to copy and share with your contacts via e-mail.<br/><br/>You can shorten the URL by customizing the headline. Here\'s how you can do it: <br/> <br/>- Login to your Change.org account<br/>- Select the petition in question, select edit<br/>- Scroll down to the option \xe2\x80\x9cCustomize your headline for sharing\xe2\x80\x9d.<br/> <br/>If want an even shorter link, you can use a third party site, such as:<span class="notranslate" origval="1"> http://tinyurl.com/ </span>or<span class="notranslate" origval="2"> http://goo.gl/.</span><br/> <br/>Please let us know if you need further help.<br/><br/>Best Wishes,<br/><br/>Maria<br/>Change.org Help Center\n'''
     text, tags = remove_tags(tagged_text)
     sentences, skeleton, list_tags = split_text(text, "en", tags)
     for sentence in sentences:
                 self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
     new_text, new_tags = join_text(sentences,skeleton,list_tags)
     self.assertEqual(text,new_text)
     self.assertEqual(tags, new_tags)
     nr_list_tags = sum([self._nr_tags(entry) for entry in list_tags])
     self.assertEqual(nr_list_tags,self._nr_tags(tags))
 def test_preserve_annotations_small_en(self):
     tagged_text = u''''Hello Jaime,<br/><br/>You have provide the link to your petition. <br/><br/>Best Wishes,<br/><br/>Maria<br/>Change.org Help Center\n'''
     text, tags = remove_tags(tagged_text)
     sentences, skeleton, list_tags = split_text(text, "en", tags)
     for sentence in sentences:
                 self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
     new_text, new_tags = join_text(sentences,skeleton,list_tags)
     self.assertEqual(text,new_text)
     self.assertEqual(tags, new_tags)
     nr_list_tags = sum([self._nr_tags(entry) for entry in list_tags])
     self.assertEqual(nr_list_tags,self._nr_tags(tags))
예제 #3
0
def word_count(text, lang=None):

    # Remove all tags from text
    text_no_tags,_ = remove_tags(text)

    # Remove urls, emails and other breakable elements.
    text_no_tags = replace_url(u'url', text_no_tags)
    text_no_tags = replace_email(u'email', text_no_tags)
    text_no_tags = replace_date(u'date', text_no_tags)
    text_no_tags = replace_phone(u'phone', text_no_tags)
    text_no_tags = replace_money(u'money', text_no_tags)

    if lang is not None and lang not in asian_languages:
          words = word_count_aux(text_no_tags)
          return words
    else:
        asian_chars = sum([is_asian(x) for x in text_no_tags])
        non_asian_words = "".join([filter_jchars(c) for c in text_no_tags])
        words = word_count_aux(non_asian_words)
    return words + asian_chars
def inline_to_annotation(inline_text,
                         wrapper_funcs=[default_wrapper_func],
                         final_wrapper=None):
    """
    Converts inline text into text and annotations
    """
    # Remove wrappers
    wrappers = []
    for wrapper_func in wrapper_funcs:
        inline_text, wrappers_aux = wrapper_func(inline_text)
        wrappers.extend(wrappers_aux)

    wrappers = clear_annotations_list(wrappers)

    # Remove markup tags
    text, markup = remove_tags(inline_text)

    if final_wrapper:
        wrappers = final_wrapper(text, wrappers)

    wrappers = clear_annotations_list(wrappers)

    return text, markup, wrappers
    def test_split_join_tagged_text_en(self):
        directory = get_data_dir()+"/tagged_texts/en"
        for i, fn in enumerate(os.listdir(directory)):

            with self.subTest(i=i):
                sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read()
                tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text)
                text, tags = remove_tags(tagged_text)
                sentences, skeleton, list_tags = split_text(text, "en", tags)
                for sentence in sentences:
                    self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
                # All sentences have to be able to be added tags
                for sentence, stags in izip(sentences, list_tags):
                    insert_tags(sentence, stags)

                new_text, new_tags = join_text(sentences,skeleton,list_tags)
                msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags)
                self.assertEqual(text,new_text,msg)
                self.assertEqual(tags, new_tags,msg)
                new_tagged_text = insert_tags(new_text,new_tags)
                self.assertEqual(new_tagged_text, tagged_text)
                new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton)
                self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)
    def test_correct_tuples(self):

        strs = [
            # No tags
            [u'a b',
             [[0, 0], [1, 1]],
             u'1 2'],
            [u'a b',
             [[0, 1], [1, 0]],
             u'2 1'],

            # Single Tag
            [u'a<a/> b',
             [[0, 0], [1, 1]],
             u'1<a/> 2'],
            [u'a<a/> b',
             [[0, 1], [1, 0]],
             u'2 1<a/>'],
            [u'a <a/>b',
             [[0, 0], [1, 1]],
             u'1 <a/>2'],
            [u'<a/>a b',
             [[0, 0], [1, 1]],
             u'<a/>1 2'],
            [u'<b/><a/>a b',
             [[0, 0], [1, 1]],
             u'<b/><a/>1 2'],
            [u'a b<a/>',
             [[0, 0], [1, 1]],
             u'1 2<a/>'],
            [u'a b<b/><a/>',
             [[0, 0], [1, 1]],
             u'1 2<b/><a/>'],
            [u'<a/><b/>a b, <c/>c <d/>d e <e/>f<f/>',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6]],
             u'<a/><b/>1 2, <c/>3 <d/>4 5 <e/>6<f/>'],
            [u'<a/><b/>aaaaaaaa, <c/><d/>bbbbbbbb <e/>cccccccccc dddd eeeeee<f/><g/>',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 7], [5, 5]],
             u'<a/><b/>obrigado, <c/><d/>maricela <e/>cccccccccc del Centro de ayuda<f/><g/>'],
            [u'<a/><b/>Peishan<c/>',
             [[0, 0]],
             u'<a/><b/>Peishan<c/>'],
            [u'<a/>3.',
             [[0, 0], [1, 1]],
             u'<a/>3.'],

            # Test words more than one char
            [u'<x>ccc dd</x>',
             [[0, 0], [1, 1]],
             u'<x>aa ffff</x>'],
            [u'a b <x>c d</x>.',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]],
             u'1 2 <x>3 4</x>.'],
            [u'aaa bbbb <x>cccc dddd</x>.',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]],
             u'11 222 <x>3333 444444</x>.'],

            # punct
            [u'<x>ccc dd.</x>',
             [[0, 0], [1, 1], [2, 2]],
             u'<x>aa ffff.</x>'],
            [u'<x>ccc dd</x>.',
             [[0, 0], [1, 1], [2, 2]],
             u'<x>aa ffff</x>.'],

            # Empty Tags
            [u'a<a></a> b',
             [[0, 0], [1, 1]],
             u'1<a></a> 2'],
            [u'a<a></a> b',
             [[0, 1], [1, 0]],
             u'2 1<a></a>'],
            ['Looking<b></b> at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             'Olhando<b></b> para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],
            ['Looking <b></b>at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             'Olhando <b></b>para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],

            # Two Tags
            [u'<a>a <i>b </i></a>',
             [[0, 0], [1, 1]],
             u'<a>1 <i>2 </i></a>'],
            [u'<a>a <i>b </i></a>',
             [[0, 1], [1, 0]],
             u'<a><i>2 </i>1 </a>'],
            [u'<a> a<i> b</i></a>',
             [[0, 1], [1, 0]],
             u'<a><i> 2</i> 1</a>'],

            [u'<a> a<s/><i> b</i></a>',
             [[0, 1], [1, 0]],
             u'<a><i> 2</i> 1<s/></a>'],

            # Nested tags
            ["<a>a<i> b<b> c</b> d</i> e</a>",
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]],
             "<a>a<i> b<b> c</b> d</i> e</a>"],
            ["<a>a<i> b<b> c</b> d</i> e</a>",
             [[0, 0], [1, 1], [2, 3], [3, 2], [4, 4]],
             "<a>a<i> b d<b> c</b></i> e</a>"],
            ["<a>a<i> b<b> c</b> d</i> e</a>",
             [[0, 0], [1, 1], [2, 2], [3, 4], [4, 3]],
             "<a>a<i> b<b> c</b> e d</i></a>"],
            ['<a/><b><c/><d><e><f/><g/><h/></e></d></b>Looking at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             '<a/><b><c/><d><e><f/><g/><h/></e></d></b>Olhando para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],
            ['Looking<b><c/><a></a> at</b> the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             '<b>Olhando<c/><a></a></b> para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],
            ['Looking <b><c/><a></a></b>at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?',
             [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]],
             'Olhando <b><c/><a></a></b>para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'],
            ['<a/><b/>All the best, <c/><d/>Anjali <e/><f>Change.org </f>Centro De Apoio<g/><h/>',
             [[0, 1], [1, 1], [2, 0], [2, 2], [2, 3], [3, 4], [4, 5], [6, 7], [7, 8], [8, 9], [5, 6]],
             '<a/><b/>Com os melhores cumprimentos, <c/><d/>Anjali <e/><f>Change.org </f>Centro De Apoio<g/><h/>'],

            ['<p>You can only link a <strong>personal</strong> Facebook profile to your Pinterest account. We don\'t support Facebook business pages.</p>',
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [6, 7], [7, 5], [8, 8], [9, 9], [10, 12], [11, 10], [12, 13]],
             '<p>Usted puede unico enlace un perfil de <strong>Facebook</strong> a su cuenta de Pinterest.</p>'],


             # Overwrite invalid result in html/xml
             ['<a>First link1</a> <b>Second link2</b>',
             [[0, 0], [1, 1], [2, 1], [3, 3]],
             '<a>First <b>link1 Second link2</b></a>'],
             ['<a>Cash out</a><b/><c> I cant receive my cash out via PayPal </c><d><e>alternatives in Turkey</e>',
             [[0, 0], [2, 2], [3, 1], [3, 2], [4, 3], [5, 4], [6, 5], [8, 6], [8, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12]],
             '<a>Cobrar<c> no puedo<b/> recibir mi pago a traves de PayPal </c></a><d><e>alternativas en Turquia</e>'],

            ]

        for i, (tagged_text, wa, truth) in enumerate(strs):
            with self.subTest(i=i):
                clean_text, tags = remove_tags(tagged_text)
                clean_trg_txt, _ = remove_tags(truth)

                src_tok = tokenize_string(clean_text)
                trg_tok = tokenize_string(clean_trg_txt)

                trg_clean_text, _ = remove_tags(truth)
                pp = phrase_tuples(tagged_text, src_tok, tags, clean_text)
                tp = compute_target_covering_words(pp, wa, trg_tok,
                                                   clean_trg_txt)

                t_tags = generate_target_dict(tags, tp)
                t_text = insert_tags(trg_clean_text, t_tags)

                self.assertEqual(truth, t_text,
                                 "\nWanted:%s\nGot:%s\n" % (truth, t_text))
    def test_neste_examples(self):
        """ Assume we can always tokenize on space
            Assume that we have the same characters on target.
        """

        examples = [
            ["<a><i>a b</i></a>",
             [[0, 0], [1, 1]],
             "<a><i>a b</i></a>"],
            ["<a><i>a b</i></a>",
             [[0, 1], [1, 0]],
             "<a><i>b a</i></a>"],
            ["<a>c<i>a b</i></a>",
             [[0, 0], [1, 2], [2, 1]],
             "<a>c<i>b a</i></a>"],
            ["<a>c<i>a b</i> d</a>",
             [[0, 0], [1, 2], [2, 1], [3, 3]],
             "<a>c<i>b a</i> d</a>"],
            ["<a>c<i>a b</i> e d</a>",
             [[0, 0], [1, 2], [2, 1], [3, 4], [4, 3]],
            "<a>c<i>b a</i> d e</a>"],
            ["<a>a<i> b<b> c</b> d</i> e</a>",
             [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]],
             "<a>a<i> b<b> c</b> d</i> e</a>"],
            ["<a>a<i> b</i></a>",
             [[0, 0], [1, 1]],
             "<a>a<i> b</i></a>"],


        ]

        # nested tags
            # "<a><i>a</i></a> --> <a><i>1</i></a>"

            # "<a>a<i>b</i></a> --> <a>a<i>b</i></a>"

            # "<a>a<i>b</i>c</a> --> <a>a<i>b</i>c</a>"

        for i, (tagged_text, wrd_align, truth) in enumerate(examples):
            with self.subTest(i=i):
                text, s_tags = remove_tags(tagged_text)
                s_token = text.split(" ")
                tok_src = split_with_indices(text, " ")
                tok_trg = tok_src
                t_token = get_target_tokens(s_token, wrd_align)

                t_tags = align_tags(tagged_text,
                                    text,
                                    s_token,
                                    tok_src,
                                    tok_trg,
                                    s_tags,
                                    t_token,
                                    wrd_align=wrd_align)

                target_text = " ".join(t_token)

                target_tagged_text = insert_tags(target_text, t_tags)

                self.assertEqual(target_tagged_text, truth,
                                 "\n%s\nTruth:\n%s\nGot:\n%s" % (
                                     tagged_text, truth,
                                     target_tagged_text))