예제 #1
0
    def test_storify_embedded_video_links(self):
        """ lalibre parser can process an article with an embedded storify and embedded videos """
        with open(os.path.join(DATA_ROOT, "links_storify_video_links.html")) as f:
            article, _ = lalibre.extract_article_data(f)
            extracted_links = article.links

            expected_audio_links = [
                make_tagged_url("http://podcast.lalibre.be/articles/audio_llb_774524_1351533121.mp3", u"""Ecoutez Georges Dallemagne dans Les Flingueurs de l'info sur Twizz Radio""", set(['audio', 'sidebar box', 'internal site', 'embedded'])),
            ]

            expected_sidebox_links = [
                make_tagged_url("http://galeries.lalibre.be/album/actumonde/ouragansandy/15_21_01_171104928_624846-01-07.jpg/", u"""Les USA sur le pied de guerre avant le passage de Sandy""", set(['sidebar box', 'image gallery', 'internal'])),
                make_tagged_url("#embed_pos1", u"""Retrouvez les photos et les vidéos de l'ouragan""", set(['internal', 'sidebar box', 'anchor'])),
                make_tagged_url("#embed_pos2", u"""Vidéo: Sandy vu de l'espace""", set(['internal', 'sidebar box', 'anchor'])),
                make_tagged_url("#embed_pos3", u"""Sandy menace 50 millions d'Américains""", set(['internal', 'sidebar box', 'anchor'])),
            ]

            expected_bottom_links = [
                make_tagged_url("/societe/planete/article/774682/comment-choisit-on-le-nom-des-tempetes.html", u"""Comment choisit-on le nom des tempêtes?""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/international/article/774709/nouveau-numero-d-appel-d-urgence-pour-les-belges-aux-etats-unis.html", u"""Nouveau numéro d'appel d'urgence pour les Belges aux Etats-Unis""", set(['bottom box', 'internal'])),
                make_tagged_url("http://galeries.lalibre.be/album/actumonde/ouragansandy/15_21_01_171104928_624846-01-07.jpg/", u"""Les USA sur le pied de guerre avant le passage de Sandy""", set(['bottom box', 'internal', 'image gallery'])),
            ]

            expected_embbeded_media_links = [
                make_tagged_url("http://storify.com/pocket_pau/l-ouragan-sandy-menace-les-etats-unis", u"""View the story "L'ouragan Sandy menace les Etats-Unis" on Storify""", set(['external', 'embedded', 'script'])),
                make_tagged_url("http://www.ustream.tv/embed/recorded/26471477?v=3&wmode=direct", u"""http://www.ustream.tv/embed/recorded/26471477?v=3&wmode=direct""", set(['embedded', 'external', 'iframe'])),
                make_tagged_url("http://sa.kewego.com/swf/kp.swf?language_code=fr&width=510&height=383&playerKey=bf195c8ba4f5&configKey=&suffix=&sig=b5224f57c4cs&autostart=false", u"""Sandy menace 50 millions d'Américains""", set(['kplayer', 'video', 'external', 'embedded'])),
            ]

            expected_intext_links = [
                make_tagged_url("lalibre.be", u"lalibre.be", set(['in text', 'plaintext']))
            ]

            expected_links = expected_audio_links + expected_sidebox_links + expected_bottom_links + expected_embbeded_media_links + expected_intext_links
            assert_taggedURLs_equals(expected_links, extracted_links)
예제 #2
0
 def test_many_embedded_videos_and_links(self):
     """ The 7sur7.be parser can extract and tag many embedded video in one article"""
     with open(os.path.join(DATA_ROOT, "many_embedded_videos_and_links.html")) as f:
         article, raw_html = septsursept.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.7sur7.be/7s7/fr/1748/Open-d-Australie/article/detail/1381644/2012/01/18/Baghdatis-fracasse-4-raquettes-en-25-secondes-video.dhtml", u"""Marcos Baghdatis - Stanislas Wawrinka, Australian Open 2012""", set(['internal', 'in text'])),
             make_tagged_url("http://www.7sur7.be/7s7/fr/1513/tennis/article/detail/227704/2008/04/02/Furax-Youzhny-s-ouvre-le-crane-avec-sa-raquette.dhtml", u"""Mikhail Youzhny - Nicolas Almagro, Miami, 2008""", set(['internal', 'in text'])),
             make_tagged_url("http://www.7sur7.be/7s7/fr/1513/tennis/article/detail/1109699/2010/05/25/Verdasco-a-Gasquet-Su-puta-madre.dhtml", u"""Fernando Verdasco - Richard Gasquet, Open de Nice, 2010""", set(['internal', 'in text'])),
             make_tagged_url("http://www.7sur7.be/newslettersports", u"""Inscrivez-vous à la newsletter sports de 7sur7 et recevez chaque jour les dernières infos sports""", set(['bottom box', 'internal'])),
             make_tagged_url("/7s7/fr/1513/tennis/article/detail/1489797/2012/08/22/Darcis-se-paye-Roddick.dhtml", u"""Darcis se paye Roddick""", set(['internal', 'sidebar box'])),
             make_tagged_url("/7s7/fr/1513/tennis/article/detail/1489306/2012/08/22/Wickmayer-eliminee-a-Dallas-Darcis-au-3e-tour-a-Salem.dhtml", u"""Wickmayer éliminée à Dallas, Darcis au 3e tour à Salem""", set(['internal', 'sidebar box'])),
             make_tagged_url("/7s7/fr/1513/tennis/article/detail/1489235/2012/08/21/Goffin-affrontera-Kubot-en-1-8e-de-finale-a-Winston-Salem.dhtml", u"""Goffin affrontera Kubot en 1/8e de finale à Winston-Salem""", set(['internal', 'sidebar box'])),
             make_tagged_url("/7s7/fr/1481/Home/26/Hors-jeu/actualite/index.dhtml", u"""Hors-jeu""", set(['internal', 'sidebar box', 'keyword'])),
             make_tagged_url("http://www.youtube.com/embed/Oe6uLXaAnhQ/?wmode=opaque", u"""http://www.youtube.com/embed/Oe6uLXaAnhQ/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/lKRaOgL6_-c/?wmode=opaque", u"""http://www.youtube.com/embed/lKRaOgL6_-c/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/QqrCuIB76gs/?wmode=opaque", u"""http://www.youtube.com/embed/QqrCuIB76gs/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/g7kS68T6ptA/?wmode=opaque", u"""http://www.youtube.com/embed/g7kS68T6ptA/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/ekQ_Ja02gTY/?wmode=opaque", u"""http://www.youtube.com/embed/ekQ_Ja02gTY/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/fi-CgSO9Evw/?wmode=opaque", u"""http://www.youtube.com/embed/fi-CgSO9Evw/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/YQ2ssjDKWvk/?wmode=opaque", u"""http://www.youtube.com/embed/YQ2ssjDKWvk/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/bnREpkrIhRM/?wmode=opaque", u"""http://www.youtube.com/embed/bnREpkrIhRM/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/C8Nyc9jzSDg/?wmode=opaque", u"""http://www.youtube.com/embed/C8Nyc9jzSDg/?wmode=opaque""", set(['external', 'embedded'])),
             make_tagged_url("http://www.youtube.com/embed/FaaezNd7ykg/?wmode=opaque", u"""http://www.youtube.com/embed/FaaezNd7ykg/?wmode=opaque""", set(['external', 'embedded'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #3
0
    def test_storify_sidebox_bottom_links(self):
        """ lalibre parser can extract embedded storify links """
        with open(os.path.join(DATA_ROOT, "links_storify_sidebox_bottom_links.html")) as f:
            article, _ = lalibre.extract_article_data(f)
            extracted_links = article.links

            expected_sidebox_links = [
                make_tagged_url("#embed_pos1", u'''Le storify dédié à "Chevaux et baïonnettes"''', set(['internal', 'sidebar box', 'anchor'])),
                make_tagged_url("/actu/usa-2012/article/773103/obama-depeint-un-romney-incompetent-en-politique-etrangere.html", u"""Obama dépeint un Romney incompétent en politique étrangère""", set(['internal', 'sidebar box'])),
                make_tagged_url("/actu/usa-2012/article/773295/coup-de-mou-pour-un-dirigeable-de-mitt-romney.html", u"""Coup de mou pour un dirigeable de Mitt Romney""", set(['internal', 'sidebar box'])),
                make_tagged_url("/actu/usa-2012/article/773089/obama-et-romney-a-egalite-avant-le-dernier-debat.html", u"""Obama et Romney à égalité avant le dernier débat""", set(['internal', 'sidebar box'])),
                make_tagged_url("/actu/usa-2012/article/772885/reelu-obama-devrait-continuer-la-guerre-secrete-contre-al-qaida.html", u"""Réélu, Obama devrait continuer la "guerre secrète" contre Al-Qaïda""", set(['internal', 'sidebar box'])),
            ]

            expected_bottom_links = [
                make_tagged_url("/actu/usa-2012/article/773103/obama-depeint-un-romney-incompetent-en-politique-etrangere.html", u"""Obama dépeint un Romney incompétent en politique étrangère""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/usa-2012/article/773089/obama-et-romney-a-egalite-avant-le-dernier-debat.html", u"""Obama et Romney à égalité avant le dernier débat""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/usa-2012/article/772885/reelu-obama-devrait-continuer-la-guerre-secrete-contre-al-qaida.html", u"""Réélu, Obama devrait continuer la "guerre secrète" contre Al-Qaïda""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/usa-2012/article/773295/coup-de-mou-pour-un-dirigeable-de-mitt-romney.html", u"""Coup de mou pour un dirigeable de Mitt Romney""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/usa-2012/article/773304/mais-qu-ont-elles-de-si-interessant-ces-notes.html", u"""Mais qu'ont-elles de si intéressant ces notes?""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/usa-2012/article/773547/usa-virgil-goode-le-petit-candidat-qui-pourrait-jouer-un-grand-role.html", u"""USA : Virgil Goode, le petit candidat qui pourrait jouer un grand rôle""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/usa-2012/article/773578/la-tournee-effrenee-de-barack-obama.html", u"""La tournée effrénée de Barack Obama""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/usa-2012/article/773807/obama-a-t-il-traite-romney-de-bullshitter.html", u"""Obama a-t-il traité Romney de "bullshitter" ?""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/usa-2012/article/774012/que-peut-on-attendre-d-un-deuxieme-mandat-de-barack-obama.html", u"""Que peut-on attendre d’un deuxième mandat de Barack Obama ?""", set(['bottom box', 'internal'])),
            ]

            expected_embbeded_media_links = [
                make_tagged_url("http://storify.com/pocket_pau/chyevaux-et-baionnettes-invites-surprises-du-derni", u"""View the story "Chevaux et baïonnettes, invités surprises du dernier débat " on Storify""", set(['external', 'embedded', 'script'])),
            ]

            expected_links = expected_sidebox_links + expected_bottom_links + expected_embbeded_media_links
            assert_taggedURLs_equals(expected_links, extracted_links)
예제 #4
0
    def test_embedded_tweet(self):
        """ lalibre parser correctly extracts and tags embedded tweets (and a whole bunch of other links) """
        with open(os.path.join(DATA_ROOT, "embedded_tweet.html")) as f:
            article, raw_html = lalibre.extract_article_data(f)
            extracted_links = article.links
            tagged_urls = [
                make_tagged_url("https://twitter.com/JohnnySjh/status/282908799470292993", u"""https://twitter.com/JohnnySjh/status/282908799470292993""", set(['tweet', 'embedded media', 'external'])),
                make_tagged_url("/culture/people/article/785865/depardieu-apparait-en-chaise-roulante.html", u"""Depardieu apparaît... en chaise roulante""", set(['internal', 'sidebar box'])),
                make_tagged_url("/culture/people/article/784586/vous-soutenez-gerard-depardieu.html", u"""Vous soutenez Gérard Depardieu""", set(['internal', 'sidebar box'])),
                make_tagged_url("/actu/international/article/785820/hollande-si-on-aime-la-france-on-doit-la-servir.html", u'''Hollande : "Si on aime la France, on doit la servir"''', set(['internal', 'sidebar box'])),
                make_tagged_url("/actu/international/article/785045/gerard-depardieu-il-va-s-embeter-en-belgique-juge-cohn-bendit.html", u"""Gérard Depardieu? "Il va s'embêter" en Belgique juge Cohn-Bendit""", set(['internal', 'sidebar box'])),
                make_tagged_url("/actu/international/article/784814/depardieu-qu-il-retourne-au-cinema-muet.html", u'''Depardieu, "Qu'il retourne au cinéma muet"''', set(['internal', 'sidebar box'])),
                make_tagged_url("/actu/international/article/784820/riches-ils-ont-quitte-la-france.html", u"""Riches, ils ont quitté la France""", set(['internal', 'sidebar box'])),
                make_tagged_url("/actu/international/article/784891/edito-minable.html", u"""Édito : Minable... ?""", set(['internal', 'sidebar box'])),
                make_tagged_url("/culture/people/article/785865/depardieu-apparait-en-chaise-roulante.html", u"""Depardieu apparaît... en chaise roulante""", set(['bottom box', 'internal'])),
                make_tagged_url("/culture/people/article/784586/vous-soutenez-gerard-depardieu.html", u"""Vous soutenez Gérard Depardieu""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/international/article/785820/hollande-si-on-aime-la-france-on-doit-la-servir.html", u'''Hollande : "Si on aime la France, on doit la servir"''', set(['bottom box', 'internal'])),
                make_tagged_url("/actu/international/article/785045/gerard-depardieu-il-va-s-embeter-en-belgique-juge-cohn-bendit.html", u"""Gérard Depardieu? "Il va s'embêter" en Belgique juge Cohn-Bendit""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/international/article/784814/depardieu-qu-il-retourne-au-cinema-muet.html", u'''Depardieu, "Qu'il retourne au cinéma muet"''', set(['bottom box', 'internal'])),
                make_tagged_url("/actu/international/article/784820/riches-ils-ont-quitte-la-france.html", u"""Riches, ils ont quitté la France""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/international/article/784891/edito-minable.html", u"""Édito : Minable... ?""", set(['bottom box', 'internal'])),
                make_tagged_url("/actu/international/article/787374/taxe-a-75-depardieu-reste-en-belgique.html", u"""Taxe à 75%: Depardieu reste en Belgique""", set(['bottom box', 'internal'])),
                make_tagged_url("/societe/cyber/article/788995/twitter-veut-le-feu-vert-de-la-justice-pour-denoncer-les-racistes.html", u"""Twitter veut le feu vert de la justice pour dénoncer les racistes""", set(['bottom box', 'internal'])),
            ]

            expected_links = tagged_urls
            assert_taggedURLs_equals(expected_links, extracted_links)
예제 #5
0
 def test_links_intext_overload(self):
     """ lalibre parser is very good with plaintext links"""
     with open(os.path.join(DATA_ROOT, "links_intext_overload.html")) as f:
         article, raw_html = lalibre.extract_article_data(f)
         extracted_links = article.links
         updated_tagged_urls = [
             make_tagged_url("www.nyx.com", u"""www.nyx.com""", set(['plaintext', 'external', 'in text'])),
             make_tagged_url("europeanequities.nyx.com", u"""europeanequities.nyx.com""", set(['plaintext', 'external', 'in text'])),
             make_tagged_url("www.bourse.be", u"""www.bourse.be""", set(['plaintext', 'external', 'in text'])),
             make_tagged_url("www.beurs.be", u"""www.beurs.be""", set(['plaintext', 'external', 'in text'])),
             make_tagged_url("bourse.be", u"""bourse.be""", set(['plaintext', 'external', 'in text'])),
             make_tagged_url("http://www.londonstockexchange.com", u"""http://www.londonstockexchange.com""", set(['plaintext', 'external', 'in text'])),
             make_tagged_url("http://www.six-swiss-exchange.com/", u"""http://www.six-swiss-exchange.com/""", set(['plaintext', 'external', 'in text'])),
             make_tagged_url("http://deutsche-boerse.com", u"""http://deutsche-boerse.com""", set(['plaintext', 'external', 'in text'])),
             make_tagged_url("/economie/actualite/article/754828/le-jeu-video-sans-console-via-belgacom.html", u"""Le jeu vidéo sans console via Belgacom""", set(['internal', 'sidebar box'])),
             make_tagged_url("/economie/actualite/article/753635/suivre-les-cours-de-bourse-a-la-plage-gare-aux-plongeons.html", u"""Suivre les cours de Bourse à la plage ? Gare aux plongeons !""", set(['internal', 'sidebar box'])),
             make_tagged_url("/economie/actualite/article/752413/travailler-en-vacances-une-autre-facon-de-garder-la-ligne.html", u"""Travailler en vacances : une autre façon de garder la ligne !""", set(['internal', 'sidebar box'])),
             make_tagged_url("/economie/actualite/article/754828/le-jeu-video-sans-console-via-belgacom.html", u"""Le jeu vidéo sans console via Belgacom""", set(['bottom box', 'internal'])),
             make_tagged_url("/economie/actualite/article/753635/suivre-les-cours-de-bourse-a-la-plage-gare-aux-plongeons.html", u"""Suivre les cours de Bourse à la plage ? Gare aux plongeons !""", set(['bottom box', 'internal'])),
             make_tagged_url("/economie/actualite/article/752413/travailler-en-vacances-une-autre-facon-de-garder-la-ligne.html", u"""Travailler en vacances : une autre façon de garder la ligne !""", set(['bottom box', 'internal'])),
             make_tagged_url("/economie/actualite/article/755981/la-grece-lance-une-bataille-diplomatique.html", u"""La Grèce lance une bataille diplomatique""", set(['bottom box', 'internal'])),
             make_tagged_url("/economie/actualite/article/755996/apple-roi-de-la-bourse-us.html", u"""Apple, roi de la bourse US""", set(['bottom box', 'internal'])),
         ]
         expected_links = updated_tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #6
0
    def test_embedded_videos_links(self):
        """ lalibre parser can process an article with embedded videos """
        with open(os.path.join(DATA_ROOT, "links_embedded_videos.html")) as f:
            article, _ = lalibre.extract_article_data(f)
            extracted_links = article.links

            expected_sidebox_links = [
                make_tagged_url("#embed_pos1", u"""L'incroyable but de Mexès face à Anderlecht""", set(['internal', 'sidebar box', 'anchor'])),
                make_tagged_url("http://infosports.lalibre.be/football/ligue-des-champions/phase-de-groupes/groupe-f/rencontre/209126/anderlecht-ac-milan/direct", u"""Revivez la rencontre Anderlecht-Milan""", set(['sidebar box', 'internal site'])),
                make_tagged_url("http://betfirst.dhnet.be", u"""Faites vos paris sportifs""", set(['sidebar box', 'external'])),
                make_tagged_url("http://infosports.lalibre.be/football/ligue-des-champions/phase-de-groupes/groupe-c/resultats", u"""Les résultats et classements de Ligue des Champions""", set(['sidebar box', 'internal site'])),
                make_tagged_url("#embed_pos5", u'''Deschacht : "pas d'excuse, je devais marquer"''', set(['internal', 'sidebar box', 'anchor'])),
                make_tagged_url("#embed_pos3", u'''Proto : "jamais pris un but comme ça"''', set(['internal', 'sidebar box', 'anchor'])),
            ]

            expected_bottom_links = [
                make_tagged_url("/sports/football/article/779271/genk-qualifie-un-match-avant-la-fin.html", u"""Genk qualifié un match avant la fin""", set(['bottom box', 'internal'])),
                make_tagged_url("/sports/football/article/779281/bruges-sort-sans-gloire-de-l-europa-league-1-2.html", u"""Bruges sort sans gloire de l'Europa League (1-2)""", set(['bottom box', 'internal'])),
                make_tagged_url("/sports/football/article/780016/jamais-l-horizon-mauve-n-aura-paru-aussi-degage.html", u"""Jamais l’horizon mauve n’aura paru aussi dégagé""", set(['bottom box', 'internal'])),
                make_tagged_url("http://infosports.lalibre.be/football/ligue-des-champions/phase-de-groupes/groupe-f/rencontre/209126/anderlecht-ac-milan/direct", u"""Revivez la rencontre Anderlecht-Milan""", set(['bottom box', 'internal site'])),
                make_tagged_url("http://betfirst.dhnet.be", u"""Faites vos paris sportifs""", set(['bottom box', 'external'])),
                make_tagged_url("http://infosports.lalibre.be/football/ligue-des-champions/phase-de-groupes/groupe-c/resultats", u"""Les résultats et classements de Ligue des Champions""", set(['bottom box', 'internal site'])),
            ]

            expected_embbeded_media_links = [
                make_tagged_url("http://www.youtube.com/embed/C8Z3yoIfUqc", u"""http://www.youtube.com/embed/C8Z3yoIfUqc""", set(['embedded', 'external', 'iframe'])),
                make_tagged_url("http://sa.kewego.com/swf/kp.swf?language_code=fr&width=510&height=383&playerKey=bf195c8ba4f5&configKey=&suffix=&sig=b10774aee0es&autostart=false", u"""__NO_TITLE__""", set(['kplayer', 'video', 'external', 'embedded'])),
                make_tagged_url("http://sa.kewego.com/swf/kp.swf?language_code=fr&width=510&height=383&playerKey=bf195c8ba4f5&configKey=&suffix=&sig=ca8cdb85890s&autostart=false", u"""__NO_TITLE__""", set(['kplayer', 'video', 'external', 'embedded'])),
                make_tagged_url("http://sa.kewego.com/swf/kp.swf?language_code=fr&width=510&height=383&playerKey=bf195c8ba4f5&configKey=&suffix=&sig=b79fc6ccc1bs&autostart=false", u"""__NO_TITLE__""", set(['kplayer', 'video', 'external', 'embedded'])),
                make_tagged_url("http://sa.kewego.com/swf/kp.swf?language_code=fr&width=510&height=383&playerKey=bf195c8ba4f5&configKey=&suffix=&sig=1b2b29b8280s&autostart=false", u"""__NO_TITLE__""", set(['kplayer', 'video', 'external', 'embedded'])),
                make_tagged_url("http://sa.kewego.com/swf/kp.swf?language_code=fr&width=510&height=383&playerKey=bf195c8ba4f5&configKey=&suffix=&sig=d68d7e19b04s&autostart=false", u"""__NO_TITLE__""", set(['kplayer', 'video', 'external', 'embedded'])),
            ]

            expected_links = expected_sidebox_links + expected_bottom_links + expected_embbeded_media_links
            assert_taggedURLs_equals(expected_links, extracted_links)
예제 #7
0
 def test_same_owner(self):
     """ The 7sur7.be parser can extract and tag links to 'same owner' sites (and a couple of others)"""
     with open(os.path.join(DATA_ROOT, "same_owner.html")) as f:
         article, raw_html = septsursept.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.7sur7.be/7s7/fr/1745/Standard/article/detail/1506809/2012/09/25/Jelle-Van-Damme-menace-De-Ceuninck-Fais-attention-toi.dhtml", u"""Si la tension est clairement montée entre Jelle Van Damme et Benjamin Deceuninck hier soir à Mouscron""", set(['internal', 'in text'])),
             make_tagged_url("http://www.7sur7.be/7s7/fr/1745/Standard/article/detail/1506951/2012/09/26/Deceuninck-Je-ne-me-suis-pas-senti-menace.dhtml", u'''Le journaliste de la RTBF ne s'est pas "senti menacé"''', set(['internal', 'in text'])),
             make_tagged_url("http://www.standard.be/multimedia/videos/details-video/~itv-jelle-van-damme.htm?lng=fr#.UGMIPnI4SSo", u'''"Standard TV".''', set(['external', 'in text'])),
             make_tagged_url("http://www.standard.be/multimedia/videos/details-video/~itv-jelle-van-damme.htm?lng=fr#.UGMIPnI4SSo", u"""__GHOST_LINK__""", set(['external', 'in text'])),
             make_tagged_url("/7s7/fr/1509/Football-Belge/article/detail/1506791/2012/09/25/Le-Standard-est-passe-tout-pres-de-la-catastrophe.dhtml", u"""Le Standard est passé tout près de la catastrophe""", set(['bottom box', 'internal'])),
             make_tagged_url("/7s7/fr/1745/Standard/article/detail/1506809/2012/09/25/Jelle-Van-Damme-menace-De-Ceuninck-Fais-attention-toi.dhtml", u'''Jelle Van Damme menace De Ceuninck: "Fais attention, toi!"''', set(['bottom box', 'internal'])),
             make_tagged_url("/7s7/fr/1745/Standard/article/detail/1506881/2012/09/26/Ron-Jans-Oui-cette-qualification-me-soulage.dhtml", u'''Ron Jans: "Oui, cette qualification me soulage"''', set(['bottom box', 'internal'])),
             make_tagged_url("/7s7/fr/1745/Standard/article/detail/1506951/2012/09/26/Deceuninck-Je-ne-me-suis-pas-senti-menace.dhtml", u'''Deceuninck: "Je ne me suis pas senti menacé"''', set(['bottom box', 'internal'])),
             make_tagged_url("/7s7/fr/1509/Football-Belge/article/detail/1507218/2012/09/26/Le-parquet-propose-deux-rencontres-a-Batshuayi.dhtml", u"""Le parquet propose deux rencontres à Batshuayi""", set(['bottom box', 'internal'])),
             make_tagged_url("http://www.11dor.be", u"""Jouez avec: Le 11 d'Or et gagnez 25.000 euro!""", set(['bottom box', 'external', 'same owner'])),
             make_tagged_url("http://www.7sur7.be/newslettersports", u"""Inscrivez-vous à la newsletter sports de 7sur7 et recevez chaque jour les dernières infos sports""", set(['bottom box', 'internal'])),
             make_tagged_url("/7s7/fr/1509/Football-Belge/article/detail/1506791/2012/09/25/Le-Standard-est-passe-tout-pres-de-la-catastrophe.dhtml", u"""Le Standard est passé tout près de la catastrophe""", set(['internal', 'sidebar box'])),
             make_tagged_url("/7s7/fr/1745/Standard/article/detail/1506809/2012/09/25/Jelle-Van-Damme-menace-De-Ceuninck-Fais-attention-toi.dhtml", u'''Jelle Van Damme menace De Ceuninck: "Fais attention, toi!"''', set(['internal', 'sidebar box'])),
             make_tagged_url("/7s7/fr/1745/Standard/article/detail/1506881/2012/09/26/Ron-Jans-Oui-cette-qualification-me-soulage.dhtml", u'''Ron Jans: "Oui, cette qualification me soulage"''', set(['internal', 'sidebar box'])),
             make_tagged_url("/7s7/fr/1481/Home/932/Ligue-Jupiler/actualite/index.dhtml", u"""Ligue Jupiler""", set(['internal', 'sidebar box', 'keyword'])),
             make_tagged_url("/7s7/fr/1481/Home/493/Standard/actualite/index.dhtml", u"""Standard""", set(['internal', 'sidebar box', 'keyword'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #8
0
 def test_links_embedded_thumbnails(self):
     """ sudinfo parser ignores the images from the embedded gallery in the 'medias' box"""
     with open(os.path.join(DATA_ROOT, "links_embedded_thumbnails.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #9
0
 def test_no_links(self):
     """ sudinfo parser returns an empty link list if the article has no link. """
     with open(os.path.join(DATA_ROOT, "no_links.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #10
0
 def test_embedded_storify_top_box(self):
     with open(os.path.join(DATA_ROOT, "embedded_storify_top_box.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://storify.com/lesoir/conference-de-presse-de-l-eurogroupe-sur-le-me", u"""http://storify.com/lesoir/conference-de-presse-de-l-eurogroupe-sur-le-me""", set(['external', 'embedded', 'storify'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #11
0
 def test_links_old_flowplayer(self):
     """[BACKWARDS] lavenir parser tags embedded flowplayer videos as embedded videos (but does not extract url and marks it as 'unfinished')"""
     with open(os.path.join(DATA_ROOT, "links_old_flowplayer.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("__EMBEDDED_VIDEO_URL__", u"""__EMBEDDED_VIDEO_TITLE__""", set([u'unfinished', 'video', 'external', 'embedded', 'flowplayer'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #12
0
 def test_links_intext_not_plaintext(self):
     """ sudinfo parser extracts in-text urls only once (and not as plaintext URLs)"""
     with open(os.path.join(DATA_ROOT, "links_intext_not_plaintext.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://secourslux.blogs.sudinfo.be", u"""http://secourslux.blogs.sudinfo.be""", set(['in text', 'internal', 'jblog'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #13
0
 def test_in_text_link_extraction(self):
     """ sudinfo parser can extract and tag in-text links """
     with open(os.path.join(DATA_ROOT, "in_text_link_extraction.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.sporza.be/cm/sporza/videozone/MG_programmas/MG_Extra_Time_GNMA/1.1450385?utm_medium=twitter&utm_source=dlvr.it", u"""Cliquez ici pour consulter la vidéo capturée par nos confrères de Sporza.""", set(['external', 'in text'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #14
0
 def test_plaintext_links_tagging(self):
     """ Sudpresse parser correctly tags 'plaintext' links."""
     with open(os.path.join(DATA_ROOT, "plaintext_links_tagging.html")) as f:
         article, raw_html = sudpresse.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://expo-guide.com", u"""http://expo-guide.com""", set(['plaintext', 'external', 'in text'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #15
0
 def test_links_embedded_youtube(self):
     """ sudinfo parser extract links to youtube video presented inside the article media gallery"""
     with open(os.path.join(DATA_ROOT, "links_embedded_youtube.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://www.youtube.com/watch?v=4tkHmGycfz4", u"""__NO_TITLE__""", set(['youtube', 'video', 'external', 'embedded'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #16
0
 def test_new_links_special_in_text(self):
     """ lavenir [new template] parser can in-text links, even if they look like they are located in a bolded paragraph"""
     with open(os.path.join(DATA_ROOT, "new_links_special_in_text.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://tech.lavenir.net/ge/visite_touristique.kmz", u"""Pour découvrir ce parcours en mode 3D avec photographies, cliquez sur ce lien""", set(['internal', 'internal site', 'in text'])),
             make_tagged_url("/sports/jogging", u"""Jogging""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #17
0
 def test_embedded_video_extraction(self):
     """ sudinfo parser can extract and tag embedded video from the bottom of an article. """
     with open(os.path.join(DATA_ROOT, "embedded_video_extraction.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url(u"http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&sig=5a5a3d9f57ds", u"""http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&sig=5a5a3d9f57ds""", set(['video', 'external', 'embedded', 'bottom'])),
             make_tagged_url(u"/338194/article/regions/tournai/2012-02-29/prostitution-“dodo-la-saumure”-va-demander-l’acquittement-sur-tout-jeudi-devant", u"""Prostitution: “Dodo la Saumure” va demander l’acquittement sur tout jeudi devant la justice""", set(['internal', 'sidebar box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #18
0
 def test_embedded_scribble_live(self):
     """ lesoir_new parser correctly extracts and tags an embedded scribble live """
     with open(os.path.join(DATA_ROOT, "embedded_scribble_live.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://football.lesoir.be/jupiler-pro-league/resultats", u"""Tous les résultats et classements""", set(['internal', 'sidebar box', 'internal site'])),
             make_tagged_url("http://embed.scribblelive.com/Embed/v5.aspx?Id=86477&ThemeId=7346", u"""http://embed.scribblelive.com/Embed/v5.aspx?Id=86477&ThemeId=7346""", set(['iframe', 'external', 'embedded'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #19
0
 def test_intro_type1(self):
     """ lesoir_new can extract intro"""
     with open(os.path.join(DATA_ROOT, "intro_type1.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lacapitale.be/674531/article/actualite/politique/2013-03-01/didier-reynders-veut-mettre-nos-imams-sous-controle", u"""dans un entretien donné aux journaux SudPresse""", set(['same owner', 'external', 'in text'])),
             make_tagged_url("http://www.lacapitale.be/674531/article/actualite/politique/2013-03-01/didier-reynders-veut-mettre-nos-imams-sous-controle", u"""Didier Reynders veut mettre nos imams sous contrôle (SudPresse)""", set(['sidebar box', 'external', 'same owner'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #20
0
 def test_kplayer_without_title(self):
     with open(os.path.join(DATA_ROOT, "kplayer_without_title.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://soirmag.lesoir.be/search/node/gandolfi", u"""Dans toutes ses interviews""", set(['internal', 'internal site', 'in text'])),
             make_tagged_url("http://soirmag.lesoir.be/search/node/gandolfi", u"""Les articles sur Barbara Gandolfi sur SoirMag""", set(['internal', 'sidebar box', 'internal site'])),
             make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=5ff3260def2a&skinKey=6624e00d250s&sig=d09800d9f8as&autostart=false&advertise=true", u"""__NO_TITLE__""", set(['kplayer', 'external', 'embedded', 'top box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #21
0
 def test_intro_type2(self):
     """ lesoir_new can extract other type of intro"""
     with open(os.path.join(DATA_ROOT, "intro_type2.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lesoir.be/191377/article/culture/cinema/2013-02-16/berlinale-%C2%ABthe-broken-circle-breakdown%C2%BB-remporte-prix-du-public", u"""Berlinale: «The Broken Circle Breakdown» remporte le prix du Public""", set(['internal', 'sidebar box'])),
             make_tagged_url("http://www.youtube.com/watch?v=ZtoCo9pJ2yU", u"""http://www.youtube.com/watch?v=ZtoCo9pJ2yU""", set(['video', 'external', 'embedded', 'top box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #22
0
 def test_new_links_ignore_photosets(self):
     """ lavenir [new template] parser ignores photosets"""
     with open(os.path.join(DATA_ROOT, "new_links_ignore_photosets.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("/sports/cyclisme", u"""Cyclisme""", set(['internal', 'keyword'])),
             make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #23
0
 def test_links_embedded_kewego_gallery(self):
     """ sudinfo parser can extract kewego videos from the article media gallery"""
     with open(os.path.join(DATA_ROOT, "links_embedded_kewego_gallery.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://portfolio.sudpresse.be/main.php?g2_itemId=992521", u"""Une belle après-midi à Bleid""", set(['internal', 'sidebar box', 'gallery'])),
             make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=7b7e2d7a9682&skinKey=a07930e183e6&sig=054c411daa8s&autostart=0&advertise=true", u"""__NO_TITLE__""", set(['kewego', 'video', 'external', 'embedded'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #24
0
 def test_same_owner_tagging(self):
     """ Sudpresse parser correctly tags 'same owner' links """
     with open(os.path.join(DATA_ROOT, "same_owner_tagging.html")) as f:
         article, raw_html = sudpresse.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Nos confrères de Nord Eclair France """, set(['same owner', 'external', 'in text'])),
             make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Voir sur le site nordeclair.fr""", set(['external', 'same owner', 'sidebar box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #25
0
 def test_embedded_ustream(self):
     with open(os.path.join(DATA_ROOT, "embedded_ustream.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lesoir.be/187345/article/economie/2013-02-11/gaz-schiste-menace-pour-belgique", u"""notre dossier sur le gaz de schiste""", set(['internal', 'in text'])),
             make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=5ff3260def2a&skinKey=6624e00d250s&sig=ed3b67b4053s&autostart=false&advertise=true", u"""__NO_TITLE__""", set(['kplayer', 'video', 'external', 'embedded', 'top box'])),
             make_tagged_url("__NO_URL__", u"""__NO_TITLE__""", set(['video', u'unfinished', 'embedded'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #26
0
 def test_intext_links_tagging(self):
     """ Sudpresse parser correctly tags 'in text' links."""
     with open(os.path.join(DATA_ROOT, "intext_links_tagging.html")) as f:
         article, raw_html = sudpresse.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lameuse.be/vervietois2011", u"""www.lameuse.be/vervietois2011""", set(['same owner', 'external', 'in text'])),
             make_tagged_url("http://verviers.lameuse.be", u"""http://verviers.lameuse.be""", set(['same owner', 'external', 'in text'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #27
0
 def test_links_new_ignore_animated_gifs_in_video_div(self):
     """ lavenir [new template] parser ignore <img> elements located where a video should have been. It also works for animated gif files. Which are pronoucened 'jif', btw."""
     with open(os.path.join(DATA_ROOT, "links_new_ignore_animated_gifs_in_video_div.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])),
             make_tagged_url("/sports/football/premierleague", u"""Premier League""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #28
0
 def test_in_text_same_owner(self):
     """ sudinfo parser can extract and tag in text and sidebar links to same owner sites."""
     with open(os.path.join(DATA_ROOT, "in_text_same_owner.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lesoir.be/sports/football/2012-05-31/en-combien-de-temps-hazard-gagne-t-il-votre-salaire-918967.php", u"""Le Soir.be""", set(['same owner', 'external', 'in text'])),
             make_tagged_url("http://www.lesoir.be/sports/football/2012-05-31/en-combien-de-temps-hazard-gagne-t-il-votre-salaire-918967.php", u"""En combien de temps, Eden Hazard gagne votre salaire?""", set(['sidebar box', 'external', 'same owner'])),
             make_tagged_url("slate.fr", u"""slate.fr""", set(['in text', 'plaintext', 'external']))
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #29
0
 def test_links_new_ignore_images_in_video_div(self):
     """ lavenir [new template] parser ignore <img> elements located where a video should have been"""
     with open(os.path.join(DATA_ROOT, "links_new_ignore_images_in_video_div.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://www.lavenir.net/buzz/insolite", u"""Insolite""", set(['internal', 'keyword'])),
             make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])),
             make_tagged_url("/sports/football", u"""Football""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
예제 #30
0
 def test_new_links_soccer_video_from_hungary(self):
     """ lavenir [new template] parser loves hungarian soccer videos"""
     with open(os.path.join(DATA_ROOT, "new_links_soccer_video_from_hungary.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://videa.hu/videok/sport/as-roma-3-1-genoa-alessio-romagnoli-francesco-totti-a7Mhqa5118CHtLlG", u"""szólj hozzá: AS Roma 3-1 Genoa MATCH HIGHLIGHTS""", set(['video', 'external', 'embedded'])),
             make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])),
             make_tagged_url("/sports/football/serie-a", u"""Serie A""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)