예제 #1
0
def test_fix_common_url_mistakes():
    urls = {
        # "http://http://"
        'http://http://www.al-monitor.com/pulse': 'http://www.al-monitor.com/pulse',

        # With only one slash ("http:/www.")
        'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled':
            'http://www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled',

        # missing / before ?
        'http://foo.bar?baz=bat': 'http://foo.bar/?baz=bat',

        # Whitespace
        '  http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html  ':
            'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html',

        # Missing port
        'https://www.gpo.gov:/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf':
            'https://www.gpo.gov/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf'
    }

    for orig_url, fixed_url in urls.items():
        # Fix once
        assert mc_url.urls_are_equal(url1=mc_url.fix_common_url_mistakes(orig_url), url2=fixed_url)

        # Try fixing the same URL twice, see what happens
        assert mc_url.urls_are_equal(
            url1=mc_url.fix_common_url_mistakes(mc_url.fix_common_url_mistakes(orig_url)),
            url2=fixed_url,
        )
예제 #2
0
def test_fix_common_url_mistakes():
    urls = {
        # "http://http://"
        'http://http://www.al-monitor.com/pulse': 'http://www.al-monitor.com/pulse',

        # With only one slash ("http:/www.")
        'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled':
            'http://www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled',

        # missing / before ?
        'http://foo.bar?baz=bat': 'http://foo.bar/?baz=bat',

        # Whitespace
        '  http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html  ':
            'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html',

        # Missing port
        'https://www.gpo.gov:/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf':
            'https://www.gpo.gov/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf',

        # Non-URLencoded space
        'http://www.ldeo.columbia.edu/~peter/ site/Home.html': 'http://www.ldeo.columbia.edu/~peter/%20site/Home.html',
    }

    for orig_url, fixed_url in urls.items():
        # Fix once
        assert mc_url.urls_are_equal(url1=mc_url.fix_common_url_mistakes(orig_url), url2=fixed_url)

        # Try fixing the same URL twice, see what happens
        assert mc_url.urls_are_equal(
            url1=mc_url.fix_common_url_mistakes(mc_url.fix_common_url_mistakes(orig_url)),
            url2=fixed_url,
        )
예제 #3
0
def test_target_request_from_linkis_com_url():
    # linkis.com <meta>
    assert urls_are_equal(
        url1=target_request_from_linkis_com_url(
            content='<meta property="og:url" content="http://og.url/test"',
            archive_site_url='https://linkis.com/foo.com/ASDF'
        ).url(),
        url2='http://og.url/test',
    )

    # linkis.com YouTube
    assert urls_are_equal(
        url1=target_request_from_linkis_com_url(
            content='<a class="js-youtube-ln-event" href="http://you.tube/test"',
            archive_site_url='https://linkis.com/foo.com/ASDF'
        ).url(),
        url2='http://you.tube/test',
    )

    # 'linkis.com <iframe>'
    assert urls_are_equal(
        url1=target_request_from_linkis_com_url(
            content='<iframe id="source_site" src="http://source.site/test"',
            archive_site_url='https://linkis.com/foo.com/ASDF'
        ).url(),
        url2='http://source.site/test',
    )

    # linkis.com JavaScript
    assert urls_are_equal(
        url1=target_request_from_linkis_com_url(
            content='"longUrl":"http:\/\/java.script\/test"',
            archive_site_url='https://linkis.com/foo.com/ASDF'
        ).url(),
        url2='http://java.script/test',
    )

    # linkis.com with non-matching URL
    assert target_request_from_linkis_com_url(
        content='<meta property="og:url" content="http://og.url/test"',
        archive_site_url='https://bar.com/foo/bar'
    ) is None
def test_target_request_from_linkis_com_url():
    # linkis.com <meta>
    assert urls_are_equal(
        url1=target_request_from_linkis_com_url(
            content='<meta property="og:url" content="http://og.url/test"',
            archive_site_url='https://linkis.com/foo.com/ASDF'
        ).url(),
        url2='http://og.url/test',
    )

    # linkis.com YouTube
    assert urls_are_equal(
        url1=target_request_from_linkis_com_url(
            content='<a class="js-youtube-ln-event" href="http://you.tube/test"',
            archive_site_url='https://linkis.com/foo.com/ASDF'
        ).url(),
        url2='http://you.tube/test',
    )

    # 'linkis.com <iframe>'
    assert urls_are_equal(
        url1=target_request_from_linkis_com_url(
            content='<iframe id="source_site" src="http://source.site/test"',
            archive_site_url='https://linkis.com/foo.com/ASDF'
        ).url(),
        url2='http://source.site/test',
    )

    # linkis.com JavaScript
    assert urls_are_equal(
        url1=target_request_from_linkis_com_url(
            content=r'"longUrl":"http:\/\/java.script\/test"',
            archive_site_url='https://linkis.com/foo.com/ASDF'
        ).url(),
        url2='http://java.script/test',
    )

    # linkis.com with non-matching URL
    assert target_request_from_linkis_com_url(
        content='<meta property="og:url" content="http://og.url/test"',
        archive_site_url='https://bar.com/foo/bar'
    ) is None
예제 #5
0
def test_target_request_from_alarabiya_url():
    # Alarabiya URL
    test_cookie_name = 'YPF8827340282Jdskjhfiw_928937459182JAX666'
    test_cookie_value = '78.60.231.222'
    test_content = """

        <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
        <html>
        <head>
        <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
        <meta http-equiv="Content-Script-Type" content="text/javascript">
        <script type="text/javascript">

        // ...

        setCookie('%(cookie_name)s', '%(cookie_value)s', 10);

        // ...

        </script>
        </head>
        <body>
        <noscript>This site requires JavaScript and Cookies to be enabled. Please change your browser settings or
        upgrade your browser.</noscript>
        </body>
        </html>

    """ % {
        'cookie_name': test_cookie_name,
        'cookie_value': test_cookie_value
    }

    test_url = (
        'https://english.alarabiya.net/en/News/middle-east/2017/07/21/Israel-bars-Muslim-men-under-50-from-'
        'entering-Al-Aqsa-for-Friday-prayers.html')

    test_target_request = target_request_from_alarabiya_url(
        content=test_content, archive_site_url=test_url)

    assert urls_are_equal(url1=test_target_request.url(), url2=test_url)
    assert test_target_request.header('Cookie') == "%s=%s" % (
        test_cookie_name,
        test_cookie_value,
    )

    # Non-Alarabiya URL
    assert target_request_from_alarabiya_url(
        content=test_content,
        archive_site_url='http://some-other-url.com/') is None
예제 #6
0
def test_target_request_from_archive_org_url():
    # archive.org
    assert urls_are_equal(
        url1=target_request_from_archive_org_url(
            content=None,
            archive_site_url=
            'https://web.archive.org/web/20150204024130/http://www.john-daly.com/hockey/hockey.htm'
        ).url(),
        url2='http://www.john-daly.com/hockey/hockey.htm',
    )

    # archive.org with non-matching URL
    assert target_request_from_archive_org_url(
        content=None,
        archive_site_url='http://www.john-daly.com/hockey/hockey.htm') is None
def test_target_request_from_archive_org_url():
    # archive.org
    assert urls_are_equal(
        url1=target_request_from_archive_org_url(
            content=None,
            archive_site_url='https://web.archive.org/web/20150204024130/http://www.john-daly.com/hockey/hockey.htm'
        ).url(),
        url2='http://www.john-daly.com/hockey/hockey.htm',
    )

    # archive.org with non-matching URL
    assert target_request_from_archive_org_url(
        content=None,
        archive_site_url='http://www.john-daly.com/hockey/hockey.htm'
    ) is None
예제 #8
0
def test_target_request_from_archive_is_url():
    # archive.is
    assert urls_are_equal(
        url1=target_request_from_archive_is_url(
            content="""
                <link rel="canonical" href="https://archive.is/20170201/https://bar.com/foo/bar">
            """,
            archive_site_url=
            'https://archive.is/20170201/https://bar.com/foo/bar').url(),
        url2='https://bar.com/foo/bar',
    )

    # archive.is with non-matching URL
    assert target_request_from_archive_is_url(
        content="""
            <link rel="canonical" href="https://archive.is/20170201/https://bar.com/foo/bar">
        """,
        archive_site_url='https://bar.com/foo/bar') is None
def test_target_request_from_archive_is_url():
    # archive.is
    assert urls_are_equal(
        url1=target_request_from_archive_is_url(
            content="""
                <link rel="canonical" href="https://archive.is/20170201/https://bar.com/foo/bar">
            """,
            archive_site_url='https://archive.is/20170201/https://bar.com/foo/bar'
        ).url(),
        url2='https://bar.com/foo/bar',
    )

    # archive.is with non-matching URL
    assert target_request_from_archive_is_url(
        content="""
            <link rel="canonical" href="https://archive.is/20170201/https://bar.com/foo/bar">
        """,
        archive_site_url='https://bar.com/foo/bar'
    ) is None
예제 #10
0
def test_target_request_from_meta_refresh_url():
    # <meta> refresh
    assert urls_are_equal(
        url1=target_request_from_meta_refresh_url(
            content="""
                <HTML>
                <HEAD>
                    <TITLE>This is a test</TITLE>
                    <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8">
                    <META HTTP-EQUIV="refresh" CONTENT="0; URL=http://example.com/">
                </HEAD>
                <BODY>
                    <P>This is a test.</P>
                </BODY>
                </HTML>
            """,
            archive_site_url='http://example2.com/').url(),
        url2='http://example.com/',
    )
def test_target_request_from_alarabiya_url():
    # Alarabiya URL
    test_cookie_name = 'YPF8827340282Jdskjhfiw_928937459182JAX666'
    test_cookie_value = '78.60.231.222'
    test_content = """

        <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
        <html>
        <head>
        <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
        <meta http-equiv="Content-Script-Type" content="text/javascript">
        <script type="text/javascript">

        // ...

        setCookie('%(cookie_name)s', '%(cookie_value)s', 10);

        // ...

        </script>
        </head>
        <body>
        <noscript>This site requires JavaScript and Cookies to be enabled. Please change your browser settings or
        upgrade your browser.</noscript>
        </body>
        </html>

    """ % {'cookie_name': test_cookie_name, 'cookie_value': test_cookie_value}

    test_url = ('https://english.alarabiya.net/en/News/middle-east/2017/07/21/Israel-bars-Muslim-men-under-50-from-'
                'entering-Al-Aqsa-for-Friday-prayers.html')

    test_target_request = target_request_from_alarabiya_url(content=test_content, archive_site_url=test_url)

    assert urls_are_equal(url1=test_target_request.url(), url2=test_url)
    assert test_target_request.header('Cookie') == "%s=%s" % (test_cookie_name, test_cookie_value,)

    # Non-Alarabiya URL
    assert target_request_from_alarabiya_url(
        content=test_content,
        archive_site_url='http://some-other-url.com/'
    ) is None
def test_target_request_from_meta_refresh_url():
    # <meta> refresh
    assert urls_are_equal(
        url1=target_request_from_meta_refresh_url(
            content="""
                <HTML>
                <HEAD>
                    <TITLE>This is a test</TITLE>
                    <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8">
                    <META HTTP-EQUIV="refresh" CONTENT="0; URL=http://example.com/">
                </HEAD>
                <BODY>
                    <P>This is a test.</P>
                </BODY>
                </HTML>
            """,
            archive_site_url='http://example2.com/'
        ).url(),
        url2='http://example.com/',
    )
예제 #13
0
    def __get_follow_http_html_redirects_follow_redirects(self,
                                                          response_: Response,
                                                          meta_redirects_left: int) -> Union[Response, None]:

        from mediawords.util.web.user_agent.html_redirects import (
            target_request_from_meta_refresh_url,
            target_request_from_archive_org_url,
            target_request_from_archive_is_url,
            target_request_from_linkis_com_url,
            target_request_from_alarabiya_url,
        )

        if response_ is None:
            raise McGetFollowHTTPHTMLRedirectsException("Response is None.")

        if response_.is_success():

            base_url = get_base_url(response_.request().url())

            html_redirect_functions = [
                target_request_from_meta_refresh_url,
                target_request_from_archive_org_url,
                target_request_from_archive_is_url,
                target_request_from_linkis_com_url,
                target_request_from_alarabiya_url,
            ]
            for html_redirect_function in html_redirect_functions:
                request_after_meta_redirect = html_redirect_function(
                    content=response_.decoded_content(),
                    archive_site_url=base_url,
                )
                if request_after_meta_redirect is not None:
                    log.warning(
                        "meta redirect from %s: %s" % (html_redirect_function, request_after_meta_redirect.url()))
                    if not urls_are_equal(url1=response_.request().url(), url2=request_after_meta_redirect.url()):

                        log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url())

                        orig_redirect_response = self.request(request=request_after_meta_redirect)
                        redirect_response = orig_redirect_response

                        # Response might have its previous() already set due to HTTP redirects,
                        # so we have to find the initial response first
                        previous = None
                        for x in range(self.max_redirect() + 1):
                            previous = redirect_response.previous()
                            if previous is None:
                                break
                            redirect_response = previous

                        if previous is not None:
                            raise McGetFollowHTTPHTMLRedirectsException(
                                "Can't find the initial redirected response; URL: %s" %
                                request_after_meta_redirect.url()
                            )

                        log.debug("Setting previous of URL %(url)s to %(previous_url)s" % {
                            'url': redirect_response.request().url(),
                            'previous_url': response_.request().url(),
                        })
                        redirect_response.set_previous(response_)

                        meta_redirects_left = meta_redirects_left - 1

                        return self.__get_follow_http_html_redirects(
                            response_=orig_redirect_response,
                            meta_redirects_left=meta_redirects_left,
                        )

            # No <meta /> refresh, the current URL is the final one
            return response_

        else:
            log.debug("Request to %s was unsuccessful: %s" % (response_.request().url(), response_.status_line(),))

            # Return the original URL and give up
            return None
예제 #14
0
def test_http_hash_server():
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    def __simple_callback(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback',
            'method': request.method(),
            'url': request.url(),
            'content-type': request.content_type(),
            'params': request.query_params(),
            'cookies': request.cookies(),
        })
        return str.encode(r)

    # noinspection PyUnusedLocal
    def __callback_cookie_redirect(request: HashServer.Request) -> str:
        r = ""
        r += "HTTP/1.0 302 Moved Temporarily\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "Location: /check_cookie\r\n"
        r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n"
        r += "\r\n"
        r += "Redirecting to the cookie check page..."
        return r

    def __callback_post(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback_post',
            'post_data': request.content(),
        })
        return str.encode(r)

    pages = {
        '/': 'home',
        '/foo': b'foo',
        '/bar': 'bar ąą',
        '/foo-bar': {b'redirect': b'/bar'},
        '/localhost': {'redirect': "http://localhost:%d/" % port},
        b'/127-foo': {b'redirect': "http://127.0.0.1:%d/foo" % port},
        '/auth': {b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc"},
        '/404': {b'content': b'not found', b'http_status_code': 404},
        '/callback': {b'callback': __simple_callback},

        # Test setting cookies, redirects
        '/callback_cookie_redirect': {'callback': __callback_cookie_redirect},

        # POST data
        '/callback_post': {'callback': __callback_post},
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    assert str(requests.get('%s/' % base_url).text) == 'home'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/localhost' % base_url).text) == 'home'
    assert str(requests.get('%s/127-foo' % base_url).text) == 'foo'

    # Path normalization
    assert str(requests.get('%s//' % base_url).text) == 'home'
    assert str(requests.get('%s///' % base_url).text) == 'home'
    assert str(requests.get('%s/something/../' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..//' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..///' % base_url).text) == 'home'
    assert str(requests.get('%s/foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo///' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo'

    response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={'cookie_name': 'cookie_value'}).json()
    assert response_json == {
        'name': 'callback',
        'method': 'GET',
        'url': 'http://localhost:%d/callback?a=b&c=d' % port,
        'content-type': None,
        'params': {
            'a': 'b',
            'c': 'd',
        },
        'cookies': {
            'cookie_name': 'cookie_value',
        },
    }

    response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False)
    assert response.status_code == 302
    assert response.headers['Location'] == '/check_cookie'

    response = requests.get("%s/404" % base_url)
    assert response.status_code == HTTPStatus.NOT_FOUND.value
    assert 'Not Found' in response.reason

    auth_url = "%s/auth" % base_url

    assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED
    assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED

    response = requests.get(auth_url, auth=('foo', 'bar'))
    assert response.status_code == HTTPStatus.OK
    assert response.content == b"foo bar \xf0\x90\x28\xbc"

    assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port)
    with pytest.raises(McHashServerException):
        hs.page_url('/does-not-exist')

    response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json()
    assert response_json == {
        'name': 'callback_post',
        'post_data': 'abc=def',
    }

    hs.stop()
예제 #15
0
def test_urls_are_equal():
    # Invalid input
    with pytest.raises(mc_url.McURLsAreEqualException):
        # noinspection PyTypeChecker
        mc_url.urls_are_equal(url1=None, url2=None)
    with pytest.raises(mc_url.McURLsAreEqualException):
        # noinspection PyTypeChecker
        mc_url.urls_are_equal(url1=None, url2='https://web.mit.edu/')
    with pytest.raises(mc_url.McURLsAreEqualException):
        # noinspection PyTypeChecker
        mc_url.urls_are_equal(url1='https://web.mit.edu/', url2=None)

    # Not URLs
    assert mc_url.urls_are_equal(url1='Not an URL.', url2='Not an URL.') is False

    funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises%20new%20'
                 'doubts%20about%20safety%20of%20live%20entertainment')
    assert mc_url.urls_are_equal(url1=funky_url, url2=funky_url) is False

    assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://web.mit.edu/') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU/') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU//') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443/') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443//') is True
    assert mc_url.urls_are_equal(url1='http://web.mit.edu/', url2='http://WEB.MIT.EDU:80//') is True

    assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443//page') is False
예제 #16
0
    def test_get_topic_url_variants(self):
        media = create_test_story_stack(
            db=self.db(),
            data={
                'A': {
                    'B': [1, 2, 3],
                    'C': [4, 5, 6],
                },
                'D': {
                    'E': [7, 8, 9],
                }
            }
        )

        story_1 = media['A']['feeds']['B']['stories']['1']
        story_2 = media['A']['feeds']['B']['stories']['2']
        story_3 = media['A']['feeds']['B']['stories']['3']
        story_4 = media['A']['feeds']['C']['stories']['4']

        self.db().query("""
            INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id)
            VALUES (%(source_stories_id)s, %(target_stories_id)s)
        """, {
            'source_stories_id': story_2['stories_id'],
            'target_stories_id': story_1['stories_id'],
        })

        self.db().query("""
            INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id)
            VALUES (%(source_stories_id)s, %(target_stories_id)s)
        """, {
            'source_stories_id': story_3['stories_id'],
            'target_stories_id': story_2['stories_id'],
        })

        self.db().create(
            table='tag_sets',
            insert_hash={'name': 'foo'},
        )

        topic = create_test_topic(db=self.db(), label='foo')

        self.db().create(
            table='topic_stories',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_4['stories_id'],
            }
        )

        self.db().create(
            table='topic_stories',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_1['stories_id'],
            }
        )

        self.db().create(
            table='topic_links',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_4['stories_id'],
                'ref_stories_id': story_1['stories_id'],
                'url': story_1['url'],
                'redirect_url': story_1['url'] + "/redirect_url",
            }
        )

        self.db().create(
            table='topic_stories',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_2['stories_id'],
            }
        )

        self.db().create(
            table='topic_links',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_4['stories_id'],
                'ref_stories_id': story_2['stories_id'],
                'url': story_2['url'],
                'redirect_url': story_2['url'] + "/redirect_url",
            }
        )

        self.db().create(
            table='topic_stories',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_3['stories_id']
            }
        )

        self.db().create(
            table='topic_links',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_4['stories_id'],
                'ref_stories_id': story_3['stories_id'],
                'url': story_3['url'] + '/alternate',
            }
        )

        test_url = story_1['url'] + self.CRUFT

        expected_urls = {
            story_1['url'],
            story_1['url'] + self.CRUFT,
            story_2['url'],
            story_1['url'] + "/redirect_url",
            story_2['url'] + "/redirect_url",
            story_3['url'],
            story_3['url'] + "/alternate",
        }

        url_variants = all_url_variants(db=self.db(), url=test_url)

        assert len(expected_urls) == len(url_variants)

        sorted_expected_urls = sorted(expected_urls)
        sorted_url_variants = sorted(url_variants)

        for i in range(len(sorted_expected_urls)):
            assert urls_are_equal(url1=sorted_expected_urls[i], url2=sorted_url_variants[i])
예제 #17
0
def test_http_hash_server():
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    def __simple_callback(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback',
            'method': request.method(),
            'url': request.url(),
            'content-type': request.content_type(),
            'params': request.query_params(),
            'cookies': request.cookies(),
        })
        return str.encode(r)

    # noinspection PyUnusedLocal
    def __callback_cookie_redirect(request: HashServer.Request) -> str:
        r = ""
        r += "HTTP/1.0 302 Moved Temporarily\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "Location: /check_cookie\r\n"
        r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n"
        r += "\r\n"
        r += "Redirecting to the cookie check page..."
        return r

    def __callback_post(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback_post',
            'post_data': request.content(),
        })
        return str.encode(r)

    pages = {
        '/': 'home',
        '/foo': b'foo',
        '/bar': 'bar ąą',
        '/foo-bar': {
            b'redirect': b'/bar'
        },
        '/localhost': {
            'redirect': "http://localhost:%d/" % port
        },
        b'/127-foo': {
            b'redirect': "http://127.0.0.1:%d/foo" % port
        },
        '/auth': {
            b'auth': b'foo:bar',
            b'content': b"foo bar \xf0\x90\x28\xbc"
        },
        '/404': {
            b'content': b'not found',
            b'http_status_code': 404
        },
        '/callback': {
            b'callback': __simple_callback
        },

        # Test setting cookies, redirects
        '/callback_cookie_redirect': {
            'callback': __callback_cookie_redirect
        },

        # POST data
        '/callback_post': {
            'callback': __callback_post
        },
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    assert str(requests.get('%s/' % base_url).text) == 'home'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/localhost' % base_url).text) == 'home'
    assert str(requests.get('%s/127-foo' % base_url).text) == 'foo'

    # Path normalization
    assert str(requests.get('%s//' % base_url).text) == 'home'
    assert str(requests.get('%s///' % base_url).text) == 'home'
    assert str(requests.get('%s/something/../' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..//' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..///' % base_url).text) == 'home'
    assert str(requests.get('%s/foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo///' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo'

    response_json = requests.get('%s/callback?a=b&c=d' % base_url,
                                 cookies={
                                     'cookie_name': 'cookie_value'
                                 }).json()
    assert response_json == {
        'name': 'callback',
        'method': 'GET',
        'url': 'http://localhost:%d/callback?a=b&c=d' % port,
        'content-type': None,
        'params': {
            'a': 'b',
            'c': 'd',
        },
        'cookies': {
            'cookie_name': 'cookie_value',
        },
    }

    response = requests.get('%s/callback_cookie_redirect' % base_url,
                            allow_redirects=False)
    assert response.status_code == 302
    assert response.headers['Location'] == '/check_cookie'

    response = requests.get("%s/404" % base_url)
    assert response.status_code == HTTPStatus.NOT_FOUND.value
    assert 'Not Found' in response.reason

    auth_url = "%s/auth" % base_url

    assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED
    assert requests.get(auth_url,
                        auth=('foo',
                              'foo')).status_code == HTTPStatus.UNAUTHORIZED

    response = requests.get(auth_url, auth=('foo', 'bar'))
    assert response.status_code == HTTPStatus.OK
    assert response.content == b"foo bar \xf0\x90\x28\xbc"

    assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'),
                          url2='http://localhost:%d/callback' % port)
    with pytest.raises(McHashServerException):
        hs.page_url('/does-not-exist')

    response_json = requests.post('%s/callback_post' % base_url,
                                  data='abc=def').json()
    assert response_json == {
        'name': 'callback_post',
        'post_data': 'abc=def',
    }

    hs.stop()
예제 #18
0
def test_urls_are_equal():
    # Invalid input
    with pytest.raises(mc_url.McURLsAreEqualException):
        # noinspection PyTypeChecker
        mc_url.urls_are_equal(url1=None, url2=None)
    with pytest.raises(mc_url.McURLsAreEqualException):
        # noinspection PyTypeChecker
        mc_url.urls_are_equal(url1=None, url2='https://web.mit.edu/')
    with pytest.raises(mc_url.McURLsAreEqualException):
        # noinspection PyTypeChecker
        mc_url.urls_are_equal(url1='https://web.mit.edu/', url2=None)

    # Not URLs
    assert mc_url.urls_are_equal(url1='Not an URL.',
                                 url2='Not an URL.') is False

    funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises%20new%20'
                 'doubts%20about%20safety%20of%20live%20entertainment')
    assert mc_url.urls_are_equal(url1=funky_url, url2=funky_url) is False

    assert mc_url.urls_are_equal(url1='https://web.mit.edu/',
                                 url2='https://web.mit.edu/') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/',
                                 url2='https://WEB.MIT.EDU/') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/',
                                 url2='https://WEB.MIT.EDU//') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/',
                                 url2='https://WEB.MIT.EDU:443') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/',
                                 url2='https://WEB.MIT.EDU:443/') is True
    assert mc_url.urls_are_equal(url1='https://web.mit.edu/',
                                 url2='https://WEB.MIT.EDU:443//') is True
    assert mc_url.urls_are_equal(url1='http://web.mit.edu/',
                                 url2='http://WEB.MIT.EDU:80//') is True

    assert mc_url.urls_are_equal(url1='https://web.mit.edu/',
                                 url2='https://WEB.MIT.EDU:443//page') is False
예제 #19
0
    def test_get_topic_url_variants(self):
        media = create_test_story_stack(db=self.db(),
                                        data={
                                            'A': {
                                                'B': [1, 2, 3],
                                                'C': [4, 5, 6],
                                            },
                                            'D': {
                                                'E': [7, 8, 9],
                                            }
                                        })

        story_1 = media['A']['feeds']['B']['stories']['1']
        story_2 = media['A']['feeds']['B']['stories']['2']
        story_3 = media['A']['feeds']['B']['stories']['3']
        story_4 = media['A']['feeds']['C']['stories']['4']

        self.db().query(
            """
            INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id)
            VALUES (%(source_stories_id)s, %(target_stories_id)s)
        """, {
                'source_stories_id': story_2['stories_id'],
                'target_stories_id': story_1['stories_id'],
            })

        self.db().query(
            """
            INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id)
            VALUES (%(source_stories_id)s, %(target_stories_id)s)
        """, {
                'source_stories_id': story_3['stories_id'],
                'target_stories_id': story_2['stories_id'],
            })

        self.db().create(
            table='tag_sets',
            insert_hash={'name': 'foo'},
        )

        topic = create_test_topic(db=self.db(), label='foo')

        self.db().create(table='topic_stories',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_4['stories_id'],
                         })

        self.db().create(table='topic_stories',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_1['stories_id'],
                         })

        self.db().create(table='topic_links',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_4['stories_id'],
                             'ref_stories_id': story_1['stories_id'],
                             'url': story_1['url'],
                             'redirect_url': story_1['url'] + "/redirect_url",
                         })

        self.db().create(table='topic_stories',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_2['stories_id'],
                         })

        self.db().create(table='topic_links',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_4['stories_id'],
                             'ref_stories_id': story_2['stories_id'],
                             'url': story_2['url'],
                             'redirect_url': story_2['url'] + "/redirect_url",
                         })

        self.db().create(table='topic_stories',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_3['stories_id']
                         })

        self.db().create(table='topic_links',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_4['stories_id'],
                             'ref_stories_id': story_3['stories_id'],
                             'url': story_3['url'] + '/alternate',
                         })

        test_url = story_1['url'] + self.CRUFT

        expected_urls = {
            story_1['url'],
            story_1['url'] + self.CRUFT,
            story_2['url'],
            story_1['url'] + "/redirect_url",
            story_2['url'] + "/redirect_url",
            story_3['url'],
            story_3['url'] + "/alternate",
        }

        url_variants = all_url_variants(db=self.db(), url=test_url)

        assert len(expected_urls) == len(url_variants)

        sorted_expected_urls = sorted(expected_urls)
        sorted_url_variants = sorted(url_variants)

        for i in range(len(sorted_expected_urls)):
            assert urls_are_equal(url1=sorted_expected_urls[i],
                                  url2=sorted_url_variants[i])
예제 #20
0
        def __inner_follow_redirects(
                response_: Response,
                meta_redirects_left: int) -> Union[Response, None]:

            from mediawords.util.web.user_agent.html_redirects import (
                target_request_from_meta_refresh_url,
                target_request_from_archive_org_url,
                target_request_from_archive_is_url,
                target_request_from_linkis_com_url,
                target_request_from_alarabiya_url,
            )

            if response_ is None:
                raise McGetFollowHTTPHTMLRedirectsException(
                    "Response is None.")

            if response_.is_success():

                base_url = get_base_url(response_.request().url())

                html_redirect_functions = [
                    target_request_from_meta_refresh_url,
                    target_request_from_archive_org_url,
                    target_request_from_archive_is_url,
                    target_request_from_linkis_com_url,
                    target_request_from_alarabiya_url,
                ]
                for html_redirect_function in html_redirect_functions:
                    request_after_meta_redirect = html_redirect_function(
                        content=response_.decoded_content(),
                        archive_site_url=base_url,
                    )
                    if request_after_meta_redirect is not None:
                        if not urls_are_equal(
                                url1=response_.request().url(),
                                url2=request_after_meta_redirect.url()):

                            log.debug("URL after HTML redirects: %s" %
                                      request_after_meta_redirect.url())

                            orig_redirect_response = self.request(
                                request=request_after_meta_redirect)
                            redirect_response = orig_redirect_response

                            # Response might have its previous() already set due to HTTP redirects,
                            # so we have to find the initial response first
                            previous = None
                            for x in range(self.max_redirect() + 1):
                                previous = redirect_response.previous()
                                if previous is None:
                                    break
                                redirect_response = previous

                            if previous is not None:
                                raise McGetFollowHTTPHTMLRedirectsException(
                                    "Can't find the initial redirected response; URL: %s"
                                    % request_after_meta_redirect.url())

                            log.debug(
                                "Setting previous of URL %(url)s to %(previous_url)s"
                                % {
                                    'url': redirect_response.request().url(),
                                    'previous_url': response_.request().url(),
                                })
                            redirect_response.set_previous(response_)

                            meta_redirects_left = meta_redirects_left - 1

                            return __inner(
                                response_=orig_redirect_response,
                                meta_redirects_left=meta_redirects_left,
                            )

                # No <meta /> refresh, the current URL is the final one
                return response_

            else:
                log.debug("Request to %s was unsuccessful: %s" % (
                    response_.request().url(),
                    response_.status_line(),
                ))

                # Return the original URL and give up
                return None