def test_fix_common_url_mistakes(): urls = { # "http://http://" 'http://http://www.al-monitor.com/pulse': 'http://www.al-monitor.com/pulse', # With only one slash ("http:/www.") 'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled': 'http://www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled', # missing / before ? 'http://foo.bar?baz=bat': 'http://foo.bar/?baz=bat', # Whitespace ' http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html ': 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html', # Missing port 'https://www.gpo.gov:/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf': 'https://www.gpo.gov/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf' } for orig_url, fixed_url in urls.items(): # Fix once assert mc_url.urls_are_equal(url1=mc_url.fix_common_url_mistakes(orig_url), url2=fixed_url) # Try fixing the same URL twice, see what happens assert mc_url.urls_are_equal( url1=mc_url.fix_common_url_mistakes(mc_url.fix_common_url_mistakes(orig_url)), url2=fixed_url, )
def test_fix_common_url_mistakes(): urls = { # "http://http://" 'http://http://www.al-monitor.com/pulse': 'http://www.al-monitor.com/pulse', # With only one slash ("http:/www.") 'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled': 'http://www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled', # missing / before ? 'http://foo.bar?baz=bat': 'http://foo.bar/?baz=bat', # Whitespace ' http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html ': 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html', # Missing port 'https://www.gpo.gov:/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf': 'https://www.gpo.gov/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf', # Non-URLencoded space 'http://www.ldeo.columbia.edu/~peter/ site/Home.html': 'http://www.ldeo.columbia.edu/~peter/%20site/Home.html', } for orig_url, fixed_url in urls.items(): # Fix once assert mc_url.urls_are_equal(url1=mc_url.fix_common_url_mistakes(orig_url), url2=fixed_url) # Try fixing the same URL twice, see what happens assert mc_url.urls_are_equal( url1=mc_url.fix_common_url_mistakes(mc_url.fix_common_url_mistakes(orig_url)), url2=fixed_url, )
def test_target_request_from_linkis_com_url(): # linkis.com <meta> assert urls_are_equal( url1=target_request_from_linkis_com_url( content='<meta property="og:url" content="http://og.url/test"', archive_site_url='https://linkis.com/foo.com/ASDF' ).url(), url2='http://og.url/test', ) # linkis.com YouTube assert urls_are_equal( url1=target_request_from_linkis_com_url( content='<a class="js-youtube-ln-event" href="http://you.tube/test"', archive_site_url='https://linkis.com/foo.com/ASDF' ).url(), url2='http://you.tube/test', ) # 'linkis.com <iframe>' assert urls_are_equal( url1=target_request_from_linkis_com_url( content='<iframe id="source_site" src="http://source.site/test"', archive_site_url='https://linkis.com/foo.com/ASDF' ).url(), url2='http://source.site/test', ) # linkis.com JavaScript assert urls_are_equal( url1=target_request_from_linkis_com_url( content='"longUrl":"http:\/\/java.script\/test"', archive_site_url='https://linkis.com/foo.com/ASDF' ).url(), url2='http://java.script/test', ) # linkis.com with non-matching URL assert target_request_from_linkis_com_url( content='<meta property="og:url" content="http://og.url/test"', archive_site_url='https://bar.com/foo/bar' ) is None
def test_target_request_from_linkis_com_url(): # linkis.com <meta> assert urls_are_equal( url1=target_request_from_linkis_com_url( content='<meta property="og:url" content="http://og.url/test"', archive_site_url='https://linkis.com/foo.com/ASDF' ).url(), url2='http://og.url/test', ) # linkis.com YouTube assert urls_are_equal( url1=target_request_from_linkis_com_url( content='<a class="js-youtube-ln-event" href="http://you.tube/test"', archive_site_url='https://linkis.com/foo.com/ASDF' ).url(), url2='http://you.tube/test', ) # 'linkis.com <iframe>' assert urls_are_equal( url1=target_request_from_linkis_com_url( content='<iframe id="source_site" src="http://source.site/test"', archive_site_url='https://linkis.com/foo.com/ASDF' ).url(), url2='http://source.site/test', ) # linkis.com JavaScript assert urls_are_equal( url1=target_request_from_linkis_com_url( content=r'"longUrl":"http:\/\/java.script\/test"', archive_site_url='https://linkis.com/foo.com/ASDF' ).url(), url2='http://java.script/test', ) # linkis.com with non-matching URL assert target_request_from_linkis_com_url( content='<meta property="og:url" content="http://og.url/test"', archive_site_url='https://bar.com/foo/bar' ) is None
def test_target_request_from_alarabiya_url(): # Alarabiya URL test_cookie_name = 'YPF8827340282Jdskjhfiw_928937459182JAX666' test_cookie_value = '78.60.231.222' test_content = """ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> <meta http-equiv="Content-Script-Type" content="text/javascript"> <script type="text/javascript"> // ... setCookie('%(cookie_name)s', '%(cookie_value)s', 10); // ... </script> </head> <body> <noscript>This site requires JavaScript and Cookies to be enabled. Please change your browser settings or upgrade your browser.</noscript> </body> </html> """ % { 'cookie_name': test_cookie_name, 'cookie_value': test_cookie_value } test_url = ( 'https://english.alarabiya.net/en/News/middle-east/2017/07/21/Israel-bars-Muslim-men-under-50-from-' 'entering-Al-Aqsa-for-Friday-prayers.html') test_target_request = target_request_from_alarabiya_url( content=test_content, archive_site_url=test_url) assert urls_are_equal(url1=test_target_request.url(), url2=test_url) assert test_target_request.header('Cookie') == "%s=%s" % ( test_cookie_name, test_cookie_value, ) # Non-Alarabiya URL assert target_request_from_alarabiya_url( content=test_content, archive_site_url='http://some-other-url.com/') is None
def test_target_request_from_archive_org_url(): # archive.org assert urls_are_equal( url1=target_request_from_archive_org_url( content=None, archive_site_url= 'https://web.archive.org/web/20150204024130/http://www.john-daly.com/hockey/hockey.htm' ).url(), url2='http://www.john-daly.com/hockey/hockey.htm', ) # archive.org with non-matching URL assert target_request_from_archive_org_url( content=None, archive_site_url='http://www.john-daly.com/hockey/hockey.htm') is None
def test_target_request_from_archive_org_url(): # archive.org assert urls_are_equal( url1=target_request_from_archive_org_url( content=None, archive_site_url='https://web.archive.org/web/20150204024130/http://www.john-daly.com/hockey/hockey.htm' ).url(), url2='http://www.john-daly.com/hockey/hockey.htm', ) # archive.org with non-matching URL assert target_request_from_archive_org_url( content=None, archive_site_url='http://www.john-daly.com/hockey/hockey.htm' ) is None
def test_target_request_from_archive_is_url(): # archive.is assert urls_are_equal( url1=target_request_from_archive_is_url( content=""" <link rel="canonical" href="https://archive.is/20170201/https://bar.com/foo/bar"> """, archive_site_url= 'https://archive.is/20170201/https://bar.com/foo/bar').url(), url2='https://bar.com/foo/bar', ) # archive.is with non-matching URL assert target_request_from_archive_is_url( content=""" <link rel="canonical" href="https://archive.is/20170201/https://bar.com/foo/bar"> """, archive_site_url='https://bar.com/foo/bar') is None
def test_target_request_from_archive_is_url(): # archive.is assert urls_are_equal( url1=target_request_from_archive_is_url( content=""" <link rel="canonical" href="https://archive.is/20170201/https://bar.com/foo/bar"> """, archive_site_url='https://archive.is/20170201/https://bar.com/foo/bar' ).url(), url2='https://bar.com/foo/bar', ) # archive.is with non-matching URL assert target_request_from_archive_is_url( content=""" <link rel="canonical" href="https://archive.is/20170201/https://bar.com/foo/bar"> """, archive_site_url='https://bar.com/foo/bar' ) is None
def test_target_request_from_meta_refresh_url(): # <meta> refresh assert urls_are_equal( url1=target_request_from_meta_refresh_url( content=""" <HTML> <HEAD> <TITLE>This is a test</TITLE> <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8"> <META HTTP-EQUIV="refresh" CONTENT="0; URL=http://example.com/"> </HEAD> <BODY> <P>This is a test.</P> </BODY> </HTML> """, archive_site_url='http://example2.com/').url(), url2='http://example.com/', )
def test_target_request_from_alarabiya_url(): # Alarabiya URL test_cookie_name = 'YPF8827340282Jdskjhfiw_928937459182JAX666' test_cookie_value = '78.60.231.222' test_content = """ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> <meta http-equiv="Content-Script-Type" content="text/javascript"> <script type="text/javascript"> // ... setCookie('%(cookie_name)s', '%(cookie_value)s', 10); // ... </script> </head> <body> <noscript>This site requires JavaScript and Cookies to be enabled. Please change your browser settings or upgrade your browser.</noscript> </body> </html> """ % {'cookie_name': test_cookie_name, 'cookie_value': test_cookie_value} test_url = ('https://english.alarabiya.net/en/News/middle-east/2017/07/21/Israel-bars-Muslim-men-under-50-from-' 'entering-Al-Aqsa-for-Friday-prayers.html') test_target_request = target_request_from_alarabiya_url(content=test_content, archive_site_url=test_url) assert urls_are_equal(url1=test_target_request.url(), url2=test_url) assert test_target_request.header('Cookie') == "%s=%s" % (test_cookie_name, test_cookie_value,) # Non-Alarabiya URL assert target_request_from_alarabiya_url( content=test_content, archive_site_url='http://some-other-url.com/' ) is None
def test_target_request_from_meta_refresh_url(): # <meta> refresh assert urls_are_equal( url1=target_request_from_meta_refresh_url( content=""" <HTML> <HEAD> <TITLE>This is a test</TITLE> <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8"> <META HTTP-EQUIV="refresh" CONTENT="0; URL=http://example.com/"> </HEAD> <BODY> <P>This is a test.</P> </BODY> </HTML> """, archive_site_url='http://example2.com/' ).url(), url2='http://example.com/', )
def __get_follow_http_html_redirects_follow_redirects(self, response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException("Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: log.warning( "meta redirect from %s: %s" % (html_redirect_function, request_after_meta_redirect.url())) if not urls_are_equal(url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request(request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url() ) log.debug("Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return self.__get_follow_http_html_redirects( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % (response_.request().url(), response_.status_line(),)) # Return the original URL and give up return None
def test_http_hash_server(): port = random_unused_port() base_url = 'http://localhost:%d' % port def __simple_callback(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback', 'method': request.method(), 'url': request.url(), 'content-type': request.content_type(), 'params': request.query_params(), 'cookies': request.cookies(), }) return str.encode(r) # noinspection PyUnusedLocal def __callback_cookie_redirect(request: HashServer.Request) -> str: r = "" r += "HTTP/1.0 302 Moved Temporarily\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "Location: /check_cookie\r\n" r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n" r += "\r\n" r += "Redirecting to the cookie check page..." return r def __callback_post(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback_post', 'post_data': request.content(), }) return str.encode(r) pages = { '/': 'home', '/foo': b'foo', '/bar': 'bar ąą', '/foo-bar': {b'redirect': b'/bar'}, '/localhost': {'redirect': "http://localhost:%d/" % port}, b'/127-foo': {b'redirect': "http://127.0.0.1:%d/foo" % port}, '/auth': {b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc"}, '/404': {b'content': b'not found', b'http_status_code': 404}, '/callback': {b'callback': __simple_callback}, # Test setting cookies, redirects '/callback_cookie_redirect': {'callback': __callback_cookie_redirect}, # POST data '/callback_post': {'callback': __callback_post}, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) assert str(requests.get('%s/' % base_url).text) == 'home' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/localhost' % base_url).text) == 'home' assert str(requests.get('%s/127-foo' % base_url).text) == 'foo' # Path normalization assert str(requests.get('%s//' % base_url).text) == 'home' assert str(requests.get('%s///' % base_url).text) == 'home' assert str(requests.get('%s/something/../' % base_url).text) == 'home' assert str(requests.get('%s/something/..//' % base_url).text) == 'home' assert str(requests.get('%s/something/..///' % base_url).text) == 'home' assert str(requests.get('%s/foo/' % base_url).text) == 'foo' assert str(requests.get('%s/foo//' % base_url).text) == 'foo' assert str(requests.get('%s/foo///' % base_url).text) == 'foo' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo' response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={'cookie_name': 'cookie_value'}).json() assert response_json == { 'name': 'callback', 'method': 'GET', 'url': 'http://localhost:%d/callback?a=b&c=d' % port, 'content-type': None, 'params': { 'a': 'b', 'c': 'd', }, 'cookies': { 'cookie_name': 'cookie_value', }, } response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False) assert response.status_code == 302 assert response.headers['Location'] == '/check_cookie' response = requests.get("%s/404" % base_url) assert response.status_code == HTTPStatus.NOT_FOUND.value assert 'Not Found' in response.reason auth_url = "%s/auth" % base_url assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED response = requests.get(auth_url, auth=('foo', 'bar')) assert response.status_code == HTTPStatus.OK assert response.content == b"foo bar \xf0\x90\x28\xbc" assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port) with pytest.raises(McHashServerException): hs.page_url('/does-not-exist') response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json() assert response_json == { 'name': 'callback_post', 'post_data': 'abc=def', } hs.stop()
def test_urls_are_equal(): # Invalid input with pytest.raises(mc_url.McURLsAreEqualException): # noinspection PyTypeChecker mc_url.urls_are_equal(url1=None, url2=None) with pytest.raises(mc_url.McURLsAreEqualException): # noinspection PyTypeChecker mc_url.urls_are_equal(url1=None, url2='https://web.mit.edu/') with pytest.raises(mc_url.McURLsAreEqualException): # noinspection PyTypeChecker mc_url.urls_are_equal(url1='https://web.mit.edu/', url2=None) # Not URLs assert mc_url.urls_are_equal(url1='Not an URL.', url2='Not an URL.') is False funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises%20new%20' 'doubts%20about%20safety%20of%20live%20entertainment') assert mc_url.urls_are_equal(url1=funky_url, url2=funky_url) is False assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://web.mit.edu/') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU/') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU//') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443/') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443//') is True assert mc_url.urls_are_equal(url1='http://web.mit.edu/', url2='http://WEB.MIT.EDU:80//') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443//page') is False
def test_get_topic_url_variants(self): media = create_test_story_stack( db=self.db(), data={ 'A': { 'B': [1, 2, 3], 'C': [4, 5, 6], }, 'D': { 'E': [7, 8, 9], } } ) story_1 = media['A']['feeds']['B']['stories']['1'] story_2 = media['A']['feeds']['B']['stories']['2'] story_3 = media['A']['feeds']['B']['stories']['3'] story_4 = media['A']['feeds']['C']['stories']['4'] self.db().query(""" INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id) VALUES (%(source_stories_id)s, %(target_stories_id)s) """, { 'source_stories_id': story_2['stories_id'], 'target_stories_id': story_1['stories_id'], }) self.db().query(""" INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id) VALUES (%(source_stories_id)s, %(target_stories_id)s) """, { 'source_stories_id': story_3['stories_id'], 'target_stories_id': story_2['stories_id'], }) self.db().create( table='tag_sets', insert_hash={'name': 'foo'}, ) topic = create_test_topic(db=self.db(), label='foo') self.db().create( table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], } ) self.db().create( table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_1['stories_id'], } ) self.db().create( table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_1['stories_id'], 'url': story_1['url'], 'redirect_url': story_1['url'] + "/redirect_url", } ) self.db().create( table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_2['stories_id'], } ) self.db().create( table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_2['stories_id'], 'url': story_2['url'], 'redirect_url': story_2['url'] + "/redirect_url", } ) self.db().create( table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_3['stories_id'] } ) self.db().create( table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_3['stories_id'], 'url': story_3['url'] + '/alternate', } ) test_url = story_1['url'] + self.CRUFT expected_urls = { story_1['url'], story_1['url'] + self.CRUFT, story_2['url'], story_1['url'] + "/redirect_url", story_2['url'] + "/redirect_url", story_3['url'], story_3['url'] + "/alternate", } url_variants = all_url_variants(db=self.db(), url=test_url) assert len(expected_urls) == len(url_variants) sorted_expected_urls = sorted(expected_urls) sorted_url_variants = sorted(url_variants) for i in range(len(sorted_expected_urls)): assert urls_are_equal(url1=sorted_expected_urls[i], url2=sorted_url_variants[i])
def test_http_hash_server(): port = random_unused_port() base_url = 'http://localhost:%d' % port def __simple_callback(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback', 'method': request.method(), 'url': request.url(), 'content-type': request.content_type(), 'params': request.query_params(), 'cookies': request.cookies(), }) return str.encode(r) # noinspection PyUnusedLocal def __callback_cookie_redirect(request: HashServer.Request) -> str: r = "" r += "HTTP/1.0 302 Moved Temporarily\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "Location: /check_cookie\r\n" r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n" r += "\r\n" r += "Redirecting to the cookie check page..." return r def __callback_post(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback_post', 'post_data': request.content(), }) return str.encode(r) pages = { '/': 'home', '/foo': b'foo', '/bar': 'bar ąą', '/foo-bar': { b'redirect': b'/bar' }, '/localhost': { 'redirect': "http://localhost:%d/" % port }, b'/127-foo': { b'redirect': "http://127.0.0.1:%d/foo" % port }, '/auth': { b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc" }, '/404': { b'content': b'not found', b'http_status_code': 404 }, '/callback': { b'callback': __simple_callback }, # Test setting cookies, redirects '/callback_cookie_redirect': { 'callback': __callback_cookie_redirect }, # POST data '/callback_post': { 'callback': __callback_post }, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) assert str(requests.get('%s/' % base_url).text) == 'home' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/localhost' % base_url).text) == 'home' assert str(requests.get('%s/127-foo' % base_url).text) == 'foo' # Path normalization assert str(requests.get('%s//' % base_url).text) == 'home' assert str(requests.get('%s///' % base_url).text) == 'home' assert str(requests.get('%s/something/../' % base_url).text) == 'home' assert str(requests.get('%s/something/..//' % base_url).text) == 'home' assert str(requests.get('%s/something/..///' % base_url).text) == 'home' assert str(requests.get('%s/foo/' % base_url).text) == 'foo' assert str(requests.get('%s/foo//' % base_url).text) == 'foo' assert str(requests.get('%s/foo///' % base_url).text) == 'foo' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo' response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={ 'cookie_name': 'cookie_value' }).json() assert response_json == { 'name': 'callback', 'method': 'GET', 'url': 'http://localhost:%d/callback?a=b&c=d' % port, 'content-type': None, 'params': { 'a': 'b', 'c': 'd', }, 'cookies': { 'cookie_name': 'cookie_value', }, } response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False) assert response.status_code == 302 assert response.headers['Location'] == '/check_cookie' response = requests.get("%s/404" % base_url) assert response.status_code == HTTPStatus.NOT_FOUND.value assert 'Not Found' in response.reason auth_url = "%s/auth" % base_url assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED response = requests.get(auth_url, auth=('foo', 'bar')) assert response.status_code == HTTPStatus.OK assert response.content == b"foo bar \xf0\x90\x28\xbc" assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port) with pytest.raises(McHashServerException): hs.page_url('/does-not-exist') response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json() assert response_json == { 'name': 'callback_post', 'post_data': 'abc=def', } hs.stop()
def test_urls_are_equal(): # Invalid input with pytest.raises(mc_url.McURLsAreEqualException): # noinspection PyTypeChecker mc_url.urls_are_equal(url1=None, url2=None) with pytest.raises(mc_url.McURLsAreEqualException): # noinspection PyTypeChecker mc_url.urls_are_equal(url1=None, url2='https://web.mit.edu/') with pytest.raises(mc_url.McURLsAreEqualException): # noinspection PyTypeChecker mc_url.urls_are_equal(url1='https://web.mit.edu/', url2=None) # Not URLs assert mc_url.urls_are_equal(url1='Not an URL.', url2='Not an URL.') is False funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises%20new%20' 'doubts%20about%20safety%20of%20live%20entertainment') assert mc_url.urls_are_equal(url1=funky_url, url2=funky_url) is False assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://web.mit.edu/') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU/') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU//') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443/') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443//') is True assert mc_url.urls_are_equal(url1='http://web.mit.edu/', url2='http://WEB.MIT.EDU:80//') is True assert mc_url.urls_are_equal(url1='https://web.mit.edu/', url2='https://WEB.MIT.EDU:443//page') is False
def test_get_topic_url_variants(self): media = create_test_story_stack(db=self.db(), data={ 'A': { 'B': [1, 2, 3], 'C': [4, 5, 6], }, 'D': { 'E': [7, 8, 9], } }) story_1 = media['A']['feeds']['B']['stories']['1'] story_2 = media['A']['feeds']['B']['stories']['2'] story_3 = media['A']['feeds']['B']['stories']['3'] story_4 = media['A']['feeds']['C']['stories']['4'] self.db().query( """ INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id) VALUES (%(source_stories_id)s, %(target_stories_id)s) """, { 'source_stories_id': story_2['stories_id'], 'target_stories_id': story_1['stories_id'], }) self.db().query( """ INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id) VALUES (%(source_stories_id)s, %(target_stories_id)s) """, { 'source_stories_id': story_3['stories_id'], 'target_stories_id': story_2['stories_id'], }) self.db().create( table='tag_sets', insert_hash={'name': 'foo'}, ) topic = create_test_topic(db=self.db(), label='foo') self.db().create(table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], }) self.db().create(table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_1['stories_id'], }) self.db().create(table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_1['stories_id'], 'url': story_1['url'], 'redirect_url': story_1['url'] + "/redirect_url", }) self.db().create(table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_2['stories_id'], }) self.db().create(table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_2['stories_id'], 'url': story_2['url'], 'redirect_url': story_2['url'] + "/redirect_url", }) self.db().create(table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_3['stories_id'] }) self.db().create(table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_3['stories_id'], 'url': story_3['url'] + '/alternate', }) test_url = story_1['url'] + self.CRUFT expected_urls = { story_1['url'], story_1['url'] + self.CRUFT, story_2['url'], story_1['url'] + "/redirect_url", story_2['url'] + "/redirect_url", story_3['url'], story_3['url'] + "/alternate", } url_variants = all_url_variants(db=self.db(), url=test_url) assert len(expected_urls) == len(url_variants) sorted_expected_urls = sorted(expected_urls) sorted_url_variants = sorted(url_variants) for i in range(len(sorted_expected_urls)): assert urls_are_equal(url1=sorted_expected_urls[i], url2=sorted_url_variants[i])
def __inner_follow_redirects( response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException( "Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: if not urls_are_equal( url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request( request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url()) log.debug( "Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return __inner( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % ( response_.request().url(), response_.status_line(), )) # Return the original URL and give up return None