def test_gzip_bad_input(): with pytest.raises(McGzipException): # noinspection PyTypeChecker gzip(None) with pytest.raises(McGunzipException): # noinspection PyTypeChecker gunzip(None) with pytest.raises(McGunzipException): gunzip(b'') with pytest.raises(McGunzipException): gunzip(b'No way this is valid Gzip data')
def _compress_data_for_method(data: Union[bytes, str], compression_method: Compression) -> bytes: """Compress data.""" if data is None: raise McKeyValueStoreCompressionException("Data is None.") if isinstance(data, str): data = data.encode('utf-8') if not isinstance(data, bytes): raise McKeyValueStoreCompressionException( "Data is not str or bytes: %s" % str(data)) if compression_method == KeyValueStore.Compression.NONE: pass elif compression_method == KeyValueStore.Compression.GZIP: data = gzip(data) elif compression_method == KeyValueStore.Compression.BZIP2: data = bzip2(data) else: raise McKeyValueStoreCompressionException( "Invalid compression method: %s" % compression_method) return data
def __inner_test_gzip(data_: bytes) -> None: gzipped_data = gzip(data_) assert len(gzipped_data) > 0 assert isinstance(gzipped_data, bytes) assert gzipped_data != data_ gunzipped_data = gunzip(gzipped_data) assert gunzipped_data == data_
def store_model(self, model_data: bytes) -> int: compressed_model_data = gzip(model_data) models_id = self.__db.query(""" INSERT INTO snap.word2vec_models (topics_id, snapshots_id, raw_data) VALUES (%(topics_id)s, %(snapshots_id)s, %(raw_data)s) RETURNING snap_word2vec_models_id """, { 'topics_id': self.__topics_id, 'snapshots_id': self.__snapshots_id, 'raw_data': compressed_model_data, }).flat()[0] return models_id
def _compress_data_for_method(data: Union[bytes, str], compression_method: Compression) -> bytes: """Compress data.""" if data is None: raise McKeyValueStoreCompressionException("Data is None.") if isinstance(data, str): data = data.encode('utf-8') if not isinstance(data, bytes): raise McKeyValueStoreCompressionException("Data is not str or bytes: %s" % str(data)) if compression_method == KeyValueStore.Compression.NONE: pass elif compression_method == KeyValueStore.Compression.GZIP: data = gzip(data) elif compression_method == KeyValueStore.Compression.BZIP2: data = bzip2(data) else: raise McKeyValueStoreCompressionException("Invalid compression method: %s" % compression_method) return data
def test_sitemap_tree_for_homepage_huge_sitemap(self): """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling).""" page_count = 1000 sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> """ for x in range(page_count): sitemap_xml += """ <url> <loc>{base_url}/news/page_{x}.html</loc> <!-- Element present but empty --> <lastmod /> <!-- Some other XML namespace --> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/page_{x}.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> """.format( x=x, base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, ) sitemap_xml += "</urlset>" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap.xml.gz """.format(base_url=self.__test_url)).strip(), }, '/sitemap.xml.gz': { 'header': 'Content-Type: application/x-gzip', 'content': gzip(sitemap_xml), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() assert len(actual_sitemap_tree.all_pages()) == page_count
def test_sitemap_tree_for_homepage_plain_text(self): """Test sitemap_tree_for_homepage() with plain text sitemaps.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_1.txt Sitemap: {base_url}/sitemap_2.txt.dat """.format(base_url=self.__test_url)).strip(), }, # Plain text uncompressed sitemap '/sitemap_1.txt': { 'content': textwrap.dedent(""" {base_url}/news/foo.html {base_url}/news/bar.html Some other stuff which totally doesn't look like an URL """.format(base_url=self.__test_url)).strip(), }, # Plain text compressed sitemap without .gz extension '/sitemap_2.txt.dat': { 'header': 'Content-Type: application/x-gzip', 'content': gzip( textwrap.dedent(""" {base_url}/news/bar.html {base_url}/news/baz.html """.format(base_url=self.__test_url)).strip()), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 2 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap_1, PagesTextSitemap) assert len(sitemap_1.pages) == 2 sitemap_2 = actual_sitemap_tree.sub_sitemaps[1] assert isinstance(sitemap_2, PagesTextSitemap) assert len(sitemap_2.pages) == 2 pages = actual_sitemap_tree.all_pages() assert len(pages) == 3 print(pages) assert SitemapPage( url='{}/news/foo.html'.format(self.__test_url)) in pages assert SitemapPage( url='{}/news/bar.html'.format(self.__test_url)) in pages assert SitemapPage( url='{}/news/baz.html'.format(self.__test_url)) in pages
def test_sitemap_tree_for_homepage_gzip(self): """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_1.gz Sitemap: {base_url}/sitemap_2.dat """.format(base_url=self.__test_url)).strip(), }, # Gzipped sitemap without correct HTTP header but with .gz extension '/sitemap_1.gz': { 'content': gzip( textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/foo.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip()), }, # Gzipped sitemap with correct HTTP header but without .gz extension '/sitemap_2.dat': { 'header': 'Content-Type: application/x-gzip', 'content': gzip( textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/baz.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip()), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() # Don't do an in-depth check, we just need to make sure that gunzip works assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 2 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap_1, PagesXMLSitemap) assert len(sitemap_1.pages) == 1 sitemap_2 = actual_sitemap_tree.sub_sitemaps[1] assert isinstance(sitemap_2, PagesXMLSitemap) assert len(sitemap_2.pages) == 1
def __inner_test_wrong_algorithm(data_: bytes) -> None: with pytest.raises(McBunzip2Exception): bunzip2(gzip(data_)) with pytest.raises(McGunzipException): gunzip(bzip2(data_))
def test_sitemap_tree_for_homepage_huge_sitemap(self): """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling).""" page_count = 1000 sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> """ for x in range(page_count): sitemap_xml += """ <url> <loc>{base_url}/news/page_{x}.html</loc> <!-- Element present but empty --> <lastmod /> <!-- Some other XML namespace --> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/page_{x}.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> """.format( x=x, base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, ) sitemap_xml += "</urlset>" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap.xml.gz """.format(base_url=self.__test_url)).strip(), }, '/sitemap.xml.gz': { 'header': 'Content-Type: application/x-gzip', 'content': gzip(sitemap_xml), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() assert len(actual_sitemap_tree.all_pages()) == page_count
def test_sitemap_tree_for_homepage_plain_text(self): """Test sitemap_tree_for_homepage() with plain text sitemaps.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_1.txt Sitemap: {base_url}/sitemap_2.txt.dat """.format(base_url=self.__test_url)).strip(), }, # Plain text uncompressed sitemap '/sitemap_1.txt': { 'content': textwrap.dedent(""" {base_url}/news/foo.html {base_url}/news/bar.html Some other stuff which totally doesn't look like an URL """.format(base_url=self.__test_url)).strip(), }, # Plain text compressed sitemap without .gz extension '/sitemap_2.txt.dat': { 'header': 'Content-Type: application/x-gzip', 'content': gzip(textwrap.dedent(""" {base_url}/news/bar.html {base_url}/news/baz.html """.format(base_url=self.__test_url)).strip()), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 2 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap_1, PagesTextSitemap) assert len(sitemap_1.pages) == 2 sitemap_2 = actual_sitemap_tree.sub_sitemaps[1] assert isinstance(sitemap_2, PagesTextSitemap) assert len(sitemap_2.pages) == 2 pages = actual_sitemap_tree.all_pages() assert len(pages) == 3 print(pages) assert SitemapPage(url='{}/news/foo.html'.format(self.__test_url)) in pages assert SitemapPage(url='{}/news/bar.html'.format(self.__test_url)) in pages assert SitemapPage(url='{}/news/baz.html'.format(self.__test_url)) in pages
def test_sitemap_tree_for_homepage_gzip(self): """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_1.gz Sitemap: {base_url}/sitemap_2.dat """.format(base_url=self.__test_url)).strip(), }, # Gzipped sitemap without correct HTTP header but with .gz extension '/sitemap_1.gz': { 'content': gzip(textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/foo.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip()), }, # Gzipped sitemap with correct HTTP header but without .gz extension '/sitemap_2.dat': { 'header': 'Content-Type: application/x-gzip', 'content': gzip(textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/baz.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip()), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() # Don't do an in-depth check, we just need to make sure that gunzip works assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 2 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap_1, PagesXMLSitemap) assert len(sitemap_1.pages) == 1 sitemap_2 = actual_sitemap_tree.sub_sitemaps[1] assert isinstance(sitemap_2, PagesXMLSitemap) assert len(sitemap_2.pages) == 1