def test_javascript_saving(http_params, xpi, server): """ check that javascript content is saved and hashed correctly """ test_url = utilities.BASE_TEST_URL + "/http_test_page.html" manager_params, browser_params = http_params() for browser_param in browser_params: browser_param.http_instrument = True browser_param.save_content = "script" structured_storage = SQLiteStorageProvider( db_path=manager_params.data_directory / "crawl-data.sqlite") ldb_path = Path(manager_params.data_directory) / "content.ldb" unstructured_storage = LevelDbProvider(db_path=ldb_path) manager = task_manager.TaskManager(manager_params, browser_params, structured_storage, unstructured_storage) manager.get(url=test_url, sleep=1) manager.close() expected_hashes = { "0110c0521088c74f179615cd7c404816816126fa657550032f75ede67a66c7cc", "b34744034cd61e139f85f6c4c92464927bed8343a7ac08acf9fb3c6796f80f08", } for chash, content in db_utils.get_content(ldb_path): chash = chash.decode("ascii").lower() pyhash = sha256(content).hexdigest().lower() assert pyhash == chash # Verify expected key (sha256 of content) assert chash in expected_hashes expected_hashes.remove(chash) assert len(expected_hashes) == 0 # All expected hashes have been seen
def test_document_saving(http_params, xpi, server): """ check that document content is saved and hashed correctly """ test_url = utilities.BASE_TEST_URL + "/http_test_page.html" expected_hashes = { "2390eceab422db15bc45940b7e042e83e6cbd5f279f57e714bc4ad6cded7f966", "25343f42d9ffa5c082745f775b172db87d6e14dfbc3160b48669e06d727bfc8d", } manager_params, browser_params = http_params() for browser_param in browser_params: browser_param.http_instrument = True browser_param.save_content = "main_frame,sub_frame" structured_storage = SQLiteStorageProvider( db_path=manager_params.data_directory / "crawl-data.sqlite") ldb_path = Path(manager_params.data_directory) / "content.ldb" unstructured_storage = LevelDbProvider(db_path=ldb_path) manager = task_manager.TaskManager(manager_params, browser_params, structured_storage, unstructured_storage) manager.get(url=test_url, sleep=1) manager.close() for chash, content in db_utils.get_content(ldb_path): chash = chash.decode("ascii").lower() pyhash = sha256(content).hexdigest().lower() assert pyhash == chash # Verify expected key (sha256 of content) assert chash in expected_hashes expected_hashes.remove(chash) assert len(expected_hashes) == 0 # All expected hashes have been seen
def unstructured_provider( request: Any, tmp_path_factory: Any) -> UnstructuredStorageProvider: if request.param == memory_unstructured: return MemoryUnstructuredProvider() elif request.param == leveldb: tmp_path = tmp_path_factory.mktemp(leveldb) return LevelDbProvider(tmp_path / "content.ldb") elif request.param == local_gzip: tmp_path = tmp_path_factory.mktemp(local_gzip) return LocalGzipProvider(tmp_path) assert isinstance( request, FixtureRequest ) # See https://github.com/pytest-dev/pytest/issues/8073 for why this can't be type annotated request.raiseerror("invalid internal test config")
def test_content_saving(http_params, xpi, server): """ check that content is saved and hashed correctly """ test_url = utilities.BASE_TEST_URL + "/http_test_page.html" manager_params, browser_params = http_params() for browser_param in browser_params: browser_param.http_instrument = True browser_param.save_content = True db = manager_params.data_directory / "crawl-data.sqlite" structured_storage = SQLiteStorageProvider(db_path=db) ldb_path = Path(manager_params.data_directory) / "content.ldb" unstructured_storage = LevelDbProvider(db_path=ldb_path) manager = task_manager.TaskManager(manager_params, browser_params, structured_storage, unstructured_storage) manager.get(url=test_url, sleep=1) manager.close() rows = db_utils.query_db(db, "SELECT * FROM http_responses;") disk_content = dict() for row in rows: if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]: continue path = urlparse(row["url"]).path with open(os.path.join(BASE_PATH, path[1:]), "rb") as f: content = f.read() chash = sha256(content).hexdigest() assert chash == row["content_hash"] disk_content[chash] = content ldb_content = dict() for chash, content in db_utils.get_content(ldb_path): chash = chash.decode("ascii") ldb_content[chash] = content for k, v in disk_content.items(): assert v == ldb_content[k]