示例#1
0
def test_javascript_saving(http_params, xpi, server):
    """ check that javascript content is saved and hashed correctly """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    manager_params, browser_params = http_params()

    for browser_param in browser_params:
        browser_param.http_instrument = True
        browser_param.save_content = "script"

    structured_storage = SQLiteStorageProvider(
        db_path=manager_params.data_directory / "crawl-data.sqlite")
    ldb_path = Path(manager_params.data_directory) / "content.ldb"
    unstructured_storage = LevelDbProvider(db_path=ldb_path)
    manager = task_manager.TaskManager(manager_params, browser_params,
                                       structured_storage,
                                       unstructured_storage)
    manager.get(url=test_url, sleep=1)
    manager.close()
    expected_hashes = {
        "0110c0521088c74f179615cd7c404816816126fa657550032f75ede67a66c7cc",
        "b34744034cd61e139f85f6c4c92464927bed8343a7ac08acf9fb3c6796f80f08",
    }
    for chash, content in db_utils.get_content(ldb_path):
        chash = chash.decode("ascii").lower()
        pyhash = sha256(content).hexdigest().lower()
        assert pyhash == chash  # Verify expected key (sha256 of content)
        assert chash in expected_hashes
        expected_hashes.remove(chash)
    assert len(expected_hashes) == 0  # All expected hashes have been seen
示例#2
0
def test_document_saving(http_params, xpi, server):
    """ check that document content is saved and hashed correctly """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    expected_hashes = {
        "2390eceab422db15bc45940b7e042e83e6cbd5f279f57e714bc4ad6cded7f966",
        "25343f42d9ffa5c082745f775b172db87d6e14dfbc3160b48669e06d727bfc8d",
    }
    manager_params, browser_params = http_params()
    for browser_param in browser_params:
        browser_param.http_instrument = True
        browser_param.save_content = "main_frame,sub_frame"

    structured_storage = SQLiteStorageProvider(
        db_path=manager_params.data_directory / "crawl-data.sqlite")
    ldb_path = Path(manager_params.data_directory) / "content.ldb"
    unstructured_storage = LevelDbProvider(db_path=ldb_path)
    manager = task_manager.TaskManager(manager_params, browser_params,
                                       structured_storage,
                                       unstructured_storage)

    manager.get(url=test_url, sleep=1)
    manager.close()
    for chash, content in db_utils.get_content(ldb_path):
        chash = chash.decode("ascii").lower()
        pyhash = sha256(content).hexdigest().lower()
        assert pyhash == chash  # Verify expected key (sha256 of content)
        assert chash in expected_hashes
        expected_hashes.remove(chash)
    assert len(expected_hashes) == 0  # All expected hashes have been seen
示例#3
0
def unstructured_provider(
        request: Any, tmp_path_factory: Any) -> UnstructuredStorageProvider:
    if request.param == memory_unstructured:
        return MemoryUnstructuredProvider()
    elif request.param == leveldb:
        tmp_path = tmp_path_factory.mktemp(leveldb)
        return LevelDbProvider(tmp_path / "content.ldb")
    elif request.param == local_gzip:
        tmp_path = tmp_path_factory.mktemp(local_gzip)
        return LocalGzipProvider(tmp_path)
    assert isinstance(
        request, FixtureRequest
    )  # See https://github.com/pytest-dev/pytest/issues/8073 for why this can't be type annotated
    request.raiseerror("invalid internal test config")
示例#4
0
def test_content_saving(http_params, xpi, server):
    """ check that content is saved and hashed correctly """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    manager_params, browser_params = http_params()
    for browser_param in browser_params:
        browser_param.http_instrument = True
        browser_param.save_content = True
    db = manager_params.data_directory / "crawl-data.sqlite"
    structured_storage = SQLiteStorageProvider(db_path=db)
    ldb_path = Path(manager_params.data_directory) / "content.ldb"
    unstructured_storage = LevelDbProvider(db_path=ldb_path)
    manager = task_manager.TaskManager(manager_params, browser_params,
                                       structured_storage,
                                       unstructured_storage)
    manager.get(url=test_url, sleep=1)
    manager.close()

    rows = db_utils.query_db(db, "SELECT * FROM http_responses;")
    disk_content = dict()
    for row in rows:
        if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]:
            continue
        path = urlparse(row["url"]).path
        with open(os.path.join(BASE_PATH, path[1:]), "rb") as f:
            content = f.read()
        chash = sha256(content).hexdigest()
        assert chash == row["content_hash"]
        disk_content[chash] = content

    ldb_content = dict()
    for chash, content in db_utils.get_content(ldb_path):
        chash = chash.decode("ascii")
        ldb_content[chash] = content

    for k, v in disk_content.items():
        assert v == ldb_content[k]