def test_loader_incremental(swh_storage, requests_mock_datadir):
    """Ensure a second visit do not download artifact already
    downloaded by the previous visit.

    """
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()

    loader.load()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": SNAPSHOT1.id.hex()
    }

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=SNAPSHOT1.id,
    )

    check_snapshot(SNAPSHOT1, storage=swh_storage)

    urls = [
        m.url for m in requests_mock_datadir.request_history
        if m.url == ("https://github.com/owner-1/repository-1/revision-1.tgz")
    ]
    # The artifact
    # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only
    # visited one time
    assert len(urls) == 1
def test_eoferror(swh_storage, requests_mock_datadir):
    """Load a truncated archive which is invalid to make the uncompress
    function raising the exception EOFError. We then check if a
    snapshot is created, meaning this error is well managed.

    """
    sources = (
        "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json"  # noqa
    )
    loader = NixGuixLoader(swh_storage, sources)
    loader.load()

    expected_snapshot = Snapshot(
        id=hash_to_bytes("4257fa2350168c6bfec726a06452ea27a2c0cb33"),
        branches={
            b"evaluation":
            SnapshotBranch(
                target=hash_to_bytes(
                    "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"),
                target_type=TargetType.REVISION,
            ),
        },
    )

    check_snapshot(expected_snapshot, storage=swh_storage)
def test_nixguix_url_not_found(swh_storage, requests_mock_datadir):
    """When failing to read from the url, the visit is marked as not_found.

    Here the sources url does not exist, so requests_mock_datadir returns a 404.
    Resulting in a NotFound raised within the package loader's main loop.

    This results in the task with status failed and a visit_status with status
    "not_found".

    """
    unknown_url = "https://non-existing-url/"
    loader = NixGuixLoader(swh_storage, unknown_url)
    # during the retrieval step
    load_status = loader.load()

    assert load_status == {"status": "failed"}

    assert_last_visit_matches(swh_storage,
                              unknown_url,
                              status="not_found",
                              type="nixguix",
                              snapshot=None)

    assert len(requests_mock_datadir.request_history) == 1
    assert requests_mock_datadir.request_history[0].url == unknown_url
def test_evaluation_branch(swh_storage, requests_mock_datadir):
    loader = NixGuixLoader(swh_storage, sources_url)
    res = loader.load()
    assert res["status"] == "eventful"

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=SNAPSHOT1.id,
    )

    check_snapshot(SNAPSHOT1, storage=swh_storage)
def test_nixguix_url_with_decoding_error(swh_storage, requests_mock_datadir):
    """Other errors during communication with the url, the visit is marked as failed

    requests_mock_datadir will intercept the requests to sources_url. Since the file
    exists, returns a 200 with the requested content of the query. As file.txt is no
    json, fails do decode and raises a JSONDecodeError. In effect failing the visit.

    """
    sources_url = "https://example.com/file.txt"
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()

    assert load_status == {"status": "failed"}

    assert_last_visit_matches(swh_storage,
                              sources_url,
                              status="failed",
                              type="nixguix",
                              snapshot=None)

    assert len(requests_mock_datadir.request_history) == 1
    assert requests_mock_datadir.request_history[0].url == sources_url
def test_uncompress_failure(swh_storage, requests_mock_datadir):
    """Non tarball files are currently not supported and the uncompress
    function fails on such kind of files.

    However, even in this case of failure (because of the url
    https://example.com/file.txt), a snapshot and a visit has to be
    created (with a status partial since all files are not archived).

    """
    loader = NixGuixLoader(swh_storage, sources_url)
    loader_status = loader.load()

    sources = loader.supported_sources()["sources"]
    urls = [s["urls"][0] for s in sources]
    assert "https://example.com/file.txt" in urls
    assert loader_status["status"] == "eventful"

    # The visit is partial because urls pointing to non tarball files
    # are not handled yet
    assert_last_visit_matches(swh_storage,
                              sources_url,
                              status="partial",
                              type="nixguix")
def test_raise_exception(swh_storage, requests_mock_datadir, mocker):
    mock_download = mocker.patch("swh.loader.package.loader.download")
    mock_download.side_effect = fake_download

    loader = NixGuixLoader(swh_storage, sources_url)
    res = loader.load()

    assert res == {
        "status": "eventful",
        "snapshot_id": SNAPSHOT1.id.hex(),
    }

    # The visit is partial because some artifact downloads failed
    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=SNAPSHOT1.id,
    )

    check_snapshot(SNAPSHOT1, storage=swh_storage)

    assert len(mock_download.mock_calls) == 3
def test_load_nixguix_one_common_artifact_from_other_loader(
        swh_storage, datadir, requests_mock_datadir_visits, caplog):
    """Misformatted revision should be caught and logged, then loading continues"""
    caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader")

    # 1. first ingest with for example the archive loader
    gnu_url = "https://ftp.gnu.org/gnu/8sync/"
    release = "0.1.0"
    artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz"
    gnu_artifacts = [{
        "time": 944729610,
        "url": artifact_url,
        "length": 221837,
        "filename": f"8sync-{release}.tar.gz",
        "version": release,
    }]
    archive_loader = ArchiveLoader(swh_storage,
                                   url=gnu_url,
                                   artifacts=gnu_artifacts)
    actual_load_status = archive_loader.load()
    expected_snapshot_id = "9efecc835e8f99254934f256b5301b94f348fd17"
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] == expected_snapshot_id  # noqa

    assert_last_visit_matches(
        archive_loader.storage,
        gnu_url,
        status="full",
        type="tar",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    # 2. Then ingest with the nixguix loader which lists the same artifact within its
    # sources.json

    # ensure test setup is ok
    data_sources = os.path.join(datadir, "https_nix-community.github.io",
                                "nixpkgs-swh_sources_special.json")
    all_sources = json.loads(open(data_sources).read())
    found = False
    for source in all_sources["sources"]:
        if source["urls"][0] == artifact_url:
            found = True
            assert (
                found is True
            ), f"test setup error: {artifact_url} must be in {data_sources}"

    # first visit with a snapshot, ok
    sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json"
    loader = NixGuixLoader(swh_storage, sources_url)
    actual_load_status2 = loader.load()
    assert actual_load_status2["status"] == "eventful"

    snapshot_id = actual_load_status2["snapshot_id"]

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="full",
        type="nixguix",
        snapshot=hash_to_bytes(snapshot_id),
    )

    snapshot = snapshot_get_all_branches(swh_storage,
                                         hash_to_bytes(snapshot_id))
    assert snapshot
def test_loader_two_visits(swh_storage, requests_mock_datadir_visits):
    """To ensure there is only one origin, but two visits, two revisions
    and two snapshots are created.

    The first visit creates a snapshot containing one tarball. The
    second visit creates a snapshot containing the same tarball and
    another tarball.

    """
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": SNAPSHOT1.id.hex()
    }

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=SNAPSHOT1.id,
    )

    check_snapshot(SNAPSHOT1, storage=swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    expected_snapshot_id_hex = "c1983a0a3f647548e1fb92f30339da6848fe9f7a"
    expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex)
    assert load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id_hex,
    }

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=expected_snapshot_id,
    )

    # This ensures visits are incremental. Indeed, if we request a
    # second time an url, because of the requests_mock_datadir_visits
    # fixture, the file has to end with `_visit1`.
    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"evaluation":
            SnapshotBranch(
                target=hash_to_bytes(
                    "602140776b2ce6c9159bcf52ada73a297c063d5e"),
                target_type=TargetType.REVISION,
            ),
            b"https://github.com/owner-1/repository-1/revision-1.tgz":
            SnapshotBranch(
                target=hash_to_bytes(
                    "df7811b9644ed8ef088e2e7add62ed32b0bab15f"),
                target_type=TargetType.RELEASE,
            ),
            b"https://github.com/owner-2/repository-1/revision-1.tgz":
            SnapshotBranch(
                target=hash_to_bytes(
                    "5cc0115cd643902b837cb6cfbc9f5865bc5a7cb2"),
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 2,
        "directory": 5,
        "origin": 1,
        "origin_visit": 2,
        "release": 3,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 2,
    } == stats
def test_loader_one_visit(swh_storage, requests_mock_datadir, raw_sources):
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    expected_snapshot_id = SNAPSHOT1.id
    expected_snapshot_id_hex = expected_snapshot_id.hex()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id_hex,
    }

    release_id = SNAPSHOT1.branches[
        b"https://github.com/owner-1/repository-1/revision-1.tgz"].target
    check_snapshot(SNAPSHOT1, storage=swh_storage)

    assert swh_storage.release_get([release_id])[0] == Release(
        id=release_id,
        name=b"https://github.com/owner-1/repository-1/revision-1.tgz",
        message=None,
        target=hash_to_bytes("4de2e07d3742718d928e974b8a4c721b9f7b33bf"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b""),
        date=None,
    )

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    # The visit is partial because urls pointing to non tarball file
    # are not handled yet
    assert_last_visit_matches(swh_storage,
                              sources_url,
                              status="partial",
                              type="nixguix")

    visit_status = origin_get_latest_visit_status(swh_storage, sources_url)
    snapshot_swhid = ExtendedSWHID(object_type=ExtendedObjectType.SNAPSHOT,
                                   object_id=visit_status.snapshot)
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url=sources_url,
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=snapshot_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.nixguix.loader.NixGuixLoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="nixguix-sources-json",
            metadata=raw_sources,
            origin=sources_url,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        snapshot_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
예제 #11
0
def load_nixguix(**kwargs):
    """Load functional (e.g. guix/nix) package"""
    return NixGuixLoader.from_configfile(**kwargs).load()