def test_archive_visit_with_no_artifact_found(swh_storage,
                                              requests_mock_datadir):
    url = URL
    unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz"
    loader = ArchiveLoader(
        swh_storage,
        url,
        artifacts=[{
            "time": 944729610,
            "url": unknown_artifact_url,  # unknown artifact
            "length": 221837,
            "filename": "8sync-0.1.0.tar.gz",
            "version": "0.1.0",
        }],
    )

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "uneventful"
    assert actual_load_status["snapshot_id"] is not None
    stats = get_stats(swh_storage)

    assert {
        "content": 0,
        "directory": 0,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    assert_last_visit_matches(swh_storage, url, status="partial", type="tar")
def test_archive_visit_no_time_for_tarball(swh_storage, requests_mock_datadir):
    artifacts = copy.deepcopy(GNU_ARTIFACTS)
    for artifact in artifacts:
        artifact["time"] = None

    loader = ArchiveLoader(swh_storage, URL, artifacts=artifacts)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"

    assert_last_visit_matches(swh_storage, URL, status="full", type="tar")
def test_archive_snapshot_append(swh_storage, requests_mock_datadir):
    # first loading with a first artifact
    artifact1 = GNU_ARTIFACTS[0]
    loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True)
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None
    assert_last_visit_matches(swh_storage, URL, status="full", type="tar")

    # check expected snapshot
    snapshot = loader.last_snapshot()
    assert len(snapshot.branches) == 2
    branch_artifact1_name = f"releases/{artifact1['version']}".encode()
    assert b"HEAD" in snapshot.branches
    assert branch_artifact1_name in snapshot.branches
    assert snapshot.branches[b"HEAD"].target == branch_artifact1_name

    # second loading with a second artifact
    artifact2 = GNU_ARTIFACTS[1]
    loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True)
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None
    assert_last_visit_matches(swh_storage, URL, status="full", type="tar")

    # check expected snapshot, should contain a new branch and the
    # branch for the first artifact
    snapshot = loader.last_snapshot()
    assert len(snapshot.branches) == 3
    branch_artifact2_name = f"releases/{artifact2['version']}".encode()
    assert b"HEAD" in snapshot.branches
    assert branch_artifact2_name in snapshot.branches
    assert branch_artifact1_name in snapshot.branches
    assert snapshot.branches[b"HEAD"].target == branch_artifact2_name
def test_archive_2_visits_without_change_not_gnu(swh_storage,
                                                 requests_mock_datadir):
    """Load a project archive (not gnu) ends up with 1 snapshot"""
    url = "https://something.else.org/8sync/"
    artifacts = [  # this is not a gnu artifact
        {
            "time": "1999-12-09T09:53:30+00:00",  # it's also not a timestamp
            "sha256":
            "d5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4",  # noqa
            # keep a gnu artifact reference to avoid adding other test files
            "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz",
            "length": 238466,
            "filename": "8sync-0.2.0.tar.gz",
            "version": "0.2.0",
        }
    ]

    # Here the loader defines the id_keys to use for existence in the snapshot
    # It's not the default archive loader which
    loader = ArchiveLoader(
        swh_storage,
        url,
        artifacts=artifacts,
        extid_manifest_format="$sha256 $length $url",
    )

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None
    assert_last_visit_matches(swh_storage, url, status="full", type="tar")

    actual_load_status2 = loader.load()
    assert actual_load_status2["status"] == "uneventful"
    assert actual_load_status2["snapshot_id"] == actual_load_status[
        "snapshot_id"]
    assert_last_visit_matches(swh_storage, url, status="full", type="tar")

    urls = [
        m.url for m in requests_mock_datadir.request_history
        if m.url.startswith("https://ftp.gnu.org")
    ]
    assert len(urls) == 1
def test_archive_not_gzipped_tarball(swh_storage, requests_mock,
                                     not_gzipped_tarball_bytes):
    """Check that a tarball erroneously marked as gzip compressed can still
    be downloaded and processed.

    """
    filename = "not_gzipped_tarball.tar.gz"
    url = f"https://example.org/ftp/{filename}"
    requests_mock.get(
        url,
        [
            {
                "exc": ContentDecodingError,
            },
            {
                "body": BytesIO(not_gzipped_tarball_bytes),
            },
        ],
    )
    loader = ArchiveLoader(
        swh_storage,
        url,
        artifacts=[{
            "time": 944729610,
            "url": url,
            "length": 221837,
            "filename": filename,
            "version": "0.1.0",
        }],
    )

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    snapshot = loader.last_snapshot()
    assert len(snapshot.branches) == 2
    assert b"releases/0.1.0" in snapshot.branches
def test_archive_2_visits_without_change(swh_storage, requests_mock_datadir):
    """With no prior visit, load a gnu project ends up with 1 snapshot"""
    url = URL
    loader = ArchiveLoader(swh_storage, url, artifacts=GNU_ARTIFACTS[:1])

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    assert_last_visit_matches(swh_storage, url, status="full", type="tar")

    actual_load_status2 = loader.load()
    assert actual_load_status2["status"] == "uneventful"
    assert actual_load_status2["snapshot_id"] is not None
    assert actual_load_status["snapshot_id"] == actual_load_status2[
        "snapshot_id"]

    assert_last_visit_matches(swh_storage, url, status="full", type="tar")

    urls = [
        m.url for m in requests_mock_datadir.request_history
        if m.url.startswith("https://ftp.gnu.org")
    ]
    assert len(urls) == 1
def test_archive_snapshot_append_branch_override(swh_storage,
                                                 requests_mock_datadir):
    # first loading for a first artifact
    artifact1 = GNU_ARTIFACTS[0]
    loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True)
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None
    assert_last_visit_matches(swh_storage, URL, status="full", type="tar")

    # check expected snapshot
    snapshot = loader.last_snapshot()
    assert len(snapshot.branches) == 2
    branch_artifact1_name = f"releases/{artifact1['version']}".encode()
    assert branch_artifact1_name in snapshot.branches
    branch_target_first_visit = snapshot.branches[branch_artifact1_name].target

    # second loading for a second artifact with same version as the first one
    # but with different tarball content
    artifact2 = dict(GNU_ARTIFACTS[0])
    artifact2["url"] = GNU_ARTIFACTS[1]["url"]
    artifact2["time"] = GNU_ARTIFACTS[1]["time"]
    artifact2["length"] = GNU_ARTIFACTS[1]["length"]
    loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True)
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None
    assert_last_visit_matches(swh_storage, URL, status="full", type="tar")

    # check expected snapshot, should contain the same branch as previously
    # but with different target
    snapshot = loader.last_snapshot()
    assert len(snapshot.branches) == 2
    assert branch_artifact1_name in snapshot.branches
    branch_target_second_visit = snapshot.branches[
        branch_artifact1_name].target

    assert branch_target_first_visit != branch_target_second_visit
Пример #8
0
def load_archive_files(**kwargs):
    """Load archive's artifacts (e.g gnu, etc...)"""
    loader = ArchiveLoader.from_configfile(**kwargs)
    return loader.load()
def test_load_nixguix_one_common_artifact_from_other_loader(
        swh_storage, datadir, requests_mock_datadir_visits, caplog):
    """Misformatted revision should be caught and logged, then loading continues"""
    caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader")

    # 1. first ingest with for example the archive loader
    gnu_url = "https://ftp.gnu.org/gnu/8sync/"
    release = "0.1.0"
    artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz"
    gnu_artifacts = [{
        "time": 944729610,
        "url": artifact_url,
        "length": 221837,
        "filename": f"8sync-{release}.tar.gz",
        "version": release,
    }]
    archive_loader = ArchiveLoader(swh_storage,
                                   url=gnu_url,
                                   artifacts=gnu_artifacts)
    actual_load_status = archive_loader.load()
    expected_snapshot_id = "9efecc835e8f99254934f256b5301b94f348fd17"
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] == expected_snapshot_id  # noqa

    assert_last_visit_matches(
        archive_loader.storage,
        gnu_url,
        status="full",
        type="tar",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    # 2. Then ingest with the nixguix loader which lists the same artifact within its
    # sources.json

    # ensure test setup is ok
    data_sources = os.path.join(datadir, "https_nix-community.github.io",
                                "nixpkgs-swh_sources_special.json")
    all_sources = json.loads(open(data_sources).read())
    found = False
    for source in all_sources["sources"]:
        if source["urls"][0] == artifact_url:
            found = True
            assert (
                found is True
            ), f"test setup error: {artifact_url} must be in {data_sources}"

    # first visit with a snapshot, ok
    sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json"
    loader = NixGuixLoader(swh_storage, sources_url)
    actual_load_status2 = loader.load()
    assert actual_load_status2["status"] == "eventful"

    snapshot_id = actual_load_status2["snapshot_id"]

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="full",
        type="nixguix",
        snapshot=hash_to_bytes(snapshot_id),
    )

    snapshot = snapshot_get_all_branches(swh_storage,
                                         hash_to_bytes(snapshot_id))
    assert snapshot
def test_archive_2_visits_with_new_artifact(swh_storage,
                                            requests_mock_datadir):
    """With no prior visit, load a gnu project ends up with 1 snapshot"""
    url = URL
    artifact1 = GNU_ARTIFACTS[0]
    loader = ArchiveLoader(swh_storage, url, [artifact1])

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    assert_last_visit_matches(swh_storage, url, status="full", type="tar")

    stats = get_stats(swh_storage)
    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    urls = [
        m.url for m in requests_mock_datadir.request_history
        if m.url.startswith("https://ftp.gnu.org")
    ]
    assert len(urls) == 1

    artifact2 = GNU_ARTIFACTS[1]

    loader2 = ArchiveLoader(swh_storage, url, [artifact1, artifact2])
    stats2 = get_stats(swh_storage)
    assert stats == stats2  # ensure we share the storage

    actual_load_status2 = loader2.load()
    assert actual_load_status2["status"] == "eventful"
    assert actual_load_status2["snapshot_id"] is not None

    stats2 = get_stats(swh_storage)
    assert {
        "content": len(_expected_new_contents_first_visit) + 14,
        "directory": len(_expected_new_directories_first_visit) + 8,
        "origin": 1,
        "origin_visit": 1 + 1,
        "release": len(_expected_new_releases_first_visit) + 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1 + 1,
    } == stats2

    assert_last_visit_matches(swh_storage, url, status="full", type="tar")

    urls = [
        m.url for m in requests_mock_datadir.request_history
        if m.url.startswith("https://ftp.gnu.org")
    ]
    # 1 artifact (2nd time no modification) + 1 new artifact
    assert len(urls) == 2
def test_archive_visit_with_release_artifact_no_prior_visit(
        swh_storage, requests_mock_datadir):
    """With no prior visit, load a gnu project ends up with 1 snapshot"""
    loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS[:1])

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"

    expected_snapshot_first_visit_id = hash_to_bytes(
        "9efecc835e8f99254934f256b5301b94f348fd17")

    assert actual_load_status["snapshot_id"] == hash_to_hex(
        expected_snapshot_first_visit_id)

    assert_last_visit_matches(swh_storage, URL, status="full", type="tar")

    stats = get_stats(swh_storage)
    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    release_id = hash_to_bytes(list(_expected_new_releases_first_visit)[0])
    expected_snapshot = Snapshot(
        id=expected_snapshot_first_visit_id,
        branches={
            b"HEAD":
            SnapshotBranch(
                target_type=TargetType.ALIAS,
                target=b"releases/0.1.0",
            ),
            b"releases/0.1.0":
            SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=release_id,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([release_id])[0] == Release(
        id=release_id,
        name=b"0.1.0",
        message=(b"Synthetic release for archive at "
                 b"https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz\n"),
        target=hash_to_bytes("3aebc29ed1fccc4a6f2f2010fb8e57882406b528"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b""),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(1999,
                              12,
                              9,
                              8,
                              53,
                              30,
                              tzinfo=datetime.timezone.utc)),
    )

    expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit)
    assert list(swh_storage.content_missing_per_sha1(expected_contents)) == []

    expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit)
    assert list(swh_storage.directory_missing(expected_dirs)) == []

    expected_rels = map(hash_to_bytes, _expected_new_releases_first_visit)
    assert list(swh_storage.release_missing(expected_rels)) == []