예제 #1
0
def test_multi_hash_path(hash_test_data):
    with tempfile.NamedTemporaryFile(delete=False) as f:
        f.write(hash_test_data.data)

    hashes = MultiHash.from_path(f.name).digest()
    os.remove(f.name)

    assert hash_test_data.checksums == hashes
예제 #2
0
 def content_add(self, content: List[Content]) -> Dict[str, int]:
     for cont in content:
         hashes = MultiHash.from_data(cont.data).digest()
         if hashes != cont.hashes():
             raise StorageArgumentException(
                 f"Object has hashes {cont.hashes()}, but they should be {hashes}"
             )
     return self.storage.content_add(content)
예제 #3
0
def test_multi_hash_file_hexdigest_with_md5(hash_test_data):
    fobj = io.BytesIO(hash_test_data.data)
    length = len(hash_test_data.data)
    checksums = MultiHash.from_file(fobj,
                                    hash_names=DEFAULT_ALGORITHMS | {"md5"},
                                    length=length).hexdigest()
    md5sum = {"md5": hashlib.md5(hash_test_data.data).hexdigest()}
    assert checksums == {**hash_test_data.hex_checksums, **md5sum}
예제 #4
0
def _init_content(uuid):
    """Given a uuid, initialize a content

    """
    return {
        "id": MultiHash.from_data(uuid.bytes, {"sha1"}).digest()["sha1"],
        "indexer_configuration_id": 1,
    }
예제 #5
0
    def download(self, url):
        """Download the remote tarball url locally.

        Args:
            url (str): Url (file or http*)

        Raises:
            ValueError in case of failing to query

        Returns:
            Tuple of local (filepath, hashes of filepath)

        """
        url_parsed = urlparse(url)
        if url_parsed.scheme == 'file':
            path = url_parsed.path
            response = LocalResponse(path)
            length = os.path.getsize(path)
        else:
            response = self.session.get(url, **self.params, stream=True)
            if response.status_code != 200:
                raise ValueError("Fail to query '%s'. Reason: %s" % (
                    url, response.status_code))
            length = int(response.headers['content-length'])

        filepath = os.path.join(self.temp_directory, os.path.basename(url))

        h = MultiHash(length=length)
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
                h.update(chunk)
                f.write(chunk)

        actual_length = os.path.getsize(filepath)
        if length != actual_length:
            raise ValueError('Error when checking size: %s != %s' % (
                length, actual_length))

        hashes = {
            'length': length,
            **h.hexdigest()
        }
        return filepath, hashes
예제 #6
0
def test_multi_hash_data_with_length(hash_test_data):
    expected_checksums = hash_test_data.checksums.copy()
    expected_checksums["length"] = len(hash_test_data.data)

    algos = set(["length"]).union(hashutil.DEFAULT_ALGORITHMS)
    checksums = MultiHash.from_data(hash_test_data.data,
                                    hash_names=algos).digest()

    assert checksums == expected_checksums
    assert "length" in checksums
예제 #7
0
def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
    """Convert a dulwich blob to a Software Heritage content id"""
    if obj.type_name != b"blob":
        raise ValueError("Argument is not a blob.")
    blob = cast(Blob, obj)

    size = blob.raw_length()
    data = blob.as_raw_string()
    hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest()
    if hashes["sha1_git"] != blob.sha().digest():
        raise HashMismatch(
            f"Expected Content hash to be {blob.sha().digest().hex()}, "
            f"got {hashes['sha1_git'].hex()}")
    hashes["length"] = size
    return hashes
예제 #8
0
def _gen_skipped_contents(n=10):
    # we do not use the hypothesis strategy here because this does not play well with
    # pytest fixtures, and it makes test execution very slow
    algos = DEFAULT_ALGORITHMS | {"length"}
    now = datetime.datetime.now(tz=UTC)
    return [
        _updated(
            MultiHash.from_data(data=f"foo{i}".encode(),
                                hash_names=algos).digest(),
            {
                "status": "absent",
                "reason": "why not",
                "origin": f"https://somewhere/{i}",
                "ctime": now,
            },
        ) for i in range(n)
    ]
예제 #9
0
 def add_file(self, path, sha1=None):
     path_parts = path.split(b"/")
     sha1 = (
         hash_to_bytes(sha1) if sha1 else MultiHash.from_data(path).digest()["sha1"]
     )
     if len(path_parts) == 1:
         self["entry_idx"][path] = len(self["entries"])
         self["entries"].append(
             {
                 "target": sha1,
                 "name": path,
                 "perms": DentryPerms.content,
                 "type": "file",
             }
         )
     else:
         if not path_parts[0] in self["entry_idx"]:
             self["entry_idx"][path_parts[0]] = len(self["entries"])
             self["entries"].append(DirectoryModel(path_parts[0]))
         if path_parts[1]:
             dir_idx = self["entry_idx"][path_parts[0]]
             self["entries"][dir_idx].add_file(b"/".join(path_parts[1:]), sha1)
예제 #10
0
            "debian-package": "python3-pygments",
            "max_content_size": 10240,
        },
    },
    {
        "tool_name": "nomos",
        "tool_version": "3.1.0rc2-31-ga2cbb8c",
        "tool_configuration": {
            "command_line": "nomossa <filepath>"
        },
    },
]

MIMETYPE_OBJECTS = [
    {
        "id": MultiHash.from_data(uuid1().bytes, {"sha1"}).digest()["sha1"],
        "mimetype": mt,
        "encoding": enc,
        # 'indexer_configuration_id' will be added after TOOLS get registered
    } for mt in MIMETYPES for enc in ENCODINGS
]

LICENSES = [
    b"3DFX",
    b"BSD",
    b"GPL",
    b"Apache2",
    b"MIT",
]

FOSSOLOGY_LICENSES = [
예제 #11
0
    Snapshot,
    SnapshotBranch,
    TargetType,
    Timestamp,
    TimestampWithTimezone,
)
from swh.model.swhids import ExtendedSWHID

UTC = datetime.timezone.utc

CONTENTS = [
    Content(
        length=4,
        data=f"foo{i}".encode(),
        status="visible",
        **MultiHash.from_data(f"foo{i}".encode()).digest(),
    ) for i in range(10)
] + [
    Content(
        length=14,
        data=f"forbidden foo{i}".encode(),
        status="hidden",
        **MultiHash.from_data(f"forbidden foo{i}".encode()).digest(),
    ) for i in range(10)
]

SKIPPED_CONTENTS = [
    SkippedContent(
        length=4,
        status="absent",
        reason=f"because chr({i}) != '*'",
예제 #12
0
def download(
    url: str,
    dest: str,
    hashes: Dict = {},
    filename: Optional[str] = None,
    auth: Optional[Tuple[str, str]] = None,
    extra_request_headers: Optional[Dict[str, str]] = None,
) -> Tuple[str, Dict]:
    """Download a remote tarball from url, uncompresses and computes swh hashes
       on it.

    Args:
        url: Artifact uri to fetch, uncompress and hash
        dest: Directory to write the archive to
        hashes: Dict of expected hashes (key is the hash algo) for the artifact
            to download (those hashes are expected to be hex string)
        auth: Optional tuple of login/password (for http authentication
            service, e.g. deposit)

    Raises:
        ValueError in case of any error when fetching/computing (length,
        checksums mismatched...)

    Returns:
        Tuple of local (filepath, hashes of filepath)

    """
    params = copy.deepcopy(DEFAULT_PARAMS)
    if auth is not None:
        params["auth"] = auth
    if extra_request_headers is not None:
        params["headers"].update(extra_request_headers)
    # so the connection does not hang indefinitely (read/connection timeout)
    timeout = params.get("timeout", 60)

    if url.startswith("ftp://"):
        response = urlopen(url, timeout=timeout)
        chunks = (response.read(HASH_BLOCK_SIZE) for _ in itertools.count())
        response_data = itertools.takewhile(bool, chunks)
    else:
        response = requests.get(url, **params, timeout=timeout, stream=True)
        if response.status_code != 200:
            raise ValueError("Fail to query '%s'. Reason: %s" %
                             (url, response.status_code))
        # update URL to response one as requests follow redirection by default
        # on GET requests
        url = response.url
        # try to extract filename from content-disposition header if available
        if filename is None and "content-disposition" in response.headers:
            filename = _content_disposition_filename(
                response.headers["content-disposition"])
        response_data = response.iter_content(chunk_size=HASH_BLOCK_SIZE)

    filename = filename if filename else os.path.basename(urlsplit(url).path)

    logger.debug("filename: %s", filename)
    filepath = os.path.join(dest, filename)
    logger.debug("filepath: %s", filepath)

    h = MultiHash(hash_names=DOWNLOAD_HASHES | set(hashes.keys()))
    with open(filepath, "wb") as f:
        for chunk in response_data:
            h.update(chunk)
            f.write(chunk)

    response.close()

    # Also check the expected hashes if provided
    if hashes:
        actual_hashes = h.hexdigest()
        for algo_hash in hashes.keys():
            actual_digest = actual_hashes[algo_hash]
            expected_digest = hashes[algo_hash]
            if actual_digest != expected_digest:
                raise ValueError("Failure when fetching %s. "
                                 "Checksum mismatched: %s != %s" %
                                 (url, expected_digest, actual_digest))

    computed_hashes = h.hexdigest()
    length = computed_hashes.pop("length")
    extrinsic_metadata = {
        "length": length,
        "filename": filename,
        "checksums": computed_hashes,
        "url": url,
    }

    logger.debug("extrinsic_metadata", extrinsic_metadata)

    return filepath, extrinsic_metadata
예제 #13
0
def test_multi_hash_file_bytehexdigest(hash_test_data):
    fobj = io.BytesIO(hash_test_data.data)
    length = len(hash_test_data.data)
    checksums = MultiHash.from_file(fobj, length=length).bytehexdigest()
    assert checksums == hash_test_data.bytehex_checksums
예제 #14
0
def test_multi_hash_data_unknown_hash(hash_test_data):
    with pytest.raises(ValueError,
                       match="Unexpected hashing algorithm.*unknown-hash"):
        MultiHash.from_data(hash_test_data.data, ["unknown-hash"])
예제 #15
0
def test_multi_hash_data(hash_test_data):
    checksums = MultiHash.from_data(hash_test_data.data).digest()
    assert checksums == hash_test_data.checksums
    assert "length" not in checksums
예제 #16
0
def test_multi_hash_file_missing_length(hash_test_data):
    fobj = io.BytesIO(hash_test_data.data)
    with pytest.raises(ValueError, match="Missing length"):
        MultiHash.from_file(fobj, hash_names=["sha1_git"])
예제 #17
0
def origin_url_to_sha1(origin_url: str) -> bytes:
    """Convert an origin URL to a sha1. Encodes URL to utf-8."""
    return MultiHash.from_data(origin_url.encode("utf-8"), {"sha1"}).digest()["sha1"]
예제 #18
0
def test_multi_hash_file(hash_test_data):
    fobj = io.BytesIO(hash_test_data.data)

    checksums = MultiHash.from_file(fobj,
                                    length=len(hash_test_data.data)).digest()
    assert checksums == hash_test_data.checksums