def test_multi_hash_path(hash_test_data): with tempfile.NamedTemporaryFile(delete=False) as f: f.write(hash_test_data.data) hashes = MultiHash.from_path(f.name).digest() os.remove(f.name) assert hash_test_data.checksums == hashes
def content_add(self, content: List[Content]) -> Dict[str, int]: for cont in content: hashes = MultiHash.from_data(cont.data).digest() if hashes != cont.hashes(): raise StorageArgumentException( f"Object has hashes {cont.hashes()}, but they should be {hashes}" ) return self.storage.content_add(content)
def test_multi_hash_file_hexdigest_with_md5(hash_test_data): fobj = io.BytesIO(hash_test_data.data) length = len(hash_test_data.data) checksums = MultiHash.from_file(fobj, hash_names=DEFAULT_ALGORITHMS | {"md5"}, length=length).hexdigest() md5sum = {"md5": hashlib.md5(hash_test_data.data).hexdigest()} assert checksums == {**hash_test_data.hex_checksums, **md5sum}
def _init_content(uuid): """Given a uuid, initialize a content """ return { "id": MultiHash.from_data(uuid.bytes, {"sha1"}).digest()["sha1"], "indexer_configuration_id": 1, }
def download(self, url): """Download the remote tarball url locally. Args: url (str): Url (file or http*) Raises: ValueError in case of failing to query Returns: Tuple of local (filepath, hashes of filepath) """ url_parsed = urlparse(url) if url_parsed.scheme == 'file': path = url_parsed.path response = LocalResponse(path) length = os.path.getsize(path) else: response = self.session.get(url, **self.params, stream=True) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) length = int(response.headers['content-length']) filepath = os.path.join(self.temp_directory, os.path.basename(url)) h = MultiHash(length=length) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): h.update(chunk) f.write(chunk) actual_length = os.path.getsize(filepath) if length != actual_length: raise ValueError('Error when checking size: %s != %s' % ( length, actual_length)) hashes = { 'length': length, **h.hexdigest() } return filepath, hashes
def test_multi_hash_data_with_length(hash_test_data): expected_checksums = hash_test_data.checksums.copy() expected_checksums["length"] = len(hash_test_data.data) algos = set(["length"]).union(hashutil.DEFAULT_ALGORITHMS) checksums = MultiHash.from_data(hash_test_data.data, hash_names=algos).digest() assert checksums == expected_checksums assert "length" in checksums
def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: """Convert a dulwich blob to a Software Heritage content id""" if obj.type_name != b"blob": raise ValueError("Argument is not a blob.") blob = cast(Blob, obj) size = blob.raw_length() data = blob.as_raw_string() hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest() if hashes["sha1_git"] != blob.sha().digest(): raise HashMismatch( f"Expected Content hash to be {blob.sha().digest().hex()}, " f"got {hashes['sha1_git'].hex()}") hashes["length"] = size return hashes
def _gen_skipped_contents(n=10): # we do not use the hypothesis strategy here because this does not play well with # pytest fixtures, and it makes test execution very slow algos = DEFAULT_ALGORITHMS | {"length"} now = datetime.datetime.now(tz=UTC) return [ _updated( MultiHash.from_data(data=f"foo{i}".encode(), hash_names=algos).digest(), { "status": "absent", "reason": "why not", "origin": f"https://somewhere/{i}", "ctime": now, }, ) for i in range(n) ]
def add_file(self, path, sha1=None): path_parts = path.split(b"/") sha1 = ( hash_to_bytes(sha1) if sha1 else MultiHash.from_data(path).digest()["sha1"] ) if len(path_parts) == 1: self["entry_idx"][path] = len(self["entries"]) self["entries"].append( { "target": sha1, "name": path, "perms": DentryPerms.content, "type": "file", } ) else: if not path_parts[0] in self["entry_idx"]: self["entry_idx"][path_parts[0]] = len(self["entries"]) self["entries"].append(DirectoryModel(path_parts[0])) if path_parts[1]: dir_idx = self["entry_idx"][path_parts[0]] self["entries"][dir_idx].add_file(b"/".join(path_parts[1:]), sha1)
"debian-package": "python3-pygments", "max_content_size": 10240, }, }, { "tool_name": "nomos", "tool_version": "3.1.0rc2-31-ga2cbb8c", "tool_configuration": { "command_line": "nomossa <filepath>" }, }, ] MIMETYPE_OBJECTS = [ { "id": MultiHash.from_data(uuid1().bytes, {"sha1"}).digest()["sha1"], "mimetype": mt, "encoding": enc, # 'indexer_configuration_id' will be added after TOOLS get registered } for mt in MIMETYPES for enc in ENCODINGS ] LICENSES = [ b"3DFX", b"BSD", b"GPL", b"Apache2", b"MIT", ] FOSSOLOGY_LICENSES = [
Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) from swh.model.swhids import ExtendedSWHID UTC = datetime.timezone.utc CONTENTS = [ Content( length=4, data=f"foo{i}".encode(), status="visible", **MultiHash.from_data(f"foo{i}".encode()).digest(), ) for i in range(10) ] + [ Content( length=14, data=f"forbidden foo{i}".encode(), status="hidden", **MultiHash.from_data(f"forbidden foo{i}".encode()).digest(), ) for i in range(10) ] SKIPPED_CONTENTS = [ SkippedContent( length=4, status="absent", reason=f"because chr({i}) != '*'",
def download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, extra_request_headers: Optional[Dict[str, str]] = None, ) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ params = copy.deepcopy(DEFAULT_PARAMS) if auth is not None: params["auth"] = auth if extra_request_headers is not None: params["headers"].update(extra_request_headers) # so the connection does not hang indefinitely (read/connection timeout) timeout = params.get("timeout", 60) if url.startswith("ftp://"): response = urlopen(url, timeout=timeout) chunks = (response.read(HASH_BLOCK_SIZE) for _ in itertools.count()) response_data = itertools.takewhile(bool, chunks) else: response = requests.get(url, **params, timeout=timeout, stream=True) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % (url, response.status_code)) # update URL to response one as requests follow redirection by default # on GET requests url = response.url # try to extract filename from content-disposition header if available if filename is None and "content-disposition" in response.headers: filename = _content_disposition_filename( response.headers["content-disposition"]) response_data = response.iter_content(chunk_size=HASH_BLOCK_SIZE) filename = filename if filename else os.path.basename(urlsplit(url).path) logger.debug("filename: %s", filename) filepath = os.path.join(dest, filename) logger.debug("filepath: %s", filepath) h = MultiHash(hash_names=DOWNLOAD_HASHES | set(hashes.keys())) with open(filepath, "wb") as f: for chunk in response_data: h.update(chunk) f.write(chunk) response.close() # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError("Failure when fetching %s. " "Checksum mismatched: %s != %s" % (url, expected_digest, actual_digest)) computed_hashes = h.hexdigest() length = computed_hashes.pop("length") extrinsic_metadata = { "length": length, "filename": filename, "checksums": computed_hashes, "url": url, } logger.debug("extrinsic_metadata", extrinsic_metadata) return filepath, extrinsic_metadata
def test_multi_hash_file_bytehexdigest(hash_test_data): fobj = io.BytesIO(hash_test_data.data) length = len(hash_test_data.data) checksums = MultiHash.from_file(fobj, length=length).bytehexdigest() assert checksums == hash_test_data.bytehex_checksums
def test_multi_hash_data_unknown_hash(hash_test_data): with pytest.raises(ValueError, match="Unexpected hashing algorithm.*unknown-hash"): MultiHash.from_data(hash_test_data.data, ["unknown-hash"])
def test_multi_hash_data(hash_test_data): checksums = MultiHash.from_data(hash_test_data.data).digest() assert checksums == hash_test_data.checksums assert "length" not in checksums
def test_multi_hash_file_missing_length(hash_test_data): fobj = io.BytesIO(hash_test_data.data) with pytest.raises(ValueError, match="Missing length"): MultiHash.from_file(fobj, hash_names=["sha1_git"])
def origin_url_to_sha1(origin_url: str) -> bytes: """Convert an origin URL to a sha1. Encodes URL to utf-8.""" return MultiHash.from_data(origin_url.encode("utf-8"), {"sha1"}).digest()["sha1"]
def test_multi_hash_file(hash_test_data): fobj = io.BytesIO(hash_test_data.data) checksums = MultiHash.from_file(fobj, length=len(hash_test_data.data)).digest() assert checksums == hash_test_data.checksums