def non_ancestor_revisions(): """ Hypothesis strategy returning a pair of revisions ingested into the test archive with no ancestor relation. """ # get a dfs revisions walker for one of the origins # loaded into the test archive revisions_walker = _get_origin_dfs_revisions_walker() merge_revs = [] children = defaultdict(list) # get all merge revisions for rev in revisions_walker: if len(rev['parents']) > 1: merge_revs.append(rev) for rev_p in rev['parents']: children[rev_p].append(rev['id']) # find a merge revisions whose parents have a unique child revision random.shuffle(merge_revs) selected_revs = None for merge_rev in merge_revs: if all(len(children[rev_p]) == 1 for rev_p in merge_rev['parents']): selected_revs = merge_rev['parents'] return just({ 'sha1_git_root': hash_to_hex(selected_revs[0]), 'sha1_git': hash_to_hex(selected_revs[1]) })
def ancestor_revisions(): """ Hypothesis strategy returning a pair of revisions ingested into the test archive with an ancestor relation. """ # get a dfs revisions walker for one of the origins # loaded into the test archive revisions_walker = _get_origin_dfs_revisions_walker() master_revisions = [] children = defaultdict(list) init_rev_found = False # get revisions only authored in the master branch for rev in revisions_walker: for rev_p in rev['parents']: children[rev_p].append(rev['id']) if not init_rev_found: master_revisions.append(rev) if not rev['parents']: init_rev_found = True # head revision root_rev = master_revisions[0] # pick a random revision, different from head, only authored # in the master branch ancestor_rev_idx = random.choice(list(range(1, len(master_revisions) - 1))) ancestor_rev = master_revisions[ancestor_rev_idx] ancestor_child_revs = children[ancestor_rev['id']] return just({ 'sha1_git_root': hash_to_hex(root_rev['id']), 'sha1_git': hash_to_hex(ancestor_rev['id']), 'children': [hash_to_hex(r) for r in ancestor_child_revs] })
def _check_revision_divergence(self, rev: int, dir_id: bytes) -> None: """Check for hash revision computation divergence. The Rationale behind this is that svn can trigger unknown edge cases (mixed CRLF, svn properties, etc...). Those are not always easy to spot. Adding a regular check will help spotting potential missing edge cases. Args: rev: The actual revision we are computing from dir_id: The actual directory for the given revision Raises ValueError if a hash divergence is detected """ self.log.debug("Checking hash computations on revision %s...", rev) checked_dir_id = self.swh_revision_hash_tree_at_svn_revision(rev) if checked_dir_id != dir_id: err = ( "Hash tree computation divergence detected " "(%s != %s), stopping!" % ( hashutil.hash_to_hex(dir_id), hashutil.hash_to_hex(checked_dir_id), ) ) raise ValueError(err)
def get_contents_error(self, content_ids, source_storage): """Indicates what is the error associated to a content when needed Check the given content on the given storage. If an error is detected, it will be reported through the returned dict. Args: content_ids ([sha1]): list of content ids to check source_storage (str): the source storage holding the contents to check. Returns: a dict that map {content_id -> error_status} for each content_id with an error. The `error_status` result may be 'missing' or 'corrupted'. """ content_status = {} storage = self.objstorages[source_storage] for content_id in content_ids: try: storage.check(content_id) except Error: content_status[content_id] = 'corrupted' logger.error('%s corrupted!' % hashutil.hash_to_hex( content_id)) except ObjNotFoundError: content_status[content_id] = 'missing' logger.error('%s missing!' % hashutil.hash_to_hex(content_id)) return content_status
def _lookup_origin_visits(*args, **kwargs): if kwargs["last_visit"] is None: return [ { "visit": 1, "date": "2017-05-06T00:59:10+00:00", "status": "full", "snapshot": hash_to_hex(snapshots[0].id), "type": "git", }, { "visit": 2, "date": "2017-08-06T00:59:10+00:00", "status": "full", "snapshot": hash_to_hex(snapshots[1].id), "type": "git", }, ] else: return [{ "visit": 3, "date": "2017-09-06T00:59:10+00:00", "status": "full", "snapshot": hash_to_hex(snapshots[2].id), "type": "git", }]
def dulwich_tag_to_release(obj: ShaFile) -> Release: if obj.type_name != b"tag": raise ValueError("Argument is not a tag.") tag = cast(Tag, obj) tagger_timezone = None # FIXME: _parse_message is a private function from Dulwich. for (field, value) in _parse_message(tag.as_raw_chunks()): if field == b"tagger": m = AUTHORSHIP_LINE_RE.match(value) if m: tagger_timezone = m.group("timezone") target_type, target = tag.object if tag.tagger: author: Optional[Person] = parse_author(tag.tagger) if tag.tag_time is None: date = None else: date = dulwich_tsinfo_to_timestamp( tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, tagger_timezone, ) else: author = date = None message = tag.message if tag.signature: message += tag.signature rel = Release( id=tag.sha().digest(), author=author, date=date, name=tag.name, target=bytes.fromhex(target.decode()), target_type=DULWICH_OBJECT_TYPES[target_type.type_name], message=message, metadata=None, synthetic=False, ) if rel.compute_hash() != rel.id: expected_id = rel.id actual_id = rel.compute_hash() logger.warning( "Expected release to have id %s, but got %s. Recording raw_manifest.", hash_to_hex(expected_id), hash_to_hex(actual_id), ) raw_string = tag.as_raw_string() rel = attr.evolve( rel, raw_manifest=git_object_header("tag", len(raw_string)) + raw_string) check_id(rel) return rel
def process_svn_revisions( self, svnrepo, revision_start, revision_end ) -> Iterator[ Tuple[List[Content], List[SkippedContent], List[Directory], Revision] ]: """Process svn revisions from revision_start to revision_end. At each svn revision, apply new diffs and simultaneously compute swh hashes. This yields those computed swh hashes as a tuple (contents, directories, revision). Note that at every `self.check_revision`, a supplementary check takes place to check for hash-tree divergence (related T570). Yields: tuple (contents, directories, revision) of dict as a dictionary with keys, sha1_git, sha1, etc... Raises: ValueError in case of a hash divergence detection """ gen_revs = svnrepo.swh_hash_data_per_revision(revision_start, revision_end) parents = (self.latest_revision.id,) if self.latest_revision is not None else () count = 0 for rev, commit, new_objects, root_directory in gen_revs: count += 1 # Send the associated contents/directories _contents, _skipped_contents, _directories = new_objects # compute the fs tree's checksums dir_id = root_directory.hash swh_revision = self.build_swh_revision(rev, commit, dir_id, parents) self.log.debug( "rev: %s, swhrev: %s, dir: %s", rev, hashutil.hash_to_hex(swh_revision.id), hashutil.hash_to_hex(dir_id), ) if ( self.check_revision and self.check_revision != 0 and count % self.check_revision == 0 ): self._check_revision_divergence(rev, dir_id) parents = (swh_revision.id,) yield _contents, _skipped_contents, _directories, swh_revision if not self.debug and self.svnrepo: # clean directory where revisions were replayed to gain some disk space # before the post_load operation self.svnrepo.clean_fs(self.svnrepo.local_url)
def get_object(objstorage, obj_id): try: with statsd.timed(CONTENT_DURATION_METRIC, tags={"request": "get"}): obj = objstorage.get(obj_id) logger.debug("retrieved %(obj_id)s", {"obj_id": hash_to_hex(obj_id)}) return obj except ObjNotFoundError: logger.error( "Failed to retrieve %(obj_id)s: object not found", {"obj_id": hash_to_hex(obj_id)}, ) raise except Exception as exc: raise ReplayError(obj_id=obj_id, exc=exc) from None
def run(self): """ Do the copy on the backup storage. Run the archiver copier in order to copy the required content into the current destination. The content which corresponds to the sha1 in self.content_ids will be fetched from the master_storage and then copied into the backup object storage. Returns: A boolean that indicates if the whole content have been copied. """ try: for content_id in self.content_ids: try: content = self.source.get(content_id) except ObjNotFoundError: logging.error('content %s not found' % hashutil.hash_to_hex(content_id)) continue self.destination.add(content, content_id) except Exception as e: logger.exception('Problem during copy: %s' % e) return False return True
def test_data_to_content(self): for filename, content in self.contents.items(): conv_content = Content.from_bytes( mode=content["mode"], data=content["data"] ) self.assertContentEqual(conv_content, content) self.assertIn(hash_to_hex(conv_content.hash), repr(conv_content))
def test_lookup_revision_msg_ok(archive_data, new_revision): archive_data.revision_add([new_revision]) revision_message = archive.lookup_revision_message( hash_to_hex(new_revision.id)) assert revision_message == {"message": new_revision.message}
def test_lookup_directory_with_revision_unknown_content( archive_data, new_revision): unknown_content_ = random_content() dir_path = "README.md" # A directory that points to unknown content dir = Directory(entries=(DirectoryEntry( name=bytes(dir_path.encode("utf-8")), type="file", target=hash_to_bytes(unknown_content_["sha1_git"]), perms=DentryPerms.content, ), )) # Create a revision that points to a directory # Which points to unknown content new_revision = new_revision.to_dict() new_revision["directory"] = dir.id del new_revision["id"] new_revision = Revision.from_dict(new_revision) # Add the directory and revision in mem archive_data.directory_add([dir]) archive_data.revision_add([new_revision]) new_revision_id = hash_to_hex(new_revision.id) with pytest.raises(NotFoundExc) as e: archive.lookup_directory_with_revision(new_revision_id, dir_path) assert e.match("Content not found for revision %s" % new_revision_id)
def test_directory_simple(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o755) (rp / "link").symlink_to("file") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) with cook_extract_directory(loader.storage, swhid) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE assert (p / "link").is_symlink() assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash)
def content_archive_update(self, content_id, archive_id, new_status=None, cur=None): """ Update the status of an archive content and set its mtime to now Change the mtime of an archived content for the given archive and set it's mtime to the current time. Args: content_id (str): content sha1 archive_id (str): name of the archive new_status (str): one of 'missing', 'present' or 'ongoing'. this status will replace the previous one. If not given, the function only change the mtime of the content for the given archive. """ if not self.__logfile: self.open_logfile() print(time.time(), archive_id, new_status, hashutil.hash_to_hex(content_id), file=self.__logfile)
def index(self, id: Sha1, data: Optional[bytes] = None, **kwargs) -> List[ContentLicenseRow]: """Index sha1s' content and store result. Args: id (bytes): content's identifier raw_content (bytes): associated raw content to content id Returns: dict: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path - indexer_configuration_id (int): tool used to compute the output """ assert data is not None with write_to_temp( filename=hashutil.hash_to_hex(id), # use the id as pathname data=data, working_directory=self.working_directory, ) as content_path: properties = compute_license(path=content_path) return [ ContentLicenseRow( id=id, indexer_configuration_id=self.tool["id"], license=license, ) for license in properties["licenses"] ]
def check_snapshot(expected_snapshot, storage): """Check for snapshot match. Provide the hashes as hexadecimal, the conversion is done within the method. Args: expected_snapshot (dict): full snapshot with hex ids storage (Storage): expected storage """ expected_snapshot_id = expected_snapshot["id"] expected_branches = expected_snapshot["branches"] snap = snapshot_get_all_branches(hash_to_bytes(expected_snapshot_id)) if snap is None: # display known snapshots instead if possible if hasattr(storage, "_snapshots"): # in-mem storage from pprint import pprint for snap_id, (_snap, _) in storage._snapshots.items(): snapd = _snap.to_dict() snapd["id"] = hash_to_hex(snapd["id"]) branches = { branch.decode("utf-8"): decode_target(target) for branch, target in snapd["branches"].items() } snapd["branches"] = branches pprint(snapd) raise AssertionError("Snapshot is not found") branches = { branch.decode("utf-8"): decode_target(branch) for branch_name, branch in snap["branches"].items() } assert expected_branches == branches
def convert_hashes_bytes(v): """v is supposedly a hash as bytes, returns it converted in hex. """ if isinstance(v, bytes): return hashutil.hash_to_hex(v) return v
def test_revision_metadata_display(archive_data, client, directory, person, date): metadata = {"foo": "bar"} revision = Revision( directory=hash_to_bytes(directory), author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, metadata=metadata, ) archive_data.revision_add([revision]) url = reverse("browse-revision", url_args={"sha1_git": hash_to_hex(revision.id)}) resp = check_html_get_response(client, url, status_code=200, template_used="browse/revision.html") assert_contains(resp, "swh-metadata-popover") assert_contains(resp, escape(json.dumps(metadata, indent=4)))
def _object_path(self, obj_id): """Get the full path to an object""" hex_obj_id = hashutil.hash_to_hex(obj_id) if self.path_prefix: return self.path_prefix + hex_obj_id else: return hex_obj_id
def content_raw(request, query_string): """Django view that produces a raw display of a content identified by its hash value. The url that points to it is :http:get:`/browse/content/[(algo_hash):](hash)/raw/` """ re_encode = bool(strtobool(request.GET.get("re_encode", "false"))) algo, checksum = query.parse_hash(query_string) checksum = hash_to_hex(checksum) content_data = request_content(query_string, max_size=None, re_encode=re_encode) filename = request.GET.get("filename", None) if not filename: filename = "%s_%s" % (algo, checksum) if (content_data["mimetype"].startswith("text/") or content_data["mimetype"] == "inode/x-empty"): response = HttpResponse(content_data["raw_data"], content_type="text/plain") response["Content-disposition"] = "filename=%s" % filename else: response = HttpResponse(content_data["raw_data"], content_type="application/octet-stream") response["Content-disposition"] = "attachment; filename=%s" % filename return response
def _create_revisions(self, revs_data): """Create the revisions in the tree as broken symlinks to the target identifier.""" for file_data in revs_data: path = os.path.join(self.root, file_data["path"]) target = hashutil.hash_to_hex(file_data["target"]) self._create_file(path, target, mode=DentryPerms.symlink)
async def compute_entries(self) -> AsyncIterator[FuseEntry]: history = await self.fuse.get_history(self.history_swhid) if self.prefix: root_path = self.get_relative_root_path() for swhid in history: if swhid.object_id.startswith(hash_to_bytes(self.prefix)): yield self.create_child( FuseSymlinkEntry, name=str(swhid), target=Path(root_path, f"archive/{swhid}"), ) # Create sharded directories else: sharded_dirs = set() for swhid in history: next_prefix = hash_to_hex( swhid.object_id)[:self.SHARDING_LENGTH] if next_prefix not in sharded_dirs: sharded_dirs.add(next_prefix) yield self.create_child( RevisionHistoryShardByHash, name=next_prefix, mode=int(EntryMode.RDONLY_DIR), prefix=next_prefix, history_swhid=self.history_swhid, )
def __init__(self, algo, hash_id, colliding_contents): self.algo = algo self.hash_id = hash_to_hex(hash_id) self.colliding_contents = [ content_hex_hashes(c) for c in colliding_contents ] super().__init__(self.algo, self.hash_id, self.colliding_contents)
def content_raw(request, query_string): """Django view that produces a raw display of a content identified by its hash value. The url that points to it is :http:get:`/browse/content/[(algo_hash):](hash)/raw/` """ # noqa try: reencode = bool(strtobool(request.GET.get('reencode', 'false'))) algo, checksum = query.parse_hash(query_string) checksum = hash_to_hex(checksum) content_data = request_content(query_string, max_size=None, reencode=reencode) except Exception as exc: return handle_view_exception(request, exc) filename = request.GET.get('filename', None) if not filename: filename = '%s_%s' % (algo, checksum) if content_data['mimetype'].startswith('text/') or \ content_data['mimetype'] == 'inode/x-empty': response = HttpResponse(content_data['raw_data'], content_type="text/plain") response['Content-disposition'] = 'filename=%s' % filename else: response = HttpResponse(content_data['raw_data'], content_type='application/octet-stream') response['Content-disposition'] = 'attachment; filename=%s' % filename return response
def put_object(objstorage, obj_id, obj): try: with statsd.timed(CONTENT_DURATION_METRIC, tags={"request": "put"}): obj = objstorage.add(obj, obj_id, check_presence=False) logger.debug("stored %(obj_id)s", {"obj_id": hash_to_hex(obj_id)}) except Exception as exc: raise ReplayError(obj_id=obj_id, exc=exc) from None
def load_directory(self, obj_id: Sha1Git, raw_manifest: Optional[bytes]) -> None: # Load the directory entries_it: Optional[ Iterable[DirectoryEntry]] = stream_results_optional( self.storage.directory_get_entries, obj_id) if entries_it is None: logger.error("Missing swh:1:dir:%s, ignoring.", hash_to_hex(obj_id)) return directory = Directory(id=obj_id, entries=tuple(entries_it), raw_manifest=raw_manifest) git_object = raw_manifest or git_objects.directory_git_object( directory) self.write_object(obj_id, git_object) # Add children to the stack entry_loaders: Dict[str, Optional[List[Sha1Git]]] = { "file": self._cnt_stack, "dir": self._dir_stack, "rev": None, # Do not include submodule targets (rejected by git-fsck) } for entry in directory.entries: stack = entry_loaders[entry.type] if stack is not None: self._push(stack, [entry.target])
def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[str]: """Lookup missing Software Heritage persistent identifier hash, using batch processing. Args: A dictionary with: keys: object types values: object hashes Returns: A set(hexadecimal) of the hashes not found in the storage """ missing_hashes = [] for obj_type, obj_ids in grouped_swhids.items(): if obj_type == CONTENT: missing_hashes.append( storage.content_missing_per_sha1_git(obj_ids)) elif obj_type == DIRECTORY: missing_hashes.append(storage.directory_missing(obj_ids)) elif obj_type == REVISION: missing_hashes.append(storage.revision_missing(obj_ids)) elif obj_type == RELEASE: missing_hashes.append(storage.release_missing(obj_ids)) elif obj_type == SNAPSHOT: missing_hashes.append(storage.snapshot_missing(obj_ids)) missing = set( map(lambda x: hashutil.hash_to_hex(x), itertools.chain(*missing_hashes))) return missing
def search_origin_metadata(fulltext: str, limit: int = 50) -> Iterable[OriginMetadataInfo]: """Search for origins whose metadata match a provided string pattern. Args: fulltext: the string pattern to search for in origin metadata limit: the maximum number of found origins to return Returns: Iterable of origin metadata information for existing origins """ results = [] if search and config.get_config( )["metadata_search_backend"] == "swh-search": page_result = search.origin_search( metadata_pattern=fulltext, limit=limit, ) matches = idx_storage.origin_intrinsic_metadata_get( [r["url"] for r in page_result.results]) else: matches = idx_storage.origin_intrinsic_metadata_search_fulltext( conjunction=[fulltext], limit=limit) matches = [match.to_dict() for match in matches] origins = storage.origin_get([match["id"] for match in matches]) for origin, match in zip(origins, matches): if not origin: continue match["from_revision"] = hashutil.hash_to_hex(match["from_revision"]) del match["id"] results.append(OriginMetadataInfo(url=origin.url, metadata=match)) return results
def check_revisions_ordering( mocker, rev_walker_type, expected_result, truncated_history, revisions_list=_revisions_list, ): storage = mocker.patch("swh.storage.postgresql.storage.Storage") if not truncated_history: storage.revision_log.return_value = revisions_list else: revs_lists_truncated = [ None if hash_to_hex(rev["id"]) == _rev_missing else rev for rev in revisions_list ] storage.revision_log.return_value = revs_lists_truncated revs_walker = get_revisions_walker( rev_walker_type, storage, hash_to_bytes(_rev_start) ) assert list(map(hash_to_bytes, expected_result)) == [ rev["id"] for rev in revs_walker ] assert revs_walker.is_history_truncated() == truncated_history if truncated_history: missing_revs = revs_walker.missing_revisions() assert missing_revs == {hash_to_bytes(_rev_missing)} else: assert revs_walker.missing_revisions() == set()
def _corrupt_content(self, obj_id): """ Make the given content invalid. """ hex_obj_id = hashutil.hash_to_hex(obj_id) file_path = self.checker.objstorage._obj_path(hex_obj_id) with gzip.open(file_path, 'wb') as f: f.write(b'Unexpected content')