Exemplos de hash_to_hex em Python, exemplos de swh.model.hashutil.hash_to_hex em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: strategies.py Projeto: monperrus/swh-web

def non_ancestor_revisions():
    """
    Hypothesis strategy returning a pair of revisions ingested into the
    test archive with no ancestor relation.
    """
    # get a dfs revisions walker for one of the origins
    # loaded into the test archive
    revisions_walker = _get_origin_dfs_revisions_walker()
    merge_revs = []
    children = defaultdict(list)
    # get all merge revisions
    for rev in revisions_walker:
        if len(rev['parents']) > 1:
            merge_revs.append(rev)
        for rev_p in rev['parents']:
            children[rev_p].append(rev['id'])
    # find a merge revisions whose parents have a unique child revision
    random.shuffle(merge_revs)
    selected_revs = None
    for merge_rev in merge_revs:
        if all(len(children[rev_p]) == 1 for rev_p in merge_rev['parents']):
            selected_revs = merge_rev['parents']

    return just({
        'sha1_git_root': hash_to_hex(selected_revs[0]),
        'sha1_git': hash_to_hex(selected_revs[1])
    })

Exemplo n.º 2

0

Exibir arquivo

Arquivo: strategies.py Projeto: monperrus/swh-web

def ancestor_revisions():
    """
    Hypothesis strategy returning a pair of revisions ingested into the
    test archive with an ancestor relation.
    """
    # get a dfs revisions walker for one of the origins
    # loaded into the test archive
    revisions_walker = _get_origin_dfs_revisions_walker()
    master_revisions = []
    children = defaultdict(list)
    init_rev_found = False
    # get revisions only authored in the master branch
    for rev in revisions_walker:
        for rev_p in rev['parents']:
            children[rev_p].append(rev['id'])
        if not init_rev_found:
            master_revisions.append(rev)
        if not rev['parents']:
            init_rev_found = True

    # head revision
    root_rev = master_revisions[0]
    # pick a random revision, different from head, only authored
    # in the master branch
    ancestor_rev_idx = random.choice(list(range(1, len(master_revisions) - 1)))
    ancestor_rev = master_revisions[ancestor_rev_idx]
    ancestor_child_revs = children[ancestor_rev['id']]

    return just({
        'sha1_git_root': hash_to_hex(root_rev['id']),
        'sha1_git': hash_to_hex(ancestor_rev['id']),
        'children': [hash_to_hex(r) for r in ancestor_child_revs]
    })

Exemplo n.º 3

0

Exibir arquivo

    def _check_revision_divergence(self, rev: int, dir_id: bytes) -> None:
        """Check for hash revision computation divergence.

           The Rationale behind this is that svn can trigger unknown edge cases (mixed
           CRLF, svn properties, etc...). Those are not always easy to spot. Adding a
           regular check will help spotting potential missing edge cases.

        Args:
            rev: The actual revision we are computing from
            dir_id: The actual directory for the given revision

        Raises
            ValueError if a hash divergence is detected

        """

        self.log.debug("Checking hash computations on revision %s...", rev)
        checked_dir_id = self.swh_revision_hash_tree_at_svn_revision(rev)
        if checked_dir_id != dir_id:
            err = (
                "Hash tree computation divergence detected "
                "(%s != %s), stopping!"
                % (
                    hashutil.hash_to_hex(dir_id),
                    hashutil.hash_to_hex(checked_dir_id),
                )
            )
            raise ValueError(err)

Exemplo n.º 4

0

Exibir arquivo

    def get_contents_error(self, content_ids, source_storage):
        """Indicates what is the error associated to a content when needed

        Check the given content on the given storage. If an error is detected,
        it will be reported through the returned dict.

        Args:
            content_ids ([sha1]): list of content ids to check
            source_storage (str): the source storage holding the
            contents to check.

        Returns:
            a dict that map {content_id -> error_status} for each content_id
            with an error. The `error_status` result may be 'missing' or
            'corrupted'.

        """
        content_status = {}
        storage = self.objstorages[source_storage]
        for content_id in content_ids:
            try:
                storage.check(content_id)
            except Error:
                content_status[content_id] = 'corrupted'
                logger.error('%s corrupted!' % hashutil.hash_to_hex(
                    content_id))
            except ObjNotFoundError:
                content_status[content_id] = 'missing'
                logger.error('%s missing!' % hashutil.hash_to_hex(content_id))

        return content_status

Exemplo n.º 5

0

Exibir arquivo

 def _lookup_origin_visits(*args, **kwargs):
     if kwargs["last_visit"] is None:
         return [
             {
                 "visit": 1,
                 "date": "2017-05-06T00:59:10+00:00",
                 "status": "full",
                 "snapshot": hash_to_hex(snapshots[0].id),
                 "type": "git",
             },
             {
                 "visit": 2,
                 "date": "2017-08-06T00:59:10+00:00",
                 "status": "full",
                 "snapshot": hash_to_hex(snapshots[1].id),
                 "type": "git",
             },
         ]
     else:
         return [{
             "visit": 3,
             "date": "2017-09-06T00:59:10+00:00",
             "status": "full",
             "snapshot": hash_to_hex(snapshots[2].id),
             "type": "git",
         }]

Exemplo n.º 6

0

Exibir arquivo

def dulwich_tag_to_release(obj: ShaFile) -> Release:
    if obj.type_name != b"tag":
        raise ValueError("Argument is not a tag.")
    tag = cast(Tag, obj)

    tagger_timezone = None
    # FIXME: _parse_message is a private function from Dulwich.
    for (field, value) in _parse_message(tag.as_raw_chunks()):
        if field == b"tagger":
            m = AUTHORSHIP_LINE_RE.match(value)
            if m:
                tagger_timezone = m.group("timezone")

    target_type, target = tag.object
    if tag.tagger:
        author: Optional[Person] = parse_author(tag.tagger)
        if tag.tag_time is None:
            date = None
        else:
            date = dulwich_tsinfo_to_timestamp(
                tag.tag_time,
                tag.tag_timezone,
                tag._tag_timezone_neg_utc,
                tagger_timezone,
            )
    else:
        author = date = None

    message = tag.message
    if tag.signature:
        message += tag.signature

    rel = Release(
        id=tag.sha().digest(),
        author=author,
        date=date,
        name=tag.name,
        target=bytes.fromhex(target.decode()),
        target_type=DULWICH_OBJECT_TYPES[target_type.type_name],
        message=message,
        metadata=None,
        synthetic=False,
    )

    if rel.compute_hash() != rel.id:
        expected_id = rel.id
        actual_id = rel.compute_hash()
        logger.warning(
            "Expected release to have id %s, but got %s. Recording raw_manifest.",
            hash_to_hex(expected_id),
            hash_to_hex(actual_id),
        )
        raw_string = tag.as_raw_string()
        rel = attr.evolve(
            rel,
            raw_manifest=git_object_header("tag", len(raw_string)) +
            raw_string)

    check_id(rel)
    return rel

Exemplo n.º 7

0

Exibir arquivo

    def process_svn_revisions(
        self, svnrepo, revision_start, revision_end
    ) -> Iterator[
        Tuple[List[Content], List[SkippedContent], List[Directory], Revision]
    ]:
        """Process svn revisions from revision_start to revision_end.

        At each svn revision, apply new diffs and simultaneously
        compute swh hashes.  This yields those computed swh hashes as
        a tuple (contents, directories, revision).

        Note that at every `self.check_revision`, a supplementary
        check takes place to check for hash-tree divergence (related
        T570).

        Yields:
            tuple (contents, directories, revision) of dict as a
            dictionary with keys, sha1_git, sha1, etc...

        Raises:
            ValueError in case of a hash divergence detection

        """
        gen_revs = svnrepo.swh_hash_data_per_revision(revision_start, revision_end)
        parents = (self.latest_revision.id,) if self.latest_revision is not None else ()
        count = 0
        for rev, commit, new_objects, root_directory in gen_revs:
            count += 1
            # Send the associated contents/directories
            _contents, _skipped_contents, _directories = new_objects

            # compute the fs tree's checksums
            dir_id = root_directory.hash
            swh_revision = self.build_swh_revision(rev, commit, dir_id, parents)

            self.log.debug(
                "rev: %s, swhrev: %s, dir: %s",
                rev,
                hashutil.hash_to_hex(swh_revision.id),
                hashutil.hash_to_hex(dir_id),
            )

            if (
                self.check_revision
                and self.check_revision != 0
                and count % self.check_revision == 0
            ):
                self._check_revision_divergence(rev, dir_id)

            parents = (swh_revision.id,)

            yield _contents, _skipped_contents, _directories, swh_revision

        if not self.debug and self.svnrepo:
            # clean directory where revisions were replayed to gain some disk space
            # before the post_load operation
            self.svnrepo.clean_fs(self.svnrepo.local_url)

Exemplo n.º 8

0

Exibir arquivo

def get_object(objstorage, obj_id):
    try:
        with statsd.timed(CONTENT_DURATION_METRIC, tags={"request": "get"}):
            obj = objstorage.get(obj_id)
            logger.debug("retrieved %(obj_id)s",
                         {"obj_id": hash_to_hex(obj_id)})
        return obj
    except ObjNotFoundError:
        logger.error(
            "Failed to retrieve %(obj_id)s: object not found",
            {"obj_id": hash_to_hex(obj_id)},
        )
        raise
    except Exception as exc:
        raise ReplayError(obj_id=obj_id, exc=exc) from None

Exemplo n.º 9

0

Exibir arquivo

Arquivo: copier.py Projeto: SoftwareHeritage/swh-archiver

    def run(self):
        """ Do the copy on the backup storage.

        Run the archiver copier in order to copy the required content
        into the current destination.
        The content which corresponds to the sha1 in self.content_ids
        will be fetched from the master_storage and then copied into
        the backup object storage.

        Returns:
            A boolean that indicates if the whole content have been copied.
        """
        try:
            for content_id in self.content_ids:
                try:
                    content = self.source.get(content_id)
                except ObjNotFoundError:
                    logging.error('content %s not found' %
                                  hashutil.hash_to_hex(content_id))
                    continue
                self.destination.add(content, content_id)
        except Exception as e:
            logger.exception('Problem during copy: %s' % e)
            return False
        return True

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_from_disk.py Projeto: SoftwareHeritage/swh-model

 def test_data_to_content(self):
     for filename, content in self.contents.items():
         conv_content = Content.from_bytes(
             mode=content["mode"], data=content["data"]
         )
         self.assertContentEqual(conv_content, content)
         self.assertIn(hash_to_hex(conv_content.hash), repr(conv_content))

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_archive.py Projeto: shivam2003sy/swh-web

def test_lookup_revision_msg_ok(archive_data, new_revision):
    archive_data.revision_add([new_revision])

    revision_message = archive.lookup_revision_message(
        hash_to_hex(new_revision.id))

    assert revision_message == {"message": new_revision.message}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_archive.py Projeto: shivam2003sy/swh-web

def test_lookup_directory_with_revision_unknown_content(
        archive_data, new_revision):
    unknown_content_ = random_content()

    dir_path = "README.md"

    # A directory that points to unknown content
    dir = Directory(entries=(DirectoryEntry(
        name=bytes(dir_path.encode("utf-8")),
        type="file",
        target=hash_to_bytes(unknown_content_["sha1_git"]),
        perms=DentryPerms.content,
    ), ))

    # Create a revision that points to a directory
    # Which points to unknown content
    new_revision = new_revision.to_dict()
    new_revision["directory"] = dir.id
    del new_revision["id"]
    new_revision = Revision.from_dict(new_revision)

    # Add the directory and revision in mem
    archive_data.directory_add([dir])
    archive_data.revision_add([new_revision])
    new_revision_id = hash_to_hex(new_revision.id)
    with pytest.raises(NotFoundExc) as e:
        archive.lookup_directory_with_revision(new_revision_id, dir_path)
    assert e.match("Content not found for revision %s" % new_revision_id)

Exemplo n.º 13

0

Exibir arquivo

    def test_directory_simple(self, git_loader, cook_extract_directory):
        repo = TestRepo()
        with repo as rp:
            (rp / "file").write_text(TEST_CONTENT)
            (rp / "executable").write_bytes(TEST_EXECUTABLE)
            (rp / "executable").chmod(0o755)
            (rp / "link").symlink_to("file")
            (rp / "dir1/dir2").mkdir(parents=True)
            (rp / "dir1/dir2/file").write_text(TEST_CONTENT)
            c = repo.commit()
            loader = git_loader(str(rp))
            loader.load()

            obj_id_hex = repo.repo[c].tree.decode()
            obj_id = hashutil.hash_to_bytes(obj_id_hex)
            swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                              object_id=obj_id)

        with cook_extract_directory(loader.storage, swhid) as p:
            assert (p / "file").stat().st_mode == 0o100644
            assert (p / "file").read_text() == TEST_CONTENT
            assert (p / "executable").stat().st_mode == 0o100755
            assert (p / "executable").read_bytes() == TEST_EXECUTABLE
            assert (p / "link").is_symlink()
            assert os.readlink(str(p / "link")) == "file"
            assert (p / "dir1/dir2/file").stat().st_mode == 0o100644
            assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT

            directory = from_disk.Directory.from_disk(path=bytes(p))
            assert obj_id_hex == hashutil.hash_to_hex(directory.hash)

Exemplo n.º 14

0

Exibir arquivo

    def content_archive_update(self,
                               content_id,
                               archive_id,
                               new_status=None,
                               cur=None):
        """ Update the status of an archive content and set its mtime to now

        Change the mtime of an archived content for the given archive and set
        it's mtime to the current time.

        Args:
            content_id (str): content sha1
            archive_id (str): name of the archive
            new_status (str): one of 'missing', 'present' or 'ongoing'.
                this status will replace the previous one. If not given,
                the function only change the mtime of the content for the
                given archive.
        """
        if not self.__logfile:
            self.open_logfile()

        print(time.time(),
              archive_id,
              new_status,
              hashutil.hash_to_hex(content_id),
              file=self.__logfile)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: fossology_license.py Projeto: SoftwareHeritage/swh-indexer

    def index(self,
              id: Sha1,
              data: Optional[bytes] = None,
              **kwargs) -> List[ContentLicenseRow]:
        """Index sha1s' content and store result.

        Args:
            id (bytes): content's identifier
            raw_content (bytes): associated raw content to content id

        Returns:
            dict: A dict, representing a content_license, with keys:

            - id (bytes): content's identifier (sha1)
            - license (bytes): license in bytes
            - path (bytes): path
            - indexer_configuration_id (int): tool used to compute the output

        """
        assert data is not None
        with write_to_temp(
                filename=hashutil.hash_to_hex(id),  # use the id as pathname
                data=data,
                working_directory=self.working_directory,
        ) as content_path:
            properties = compute_license(path=content_path)
        return [
            ContentLicenseRow(
                id=id,
                indexer_configuration_id=self.tool["id"],
                license=license,
            ) for license in properties["licenses"]
        ]

Exemplo n.º 16

0

Exibir arquivo

Arquivo: common.py Projeto: SoftwareHeritage/swh-deposit

def check_snapshot(expected_snapshot, storage):
    """Check for snapshot match.

    Provide the hashes as hexadecimal, the conversion is done
    within the method.

    Args:
        expected_snapshot (dict): full snapshot with hex ids
        storage (Storage): expected storage

    """
    expected_snapshot_id = expected_snapshot["id"]
    expected_branches = expected_snapshot["branches"]
    snap = snapshot_get_all_branches(hash_to_bytes(expected_snapshot_id))
    if snap is None:
        # display known snapshots instead if possible
        if hasattr(storage, "_snapshots"):  # in-mem storage
            from pprint import pprint

            for snap_id, (_snap, _) in storage._snapshots.items():
                snapd = _snap.to_dict()
                snapd["id"] = hash_to_hex(snapd["id"])
                branches = {
                    branch.decode("utf-8"): decode_target(target)
                    for branch, target in snapd["branches"].items()
                }
                snapd["branches"] = branches
                pprint(snapd)
        raise AssertionError("Snapshot is not found")

    branches = {
        branch.decode("utf-8"): decode_target(branch)
        for branch_name, branch in snap["branches"].items()
    }
    assert expected_branches == branches

Exemplo n.º 17

0

Exibir arquivo

Arquivo: converters.py Projeto: shivam2003sy/swh-web

    def convert_hashes_bytes(v):
        """v is supposedly a hash as bytes, returns it converted in hex.

        """
        if isinstance(v, bytes):
            return hashutil.hash_to_hex(v)
        return v

Exemplo n.º 18

0

Exibir arquivo

def test_revision_metadata_display(archive_data, client, directory, person,
                                   date):
    metadata = {"foo": "bar"}
    revision = Revision(
        directory=hash_to_bytes(directory),
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
        metadata=metadata,
    )
    archive_data.revision_add([revision])

    url = reverse("browse-revision",
                  url_args={"sha1_git": hash_to_hex(revision.id)})

    resp = check_html_get_response(client,
                                   url,
                                   status_code=200,
                                   template_used="browse/revision.html")
    assert_contains(resp, "swh-metadata-popover")
    assert_contains(resp, escape(json.dumps(metadata, indent=4)))

Exemplo n.º 19

0

Exibir arquivo

 def _object_path(self, obj_id):
     """Get the full path to an object"""
     hex_obj_id = hashutil.hash_to_hex(obj_id)
     if self.path_prefix:
         return self.path_prefix + hex_obj_id
     else:
         return hex_obj_id

Exemplo n.º 20

0

Exibir arquivo

def content_raw(request, query_string):
    """Django view that produces a raw display of a content identified
    by its hash value.

    The url that points to it is
    :http:get:`/browse/content/[(algo_hash):](hash)/raw/`
    """
    re_encode = bool(strtobool(request.GET.get("re_encode", "false")))
    algo, checksum = query.parse_hash(query_string)
    checksum = hash_to_hex(checksum)
    content_data = request_content(query_string,
                                   max_size=None,
                                   re_encode=re_encode)

    filename = request.GET.get("filename", None)
    if not filename:
        filename = "%s_%s" % (algo, checksum)

    if (content_data["mimetype"].startswith("text/")
            or content_data["mimetype"] == "inode/x-empty"):
        response = HttpResponse(content_data["raw_data"],
                                content_type="text/plain")
        response["Content-disposition"] = "filename=%s" % filename
    else:
        response = HttpResponse(content_data["raw_data"],
                                content_type="application/octet-stream")
        response["Content-disposition"] = "attachment; filename=%s" % filename
    return response

Exemplo n.º 21

0

Exibir arquivo

Arquivo: to_disk.py Projeto: SoftwareHeritage/swh-vault

 def _create_revisions(self, revs_data):
     """Create the revisions in the tree as broken symlinks to the target
     identifier."""
     for file_data in revs_data:
         path = os.path.join(self.root, file_data["path"])
         target = hashutil.hash_to_hex(file_data["target"])
         self._create_file(path, target, mode=DentryPerms.symlink)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: artifact.py Projeto: SoftwareHeritage/swh-fuse

    async def compute_entries(self) -> AsyncIterator[FuseEntry]:
        history = await self.fuse.get_history(self.history_swhid)

        if self.prefix:
            root_path = self.get_relative_root_path()
            for swhid in history:
                if swhid.object_id.startswith(hash_to_bytes(self.prefix)):
                    yield self.create_child(
                        FuseSymlinkEntry,
                        name=str(swhid),
                        target=Path(root_path, f"archive/{swhid}"),
                    )
        # Create sharded directories
        else:
            sharded_dirs = set()
            for swhid in history:
                next_prefix = hash_to_hex(
                    swhid.object_id)[:self.SHARDING_LENGTH]
                if next_prefix not in sharded_dirs:
                    sharded_dirs.add(next_prefix)
                    yield self.create_child(
                        RevisionHistoryShardByHash,
                        name=next_prefix,
                        mode=int(EntryMode.RDONLY_DIR),
                        prefix=next_prefix,
                        history_swhid=self.history_swhid,
                    )

Exemplo n.º 23

0

Exibir arquivo

Arquivo: exc.py Projeto: SoftwareHeritage/swh-storage

 def __init__(self, algo, hash_id, colliding_contents):
     self.algo = algo
     self.hash_id = hash_to_hex(hash_id)
     self.colliding_contents = [
         content_hex_hashes(c) for c in colliding_contents
     ]
     super().__init__(self.algo, self.hash_id, self.colliding_contents)

Exemplo n.º 24

0

Exibir arquivo

def content_raw(request, query_string):
    """Django view that produces a raw display of a content identified
    by its hash value.

    The url that points to it is :http:get:`/browse/content/[(algo_hash):](hash)/raw/`
    """ # noqa
    try:
        reencode = bool(strtobool(request.GET.get('reencode', 'false')))
        algo, checksum = query.parse_hash(query_string)
        checksum = hash_to_hex(checksum)
        content_data = request_content(query_string,
                                       max_size=None,
                                       reencode=reencode)
    except Exception as exc:
        return handle_view_exception(request, exc)

    filename = request.GET.get('filename', None)
    if not filename:
        filename = '%s_%s' % (algo, checksum)

    if content_data['mimetype'].startswith('text/') or \
       content_data['mimetype'] == 'inode/x-empty':
        response = HttpResponse(content_data['raw_data'],
                                content_type="text/plain")
        response['Content-disposition'] = 'filename=%s' % filename
    else:
        response = HttpResponse(content_data['raw_data'],
                                content_type='application/octet-stream')
        response['Content-disposition'] = 'attachment; filename=%s' % filename
    return response

Exemplo n.º 25

0

Exibir arquivo

def put_object(objstorage, obj_id, obj):
    try:
        with statsd.timed(CONTENT_DURATION_METRIC, tags={"request": "put"}):
            obj = objstorage.add(obj, obj_id, check_presence=False)
            logger.debug("stored %(obj_id)s", {"obj_id": hash_to_hex(obj_id)})
    except Exception as exc:
        raise ReplayError(obj_id=obj_id, exc=exc) from None

Exemplo n.º 26

0

Exibir arquivo

    def load_directory(self, obj_id: Sha1Git,
                       raw_manifest: Optional[bytes]) -> None:
        # Load the directory
        entries_it: Optional[
            Iterable[DirectoryEntry]] = stream_results_optional(
                self.storage.directory_get_entries, obj_id)

        if entries_it is None:
            logger.error("Missing swh:1:dir:%s, ignoring.",
                         hash_to_hex(obj_id))
            return

        directory = Directory(id=obj_id,
                              entries=tuple(entries_it),
                              raw_manifest=raw_manifest)
        git_object = raw_manifest or git_objects.directory_git_object(
            directory)
        self.write_object(obj_id, git_object)

        # Add children to the stack
        entry_loaders: Dict[str, Optional[List[Sha1Git]]] = {
            "file": self._cnt_stack,
            "dir": self._dir_stack,
            "rev":
            None,  # Do not include submodule targets (rejected by git-fsck)
        }
        for entry in directory.entries:
            stack = entry_loaders[entry.type]
            if stack is not None:
                self._push(stack, [entry.target])

Exemplo n.º 27

0

Exibir arquivo

def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[str]:
    """Lookup missing Software Heritage persistent identifier hash, using
    batch processing.

    Args:
        A dictionary with:
        keys: object types
        values: object hashes
    Returns:
        A set(hexadecimal) of the hashes not found in the storage
    """
    missing_hashes = []

    for obj_type, obj_ids in grouped_swhids.items():
        if obj_type == CONTENT:
            missing_hashes.append(
                storage.content_missing_per_sha1_git(obj_ids))
        elif obj_type == DIRECTORY:
            missing_hashes.append(storage.directory_missing(obj_ids))
        elif obj_type == REVISION:
            missing_hashes.append(storage.revision_missing(obj_ids))
        elif obj_type == RELEASE:
            missing_hashes.append(storage.release_missing(obj_ids))
        elif obj_type == SNAPSHOT:
            missing_hashes.append(storage.snapshot_missing(obj_ids))

    missing = set(
        map(lambda x: hashutil.hash_to_hex(x),
            itertools.chain(*missing_hashes)))

    return missing

Exemplo n.º 28

0

Exibir arquivo

def search_origin_metadata(fulltext: str,
                           limit: int = 50) -> Iterable[OriginMetadataInfo]:
    """Search for origins whose metadata match a provided string pattern.

    Args:
        fulltext: the string pattern to search for in origin metadata
        limit: the maximum number of found origins to return

    Returns:
        Iterable of origin metadata information for existing origins

    """
    results = []
    if search and config.get_config(
    )["metadata_search_backend"] == "swh-search":
        page_result = search.origin_search(
            metadata_pattern=fulltext,
            limit=limit,
        )
        matches = idx_storage.origin_intrinsic_metadata_get(
            [r["url"] for r in page_result.results])
    else:
        matches = idx_storage.origin_intrinsic_metadata_search_fulltext(
            conjunction=[fulltext], limit=limit)

    matches = [match.to_dict() for match in matches]
    origins = storage.origin_get([match["id"] for match in matches])
    for origin, match in zip(origins, matches):
        if not origin:
            continue
        match["from_revision"] = hashutil.hash_to_hex(match["from_revision"])
        del match["id"]
        results.append(OriginMetadataInfo(url=origin.url, metadata=match))

    return results

Exemplo n.º 29

0

Exibir arquivo

Arquivo: test_revisions_walker.py Projeto: SoftwareHeritage/swh-storage

def check_revisions_ordering(
    mocker,
    rev_walker_type,
    expected_result,
    truncated_history,
    revisions_list=_revisions_list,
):
    storage = mocker.patch("swh.storage.postgresql.storage.Storage")

    if not truncated_history:
        storage.revision_log.return_value = revisions_list
    else:
        revs_lists_truncated = [
            None if hash_to_hex(rev["id"]) == _rev_missing else rev
            for rev in revisions_list
        ]

        storage.revision_log.return_value = revs_lists_truncated

    revs_walker = get_revisions_walker(
        rev_walker_type, storage, hash_to_bytes(_rev_start)
    )

    assert list(map(hash_to_bytes, expected_result)) == [
        rev["id"] for rev in revs_walker
    ]

    assert revs_walker.is_history_truncated() == truncated_history

    if truncated_history:
        missing_revs = revs_walker.missing_revisions()
        assert missing_revs == {hash_to_bytes(_rev_missing)}
    else:
        assert revs_walker.missing_revisions() == set()

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_checker.py Projeto: SoftwareHeritage/swh-archiver

 def _corrupt_content(self, obj_id):
     """ Make the given content invalid.
     """
     hex_obj_id = hashutil.hash_to_hex(obj_id)
     file_path = self.checker.objstorage._obj_path(hex_obj_id)
     with gzip.open(file_path, 'wb') as f:
         f.write(b'Unexpected content')