Exemplo n.º 1
0
 def test_revision_identifier(self):
     self.assertEqual(
         Revision.from_dict(self.revision).id,
         self.revision["id"],
     )
     self.assertEqual(
         Revision.from_dict(remove_id(self.revision)).id,
         self.revision["id"],
     )
Exemplo n.º 2
0
def test_lookup_directory_with_revision_unknown_content(
        archive_data, new_revision):
    unknown_content_ = random_content()

    dir_path = "README.md"

    # A directory that points to unknown content
    dir = Directory(entries=(DirectoryEntry(
        name=bytes(dir_path.encode("utf-8")),
        type="file",
        target=hash_to_bytes(unknown_content_["sha1_git"]),
        perms=DentryPerms.content,
    ), ))

    # Create a revision that points to a directory
    # Which points to unknown content
    new_revision = new_revision.to_dict()
    new_revision["directory"] = dir.id
    del new_revision["id"]
    new_revision = Revision.from_dict(new_revision)

    # Add the directory and revision in mem
    archive_data.directory_add([dir])
    archive_data.revision_add([new_revision])
    new_revision_id = hash_to_hex(new_revision.id)
    with pytest.raises(NotFoundExc) as e:
        archive.lookup_directory_with_revision(new_revision_id, dir_path)
    assert e.match("Content not found for revision %s" % new_revision_id)
Exemplo n.º 3
0
 def custom_deserializer(object_type, msg):
     assert object_type == "revision"
     obj = kafka_to_value(msg)
     # filter the first revision
     if obj["id"] == revisions[0].id:
         return None
     return Revision.from_dict(obj)
Exemplo n.º 4
0
    def test_revision_metadata_indexer_single_root_dir(self):
        metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
        fill_obj_storage(metadata_indexer.objstorage)
        fill_storage(metadata_indexer.storage)

        # Add a parent directory, that is the only directory at the root
        # of the revision
        rev = REVISION
        assert rev.directory == DIRECTORY2.id

        directory = Directory(
            entries=(
                DirectoryEntry(
                    name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384,
                ),
            ),
        )
        assert directory.id is not None
        metadata_indexer.storage.directory_add([directory])

        new_rev_dict = {**rev.to_dict(), "directory": directory.id}
        new_rev_dict.pop("id")
        new_rev = Revision.from_dict(new_rev_dict)
        metadata_indexer.storage.revision_add([new_rev])

        tool = metadata_indexer.idx_storage.indexer_configuration_get(
            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
        )
        assert tool is not None

        metadata_indexer.idx_storage.content_metadata_add(
            [
                ContentMetadataRow(
                    id=DIRECTORY2.entries[0].target,
                    indexer_configuration_id=tool["id"],
                    metadata=YARN_PARSER_METADATA,
                )
            ]
        )

        metadata_indexer.run([new_rev.id])

        results = list(
            metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
        )

        expected_results = [
            RevisionIntrinsicMetadataRow(
                id=new_rev.id,
                tool=TRANSLATOR_TOOL,
                metadata=YARN_PARSER_METADATA,
                mappings=["npm"],
            )
        ]

        for result in results:
            del result.tool["id"]

        # then
        self.assertEqual(results, expected_results)
Exemplo n.º 5
0
def test_lookup_revision_invalid_msg(archive_data, new_revision):
    new_revision = new_revision.to_dict()
    new_revision["message"] = b"elegant fix for bug \xff"
    archive_data.revision_add([Revision.from_dict(new_revision)])

    revision = archive.lookup_revision(hash_to_hex(new_revision["id"]))
    assert revision["message"] == "elegant fix for bug \\xff"
    assert revision["decoding_failures"] == ["message"]
def test_build_swh_revision_default():
    """This should build the swh revision with the swh revision's extra
    headers about the repository.

    """
    dir_id = hash_to_bytes("d6e08e19159f77983242877c373c75222d5ae9dd")
    date = TimestampWithTimezone(timestamp=Timestamp(seconds=1088108379,
                                                     microseconds=0),
                                 offset_bytes=b"+0000")
    actual_rev = converters.build_swh_revision(
        repo_uuid=b"uuid",
        dir_id=dir_id,
        commit={
            "author_name":
            Person(name=b"theo",
                   email=b"theo@uuid",
                   fullname=b"theo <theo@uuid>"),
            "message":
            b"commit message",
            "author_date":
            date,
        },
        rev=10,
        parents=(),
    )

    expected_rev = Revision.from_dict({
        "date":
        date.to_dict(),
        "committer_date":
        date.to_dict(),
        "type":
        "svn",
        "directory":
        dir_id,
        "message":
        b"commit message",
        "author": {
            "name": b"theo",
            "email": b"theo@uuid",
            "fullname": b"theo <theo@uuid>",
        },
        "committer": {
            "name": b"theo",
            "email": b"theo@uuid",
            "fullname": b"theo <theo@uuid>",
        },
        "synthetic":
        True,
        "extra_headers": (
            (b"svn_repo_uuid", b"uuid"),
            (b"svn_revision", b"10"),
        ),
        "parents": (),
    })

    assert actual_rev == expected_rev
Exemplo n.º 7
0
    def push_revision_subgraph(self, obj_id: Sha1Git) -> None:
        """Fetches the graph of revisions induced by the given ``obj_id`` and adds
        them to ``self._rev_stack``.

        If swh-graph is not available, this requires fetching the revisions themselves,
        so they are directly loaded instead."""
        loaded_from_graph = False

        if self.graph:
            from swh.graph.client import GraphArgumentException

            # First, try to cook using swh-graph, as it is more efficient than
            # swh-storage for querying the history
            obj_swhid = CoreSWHID(
                object_type=ObjectType.REVISION,
                object_id=obj_id,
            )
            try:
                revision_ids = (swhid.object_id for swhid in map(
                    CoreSWHID.from_string,
                    self.graph.visit_nodes(str(obj_swhid), edges="rev:rev"),
                ))
                self._push(self._rev_stack, revision_ids)
            except GraphArgumentException as e:
                logger.info(
                    "Revision %s not found in swh-graph, falling back to fetching "
                    "history using swh-storage. %s",
                    hash_to_hex(obj_id),
                    e.args[0],
                )
            else:
                loaded_from_graph = True

        if not loaded_from_graph:
            # If swh-graph is not available, or the revision is not yet in
            # swh-graph, fall back to self.storage.revision_log.
            # self.storage.revision_log also gives us the full revisions,
            # so we load them right now instead of just pushing them on the stack.
            walker = DFSRevisionsWalker(self.storage,
                                        obj_id,
                                        state=self._walker_state,
                                        ignore_displayname=True)
            for revision in walker:
                self.write_revision_node(Revision.from_dict(revision))
                self.nb_loaded += 1
                self._push(self._dir_stack, [revision["directory"]])
            # Save the state, so the next call to the walker won't return the same
            # revisions
            self._walker_state = walker.export_state()
Exemplo n.º 8
0
def test_client_stop_after_objects(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server: str, count: int
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    revisions = cast(List[Revision], TEST_OBJECTS["revision"])
    for rev in revisions:
        producer.produce(
            topic=kafka_prefix + ".revision",
            key=rev.id,
            value=value_to_kafka(rev.to_dict()),
        )
    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=False,
        stop_after_objects=count,
    )

    worker_fn = MagicMock()
    client.process(worker_fn)

    # this code below is not pretty, but needed since we have to deal with
    # dicts (so no set) which can have values that are list vs tuple, and we do
    # not know for sure how many calls of the worker_fn will happen during the
    # consumption of the topic...
    worker_fn.assert_called()
    revs = []  # list of (unique) rev dicts we got from the client
    for call in worker_fn.call_args_list:
        callrevs = call[0][0]["revision"]
        for rev in callrevs:
            assert Revision.from_dict(rev) in revisions
            if rev not in revs:
                revs.append(rev)
    assert len(revs) == count
Exemplo n.º 9
0
def identify_revision(
    hg: Hg,
    rev: Optional[bytes] = None,
    node_id_2_swhid: Optional[Dict[bytes, CoreSWHID]] = None,
) -> Iterator[RevisionIdentity]:
    """Return the repository revision identities.

    Args:
        hg: A `Hg` repository instance
        rev: An optional revision or Mercurial revsets (See `hg help revsets`)
             If not provided all the repository revisions will be computed.
        node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs
            It will be updated in place with new mappings.
    """
    from swh.model.model import Revision

    if node_id_2_swhid is None:
        node_id_2_swhid = {}

    for revision in hg.log(rev):
        data = revision.to_dict()

        hg.up(revision.node_id)
        directory_swhid = identify_directory(hg.root())
        data["directory"] = directory_swhid.object_id

        parents = []
        for parent in data["parents"]:
            if parent not in node_id_2_swhid:
                parent_revision = next(
                    identify_revision(hg, parent, node_id_2_swhid))
                node_id_2_swhid[parent] = parent_revision.swhid
            assert node_id_2_swhid[parent].object_type == ObjectType.REVISION
            parents.append(node_id_2_swhid[parent].object_id)
        data["parents"] = parents

        revision_swhid = Revision.from_dict(data).swhid()
        node_id_2_swhid[revision.node_id] = revision_swhid

        yield RevisionIdentity(
            swhid=revision_swhid,
            node_id=revision.node_id,
            directory_swhid=directory_swhid,
        )
Exemplo n.º 10
0
def test_kafka_writer_anonymized(
    kafka_prefix: str,
    kafka_server: str,
    consumer: Consumer,
    privileged_object_types: Iterable[str],
):
    writer = KafkaJournalWriter(
        brokers=[kafka_server],
        client_id="kafka_writer",
        prefix=kafka_prefix,
        value_sanitizer=model_object_dict_sanitizer,
        anonymize=True,
    )

    expected_messages = 0

    for object_type, objects in TEST_OBJECTS.items():
        writer.write_additions(object_type, objects)
        expected_messages += len(objects)
        if object_type in privileged_object_types:
            expected_messages += len(objects)

    consumed_messages = consume_messages(consumer, kafka_prefix,
                                         expected_messages)
    assert_all_objects_consumed(consumed_messages,
                                exclude=["revision", "release"])

    for key, obj_dict in consumed_messages["revision"]:
        obj = Revision.from_dict(obj_dict)
        for person in (obj.author, obj.committer):
            assert (len(person.fullname) == 32 and person.name is None
                    and person.email is None)
    for key, obj_dict in consumed_messages["release"]:
        obj = Release.from_dict(obj_dict)
        # author is optional for release
        if obj.author is None:
            continue
        for person in (obj.author, ):
            assert (len(person.fullname) == 32 and person.name is None
                    and person.email is None)
Exemplo n.º 11
0
 def test_revision_identifier_only_fullname(self):
     self.assertEqual(
         Revision.from_dict(remove_id(self.revision_only_fullname)).id,
         self.revision_only_fullname["id"],
     )
Exemplo n.º 12
0
 def test_revision_identifier_empty_message(self):
     self.assertEqual(
         Revision.from_dict(remove_id(self.revision_empty_message)).id,
         self.revision_empty_message["id"],
     )
Exemplo n.º 13
0
 def test_revision_identifier_with_gpgsig(self):
     self.assertEqual(
         Revision.from_dict(remove_id(self.revision_with_gpgsig)).id,
         self.revision_with_gpgsig["id"],
     )
Exemplo n.º 14
0
 def test_revision_identifier_with_extra_headers(self):
     self.assertEqual(
         Revision.from_dict(remove_id(self.revision_with_extra_headers)).id,
         self.revision_with_extra_headers["id"],
     )
Exemplo n.º 15
0
 def test_revision_identifier_synthetic(self):
     self.assertEqual(
         Revision.from_dict(remove_id(self.synthetic_revision)).id,
         self.synthetic_revision["id"],
     )
Exemplo n.º 16
0
 def test_revision_identifier_none_metadata(self):
     self.assertEqual(
         Revision.from_dict(remove_id(self.revision_none_metadata)).id,
         self.revision_none_metadata["id"],
     )