示例#1
0
def test_client(kafka_prefix: str, kafka_consumer_group: str, kafka_server: str):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    producer.produce(
        topic=kafka_prefix + ".revision",
        key=REV["id"],
        value=value_to_kafka(REV),
    )
    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
    )
    worker_fn = MagicMock()
    client.process(worker_fn)

    worker_fn.assert_called_once_with({"revision": [REV]})
def test_replay_content(kafka_server, kafka_prefix, kafka_consumer_group):
    objstorage1 = get_objstorage(cls="memory")
    objstorage2 = get_objstorage(cls="memory")

    writer = get_journal_writer(
        cls="kafka",
        brokers=[kafka_server],
        client_id="kafka_writer",
        prefix=kafka_prefix,
        anonymize=False,
    )

    for content in CONTENTS:
        objstorage1.add(content.data)
        writer.write_addition("content", content)

    replayer = JournalClient(
        brokers=kafka_server,
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        # stop_after_objects=len(objects),
    )

    worker_fn = functools.partial(process_replay_objects_content,
                                  src=objstorage1,
                                  dst=objstorage2)
    replayer.process(worker_fn)
    # only content with status visible will be copied in storage2
    expected_objstorage_state = {
        c.sha1: c.data
        for c in CONTENTS if c.status == "visible"
    }

    assert expected_objstorage_state == objstorage2.state
示例#3
0
def test_client_batch_size(
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
    batch_size: int,
):
    num_objects = 2 * batch_size + 1
    assert num_objects < 256, "Too many objects, generation will fail"

    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    contents = [Content.from_data(bytes([i])) for i in range(num_objects)]

    # Fill Kafka
    for content in contents:
        producer.produce(
            topic=kafka_prefix + ".content",
            key=key_to_kafka(content.sha1),
            value=value_to_kafka(content.to_dict()),
        )

    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        batch_size=batch_size,
    )

    collected_output: List[Dict] = []

    def worker_fn(objects):
        received = objects["content"]
        assert len(received) <= batch_size
        collected_output.extend(received)

    client.process(worker_fn)

    expected_output = [content.to_dict() for content in contents]
    assert len(collected_output) == len(expected_output)

    for output in collected_output:
        assert output in expected_output
示例#4
0
def test_client_subscribe_one_topic(
    kafka_producer: Producer, kafka_prefix: str, kafka_server_base: str
):
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id="whatever",
        prefix=kafka_prefix,
        stop_on_eof=True,
        object_types=["else"],
    )
    assert client.subscription == [f"{kafka_prefix}.else"]

    worker_fn = MagicMock()
    client.process(worker_fn)
    worker_fn.assert_called_once_with({"else": ["value2"]})
示例#5
0
def test_client_with_deserializer(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server: str
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    revisions = cast(List[Revision], TEST_OBJECTS["revision"])
    for rev in revisions:
        producer.produce(
            topic=kafka_prefix + ".revision",
            key=rev.id,
            value=value_to_kafka(rev.to_dict()),
        )
    producer.flush()

    def custom_deserializer(object_type, msg):
        assert object_type == "revision"
        obj = kafka_to_value(msg)
        # filter the first revision
        if obj["id"] == revisions[0].id:
            return None
        return Revision.from_dict(obj)

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        value_deserializer=custom_deserializer,
    )
    worker_fn = MagicMock()
    client.process(worker_fn)

    # a commit seems to be needed to prevent some race condition situation
    # where the worker_fn has not yet been called at this point (not sure how)
    client.consumer.commit()

    # Check the first revision has not been passed to worker_fn
    processed_revisions = set(worker_fn.call_args[0][0]["revision"])
    assert revisions[0] not in processed_revisions
    assert all(rev in processed_revisions for rev in revisions[1:])
示例#6
0
def test_client_stop_after_objects(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server: str, count: int
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    revisions = cast(List[Revision], TEST_OBJECTS["revision"])
    for rev in revisions:
        producer.produce(
            topic=kafka_prefix + ".revision",
            key=rev.id,
            value=value_to_kafka(rev.to_dict()),
        )
    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=False,
        stop_after_objects=count,
    )

    worker_fn = MagicMock()
    client.process(worker_fn)

    # this code below is not pretty, but needed since we have to deal with
    # dicts (so no set) which can have values that are list vs tuple, and we do
    # not know for sure how many calls of the worker_fn will happen during the
    # consumption of the topic...
    worker_fn.assert_called()
    revs = []  # list of (unique) rev dicts we got from the client
    for call in worker_fn.call_args_list:
        callrevs = call[0][0]["revision"]
        for rev in callrevs:
            assert Revision.from_dict(rev) in revisions
            if rev not in revs:
                revs.append(rev)
    assert len(revs) == count
示例#7
0
def test_client_subscriptions_with_anonymized_topics(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server_base: str
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server_base,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka with revision object on both the regular prefix (normally for
    # anonymized objects in this case) and privileged one
    producer.produce(
        topic=kafka_prefix + ".revision",
        key=REV["id"],
        value=value_to_kafka(REV),
    )
    producer.produce(
        topic=kafka_prefix + "_privileged.revision",
        key=REV["id"],
        value=value_to_kafka(REV),
    )
    producer.flush()

    # without privileged "channels" activated on the client side
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        privileged=False,
    )
    # we only subscribed to "standard" topics
    assert client.subscription == [kafka_prefix + ".revision"]

    # with privileged "channels" activated on the client side
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        privileged=True,
    )
    # we only subscribed to "privileged" topics
    assert client.subscription == [kafka_prefix + "_privileged.revision"]
示例#8
0
def test_client_subscribe_absent_prefix(
    kafka_producer: Producer, kafka_prefix: str, kafka_server_base: str
):
    with pytest.raises(ValueError):
        JournalClient(
            brokers=[kafka_server_base],
            group_id="whatever",
            prefix="wrong.prefix",
            stop_on_eof=True,
        )
    with pytest.raises(ValueError):
        JournalClient(
            brokers=[kafka_server_base],
            group_id="whatever",
            prefix="wrong.prefix",
            stop_on_eof=True,
            object_types=["else"],
        )
示例#9
0
def test_client_subscriptions_without_anonymized_topics(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server_base: str
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server_base,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka with revision objects only on the standard prefix
    producer.produce(
        topic=kafka_prefix + ".revision",
        key=REV["id"],
        value=value_to_kafka(REV),
    )
    producer.flush()

    # without privileged channel activated on the client side
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        privileged=False,
    )
    # we only subscribed to the standard prefix
    assert client.subscription == [kafka_prefix + ".revision"]

    # with privileged channel activated on the client side
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        privileged=True,
    )
    # we also only subscribed to the standard prefix, since there is no priviled prefix
    # on the kafka broker
    assert client.subscription == [kafka_prefix + ".revision"]
示例#10
0
def test_client_subscribe_all(
    kafka_producer: Producer, kafka_prefix: str, kafka_server_base: str
):
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id="whatever",
        prefix=kafka_prefix,
        stop_on_eof=True,
    )
    assert set(client.subscription) == {
        f"{kafka_prefix}.something",
        f"{kafka_prefix}.else",
    }

    worker_fn = MagicMock()
    client.process(worker_fn)
    worker_fn.assert_called_once_with(
        {
            "something": ["value1"],
            "else": ["value2"],
        }
    )
示例#11
0
def replayer_storage_and_client(kafka_prefix: str, kafka_consumer_group: str,
                                kafka_server: str):
    journal_writer_config = {
        "cls": "kafka",
        "brokers": [kafka_server],
        "client_id": "kafka_writer",
        "prefix": kafka_prefix,
    }
    storage_config: Dict[str, Any] = {
        "cls": "memory",
        "journal_writer": journal_writer_config,
    }
    storage = get_storage(**storage_config)
    deserializer = ModelObjectDeserializer()
    replayer = JournalClient(
        brokers=kafka_server,
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        value_deserializer=deserializer.convert,
    )

    yield storage, replayer
示例#12
0
def test_replay_statsd(kafka_server, kafka_prefix, kafka_consumer_group,
                       statsd):
    objstorage1 = get_objstorage(cls="memory")
    objstorage2 = get_objstorage(cls="memory")

    writer = get_journal_writer(
        cls="kafka",
        brokers=[kafka_server],
        client_id="kafka_writer",
        prefix=kafka_prefix,
        anonymize=False,
    )

    # Fill the source objstorage with a bunch of content object. In the end,
    # there should be 2 content objects for each possible replaying decision
    # (aka. skipped, excluded, in_dst, not_in_src, failed and copied):
    # contents[0:2] are properly copied
    # contents[2:4] are excluded
    # contents[4:6] are in dst
    # contents[6:8] are hidden
    contents = [
        Content.from_data(f"foo{i}".encode(),
                          status="hidden" if 6 <= i < 8 else "visible")
        for i in range(8)
    ]

    for content in contents:
        objstorage1.add(content.data)
        writer.write_addition("content", content)
    excluded = [c.sha1 for c in contents[2:4]]

    def exclude_fn(cnt_d):
        return cnt_d["sha1"] in excluded

    for content in contents[4:6]:
        objstorage2.add(content.data)

    replayer = JournalClient(
        brokers=kafka_server,
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        # stop_after_objects=len(objects),
    )

    worker_fn = functools.partial(
        process_replay_objects_content,
        src=objstorage1,
        dst=objstorage2,
        exclude_fn=exclude_fn,
    )
    replayer.process(worker_fn)

    # We cannot expect any order from replayed objects, so statsd reports won't
    # be sorted according to contents, so we just count the expected occurrence
    # of each statsd message.
    prefix = "swh_content_replayer"
    expected_reports = {
        # 4 because 2 for the copied objects + 2 for the in_dst ones
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:obj_in_objstorage$":
        4,
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:get_object$":
        2,
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:put_object$":
        2,
        f"^{prefix}_duration_seconds:[0-9]+[.][0-9]+[|]ms[|]#request:get$":
        2,
        f"^{prefix}_duration_seconds:[0-9]+[.][0-9]+[|]ms[|]#request:put$":
        2,
        f"^{prefix}_bytes:4[|]c$":
        2,
    }
    decisions = ("copied", "skipped", "excluded", "in_dst", "not_in_src",
                 "failed")
    decision_re = (
        "^swh_content_replayer_operations_total:1[|]c[|]#decision:(?P<decision>"
        + "|".join(decisions) + ")(?P<extras>,.+)?$")

    operations = dict.fromkeys(decisions, 0)
    reports = dict.fromkeys(expected_reports, 0)

    for report in (r.decode() for r in statsd.socket.payloads):
        m = re.match(decision_re, report)
        if m:
            operations[m.group("decision")] += 1
        else:
            for expected in expected_reports:
                m = re.match(expected, report)
                if m:
                    reports[expected] += 1

    assert reports == expected_reports

    assert operations["skipped"] == 2
    assert operations["excluded"] == 2
    assert operations["in_dst"] == 2
    assert operations["copied"] == 2
    # TODO:
    assert operations["not_in_src"] == 0
    assert operations["failed"] == 0
示例#13
0
def test_backfiller(
    swh_storage_backend_config,
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
    caplog,
):
    prefix1 = f"{kafka_prefix}-1"
    prefix2 = f"{kafka_prefix}-2"

    journal1 = {
        "cls": "kafka",
        "brokers": [kafka_server],
        "client_id": "kafka_writer-1",
        "prefix": prefix1,
    }
    swh_storage_backend_config["journal_writer"] = journal1
    storage = get_storage(**swh_storage_backend_config)
    # fill the storage and the journal (under prefix1)
    for object_type, objects in TEST_OBJECTS.items():
        method = getattr(storage, object_type + "_add")
        method(objects)

    # now apply the backfiller on the storage to fill the journal under prefix2
    backfiller_config = {
        "journal_writer": {
            "brokers": [kafka_server],
            "client_id": "kafka_writer-2",
            "prefix": prefix2,
        },
        "storage": swh_storage_backend_config,
    }

    # Backfilling
    backfiller = JournalBackfiller(backfiller_config)
    for object_type in TEST_OBJECTS:
        backfiller.run(object_type, None, None)

    # Trace log messages for unhandled object types in the replayer
    caplog.set_level(logging.DEBUG, "swh.storage.replay")

    # now check journal content are the same under both topics
    # use the replayer scaffolding to fill storages to make is a bit easier
    # Replaying #1
    deserializer = ModelObjectDeserializer()
    sto1 = get_storage(cls="memory")
    replayer1 = JournalClient(
        brokers=kafka_server,
        group_id=f"{kafka_consumer_group}-1",
        prefix=prefix1,
        stop_on_eof=True,
        value_deserializer=deserializer.convert,
    )

    worker_fn1 = functools.partial(process_replay_objects, storage=sto1)
    replayer1.process(worker_fn1)

    # Replaying #2
    sto2 = get_storage(cls="memory")
    replayer2 = JournalClient(
        brokers=kafka_server,
        group_id=f"{kafka_consumer_group}-2",
        prefix=prefix2,
        stop_on_eof=True,
        value_deserializer=deserializer.convert,
    )
    worker_fn2 = functools.partial(process_replay_objects, storage=sto2)
    replayer2.process(worker_fn2)

    # Compare storages
    assert isinstance(sto1, InMemoryStorage)  # needed to help mypy
    assert isinstance(sto2, InMemoryStorage)
    check_replayed(sto1, sto2)

    for record in caplog.records:
        assert (
            "this should not happen" not in record.message
        ), "Replayer ignored some message types, see captured logging"
示例#14
0
def test_storage_replay_anonymized(
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
    privileged: bool,
):
    """Optimal replayer scenario.

    This:
    - writes objects to the topic
    - replayer consumes objects from the topic and replay them

    This tests the behavior with both a privileged and non-privileged replayer
    """
    writer_config = {
        "cls": "kafka",
        "brokers": [kafka_server],
        "client_id": "kafka_writer",
        "prefix": kafka_prefix,
        "anonymize": True,
    }
    src_config: Dict[str, Any] = {
        "cls": "memory",
        "journal_writer": writer_config
    }

    storage = get_storage(**src_config)

    # Fill the src storage
    nb_sent = 0
    for obj_type, objs in TEST_OBJECTS.items():
        if obj_type in ("origin_visit", "origin_visit_status"):
            # these are unrelated with what we want to test here
            continue
        method = getattr(storage, obj_type + "_add")
        method(objs)
        nb_sent += len(objs)

    # Fill a destination storage from Kafka, potentially using privileged topics
    dst_storage = get_storage(cls="memory")
    deserializer = ModelObjectDeserializer(
        validate=False)  # we cannot validate an anonymized replay
    replayer = JournalClient(
        brokers=kafka_server,
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_after_objects=nb_sent,
        privileged=privileged,
        value_deserializer=deserializer.convert,
    )
    worker_fn = functools.partial(process_replay_objects, storage=dst_storage)

    nb_inserted = replayer.process(worker_fn)
    replayer.consumer.commit()

    assert nb_sent == nb_inserted
    # Check the contents of the destination storage, and whether the anonymization was
    # properly used
    assert isinstance(storage, InMemoryStorage)  # needed to help mypy
    assert isinstance(dst_storage, InMemoryStorage)
    check_replayed(storage, dst_storage, expected_anonymized=not privileged)