Пример #1
0
class CopycatStandaloneFileTest(KafkaTest):
    """
    Simple test of Copycat that produces data from a file in one Copycat
    standalone process and consumes it on another, validating the output is
    identical to the input.
    """

    INPUT_FILE = "/mnt/copycat.input"
    OUTPUT_FILE = "/mnt/copycat.output"

    OFFSETS_FILE = "/mnt/copycat.offsets"

    TOPIC = "test"

    FIRST_INPUT_LIST = ["foo", "bar", "baz"]
    FIRST_INPUT = "\n".join(FIRST_INPUT_LIST) + "\n"
    SECOND_INPUT_LIST = ["razz", "ma", "tazz"]
    SECOND_INPUT = "\n".join(SECOND_INPUT_LIST) + "\n"

    SCHEMA = {"type": "string", "optional": False}

    def __init__(self, test_context):
        super(CopycatStandaloneFileTest, self).__init__(
            test_context,
            num_zk=1,
            num_brokers=1,
            topics={'test': {
                'partitions': 1,
                'replication-factor': 1
            }})

        self.source = CopycatStandaloneService(
            test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE])
        self.sink = CopycatStandaloneService(
            test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE])
        self.consumer_validator = ConsoleConsumer(test_context,
                                                  1,
                                                  self.kafka,
                                                  self.TOPIC,
                                                  consumer_timeout_ms=1000)

    @parametrize(converter="org.apache.kafka.copycat.json.JsonConverter",
                 schemas=True)
    @parametrize(converter="org.apache.kafka.copycat.json.JsonConverter",
                 schemas=False)
    @parametrize(converter="org.apache.kafka.copycat.storage.StringConverter",
                 schemas=None)
    def test_file_source_and_sink(
            self,
            converter="org.apache.kafka.copycat.json.JsonConverter",
            schemas=True):
        assert converter != None, "converter type must be set"
        # Template parameters
        self.key_converter = converter
        self.value_converter = converter
        self.schemas = schemas

        self.source.set_configs(
            self.render("copycat-standalone.properties"),
            [self.render("copycat-file-source.properties")])
        self.sink.set_configs(self.render("copycat-standalone.properties"),
                              [self.render("copycat-file-sink.properties")])

        self.source.start()
        self.sink.start()

        # Generating data on the source node should generate new records and create new output on the sink node
        self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) +
                                     " >> " + self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.FIRST_INPUT),
            timeout_sec=60,
            err_msg=
            "Data added to input file was not seen in the output file in a reasonable amount of time."
        )

        # Restarting both should result in them picking up where they left off,
        # only processing new data.
        self.source.restart()
        self.sink.restart()

        self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) +
                                     " >> " + self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT),
            timeout_sec=60,
            err_msg=
            "Sink output file never converged to the same state as the input file"
        )

        # Validate the format of the data in the Kafka topic
        self.consumer_validator.run()
        expected = json.dumps([
            line if not self.schemas else {
                "schema": self.SCHEMA,
                "payload": line
            } for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST
        ])
        decoder = (json.loads if converter.endswith("JsonConverter") else str)
        actual = json.dumps(
            [decoder(x) for x in self.consumer_validator.messages_consumed[1]])
        assert expected == actual, "Expected %s but saw %s in Kafka" % (
            expected, actual)

    def validate_output(self, value):
        try:
            output_hash = list(
                self.sink.node.account.ssh_capture(
                    "md5sum " + self.OUTPUT_FILE))[0].strip().split()[0]
            return output_hash == hashlib.md5(value).hexdigest()
        except subprocess.CalledProcessError:
            return False
Пример #2
0
class CopycatStandaloneFileTest(KafkaTest):
    """
    Simple test of Copycat that produces data from a file in one Copycat
    standalone process and consumes it on another, validating the output is
    identical to the input.
    """

    INPUT_FILE = "/mnt/copycat.input"
    OUTPUT_FILE = "/mnt/copycat.output"

    OFFSETS_FILE = "/mnt/copycat.offsets"

    FIRST_INPUT = "foo\nbar\nbaz\n"
    SECOND_INPUT = "razz\nma\ntazz\n"

    def __init__(self, test_context):
        super(CopycatStandaloneFileTest, self).__init__(
            test_context,
            num_zk=1,
            num_brokers=1,
            topics={'test': {
                'partitions': 1,
                'replication-factor': 1
            }})

        self.source = CopycatStandaloneService(
            test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE])
        self.sink = CopycatStandaloneService(
            test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE])

    def test_file_source_and_sink(self):
        # These need to be set
        self.source.set_configs(self.render("copycat-standalone.properties"),
                                self.render("copycat-file-source.properties"))
        self.sink.set_configs(self.render("copycat-standalone.properties"),
                              self.render("copycat-file-sink.properties"))

        self.source.start()
        self.sink.start()

        # Generating data on the source node should generate new records and create new output on the sink node
        self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) +
                                     " >> " + self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.FIRST_INPUT),
            timeout_sec=60,
            err_msg=
            "Data added to input file was not seen in the output file in a reasonable amount of time."
        )

        # Restarting both should result in them picking up where they left off,
        # only processing new data.
        self.source.restart()
        self.sink.restart()

        self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) +
                                     " >> " + self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT),
            timeout_sec=60,
            err_msg=
            "Sink output file never converged to the same state as the input file"
        )

    def validate_output(self, value):
        try:
            output_hash = list(
                self.sink.node.account.ssh_capture(
                    "md5sum " + self.OUTPUT_FILE))[0].strip().split()[0]
            return output_hash == hashlib.md5(value).hexdigest()
        except subprocess.CalledProcessError:
            return False