Exemplo n.º 1
0
def _run(persistent_data,
         res_ops,
         command,
         ops=[],
         initial=None,
         sources=1,
         partition_multiplier=1,
         validate_output=True,
         sender_mps=1000,
         sender_interval=0.01):
    host = '127.0.0.1'
    sinks = 1
    sink_mode = 'framed'
    batch_size = int(sender_mps * sender_interval)
    logging.debug("batch_size is {}".format(batch_size))

    if not isinstance(ops, (list, tuple)):
        raise TypeError("ops must be a list or tuple of operations")

    # If no initial workers value is given, determine the minimum number
    # required at the start so that the cluster never goes below 1 worker.
    # If a number is given, then verify it is sufficient.
    if ops:
        if isinstance(ops[0], Recover):
            raise ValueError("The first operation cannot be Recover")
        lowest = lowest_point(ops)
        if lowest < 1:
            min_workers = abs(lowest) + 1
        else:
            min_workers = 1
        if isinstance(initial, int):
            logging.debug('initial: {}'.format(initial))
            logging.debug('min: {}'.format(min_workers))
            assert (initial >= min_workers)
            workers = initial
        else:
            workers = min_workers
    else:  # Test is only for setup using initial workers
        assert (initial > 0)
        workers = initial

    logging.info("Initial cluster size: {}".format(workers))

    partition_multiplier = 5  # Used in partition count creation
    # create the sequence generator and the reader
    msg = MultiSequenceGenerator(base_parts=workers * partition_multiplier - 1)

    # Start cluster
    logging.debug("Creating cluster")
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 persistent_data=persistent_data) as cluster:

        # start senders
        for s in range(sources):
            sender = Sender(cluster.source_addrs[0],
                            Reader(msg),
                            batch_size=batch_size,
                            interval=sender_interval,
                            reconnect=True)
            cluster.add_sender(sender, start=True)

        # let the senders send some data first
        time.sleep(1)

        # loop over ops, keeping the result and passing it to the next op
        res = None
        assert (not cluster.get_crashed_workers())
        for op in ops:
            res_ops.append(op)
            logging.info("Executing: {}".format(op))
            res = op.apply(cluster, res)
            assert (not cluster.get_crashed_workers())

        # Wait a full second for things to calm down
        time.sleep(1)

        # If using external senders, wait for them to stop cleanly
        if cluster.senders:
            # Tell the multi-sequence-sender to stop
            msg.stop()

            # wait for senders to reach the end of their readers and stop
            for s in cluster.senders:
                cluster.wait_for_sender(s)

            # Validate all sender values caught up
            stop_value = max(msg.seqs)
            t0 = time.time()
            while True:
                try:
                    assert (len(msg.seqs) == msg.seqs.count(stop_value))
                    break
                except:
                    if time.time() - t0 > 2:
                        logging.error("msg.seqs aren't all equal: {}".format(
                            msg.seqs))
                        raise
                time.sleep(0.1)

            # Create await_values for the sink based on the stop values from
            # the multi sequence generator
            await_values = []
            for part, val in enumerate(msg.seqs):
                key = '{:07d}'.format(part).encode()
                data = '[{},{},{},{}]'.format(
                    *[val - x for x in range(3, -1, -1)]).encode()
                await_values.append((key, data))
            cluster.sink_await(values=await_values, func=parse_sink_value)

        logging.info("Completion condition achieved. Shutting down cluster.")

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        if validate_output:
            # TODO: move to validations.py
            out_file = os.path.join(cluster.res_dir, 'received.txt')
            cluster.sinks[0].save(out_file)

            # Validate captured output
            logging.info("Validating output")
            # if senders == 0, using internal source
            if cluster.senders:
                cmd_validate = (
                    'validator -i {out_file} -e {expect} -a'.format(
                        out_file=out_file, expect=stop_value))
            else:
                cmd_validate = ('validator -i {out_file} -a'.format(
                    out_file=out_file))
            res = run_shell_cmd(cmd_validate)
            try:
                assert (res.success)
                logging.info("Validation successful")
            except:
                raise AssertionError('Validation failed with the following '
                                     'error:\n{}'.format(res.output))

        # Validate worker actually underwent recovery
        if cluster.restarted_workers:
            # TODO: move to validations.py
            logging.info("Validating recovery")
            pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
            for r in cluster.restarted_workers:
                stdout = r.get_output()
                try:
                    assert (re.search(pattern, stdout) is not None)
                    logging.info("{} recovered successfully".format(r.name))
                except AssertionError:
                    raise AssertionError(
                        'Worker {} does not appear to have performed '
                        'recovery as expected.'.format(r.name))
Exemplo n.º 2
0
def _test_log_rotation_external_trigger_no_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (ports[:sources], [
            ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3)
        ])
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      res_dir, workers, worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        time.sleep(0.5)
        # Trigger log rotation with external message
        cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m '
                                'worker1'.format(host, external_port))

        res = run_shell_cmd(cmd_external_trigger)
        try:
            assert (res.success)
        except AssertionError:
            raise AssertionError('External rotation trigger failed with '
                                 'the error:\n{}'.format(res.output))

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            print runners[0].name
            print runners[0].get_output()
            print '---'
            print runners[1].name
            print runners[1].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate all workers underwent log rotation
        for r in runners[1:]:
            stdout = r.get_output()
            try:
                assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                        is not None)
            except AssertionError:
                raise AssertionError('Worker %r does not appear to have '
                                     'performed log rotation as expected.'
                                     ' The pattern %r '
                                     'is missing form the Worker output '
                                     'included below.\nSTDOUT\n---\n%s\n'
                                     '---\n' %
                                     (r.name, log_rotated_pattern, stdout))
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Exemplo n.º 3
0
def _test_log_rotation_file_size_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    event_log_file_size = 50000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)
    alt_block = '--event-log-file-size {}'.format(event_log_file_size)
    alt_func = lambda x: x > 0

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (ports[:sources], [
            ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3)
        ])
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      res_dir, workers, worker_ports, alt_block, alt_func)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        # Wait for runner to complete a log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop the worker in a non-graceful fashion so it doesn't remove
        # recovery files
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            print runners[-1].name
            print runners[-1].get_output()
            print '---'
            print runners[-2].name
            print runners[-2].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate worker underwent log rotation, but not initializer
        i, r = 1, runners[1]
        stdout = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (i, r.name, log_rotated_pattern, stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s' % stdout)
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Exemplo n.º 4
0
def _run(command, runner_data=[]):
    host = '127.0.0.1'
    sources = 1
    sinks = 1
    sink_mode = 'framed'
    workers = 2
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    # Start cluster
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 runner_data=runner_data) as cluster:
        # Create sender
        logging.debug("Creating sender")
        sender = Sender(cluster.source_addrs[0],
                        Reader(sequence_generator(expect)),
                        batch_size=100,
                        interval=0.05,
                        reconnect=True)
        cluster.add_sender(sender, start=True)

        # wait for some data to go through the system
        time.sleep(0.2)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        logging.debug("Killing worker")
        killed = cluster.kill_worker(worker=-1)

        ## restart worker
        logging.debug("Restarting worker")
        cluster.restart_worker(killed)

        # wait until sender completes (~1 second)
        logging.debug("Waiting for sender to complete")
        cluster.wait_for_sender()

        # Wait for the last sent value expected at the worker
        logging.debug("Waiting for sink to complete")
        cluster.sink_await(await_values)

        # stop the cluster
        logging.debug("Stopping cluster")
        cluster.stop_cluster()

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(cluster.res_dir, 'received.txt')
        cluster.sinks[0].save(out_file, mode='giles')

        # Validate captured output
        logging.debug("Validating output")
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            raise AssertionError('Output validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate worker actually underwent recovery
        logging.debug("Validating recovery from worker stdout")
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        try:
            assert (re.search(pattern, cluster.runners[-1].get_output())
                    is not None)
        except AssertionError:
            raise AssertionError("Worker does not appear to have performed "
                                 "recovery as expected.")