Exemplo n.º 1
0
def _autoscale_sequence(command, ops=[1], cycles=1, initial=None):
    host = '127.0.0.1'
    sources = 1

    if isinstance(ops, int):
        ops = [ops]

    # If no initial workers value is given, determine the minimum number
    # required at the start so that the cluster never goes below 1 worker.
    # If a number is given, then verify it is sufficient.
    bottom = min(min(compact_sign(ops * cycles)), sum(ops * cycles))
    if bottom < 1:
        min_workers = abs(bottom) + 1
    else:
        min_workers = 1
    if isinstance(initial, int):
        assert (initial >= min_workers)
        workers = initial
    else:
        workers = min_workers

    batch_size = 10
    interval = 0.05
    msgs_per_sec = int(batch_size / interval)
    base_time = 10  # Seconds
    cycle_time = 10  # seconds
    expect_time = base_time + cycle_time * cycles  # seconds
    expect = expect_time * msgs_per_sec
    sender_timeout = expect_time + 10  # seconds
    join_timeout = 200
    runner_join_timeout = 30

    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    setup_resilience_path(res_dir)

    steps = []

    runners = []
    try:
        try:
            # Create sink, metrics, reader, sender
            sink = Sink(host)
            metrics = Metrics(host)
            lowercase2 = [a + b for a in lowercase for b in lowercase]
            char_gen = cycle(lowercase2)
            chars = [next(char_gen) for i in range(expect)]
            expected = Counter(chars)

            reader = Reader(iter_generator(chars,
                                           lambda s: pack('>2sI', s, 1)))

            await_values = [
                pack('>I2sQ', 10, c, v) for c, v in expected.items()
            ]

            # Start sink and metrics, and get their connection info
            sink.start()
            sink_host, sink_port = sink.get_connection_info()
            outputs = '{}:{}'.format(sink_host, sink_port)

            metrics.start()
            metrics_host, metrics_port = metrics.get_connection_info()
            time.sleep(0.05)

            num_ports = sources + 3 + (2 * (workers - 1))
            ports = get_port_values(num=num_ports, host=host)
            (input_ports, (control_port, data_port, external_port),
             worker_ports) = (ports[:sources], ports[sources:sources + 3],
                              zip(ports[-(2 * (workers - 1)):][::2],
                                  ports[-(2 * (workers - 1)):][1::2]))
            inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

            start_runners(runners, command, host, inputs, outputs,
                          metrics_port, control_port, external_port, data_port,
                          res_dir, workers, worker_ports)

            # Wait for first runner (initializer) to report application ready
            runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
            runner_ready_checker.start()
            runner_ready_checker.join()
            if runner_ready_checker.error:
                raise runner_ready_checker.error

            # Get initial partition data
            partitions = query_partitions(host, external_port)
            # Verify all workers start with partitions
            assert (map(
                len, partitions['state_partitions']
                ['letter-state'].values()).count(0) == 0)

            # start sender
            sender = Sender(host,
                            input_ports[0],
                            reader,
                            batch_size=batch_size,
                            interval=interval)
            sender.start()

            time.sleep(2)

            # Perform autoscale cycles
            start_froms = {r: 0 for r in runners}
            for cyc in range(cycles):
                for joiners in ops:
                    steps.append(joiners)
                    joined = []
                    left = []
                    if joiners > 0:
                        # create a new worker and have it join
                        new_ports = get_port_values(num=(joiners * 2),
                                                    host=host,
                                                    base_port=25000)
                        joiner_ports = zip(new_ports[::2], new_ports[1::2])
                        for i in range(joiners):
                            add_runner(runners, command, host, inputs, outputs,
                                       metrics_port, control_port,
                                       external_port, data_port, res_dir,
                                       joiners, *joiner_ports[i])
                            joined.append(runners[-1])
                            start_froms[runners[-1]] = 0

                        patterns_i = ([
                            re.escape('***Worker {} attempting to join the '
                                      'cluster. Sent necessary information.***'
                                      .format(r.name)) for r in joined
                        ] + [
                            re.escape('Migrating partitions to {}'.format(
                                r.name)) for r in joined
                        ] + [
                            re.escape('--All new workers have acked migration '
                                      'batch complete'),
                            re.escape('~~~Resuming message processing.~~~')
                        ])
                        patterns_j = [
                            re.escape('***Successfully joined cluster!***'),
                            re.escape('~~~Resuming message processing.~~~')
                        ]

                        # Wait for runners to complete joining
                        join_checkers = []
                        join_checkers.append(
                            RunnerChecker(runners[0],
                                          patterns_i,
                                          timeout=join_timeout,
                                          start_from=start_froms[runners[0]]))
                        for runner in joined:
                            join_checkers.append(
                                RunnerChecker(runner,
                                              patterns_j,
                                              timeout=join_timeout,
                                              start_from=start_froms[runner]))
                        for jc in join_checkers:
                            jc.start()
                        for jc in join_checkers:
                            jc.join()
                            if jc.error:
                                outputs = runners_output_format(runners)
                                raise AutoscaleTimeoutError(
                                    "'{}' timed out on JOIN in {} "
                                    "seconds. The cluster had the following outputs:\n===\n{}"
                                    .format(jc.runner_name, jc.timeout,
                                            outputs),
                                    as_error=jc.error,
                                    as_steps=steps)

                    elif joiners < 0:  # joiners < 0, e.g. leavers!
                        # choose the most recent, still-alive runners to leave
                        leavers = abs(joiners)
                        idx = 1
                        while len(left) < leavers and idx < len(runners):
                            if runners[-idx].is_alive():
                                left.append(runners[-idx])
                            idx += 1
                        if len(left) < leavers:
                            raise AutoscaleTestError(
                                "Not enough workers left to "
                                "shrink! {} requested but "
                                "only {} live non-initializer"
                                "workers found!".format(joiners, len(left)))

                        # Create the checkers

                        initializer = [runners[0]]
                        remaining = [
                            r for r in runners
                            if r.is_alive() and r not in initializer + left
                        ]

                        patterns_i = ([
                            r'ExternalChannelConnectNotifier: initializer: '
                            r'server closed', r'Saving topology!',
                            r'Saving worker names to file: .*?initializer.'
                            r'workers'
                        ] + [
                            re.escape(
                                r'LocalTopology._save_worker_names: {}'.format(
                                    r.name)) for r in initializer + remaining
                        ] + [
                            re.escape(r'~~~Initiating shrink~~~'),
                            re.escape(r'-- Remaining workers:')
                        ] + [
                            re.escape(r'-- -- {}'.format(r.name))
                            for n in initializer + remaining
                        ] + [
                            re.escape(r'~~~Stopping message processing for '
                                      r'leaving workers.~~~'),
                            re.escape(r'~~~Resuming message processing.~~~')
                        ])

                        patterns_r = ([
                            re.escape(
                                r'Control Ch: Received Mute Request from initializer'
                            ),
                            re.escape(
                                r'~~~Stopping message processing for leaving workers.~~~'
                            ),
                            re.escape(
                                r'DataChannelConnectNotifier: server closed'),
                            re.escape(
                                r'ControlSenderConnectNotifier: server closed'
                            ),
                            re.escape(r'BoundaryNotify: closed'),
                            re.escape(
                                r'Control Ch: Received Unmute Request from initializer'
                            ),
                            re.escape(r'~~~Resuming message processing.~~~'),
                            re.escape(r'Shutting down OutgoingBoundary'),
                            re.escape(r'Shutting down ControlConnection')
                        ])
                        patterns_r_per = [
                            r'ControlChannelConnectNotifier:{}: server closed'
                        ]

                        patterns_l = ([
                            re.escape(
                                r'Control Ch: Received Mute Request from {}'.
                                format(r.name))
                            for r in initializer + remaining
                        ] + [
                            re.escape(
                                r'Migrating all partitions to {} remaining '
                                r'workers'.format(
                                    len(initializer + remaining))),
                            r'\^\^Migrating \d+ steps to {} workers'.format(
                                len(initializer + remaining))
                        ] + [
                            r'\^\^Migrating step \d+ to outgoing '
                            r'boundary {}/[0-9a-f]{{12}}'.format(r.name)
                            for r in initializer + remaining
                        ] + [
                            re.escape(
                                r'--Sending leaving worker done migration msg to cluster'
                            ),
                            re.escape(
                                r'Connections: Finished shutdown procedure.'),
                            re.escape(r'Shutting down ControlConnection'),
                            re.escape(r'Shutting down TCPSink'),
                            re.escape(r'Shutting down DataReceiver'),
                            re.escape(
                                r'Shutting down ReconnectingMetricsSink'),
                            re.escape(r'Shutting down OutgoingBoundary'),
                            re.escape(r'Shutting down Startup...'),
                            re.escape(r'Shutting down DataChannel'),
                            re.escape(r'metrics connection closed'),
                            re.escape(r'TCPSink connection closed'),
                            re.escape(
                                r'ControlChannelConnectNotifier: server closed'
                            )
                        ])
                        patterns_l_per = []

                        left_checkers = []

                        # initializer STDOUT checker
                        left_checkers.append(
                            RunnerChecker(
                                initializer[0],
                                patterns_i,
                                timeout=join_timeout,
                                start_from=start_froms[initializer[0]]))

                        # remaining workers STDOUT checkers
                        for runner in remaining:
                            left_checkers.append(
                                RunnerChecker(runner,
                                              patterns_r + [
                                                  p.format(runner.name)
                                                  for p in patterns_r_per
                                              ],
                                              timeout=join_timeout,
                                              start_from=start_froms[runner]))

                        # leaving workers STDOUT checkers
                        for runner in left:
                            left_checkers.append(
                                RunnerChecker(runner,
                                              patterns_l + [
                                                  p.format(runner.name)
                                                  for p in patterns_l_per
                                              ],
                                              timeout=join_timeout,
                                              start_from=start_froms[runner]))
                        for lc in left_checkers:
                            lc.start()

                        # Send the shrink command
                        send_shrink_cmd(host,
                                        external_port,
                                        names=[r.name for r in left])

                        # Wait for output checkers to join
                        for lc in left_checkers:
                            lc.join()
                            if lc.error:
                                outputs = runners_output_format(runners)
                                raise AutoscaleTimeoutError(
                                    "'{}' timed out on SHRINK in {} "
                                    "seconds. The cluster had the following outputs:\n===\n{}"
                                    .format(lc.runner_name, lc.timeout,
                                            outputs),
                                    as_error=lc.error,
                                    as_steps=steps)

                    else:  # Handle the 0 case as a noop
                        continue

                    start_froms = {r: r.tell() for r in runners}

                    # Validate autoscale via partition query
                    try:
                        partitions = query_partitions(host, external_port)
                        phase_validate_partitions(
                            runners,
                            partitions,
                            joined=[r.name for r in joined],
                            left=[r.name for r in left])
                    except Exception as err:
                        print(
                            'error validating {} have joined and {} have left'.
                            format([r.name for r in joined],
                                   [r.name for r in left]))
                        raise err

            # wait until sender completes (~10 seconds)
            sender.join(sender_timeout)
            if sender.error:
                raise sender.error
            if sender.is_alive():
                sender.stop()
                raise TimeoutError('Sender did not complete in the expected '
                                   'period')

            # Use Sink value to determine when to stop runners and sink
            stopper = SinkAwaitValue(sink, await_values, 30)
            stopper.start()
            stopper.join()
            if stopper.error:
                print('sink.data', len(sink.data))
                print('await_values', len(await_values))
                raise stopper.error

            # stop application workers
            for r in runners:
                r.stop()

            # Stop sink
            sink.stop()

            # Stop metrics
            metrics.stop()

            # validate output
            phase_validate_output(runners, sink, expected)

        finally:
            for r in runners:
                r.stop()
            # Wait on runners to finish waiting on their subprocesses to exit
            for r in runners:
                r.join(runner_join_timeout)
            alive = []
            for r in runners:
                if r.is_alive():
                    alive.append(r)
            for r in runners:
                ec = r.poll()
                if ec != 0:
                    print('Worker {!r} exited with return code {}'.format(
                        r.name, ec))
                    print('Its last 5 log lines were:')
                    print('\n'.join(r.get_output().splitlines()[-5:]))
                    print()
            if alive:
                alive_names = ', '.join((r.name for r in alive))
                outputs = runners_output_format(runners)
                for a in alive:
                    a.kill()
            clean_resilience_path(res_dir)
            if alive:
                raise PipelineTestError(
                    "Runners [{}] failed to exit cleanly after"
                    " {} seconds.\n"
                    "Runner outputs are attached below:"
                    "\n===\n{}".format(alive_names, runner_join_timeout,
                                       outputs))

    except Exception as err:
        if not hasattr(err, 'as_steps'):
            err.as_steps = steps
        raise err
Exemplo n.º 2
0
def _test_log_rotation_external_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        time.sleep(0.5)
        # Trigger log rotation with external message
        cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m '
                                'worker1'.format(host, external_port))

        success, stdout, retcode, cmd = ex_validate(cmd_external_trigger)
        try:
            assert (success)
        except AssertionError:
            raise AssertionError('External rotation trigger failed with '
                                 'the error:\n{}'.format(stdout))

        # Check for log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()[0]
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[0].name
            print runners[0].get_output()[0]
            print '---'
            print runners[1].name
            print runners[1].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate all workers underwent log rotation
        r = runners[1]
        stdout, stderr = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (1, r.name, log_rotated_pattern, stdout))
        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout, stderr = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError(
                'Worker %d.%r does not appear to have '
                'performed recovery as expected. Worker '
                'output is '
                'included below.\nSTDOUT\n---\n%s\n---\n'
                'STDERR\n---\n%s' %
                (len(runners) - 1, runners[-1].name, stdout, stderr))
    finally:
        for r in runners:
            r.stop()
Exemplo n.º 3
0
def _test_log_rotation_file_size_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    event_log_file_size = 50000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)
    alt_block = '--event-log-file-size {}'.format(event_log_file_size)
    alt_func = lambda x: x > 0

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (ports[:sources], [
            ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3)
        ])
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      res_dir, workers, worker_ports, alt_block, alt_func)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        # Wait for runner to complete a log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop the worker in a non-graceful fashion so it doesn't remove
        # recovery files
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            print runners[-1].name
            print runners[-1].get_output()
            print '---'
            print runners[-2].name
            print runners[-2].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate worker underwent log rotation, but not initializer
        i, r = 1, runners[1]
        stdout = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (i, r.name, log_rotated_pattern, stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s' % stdout)
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Exemplo n.º 4
0
def _test_autoscale_grow(command):
    host = '127.0.0.1'
    sources = 1
    workers = 1
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))

    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    patterns_i = [
        re.escape(r'***Worker worker1 attempting to join the '
                  r'cluster. Sent necessary information.***'),
        re.escape(r'Migrating partitions to worker1'),
        re.escape(r'--All new workers have acked migration '
                  r'batch complete'),
        re.escape(r'~~~Resuming message processing.~~~')
    ]
    patterns_w = [
        re.escape(r'***Successfully joined cluster!***'),
        re.escape(r'~~~Resuming message processing.~~~')
    ]

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader1 = Reader(sequence_generator(expect - 1000))
        reader2 = Reader(sequence_generator(expect, 1000))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender1 (0,1000]
        sender1 = Sender(host,
                         input_ports[0],
                         reader1,
                         batch_size=10,
                         interval=0.05)
        sender1.start()

        # wait until sender1 completes (~5 seconds)
        sender1.join(30)
        if sender1.error:
            raise sender1.error
        if sender1.is_alive():
            sender1.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # create a new worker and have it join
        add_runner(runners, command, host, inputs, outputs, metrics_port,
                   control_port, external_port, data_port, res_dir, workers)

        # Wait for runner to complete a log rotation
        join_checker_i = RunnerChecker(runners[0], patterns_i, timeout=30)
        join_checker_w = RunnerChecker(runners[1], patterns_w, timeout=30)
        join_checker_i.start()
        join_checker_w.start()
        join_checker_i.join()
        if join_checker_i.error:
            print('worker output:')
            print(runners[1].get_output()[0])
            raise join_checker_i.error
        join_checker_w.join()
        if join_checker_w.error:
            print('initalizer output:')
            print(runners[0].get_output()[0])
            raise join_checker_w.error

        # Start sender2 (1000, 2000]
        sender2 = Sender(host,
                         input_ports[0],
                         reader2,
                         batch_size=10,
                         interval=0.05)
        sender2.start()

        # wait until sender2 completes (~5 seconds)
        sender2.join(30)
        if sender2.error:
            raise sender2.error
        if sender2.is_alive():
            sender2.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use Sink value to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Stop metrics
        metrics.stop()

        # parse metrics data and validate worker has shifted from 1 to 2
        # workers
        mp = MetricsParser()
        mp.load_string_list(metrics.data)
        mp.parse()
        # Now confirm that there are computations in worker1's metrics
        app_key = mp.data.keys()[0]  # 'metrics:Sequence Window Printer'
        worker_metrics = [
            v for v in mp.data[app_key].get('worker1', []) if v[0] == 'metrics'
        ]
        # Verify there is at least one entry for a computation with a nonzero
        # total value
        print('worker_metrics', worker_metrics)
        filtered = filter(
            lambda v:
            (v[1]['metric_category'] == 'computation' and v[1]['total'] > 0),
            worker_metrics)
        print('filtered', filtered)
        assert (len(filtered) > 0)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].get_output()[0]
            print '---'
            print runners[-2].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

    finally:
        for r in runners:
            r.stop()