def _run(persistent_data, res_ops, command, ops=[], initial=None, sources=1, partition_multiplier=1, validate_output=True, sender_mps=1000, sender_interval=0.01): host = '127.0.0.1' sinks = 1 sink_mode = 'framed' batch_size = int(sender_mps * sender_interval) logging.debug("batch_size is {}".format(batch_size)) if not isinstance(ops, (list, tuple)): raise TypeError("ops must be a list or tuple of operations") # If no initial workers value is given, determine the minimum number # required at the start so that the cluster never goes below 1 worker. # If a number is given, then verify it is sufficient. if ops: if isinstance(ops[0], Recover): raise ValueError("The first operation cannot be Recover") lowest = lowest_point(ops) if lowest < 1: min_workers = abs(lowest) + 1 else: min_workers = 1 if isinstance(initial, int): logging.debug('initial: {}'.format(initial)) logging.debug('min: {}'.format(min_workers)) assert (initial >= min_workers) workers = initial else: workers = min_workers else: # Test is only for setup using initial workers assert (initial > 0) workers = initial logging.info("Initial cluster size: {}".format(workers)) partition_multiplier = 5 # Used in partition count creation # create the sequence generator and the reader msg = MultiSequenceGenerator(base_parts=workers * partition_multiplier - 1) # Start cluster logging.debug("Creating cluster") with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=sink_mode, persistent_data=persistent_data) as cluster: # start senders for s in range(sources): sender = Sender(cluster.source_addrs[0], Reader(msg), batch_size=batch_size, interval=sender_interval, reconnect=True) cluster.add_sender(sender, start=True) # let the senders send some data first time.sleep(1) # loop over ops, keeping the result and passing it to the next op res = None assert (not cluster.get_crashed_workers()) for op in ops: res_ops.append(op) logging.info("Executing: {}".format(op)) res = op.apply(cluster, res) assert (not cluster.get_crashed_workers()) # Wait a full second for things to calm down time.sleep(1) # If using external senders, wait for them to stop cleanly if cluster.senders: # Tell the multi-sequence-sender to stop msg.stop() # wait for senders to reach the end of their readers and stop for s in cluster.senders: cluster.wait_for_sender(s) # Validate all sender values caught up stop_value = max(msg.seqs) t0 = time.time() while True: try: assert (len(msg.seqs) == msg.seqs.count(stop_value)) break except: if time.time() - t0 > 2: logging.error("msg.seqs aren't all equal: {}".format( msg.seqs)) raise time.sleep(0.1) # Create await_values for the sink based on the stop values from # the multi sequence generator await_values = [] for part, val in enumerate(msg.seqs): key = '{:07d}'.format(part).encode() data = '[{},{},{},{}]'.format( *[val - x for x in range(3, -1, -1)]).encode() await_values.append((key, data)) cluster.sink_await(values=await_values, func=parse_sink_value) logging.info("Completion condition achieved. Shutting down cluster.") # Use validator to validate the data in at-least-once mode # save sink data to a file if validate_output: # TODO: move to validations.py out_file = os.path.join(cluster.res_dir, 'received.txt') cluster.sinks[0].save(out_file) # Validate captured output logging.info("Validating output") # if senders == 0, using internal source if cluster.senders: cmd_validate = ( 'validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=stop_value)) else: cmd_validate = ('validator -i {out_file} -a'.format( out_file=out_file)) res = run_shell_cmd(cmd_validate) try: assert (res.success) logging.info("Validation successful") except: raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate worker actually underwent recovery if cluster.restarted_workers: # TODO: move to validations.py logging.info("Validating recovery") pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." for r in cluster.restarted_workers: stdout = r.get_output() try: assert (re.search(pattern, stdout) is not None) logging.info("{} recovered successfully".format(r.name)) except AssertionError: raise AssertionError( 'Worker {} does not appear to have performed ' 'recovery as expected.'.format(r.name))
def _test_log_rotation_external_trigger_no_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 * workers ports = get_port_values(num=num_ports, host=host) (input_ports, worker_ports) = (ports[:sources], [ ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3) ]) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, res_dir, workers, worker_ports) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.5) # Trigger log rotation with external message cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m ' 'worker1'.format(host, external_port)) res = run_shell_cmd(cmd_external_trigger) try: assert (res.success) except AssertionError: raise AssertionError('External rotation trigger failed with ' 'the error:\n{}'.format(res.output)) # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output() print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) res = run_shell_cmd(cmd_validate) try: assert (res.success) except AssertionError: print runners[0].name print runners[0].get_output() print '---' print runners[1].name print runners[1].get_output() print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate all workers underwent log rotation for r in runners[1:]: stdout = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (r.name, log_rotated_pattern, stdout)) finally: for r in runners: r.stop() clean_resilience_path(res_dir)
def _test_log_rotation_file_size_trigger_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 2000 event_log_file_size = 50000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) alt_block = '--event-log-file-size {}'.format(event_log_file_size) alt_func = lambda x: x > 0 runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 * workers ports = get_port_values(num=num_ports, host=host) (input_ports, worker_ports) = (ports[:sources], [ ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3) ]) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, res_dir, workers, worker_ports, alt_block, alt_func) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() # Wait for runner to complete a log rotation log_rotated_checker = RunnerChecker(runners[1], log_rotated_patterns, timeout=AWAIT_TIMEOUT) log_rotated_checker.start() log_rotated_checker.join() if log_rotated_checker.error: raise log_rotated_checker.error # stop the worker in a non-graceful fashion so it doesn't remove # recovery files runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output() print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) res = run_shell_cmd(cmd_validate) try: assert (res.success) except AssertionError: print runners[-1].name print runners[-1].get_output() print '---' print runners[-2].name print runners[-2].get_output() print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate worker underwent log rotation, but not initializer i, r = 1, runners[1] stdout = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %d.%r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (i, r.name, log_rotated_pattern, stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have performed ' 'recovery as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s' % stdout) finally: for r in runners: r.stop() clean_resilience_path(res_dir)
def _run(command, runner_data=[]): host = '127.0.0.1' sources = 1 sinks = 1 sink_mode = 'framed' workers = 2 expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) # Start cluster with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=sink_mode, runner_data=runner_data) as cluster: # Create sender logging.debug("Creating sender") sender = Sender(cluster.source_addrs[0], Reader(sequence_generator(expect)), batch_size=100, interval=0.05, reconnect=True) cluster.add_sender(sender, start=True) # wait for some data to go through the system time.sleep(0.2) # stop worker in a non-graceful fashion so that recovery files # aren't removed logging.debug("Killing worker") killed = cluster.kill_worker(worker=-1) ## restart worker logging.debug("Restarting worker") cluster.restart_worker(killed) # wait until sender completes (~1 second) logging.debug("Waiting for sender to complete") cluster.wait_for_sender() # Wait for the last sent value expected at the worker logging.debug("Waiting for sink to complete") cluster.sink_await(await_values) # stop the cluster logging.debug("Stopping cluster") cluster.stop_cluster() # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(cluster.res_dir, 'received.txt') cluster.sinks[0].save(out_file, mode='giles') # Validate captured output logging.debug("Validating output") cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) res = run_shell_cmd(cmd_validate) try: assert (res.success) except AssertionError: raise AssertionError('Output validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate worker actually underwent recovery logging.debug("Validating recovery from worker stdout") pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." try: assert (re.search(pattern, cluster.runners[-1].get_output()) is not None) except AssertionError: raise AssertionError("Worker does not appear to have performed " "recovery as expected.")