def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_shutdown_host(): replace_task = sdk_tasks.get_task_avoiding_scheduler( config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$')) assert replace_task is not None, 'Could not find a node to shut down' replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$") ) assert len(candidate_tasks) != 0, "Could not find a node to shut down" # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [task for task in candidate_tasks if task.host == replace_hostname] log.info( "Tasks on host {} to be replaced after shutdown: {}".format(replace_hostname, replace_tasks) ) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_agents.shutdown_agent(replace_hostname) # Reserved resources on this agent are expected to appear as orphaned in Mesos state. # Tell our uninstall validation to ignore orphaned resources coming from this agent. sdk_install.ignore_dead_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[: -len("-server")] for task in replace_tasks]) assert len(replace_pods) == len( replace_tasks ), "Expected one task per pod in tasks to replace: {}".format(replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")) assert len(candidate_tasks) != 0, "Could not find a node to shut down" # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [ task for task in candidate_tasks if task.host == replace_hostname ] log.info("Tasks on host {} to be replaced after shutdown: {}".format( replace_hostname, replace_tasks)) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_agents.shutdown_agent(replace_hostname) # Reserved resources on this agent are expected to appear as orphaned in Mesos state. # Tell our uninstall validation to ignore orphaned resources coming from this agent. sdk_install.ignore_dead_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[:-len("-server")] for task in replace_tasks]) assert len(replace_pods) == len( replace_tasks ), "Expected one task per pod in tasks to replace: {}".format( replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info("Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task)) assert replaced_task.agent_id != new_task.agent_id
def test_auto_replace_on_drain(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( service_name, re.compile("^(master|data|coordinator)-[0-9]+-node$") ) log.info("Candidate tasks: {}".format(candidate_tasks)) assert len(candidate_tasks) != 0, "Could not find a node to drain" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id] log.info( "Tasks on agent {} to be replaced after drain: {}".format(replace_agent_id, replace_tasks) ) sdk_agents.drain_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id # Reactivate the drained agent, otherwise uninstall plans will be halted for portworx sdk_agents.reactivate_agent(replace_agent_id)
def test_auto_replace_on_decommission(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$") ) assert len(candidate_tasks) != 0, "Could not find a node to decommission" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id] log.info( "Tasks on agent {} to be replaced after decommission: {}".format(replace_agent_id, replace_tasks) ) sdk_agents.decommission_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id
def get_task_host(task_name): _, out, _ = sdk_cmd.run_cli("task {} --json".format(task_name)) tasks_json = json.loads(out) matching_tasks = list(filter(lambda t: t["name"] == task_name, tasks_json)) assert len(matching_tasks) == 1, "Duplicate tasks found with same name : [{}]".format(tasks_json) task_info = matching_tasks.pop() host = None for label in task_info["labels"]: if label["key"] == "offer_hostname": host = label["value"] break if host is None: raise Exception("offer_hostname label is not present!: {}".format(task_info)) # Validation: Check that label matches summary returned by CLI for task in sdk_tasks.get_summary(): if task.name == task_name: if task.host == host: # OK! return host else: # CLI's hostname doesn't match the TaskInfo labels. Bug! raise Exception( "offer_hostname label [{}] doesn't match CLI output [{}]\nTask:\n{}".format( host, task.host, task_info ) ) # Unable to find desired task in CLI! raise Exception("Unable to find task named {} in CLI".format(task_name))
def get_completed_task_id(task_name): task_ids = [ t.id for t in sdk_tasks.get_summary(with_completed=True, task_name=task_name) ] # Mesos returns newest task first: return task_ids[0] if task_ids else None
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == 'node-2-server'][0] log.info('avoid host for task {}'.format(replace_task)) replace_pod_name = replace_task.name[:-len('-server')] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config['env']['PLACEMENT_CONSTRAINT'] try: marathon_config['env']['PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(replace_task.host) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config['env']['PLACEMENT_CONSTRAINT'] = original_constraint sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def get_task_host(task_name): out = sdk_cmd.run_cli('task {} --json'.format(task_name)) task_info = json.loads(out)[0] host = None for label in task_info['labels']: if label['key'] == 'offer_hostname': host = label['value'] break if host is None: raise Exception( "offer_hostname label is not present!: {}".format(task_info)) # Validation: Check that label matches summary returned by CLI for task in sdk_tasks.get_summary(): if task.name == task_name: if task.host == host: # OK! return host else: # CLI's hostname doesn't match the TaskInfo labels. Bug! raise Exception( "offer_hostname label {} doesn't match CLI output!\nTask:\n{}" .format(task_info)) # Unable to find desired task in CLI! raise Exception("Unable to find task named {} in CLI".format(task_name))
def handle_test_setup(item: pytest.Item): '''Does some initialization at the start of a test. This should be called in a pytest_runtest_setup() hook. See also handle_failed_test() which must be called from a pytest_runtest_makereport() hookimpl hook.''' # Check if we're entering a new test suite. global _testlogs_test_index global _testlogs_current_test_suite test_suite = get_test_suite_name(item) if test_suite != _testlogs_current_test_suite: # New test suite: # 1 Store all the task ids which already exist as of this point. _testlogs_current_test_suite = test_suite global _testlogs_ignored_task_ids _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union([ task.id for task in sdk_tasks.get_summary(with_completed=True)]) log.info('Entering new test suite {}: {} preexisting tasks will be ignored on test failure.'.format( test_suite, len(_testlogs_ignored_task_ids))) # 2 Reset the test index. _testlogs_test_index = 0 # 3 Remove any prior logs for the test suite. test_log_dir = _test_suite_artifact_directory(item) if os.path.exists(test_log_dir): log.info('Deleting existing test suite logs: {}/'.format(test_log_dir)) shutil.rmtree(test_log_dir) # Increment the test index (to 1, if this is a new suite) _testlogs_test_index += 1
def get_task_host(task_name): out = sdk_cmd.run_cli('task {} --json'.format(task_name)) task_info = json.loads(out)[0] host = None for label in task_info['labels']: if label['key'] == 'offer_hostname': host = label['value'] break if host is None: raise Exception("offer_hostname label is not present!: {}".format(task_info)) # Validation: Check that label matches summary returned by CLI for task in sdk_tasks.get_summary(): if task.name == task_name: if task.host == host: # OK! return host else: # CLI's hostname doesn't match the TaskInfo labels. Bug! raise Exception("offer_hostname label {} doesn't match CLI output!\nTask:\n{}".format(task_info)) # Unable to find desired task in CLI! raise Exception("Unable to find task named {} in CLI".format(task_name))
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == "node-2-server" ][0] log.info("avoid host for task {}".format(replace_task)) replace_pod_name = replace_task.name[:-len("-server")] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config["env"]["PLACEMENT_CONSTRAINT"] try: marathon_config["env"][ "PLACEMENT_CONSTRAINT"] = '[["hostname", "UNLIKE", "{}"]]'.format( replace_task.host) sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery( config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config["env"]["PLACEMENT_CONSTRAINT"] = original_constraint sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def get_task_host(task_name): _, out, _ = sdk_cmd.run_cli("task {} --json".format(task_name)) tasks_json = json.loads(out) matching_tasks = list(filter(lambda t: t["name"] == task_name, tasks_json)) assert len(matching_tasks ) == 1, "Duplicate tasks found with same name : [{}]".format( tasks_json) task_info = matching_tasks.pop() host = None for label in task_info["labels"]: if label["key"] == "offer_hostname": host = label["value"] break if host is None: raise Exception( "offer_hostname label is not present!: {}".format(task_info)) # Validation: Check that label matches summary returned by CLI for task in sdk_tasks.get_summary(): if task.name == task_name: if task.host == host: # OK! return host else: # CLI's hostname doesn't match the TaskInfo labels. Bug! raise Exception( "offer_hostname label [{}] doesn't match CLI output [{}]\nTask:\n{}" .format(host, task.host, task_info)) # Unable to find desired task in CLI! raise Exception("Unable to find task named {} in CLI".format(task_name))
def handle_test_setup(item: pytest.Item): '''Does some initialization at the start of a test. This should be called in a pytest_runtest_setup() hook. See also handle_failed_test() which must be called from a pytest_runtest_makereport() hookimpl hook.''' # Check if we're entering a new test suite. global _testlogs_test_index global _testlogs_current_test_suite test_suite = get_test_suite_name(item) if test_suite != _testlogs_current_test_suite: # New test suite: # 1 Store all the task ids which already exist as of this point. _testlogs_current_test_suite = test_suite global _testlogs_ignored_task_ids _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union( [task.id for task in sdk_tasks.get_summary(with_completed=True)]) log.info( 'Entering new test suite {}: {} preexisting tasks will be ignored on test failure.' .format(test_suite, len(_testlogs_ignored_task_ids))) # 2 Reset the test index. _testlogs_test_index = 0 # 3 Remove any prior logs for the test suite. test_log_dir = _test_suite_artifact_directory(item) if os.path.exists(test_log_dir): log.info( 'Deleting existing test suite logs: {}/'.format(test_log_dir)) shutil.rmtree(test_log_dir) # Increment the test index (to 1, if this is a new suite) _testlogs_test_index += 1
def test_auto_replace_on_decommission(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")) assert len(candidate_tasks) != 0, "Could not find a node to decommission" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [ task for task in candidate_tasks if task.agent_id == replace_agent_id ] log.info("Tasks on agent {} to be replaced after decommission: {}".format( replace_agent_id, replace_tasks)) sdk_agents.decommission_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info("Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task)) assert replaced_task.agent_id != new_task.agent_id
def handle_test_report(item: pytest.Item, result): # _pytest.runner.TestReport '''Collects information from the cluster following a failed test. This should be called in a hookimpl fixture. See also handle_test_setup() which must be called in a pytest_runtest_setup() hook.''' if not result.failed: return # passed, nothing to do # Fetch all state from all currently-installed services. # We do this retrieval first in order to be closer to the actual test failure. # Services may still be installed when e.g. we're still in the middle of a test suite. service_names = list( filter( lambda name: name != sdk_package_registry. PACKAGE_REGISTRY_SERVICE_NAME, sdk_install.get_installed_service_names())) if len(service_names) > 0: log.info( 'Fetching plans for {} services that are currently installed: {}'. format(len(service_names), ', '.join(service_names))) for service_name in service_names: try: _dump_scheduler(item, service_name) except Exception: log.exception('Plan collection from service {} failed!'.format( service_name)) # Fetch all logs from tasks created since the last failure, or since the start of the suite. global _testlogs_ignored_task_ids new_task_ids = [ task.id for task in sdk_tasks.get_summary(with_completed=True) if task.id not in _testlogs_ignored_task_ids ] _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union(new_task_ids) # Enforce limit on how many tasks we will fetch logs from, to avoid unbounded log fetching. if len(new_task_ids) > _testlogs_task_id_limit: log.warning( 'Truncating list of {} new tasks to size {} to avoid fetching logs forever: {}' .format(len(new_task_ids), _testlogs_task_id_limit, new_task_ids)) del new_task_ids[_testlogs_task_id_limit:] try: log.info( 'Fetching logs for {} tasks launched in this suite since last failure: {}' .format(len(new_task_ids), ', '.join(new_task_ids))) _dump_task_logs(item, new_task_ids) except Exception: log.exception('Task log collection failed!') try: log.info('Fetching mesos state:') _dump_mesos_state(item) except Exception: log.exception('Mesos state collection failed!') try: log.info('Creating/fetching cluster diagnostics bundle:') _dump_diagnostics_bundle(item) except Exception: log.exception('Diagnostics bundle creation failed') log.info('Post-failure collection complete')
def test_launch_task_with_multiple_ports(): sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, 0, additional_options={"service": {"yaml": "multiport"}}, ) assert sdk_tasks.get_summary(with_completed=True, task_name="multiport-0-server"), "Unable to find matching completed task"
def test_launch_task_with_multiple_ports(): sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, 0, additional_options={"service": { "yaml": "multiport" }}, ) assert sdk_tasks.get_summary(with_completed=True, task_name="multiport-0-server" ), "Unable to find matching completed task"
def handle_test_report(item: pytest.Item, result): # _pytest.runner.TestReport '''Collects information from the cluster following a failed test. This should be called in a hookimpl fixture. See also handle_test_setup() which must be called in a pytest_runtest_setup() hook.''' if not result.failed: return # passed, nothing to do # Fetch all plans from all currently-installed services. # We do this retrieval first in order to be closer to the actual test failure. # Services may still be installed when e.g. we're still in the middle of a test suite. service_names = sdk_install.get_installed_service_names() if len(service_names) > 0: log.info('Fetching plans for {} services that are currently installed: {}'.format( len(service_names), ', '.join(service_names))) for service_name in service_names: try: _dump_plans(item, service_name) except: log.exception('Plan collection from service {} failed!'.format(service_name)) # Fetch all logs from tasks created since the last failure, or since the start of the suite. global _testlogs_ignored_task_ids new_task_ids = [task.id for task in sdk_tasks.get_summary(with_completed=True) if task.id not in _testlogs_ignored_task_ids] _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union(new_task_ids) # Enforce limit on how many tasks we will fetch logs from, to avoid unbounded log fetching. if len(new_task_ids) > _testlogs_task_id_limit: log.warning('Truncating list of {} new tasks to size {} to avoid fetching logs forever: {}'.format( len(new_task_ids), _testlogs_task_id_limit, new_task_ids)) del new_task_ids[_testlogs_task_id_limit:] try: log.info('Fetching logs for {} tasks launched in this suite since last failure: {}'.format( len(new_task_ids), ', '.join(new_task_ids))) _dump_task_logs(item, new_task_ids) except: log.exception('Task log collection failed!') try: log.info('Fetching mesos state:') _dump_mesos_state(item) except: log.exception('Mesos state collection failed!') try: log.info('Creating/fetching cluster diagnostics bundle:') _dump_diagnostics_bundle(item) except: log.exception('Diagnostics bundle creation failed') log.info('Post-failure collection complete')
def pytest_runtest_makereport(item, call): '''Hook to run after every test, before any other post-test hooks. See also: https://docs.pytest.org/en/latest/example/simple.html\ #making-test-result-information-available-in-fixtures ''' # Execute all other hooks to obtain the report object, then a report attribute for each phase of # a call, which can be "setup", "call", "teardown". # Subsequent fixtures can get the reports off of the request object like: `request.rep_setup.failed`. outcome = yield rep = outcome.get_result() setattr(item, "rep_" + rep.when, rep) # Handle failures. Must be done here and not in a fixture in order to # properly handle post-yield fixture teardown failures. if rep.failed: # Fetch all logs from tasks created since the last failure, or since the start of the suite. global testlogs_ignored_task_ids new_task_ids = [task.id for task in sdk_tasks.get_summary(with_completed=True) if task.id not in testlogs_ignored_task_ids] testlogs_ignored_task_ids = testlogs_ignored_task_ids.union(new_task_ids) # Enforce limit on how many tasks we will fetch logs from, to avoid unbounded log fetching. if len(new_task_ids) > testlogs_task_id_limit: log.warning('Truncating list of {} new tasks to size {} to avoid fetching logs forever: {}'.format( len(new_task_ids), testlogs_task_id_limit, new_task_ids)) del new_task_ids[testlogs_task_id_limit:] log.info('Test {} failed in {} phase.'.format(item.name, rep.when)) try: log.info('Fetching logs for {} tasks launched in this suite since last failure: {}'.format( len(new_task_ids), new_task_ids)) dump_task_logs(item, new_task_ids) except Exception: log.exception('Task log collection failed!') try: log.info('Fetching mesos state') dump_mesos_state(item) except Exception: log.exception('Mesos state collection failed!') try: log.info('Creating/fetching cluster diagnostics bundle') get_diagnostics_bundle(item) except Exception: log.exception("Diagnostics bundle creation failed") log.info('Post-failure collection complete')
def pytest_runtest_setup(item): '''Hook to run before every test.''' # Inject header at start of test, following automatic "path/to/test_file.py::test_name": # Don't do this when running in teamcity, where it's redundant. if not teamcity.is_running_under_teamcity(): print(''' ========== ======= START: {}::{} =========='''.format(sdk_utils.get_test_suite_name(item), item.name)) # Check if we're entering a new test suite. global testlogs_test_index global testlogs_current_test_suite test_suite = sdk_utils.get_test_suite_name(item) if test_suite != testlogs_current_test_suite: # New test suite: # 1 Store all the task ids which already exist as of this point. testlogs_current_test_suite = test_suite global testlogs_ignored_task_ids testlogs_ignored_task_ids = testlogs_ignored_task_ids.union([ task.id for task in sdk_tasks.get_summary(with_completed=True)]) log.info('Entering new test suite {}: {} preexisting tasks will be ignored on test failure.'.format( test_suite, len(testlogs_ignored_task_ids))) # 2 Reset the test index. testlogs_test_index = 0 # 3 Remove any prior logs for the test suite. test_log_dir = sdk_utils.get_test_suite_log_directory(item) if os.path.exists(test_log_dir): log.info('Deleting existing test suite logs: {}/'.format(test_log_dir)) shutil.rmtree(test_log_dir) # Increment the test index (to 1, if this is a new suite), and pass the value to sdk_utils for use internally. testlogs_test_index += 1 sdk_utils.set_test_index(testlogs_test_index) min_version_mark = item.get_marker('dcos_min_version') if min_version_mark: min_version = min_version_mark.args[0] message = 'Feature only supported in DC/OS {} and up'.format(min_version) if 'reason' in min_version_mark.kwargs: message += ': {}'.format(min_version_mark.kwargs['reason']) if sdk_utils.dcos_version_less_than(min_version): pytest.skip(message)
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == 'node-2-server' ][0] log.info('avoid host for task {}'.format(replace_task)) replace_pod_name = replace_task.name[:-len('-server')] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env'][ 'PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format( replace_task.host) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery( config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def get_scheduler_task_id(service_name: str) -> str: for task in sdk_tasks.get_summary(): if task.name == service_name: return task.id
def get_completed_task_id(task_name): task_ids = [t.id for t in sdk_tasks.get_summary(with_completed=True, task_name=task_name)] # Mesos returns newest task first: return task_ids[0] if task_ids else None
def handle_test_report(item: pytest.Item, result: runner.TestReport) -> None: """Collects information from the cluster following a failed test. This should be called in a hookimpl fixture. See also handle_test_setup() which must be called in a pytest_runtest_setup() hook.""" if not result.failed or os.environ.get('DISABLE_DIAG'): return # passed, nothing to do, or diagnostics collection disabled # Fetch all state from all currently-installed services. # We do this retrieval first in order to be closer to the actual test failure. # Services may still be installed when e.g. we're still in the middle of a test suite. service_names = list( filter( lambda name: name != sdk_package_registry.PACKAGE_REGISTRY_SERVICE_NAME, sdk_install.get_installed_service_names().union(_whitelisted_service_names(item)), ) ) if len(service_names) > 0: log.info( "Fetching plans for {} services that are currently installed: {}".format( len(service_names), ", ".join(service_names) ) ) for service_name in service_names: try: # Skip thread retrieval if plan retrieval fails: _dump_plans(item, service_name) _dump_threads(item, service_name) except Exception: log.exception("Plan/thread collection from service {} failed!".format(service_name)) # Fetch all logs from tasks created since the last failure, or since the start of the suite. global _testlogs_ignored_task_ids new_task_ids = [ task.id for task in sdk_tasks.get_summary(with_completed=True) if task.id not in _testlogs_ignored_task_ids ] _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union(new_task_ids) # Enforce limit on how many tasks we will fetch logs from, to avoid unbounded log fetching. if len(new_task_ids) > _testlogs_task_id_limit: log.warning( "Truncating list of {} new tasks to size {} to avoid fetching logs forever: {}".format( len(new_task_ids), _testlogs_task_id_limit, new_task_ids ) ) del new_task_ids[_testlogs_task_id_limit:] try: log.info( "Fetching logs for {} tasks launched in this suite since last failure: {}".format( len(new_task_ids), ", ".join(new_task_ids) ) ) _dump_task_logs(item, new_task_ids) except Exception: log.exception("Task log collection failed!") try: log.info("Fetching mesos state:") _dump_mesos_state(item) except Exception: log.exception("Mesos state collection failed!") try: log.info("Creating/fetching cluster diagnostics bundle:") _dump_diagnostics_bundle(item) except Exception: log.exception("Diagnostics bundle creation failed") log.info("Post-failure collection complete")