def testGetIsolatedDataForStepNotOnlyFailure(self, mock_fn): master_name = 'm' builder_name = 'b' build_number = 223 step_name = 'unit_tests' mock_fn.return_value = [ SwarmingTaskData(item) for item in _SAMPLE_BUILD_STEP_DATA ] data = swarming.GetIsolatedDataForStep(master_name, builder_name, build_number, step_name, None, only_failure=False) expected_data = [{ 'digest': 'isolatedhashunittests', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }, { 'digest': 'isolatedhashunittests1', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }] self.assertEqual(sorted(expected_data), sorted(data))
def testGetIsolatedDataForStepNotOnlyFailure(self): master_name = 'm' builder_name = 'b' build_number = 223 step_name = 'unit_tests' self.http_client._SetResponseForGetRequestSwarmingList( master_name, builder_name, build_number, step_name) data = swarming_util.GetIsolatedDataForStep(master_name, builder_name, build_number, step_name, self.http_client, only_failure=False) expected_data = [{ 'digest': 'isolatedhashunittests', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }, { 'digest': 'isolatedhashunittests1', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }] self.assertEqual(sorted(expected_data), sorted(data))
def _BotsAvailableForTask(step_metadata): """Check if there are available bots for a swarming task's dimensions. Args: step_metadata (dict): Info about a step to determine the bot's dimensions to query Swarming with about bot availability. Returns: (bool): Whether or not there are enough bots available to trigger the task immediately. """ if not step_metadata: return False minimum_number_of_available_bots = ( waterfall_config.GetSwarmingSettings().get( 'minimum_number_of_available_bots', flake_constants.DEFAULT_MINIMUM_NUMBER_AVAILABLE_BOTS)) minimum_percentage_of_available_bots = ( waterfall_config.GetSwarmingSettings().get( 'minimum_percentage_of_available_bots', flake_constants.DEFAULT_MINIMUM_PERCENTAGE_AVAILABLE_BOTS)) dimensions = step_metadata.get('dimensions') bot_counts = swarming_util.GetBotCounts(swarming.SwarmingHost(), dimensions, FinditHttpClient) total_count = bot_counts.count or -1 available_count = bot_counts.available or 0 available_rate = float(available_count) / total_count return (available_count > minimum_number_of_available_bots and available_rate > minimum_percentage_of_available_bots)
def TriggerSwarmingTask(run_swarming_task_parameters, runner_id): """Triggers a swarming rerun for the given tests in a given build.""" master_name, builder_name, build_number = ( run_swarming_task_parameters.build_key.GetParts()) step_name = run_swarming_task_parameters.step_name tests = run_swarming_task_parameters.tests assert tests, 'No tests to trigger swarming task for.' http_client = FinditHttpClient() # 1. Retrieve Swarming task parameters from a given Swarming task id. ref_task_id, ref_request = swarming.GetReferredSwarmingTaskRequestInfo( master_name, builder_name, build_number, step_name, http_client) # 2. Update/Overwrite parameters for the re-run. iterations_to_rerun = waterfall_config.GetSwarmingSettings().get( 'iterations_to_rerun') new_request = CreateNewSwarmingTaskRequest(runner_id, ref_task_id, ref_request, master_name, builder_name, build_number, step_name, tests, iterations_to_rerun) # 3. Trigger a new Swarming task to re-run the failed tests. task_id, _ = swarming_util.TriggerSwarmingTask(swarming.SwarmingHost(), new_request, http_client) if task_id: # pragma: no branch. # 4. Update swarming task. OnSwarmingTaskTriggered(master_name, builder_name, build_number, step_name, tests, task_id, iterations_to_rerun, new_request) return task_id
def TriggerSwarmingTask(request, http_client): """Triggers a new Swarming task for the given request. The Swarming task priority will be overwritten, and extra tags might be added. Args: request (SwarmingTaskRequest): A Swarming task request. http_client (RetryHttpClient): An http client with automatic retry. """ # Use a priority much lower than CQ for now (CQ's priority is 30). # Later we might use a higher priority -- a lower value here. # Note: the smaller value, the higher priority. swarming_settings = waterfall_config.GetSwarmingSettings() request_expiration_hours = swarming_settings.get('request_expiration_hours') request.priority = max(100, swarming_settings.get('default_request_priority')) request.expiration_secs = request_expiration_hours * 60 * 60 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit']) url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get( 'server_host') response_data, error = _SendRequestToServer( url, http_client, request.Serialize()) if not error: return json.loads(response_data)['task_id'], None return None, error
def testRetrieveShardedTestResultsFromIsolatedServer(self): isolated_data = [{ 'digest': 'shard1_isolated', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }, { 'digest': 'shard2_isolated', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }, { 'digest': 'shard3_isolated', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }] isolated_storage_url = waterfall_config.GetSwarmingSettings().get( 'isolated_storage_url') self.http_client._SetResponseForPostRequest('shard1_isolated') self.http_client._SetResponseForPostRequest('shard1_url') self.http_client._SetResponseForGetRequestIsolated( 'https://%s/default-gzip/shard1' % isolated_storage_url, 'shard1') self.http_client._SetResponseForPostRequest('shard2_isolated') self.http_client._SetResponseForPostRequest('shard2_url') self.http_client._SetResponseForGetRequestIsolated( 'https://%s/default-gzip/shard2' % isolated_storage_url, 'shard2') self.http_client._SetResponseForPostRequest('shard3_isolated') self.http_client._SetResponseForPostRequest('shard3_url') self.http_client._SetResponseForGetRequestIsolated( 'https://%s/default-gzip/shard3' % isolated_storage_url, 'shard3') result = swarming_util.RetrieveShardedTestResultsFromIsolatedServer( isolated_data, self.http_client) expected_results_file = os.path.join(os.path.dirname(__file__), 'data', 'expected_collect_results') with open(expected_results_file, 'r') as f: expected_result = json.loads(f.read()) self.assertEqual(expected_result, result)
def GetSwarmingTaskResultById(task_id, http_client): """Gets swarming result, checks state and returns outputs ref if needed.""" base_url = ('https://%s/_ah/api/swarming/v1/task/%s/result') % ( waterfall_config.GetSwarmingSettings().get('server_host'), task_id) json_data = {} data, error = _SendRequestToServer(base_url, http_client) if not error: json_data = json.loads(data) return json_data, error
def _BotsAvailableForTask(self, step_metadata): """Check if there are available bots for this task's dimensions.""" if not step_metadata: return False minimum_number_of_available_bots = ( waterfall_config.GetSwarmingSettings().get( 'minimum_number_of_available_bots', _MINIMUM_NUMBER_BOT)) minimum_percentage_of_available_bots = ( waterfall_config.GetSwarmingSettings().get( 'minimum_percentage_of_available_bots', _MINIMUM_PERCENT_BOT)) dimensions = step_metadata.get('dimensions') bot_counts = swarming_util.GetSwarmingBotCounts( dimensions, HttpClientAppengine()) total_count = bot_counts.get('count') or -1 available_count = bot_counts.get('available', 0) available_rate = float(available_count) / total_count return (available_count > minimum_number_of_available_bots and available_rate > minimum_percentage_of_available_bots)
def testRetrieveShardedTestResultsFromIsolatedServerSingleShard(self): isolated_data = [{ 'digest': 'shard1_isolated', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }] self.http_client._SetResponseForPostRequest('shard1_isolated') self.http_client._SetResponseForPostRequest('shard1_url') self.http_client._SetResponseForGetRequestIsolated( 'https://%s/default-gzip/shard1' % waterfall_config.GetSwarmingSettings().get('isolated_storage_url'), 'shard1') result = swarming_util.RetrieveShardedTestResultsFromIsolatedServer( isolated_data, self.http_client) expected_result = json.loads( zlib.decompress(self.http_client._GetData('isolated', 'shard1'))) self.assertEqual(expected_result, result)
def GetSwarmingTaskRequest(task_id, http_client): """Returns an instance of SwarmingTaskRequest representing the given task.""" swarming_server_host = waterfall_config.GetSwarmingSettings().get( 'server_host') url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % ( swarming_server_host, task_id) content, error = _SendRequestToServer(url, http_client) # TODO(lijeffrey): Handle/report error in calling functions. if not error: json_data = json.loads(content) return SwarmingTaskRequest.Deserialize(json_data) return None
def testRetrieveShardedTestResultsFromIsolatedServerFailed(self): isolated_data = [{ 'digest': 'shard1_isolated', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') }] result = swarming_util.RetrieveShardedTestResultsFromIsolatedServer( isolated_data, self.http_client) self.assertIsNone(result)
def testDownloadTestResults(self): isolated_data = { 'digest': 'shard1_isolated', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') } isolated_storage_url = waterfall_config.GetSwarmingSettings().get( 'isolated_storage_url') self.http_client._SetResponseForPostRequest('shard1_isolated') self.http_client._SetResponseForPostRequest('shard1_url') self.http_client._SetResponseForGetRequestIsolated( 'https://%s/default-gzip/shard1' % isolated_storage_url, 'shard1') result, error = swarming_util._DownloadTestResults( isolated_data, self.http_client) expected_result = json.loads( zlib.decompress(self.http_client._GetData('isolated', 'shard1'))) self.assertEqual(expected_result, result) self.assertIsNone(error)
def testGetSwarmingTaskFailureLog(self): outputs_ref = { 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server'), 'namespace': 'default-gzip', 'isolated': 'shard1_isolated' } self.http_client._SetResponseForPostRequest('shard1_isolated') self.http_client._SetResponseForPostRequest('shard1_url') self.http_client._SetResponseForGetRequestIsolated( 'https://%s/default-gzip/shard1' % (waterfall_config.GetSwarmingSettings().get('isolated_storage_url') ), 'shard1') result, error = swarming_util.GetSwarmingTaskFailureLog( outputs_ref, self.http_client) expected_result = json.loads( zlib.decompress(self.http_client._GetData('isolated', 'shard1'))) self.assertEqual(expected_result, result) self.assertIsNone(error)
def testDownloadTestResultsFailedForFileUrl(self): isolated_data = { 'digest': 'shard1_isolated', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') } self.http_client._SetResponseForPostRequest('shard1_isolated') result, error = swarming_util._DownloadTestResults( isolated_data, self.http_client) self.assertIsNone(result) self.assertIsNotNone(error)
def testDownloadTestResultsFailedForSecondHash(self): isolated_data = { 'digest': 'not found', 'namespace': 'default-gzip', 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server') } result, error = swarming_util._DownloadTestResults( isolated_data, self.http_client) self.assertIsNone(result) self.assertIsNotNone(error)
def testGetIsolatedOutputForTask(self): task_id = '2944afa502297110' self.http_client._SetResponseForGetRequestSwarmingResult(task_id) self.http_client._SetResponseForPostRequest('shard1_isolated') self.http_client._SetResponseForPostRequest('shard1_url') self.http_client._SetResponseForGetRequestIsolated( 'https://%s/default-gzip/shard1' % (waterfall_config.GetSwarmingSettings().get('isolated_storage_url') ), 'shard1') result = swarming_util.GetIsolatedOutputForTask( task_id, self.http_client) expected_result = json.loads( zlib.decompress(self.http_client._GetData('isolated', 'shard1'))) self.assertEqual(expected_result, result)
def ListSwarmingTasksDataByTags( master_name, builder_name, build_number, http_client, additional_tag_filters=None): """Downloads tasks data from swarming server. Args: master_name(str): Value of the master tag. builder_name(str): Value of the buildername tag. build_number(int): Value of the buildnumber tag. http_client(RetryHttpClient): The http client to send HTTPs requests. additional_tag_filters(dict): More tag filters to be added. """ base_url = ('https://%s/_ah/api/swarming/v1/tasks/' 'list?tags=%s&tags=%s&tags=%s') % ( waterfall_config.GetSwarmingSettings().get('server_host'), urllib.quote('master:%s' % master_name), urllib.quote('buildername:%s' % builder_name), urllib.quote('buildnumber:%d' % build_number)) additional_tag_filters = additional_tag_filters or {} for tag_name, tag_value in additional_tag_filters.iteritems(): base_url += '&tags=%s' % urllib.quote('%s:%s' % (tag_name, tag_value)) items = [] cursor = None while True: if not cursor: url = base_url else: url = base_url + '&cursor=%s' % urllib.quote(cursor) new_data, _ = _SendRequestToServer(url, http_client) # TODO(lijeffrey): handle error in calling functions. if not new_data: break new_data_json = json.loads(new_data) if new_data_json.get('items'): items.extend(new_data_json['items']) if new_data_json.get('cursor'): cursor = new_data_json['cursor'] else: break return items
def check_task_completion(): if task_completed and data is not None: task.created_time = (task.created_time or self._ConvertDateTime(data.get('created_ts'))) task.started_time = (task.started_time or self._ConvertDateTime(data.get('started_ts'))) task.completed_time = (task.completed_time or self._ConvertDateTime(data.get('completed_ts'))) task.put() pipeline_result = self._GetPipelineResult( step_name, step_name_no_platform, task) self.complete(pipeline_result) elif time.time() > deadline: # pragma: no cover # Timeout. # Updates status as ERROR. task.status = analysis_status.ERROR task.error = { 'code': swarming_util.TIMED_OUT, 'message': 'Process swarming task result timed out' } task.put() timeout_hours = waterfall_config.GetSwarmingSettings().get( 'task_timeout_hours') logging.error('Swarming task timed out after %d hours.' % timeout_hours) pipeline_result = self._GetPipelineResult( step_name, step_name_no_platform, task) self.complete(pipeline_result) else: self.last_params = { 'task_id': task_id, 'step_name': step_name, 'call_args': call_args, 'deadline': deadline, 'server_query_interval_seconds': server_query_interval_seconds, 'task_started': task_started, 'task_completed': task_completed, 'step_name_no_platform': step_name_no_platform, } # Update the stored callback url with possibly modified params. new_callback_url = self.get_callback_url(callback_params=json.dumps( self.last_params)) if task.callback_url != new_callback_url: # pragma: no cover task.callback_url = new_callback_url task.put()
def testGetSwarmingSettings(self): self.assertEqual( { 'server_host': 'chromium-swarm.appspot.com', 'default_request_priority': 150, 'request_expiration_hours': 20, 'server_query_interval_seconds': 60, 'task_timeout_hours': 23, 'isolated_server': 'https://isolateserver.appspot.com', 'isolated_storage_url': 'isolateserver.storage.googleapis.com', 'iterations_to_rerun': 10, 'get_swarming_task_id_timeout_seconds': 300, 'get_swarming_task_id_wait_seconds': 10, 'server_retry_timeout_hours': 2, 'maximum_server_contact_retry_interval_seconds': 5 * 60, 'should_retry_server': False, 'minimum_number_of_available_bots': 5, 'minimum_percentage_of_available_bots': 0.1, }, waterfall_config.GetSwarmingSettings())
def testGetSwarmingTaskResultById(self): task_id = '2944afa502297110' self.http_client._SetResponseForGetRequestSwarmingResult(task_id) data, error = swarming_util.GetSwarmingTaskResultById( task_id, self.http_client) expected_outputs_ref = { 'isolatedserver': waterfall_config.GetSwarmingSettings().get('isolated_server'), 'namespace': 'default-gzip', 'isolated': 'shard1_isolated' } self.assertEqual('COMPLETED', data['state']) self.assertEqual(expected_outputs_ref, data['outputs_ref']) self.assertIsNone(error)
def GetTaskIdFromSwarmingTaskEntity(urlsafe_task_key): """Gets swarming task id from SwarmingTask. Waits and polls if needed.""" swarming_settings = waterfall_config.GetSwarmingSettings() wait_seconds = swarming_settings.get('get_swarming_task_id_wait_seconds') timeout_seconds = swarming_settings.get( 'get_swarming_task_id_timeout_seconds') deadline = time.time() + timeout_seconds while time.time() < deadline: swarming_task = ndb.Key(urlsafe=urlsafe_task_key).get() if not swarming_task: raise Exception('Swarming task was deleted unexpectedly!') if swarming_task.task_id: return swarming_task.task_id # Wait for the existing pipeline to start the Swarming task. time.sleep(wait_seconds) raise Exception('Timed out waiting for task_id.')
def _GetSwarmingTaskId(self, *args): swarming_settings = waterfall_config.GetSwarmingSettings() wait_seconds = swarming_settings.get( 'get_swarming_task_id_wait_seconds') timeout_seconds = swarming_settings.get( 'get_swarming_task_id_timeout_seconds') deadline = time.time() + timeout_seconds while time.time() < deadline: swarming_task = self._GetSwarmingTask(*args) if not swarming_task: # pragma: no cover. Pipeline will retry. raise Exception('Swarming task was deleted unexpectedly!') if swarming_task.task_id: return swarming_task.task_id # Wait for the existing pipeline to start the Swarming task. time.sleep(wait_seconds) raise Exception('Time out!') # pragma: no cover. Pipeline will retry.
def TriggerSwarmingTask(request, http_client): """Triggers a new Swarming task for the given request. The Swarming task priority will be overwritten, and extra tags might be added. Args: request (SwarmingTaskRequest): A Swarming task request. http_client (RetryHttpClient): An http client with automatic retry. """ # Use a priority much lower than CQ for now (CQ's priority is 30). # Later we might use a higher priority -- a lower value here. # Note: the smaller value, the higher priority. swarming_settings = waterfall_config.GetSwarmingSettings() request_expiration_hours = swarming_settings.get( 'request_expiration_hours') request.priority = str( max(100, swarming_settings.get('default_request_priority'))) request.expiration_secs = str(request_expiration_hours * 60 * 60) request.tags.extend( ['findit:1', 'project:Chromium', 'purpose:post-commit']) return swarming_util.TriggerSwarmingTask(SwarmingHost(), request, http_client)
def GetSwarmingBotCounts(dimensions, http_client): """Gets number of swarming bots for certain dimensions. Args: dimensions (dict): A dict of dimensions. http_client (HttpClient): The httpclient object with which to make the server calls. Returns: bot_counts (dict): Dict of numbers of available swarming bots. """ if not dimensions: return {} swarming_server_host = waterfall_config.GetSwarmingSettings().get( 'server_host') url = 'https://%s/_ah/api/swarming/v1/bots/count' % swarming_server_host dimension_list = ['%s:%s' % (k, v) for k, v in dimensions.iteritems()] dimension_url = '&dimensions='.join(dimension_list) # Url looks like 'https://chromium-swarm.appspot.com/_ah/api/swarming/v1/bots # /count?dimensions=os:Windows-7-SP1&dimensions=cpu:x86-64' url = '%s?dimensions=%s' % (url, dimension_url) content, error = _SendRequestToServer(url, http_client) if error or not content: return {} content_data = json.loads(content) bot_counts = { k: int(content_data.get(k, 0)) for k in ('busy', 'count', 'dead', 'quarantined') } bot_counts['available'] = (bot_counts['count'] - bot_counts['busy'] - bot_counts['dead'] - bot_counts['quarantined']) return bot_counts
def _GenerateSwarmingTasksData(failure_result_map): """Collects info for all related swarming tasks. Returns: A dict as below: { 'step1': { 'swarming_tasks': { 'm/b/121': { 'task_info': { 'status': 'Completed', 'task_id': 'task1', 'task_url': ('https://chromium-swarm.appspot.com/user' '/task/task1') }, 'all_tests': ['test2', 'test3', 'test4'], 'reliable_tests': ['test2'], 'flaky_tests': ['test3', 'test4'] } } }, 'step2': { 'swarming_tasks': { 'm/b/121': { 'task_info': { 'status': 'Pending' }, 'all_tests': ['test1'] } } }, 'step3': { 'swarming_tasks': { 'm/b/121': { 'task_info': { 'status': 'No swarming rerun found' }, 'all_tests': ['test1'] } } } } """ tasks_info = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) swarming_server = waterfall_config.GetSwarmingSettings()['server_host'] for step_name, failure in failure_result_map.iteritems(): step_tasks_info = tasks_info[step_name]['swarming_tasks'] if isinstance(failure, dict): # Only swarming test failures have swarming re-runs. swarming_task_keys = set(failure.values()) for key in swarming_task_keys: task_dict = step_tasks_info[key] referred_build_keys = BaseBuildModel.GetBuildInfoFromBuildKey(key) task = WfSwarmingTask.Get(*referred_build_keys, step_name=step_name) all_tests = _GetAllTestsForASwarmingTask(key, failure) task_dict['all_tests'] = all_tests if not task: # In case task got manually removed from data store. task_info = {'status': result_status.NO_SWARMING_TASK_FOUND} else: task_info = {'status': task.status} # Get the step name without platform. # This value should have been saved in task.parameters; # in case of no such value saved, split the step_name. task_dict['ref_name'] = ( step_name.split()[0] if not task.parameters or not task.parameters.get('ref_name') else task.parameters['ref_name']) if task.task_id: # Swarming rerun has started. task_info['task_id'] = task.task_id task_info['task_url'] = 'https://%s/user/task/%s' % ( swarming_server, task.task_id) if task.classified_tests: # Swarming rerun has completed. # Use its result to get reliable and flaky tests. # If task has not completed, there will be no try job yet, # the result will be grouped in unclassified failures temporarily. reliable_tests = task.classified_tests.get('reliable_tests', []) task_dict['reliable_tests'] = [ test for test in reliable_tests if test in all_tests ] flaky_tests = task.classified_tests.get('flaky_tests', []) task_dict['flaky_tests'] = [ test for test in flaky_tests if test in all_tests ] task_dict['task_info'] = task_info else: step_tasks_info[failure] = { 'task_info': { 'status': result_status.NON_SWARMING_NO_RERUN } } return tasks_info
def _SendRequestToServer(url, http_client, post_data=None): """Sends GET/POST request to arbitrary url and returns response content. Because the Swarming and Isolated servers that _SendRequestToServer tries to contact are prone to outages, exceptions trying to reach them may occur thus this method should retry. We want to monitor and document these occurrences even if the request eventually succeeds after retrying, with the last error encountered being the one that is reported. Args: url (str): The url to send the request to. http_client (HttpClient): The httpclient object with which to make the server calls. post_data (dict): Data/params to send with the request, if any. Returns: content (dict), error (dict): The content from the server and the last error encountered trying to retrieve it. """ headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} swarming_settings = waterfall_config.GetSwarmingSettings() should_retry = swarming_settings.get('should_retry_server') timeout_seconds = ( swarming_settings.get('server_retry_timeout_hours') * 60 * 60) maximum_retry_interval = swarming_settings.get( 'maximum_server_contact_retry_interval_seconds') deadline = time.time() + timeout_seconds retry_backoff = 60 tries = 1 error = None if post_data: post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':')) headers['Content-Type'] = 'application/json; charset=UTF-8' headers['Content-Length'] = len(post_data) while True: try: if post_data: status_code, content = http_client.Post(url, post_data, headers=headers) else: status_code, content = http_client.Get(url, headers=headers) except ConnectionClosedError as e: error = { 'code': URLFETCH_CONNECTION_CLOSED_ERROR, 'message': e.message } _OnConnectionFailed(url, 'ConnectionClosedError') except DeadlineExceededError as e: error = { 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, 'message': e.message } _OnConnectionFailed(url, 'DeadlineExceededError') except DownloadError as e: error = { 'code': URLFETCH_DOWNLOAD_ERROR, 'message': e.message } _OnConnectionFailed(url, 'DownloadError') except Exception as e: # pragma: no cover logging.error( 'An unknown exception occurred that need to be monitored: %s', e.message) error = { 'code': UNKNOWN, 'message': e.message } _OnConnectionFailed(url, 'Unknown Exception') if error or status_code != 200: # The retry upon 50x (501 excluded) is automatically handled in the # underlying http_client. # By default, it retries 5 times with exponential backoff. error = error or { 'code': EXCEEDED_MAX_RETRIES_ERROR, 'message': 'Max retries exceeded trying to reach %s' % url } logging.error(error['message']) else: # Even if the call is successful, still return the last error encountered. return content, error if should_retry and time.time() < deadline: # pragma: no cover # Wait, then retry if applicable. wait_time = _GetBackoffSeconds( retry_backoff, tries, maximum_retry_interval) logging.info('Retrying connection to %s in %d seconds', url, wait_time) time.sleep(wait_time) tries += 1 else: if should_retry: # Indicate in the error that the retry timeout was reached. error['retry_timeout'] = True break logging.error('Failed to get an adequate response from %s. No data could be ' 'retrieved', url) return None, error
def run(self, master_name, builder_name, build_number, step_name, task_id=None, *args): """Monitors a swarming task. Args: master_name (str): The master name. builder_name (str): The builder name. build_number (str): The build number. step_name (str): The failed test step name. task_id (str): The task id to query the swarming server on the progresss of a swarming task. """ call_args = self._GetArgs(master_name, builder_name, build_number, step_name, *args) task = self._GetSwarmingTask(*call_args) task_id = task_id or task.task_id if not task_id: # The swarming task encountered an error when being triggered. if not task.error: # pragma no branch task.error = { 'error': 'Undetected error in swarming task. No task id found!', 'message': 'Undetected error in swarming task. No task id found!' } task.put() return # Check to make this method idempotent. if task.callback_url and self.pipeline_id in task.callback_url: return timeout_hours = waterfall_config.GetSwarmingSettings().get( 'task_timeout_hours') deadline = time.time() + timeout_hours * 60 * 60 server_query_interval_seconds = waterfall_config.GetSwarmingSettings().get( 'server_query_interval_seconds') task_started = False task_completed = False step_name_no_platform = None if task_id.lower() in (NO_TASK, NO_TASK_EXCEPTION): # pragma: no branch # This situation happens in flake analysis: if the step with flaky test # didn't exist in checked build or the build had exception so the step # with flaky test didn't run at all, we should skip the build. has_valid_artifact = task_id != NO_TASK_EXCEPTION task.task_id = None task.status = analysis_status.SKIPPED task.put() self._UpdateMasterFlakeAnalysis( *call_args, pass_rate=-1, flake_swarming_task=task, has_valid_artifact=has_valid_artifact) self.complete(self._GetPipelineResult( step_name, step_name_no_platform, task)) return self.last_params = { 'task_id': task_id, 'step_name': step_name, 'call_args': call_args, 'deadline': deadline, 'server_query_interval_seconds': server_query_interval_seconds, 'task_started': task_started, 'task_completed': task_completed, 'step_name_no_platform': step_name_no_platform, } task.callback_url = self.get_callback_url(callback_params=json.dumps( self.last_params)) task.callback_target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) task.put() # Guarantee one callback 10 minutes after the deadline to clean up even if # Swarming fails to call us back. self.delay_callback((timeout_hours * 60 + 10) * 60, self.last_params, name=task_id + '_cleanup_task') # Run immediately in case the task already went from scheduled to started. self.callback(callback_params=self.last_params)
def testGetIsolatedDataForFailedBuild(self): master_name = 'm' builder_name = 'b' build_number = 223 failed_steps = { 'a_tests': { 'current_failure': 2, 'first_failure': 0 }, 'unit_tests': { 'current_failure': 2, 'first_failure': 0 }, 'compile': { 'current_failure': 2, 'first_failure': 0 } } self.http_client._SetResponseForGetRequestSwarmingList( master_name, builder_name, build_number) result = swarming_util.GetIsolatedDataForFailedBuild( master_name, builder_name, build_number, failed_steps, self.http_client) expected_failed_steps = { 'a_tests': { 'current_failure': 2, 'first_failure': 0, 'list_isolated_data': [{ 'digest': 'isolatedhashatests', 'namespace': 'default-gzip', 'isolatedserver': (waterfall_config.GetSwarmingSettings().get( 'isolated_server')) }] }, 'unit_tests': { 'current_failure': 2, 'first_failure': 0, 'list_isolated_data': [{ 'digest': 'isolatedhashunittests1', 'namespace': 'default-gzip', 'isolatedserver': (waterfall_config.GetSwarmingSettings().get( 'isolated_server')) }] }, 'compile': { 'current_failure': 2, 'first_failure': 0 } } for step_name in failed_steps: step = WfStep.Get(master_name, builder_name, build_number, step_name) if step_name == 'compile': self.assertIsNone(step) else: self.assertIsNotNone(step) self.assertTrue(result) self.assertEqual(expected_failed_steps, failed_steps)
def _GetIterationsToRerun(self): return waterfall_config.GetSwarmingSettings().get( 'iterations_to_rerun')
def TimeoutSeconds(self): timeout_hours = waterfall_config.GetSwarmingSettings().get( 'task_timeout_hours', 24) return timeout_hours * 60 * 60