def trigger_by_manifest(swarming, manifest): """Given a task manifest, triggers it for execution on swarming. Args: swarming: URL of a swarming service. manifest: instance of Manifest. Returns: tuple(Task id, priority) on success. tuple(None, None) on failure. """ logging.info('Triggering: %s', manifest.task_name) manifest_text = manifest.to_json() result = net.url_read(swarming + '/test', data={'request': manifest_text}) if not result: on_error.report('Failed to trigger task %s' % manifest.task_name) return None, None try: data = json.loads(result) except (ValueError, TypeError): msg = '\n'.join(( 'Failed to trigger task %s' % manifest.task_name, 'Manifest: %s' % manifest_text, 'Bad response: %s' % result)) on_error.report(msg) return None, None if not data: return None, None return data['test_keys'][0]['test_key'], data['priority']
def trigger_by_manifest(swarming, manifest): """Given a task manifest, triggers it for execution on swarming. Args: swarming: URL of a swarming service. manifest: instance of Manifest. Returns: tuple(Task id, priority) on success. tuple(None, None) on failure. """ logging.info('Triggering: %s', manifest.task_name) manifest_text = manifest.to_json() result = net.url_read(swarming + '/test', data={'request': manifest_text}) if not result: tools.report_error('Failed to trigger task %s' % manifest.task_name) return None try: data = json.loads(result) except (ValueError, TypeError) as e: msg = '\n'.join(('Failed to trigger task %s' % manifest.task_name, 'Manifest: %s' % manifest_text, 'Bad response: %s' % result, str(e))) tools.report_error(msg) return None, None if not data: return None, None return data['test_keys'][0]['test_key'], data['priority']
def retrieve_results(base_url, test_key, timeout, should_stop): """Retrieves results for a single test_key.""" assert isinstance(timeout, float), timeout params = [('r', test_key)] result_url = '%s/get_result?%s' % (base_url, urllib.urlencode(params)) start = now() while True: if timeout and (now() - start) >= timeout: logging.error('retrieve_results(%s) timed out', base_url) return {} # Do retries ourselves. response = net.url_read(result_url, retry_404=False, retry_50x=False) if response is None: # Aggressively poll for results. Do not use retry_404 so # should_stop is polled more often. remaining = min(5, timeout - (now() - start)) if timeout else 5 if remaining > 0: if should_stop.get(): return {} net.sleep_before_retry(1, remaining) else: try: data = json.loads(response) or {} except (ValueError, TypeError): logging.warning( 'Received corrupted data for test_key %s. Retrying.', test_key) else: if data['output']: return data if should_stop.get(): return {}
def retrieve_results(base_url, task_key, timeout, should_stop, output_collector): """Retrieves results for a single task_key. Returns a dict with results on success or None on failure or timeout. """ assert isinstance(timeout, float), timeout params = [("r", task_key)] result_url = "%s/get_result?%s" % (base_url, urllib.urlencode(params)) started = now() deadline = started + timeout if timeout else None attempt = 0 while not should_stop.is_set(): attempt += 1 # Waiting for too long -> give up. current_time = now() if deadline and current_time >= deadline: logging.error("retrieve_results(%s) timed out on attempt %d", base_url, attempt) return None # Do not spin too fast. Spin faster at the beginning though. # Start with 1 sec delay and for each 30 sec of waiting add another second # of delay, until hitting 15 sec ceiling. if attempt > 1: max_delay = min(15, 1 + (current_time - started) / 30.0) delay = min(max_delay, deadline - current_time) if deadline else max_delay if delay > 0: logging.debug("Waiting %.1f sec before retrying", delay) should_stop.wait(delay) if should_stop.is_set(): return None # Disable internal retries in net.url_read, since we are doing retries # ourselves. Do not use retry_404 so should_stop is polled more often. response = net.url_read(result_url, retry_404=False, retry_50x=False) # Request failed. Try again. if response is None: continue # Got some response, ensure it is JSON dict, retry if not. try: result = json.loads(response) or {} if not isinstance(result, dict): raise ValueError() except (ValueError, TypeError): logging.warning("Received corrupted or invalid data for task_key %s, retrying: %r", task_key, response) continue # Swarming server uses non-empty 'output' value as a flag that task has # finished. How to wait for tasks that produce no output is a mystery. if result.get("output"): # Record the result, try to fetch attached output files (if any). if output_collector: # TODO(vadimsh): Respect |should_stop| and |deadline| when fetching. output_collector.process_shard_result(result) return result
def download_data(root_dir, files): """Downloads and expands the zip files enumerated in the test run data.""" for data_url, _ in files: logging.info('Downloading: %s', data_url) content = net.url_read(data_url) if content is None: raise Exception('Failed to download %s' % data_url) with zipfile.ZipFile(StringIO.StringIO(content)) as zip_file: zip_file.extractall(root_dir)
def calculate_version(url): """Retrieves the swarm_bot code and returns the SHA-1 for it.""" # Cannot use url_open() since zipfile requires .seek(). archive = zipfile.ZipFile(StringIO.StringIO(net.url_read(url))) # See # https://code.google.com/p/swarming/source/browse/services/swarming/common/bot_archive.py d = hashlib.sha1() for f in archive.namelist(): d.update(archive.read(f)) return d.hexdigest()
def test_url_read(self): # Successfully reads the data. self.mock(net, "url_open", lambda url, **_kwargs: net.HttpResponse.get_fake_response("111", url)) self.assertEqual(net.url_read("https://fake_url.com/test"), "111") # Respects url_open connection errors. self.mock(net, "url_open", lambda _url, **_kwargs: None) self.assertIsNone(net.url_read("https://fake_url.com/test")) # Respects read timeout errors. def timeouting_http_response(url): def read_mock(_size=None): raise net.TimeoutError() response = net.HttpResponse.get_fake_response("", url) self.mock(response, "read", read_mock) return response self.mock(net, "url_open", lambda url, **_kwargs: timeouting_http_response(url)) self.assertIsNone(net.url_read("https://fake_url.com/test"))
def CMDreproduce(parser, args): """Runs a task locally that was triggered on the server. This running locally the same commands that have been run on the bot. The data downloaded will be in a subdirectory named 'work' of the current working directory. """ options, args = parser.parse_args(args) if len(args) != 1: parser.error('Must specify exactly one task id.') url = options.swarming + '/swarming/api/v1/client/task/%s/request' % args[0] request = net.url_read_json(url) if not request: print >> sys.stderr, 'Failed to retrieve request data for the task' return 1 if not os.path.isdir('work'): os.mkdir('work') swarming_host = urlparse.urlparse(options.swarming).netloc properties = request['properties'] for data_url, _ in properties['data']: assert data_url.startswith('https://'), data_url data_host = urlparse.urlparse(data_url).netloc if data_host != swarming_host: auth.ensure_logged_in('https://' + data_host) content = net.url_read(data_url) if content is None: print >> sys.stderr, 'Failed to download %s' % data_url return 1 with zipfile.ZipFile(StringIO.StringIO(content)) as zip_file: zip_file.extractall('work') env = None if properties['env']: env = os.environ.copy() logging.info('env: %r', properties['env']) env.update( (k.encode('utf-8'), v.encode('utf-8')) for k, v in properties['env'].iteritems()) exit_code = 0 for cmd in properties['commands']: try: c = subprocess.call(cmd, env=env, cwd='work') except OSError as e: print >> sys.stderr, 'Failed to run: %s' % ' '.join(cmd) print >> sys.stderr, str(e) c = 1 if not exit_code: exit_code = c return exit_code
def url_read(self, resource, **kwargs): url = self.url + resource if kwargs.get('data') == None: # No XSRF token for GET. return net.url_read(url, **kwargs) if self.need_refresh(): self.refresh_token() resp = self._url_read_post(url, **kwargs) if resp is None: raise Error('Failed to connect to %s; %s' % (url, self.expiration)) return resp
def test_url_read(self): # Successfully reads the data. self.mock(net, 'url_open', lambda url, **_kwargs: net_utils.make_fake_response('111', url)) self.assertEqual(net.url_read('https://fake_url.com/test'), '111') # Respects url_open connection errors. self.mock(net, 'url_open', lambda _url, **_kwargs: None) self.assertIsNone(net.url_read('https://fake_url.com/test')) # Respects read timeout errors. def timeouting_http_response(url): def read_mock(_size=None): raise net.TimeoutError() response = net_utils.make_fake_response('', url) self.mock(response, 'read', read_mock) return response self.mock(net, 'url_open', lambda url, **_kwargs: timeouting_http_response(url)) self.assertIsNone(net.url_read('https://fake_url.com/test'))
def process_manifest( swarming, isolate_server, namespace, isolated_hash, task_name, shards, dimensions, env, working_dir, verbose, profile, priority, algo): """Processes the manifest file and send off the swarming task request.""" try: manifest = Manifest( isolate_server=isolate_server, namespace=namespace, isolated_hash=isolated_hash, task_name=task_name, shards=shards, dimensions=dimensions, env=env, working_dir=working_dir, verbose=verbose, profile=profile, priority=priority, algo=algo) except ValueError as e: tools.report_error('Unable to process %s: %s' % (task_name, e)) return 1 chromium_setup(manifest) logging.info('Zipping up files...') if not zip_and_upload(manifest): return 1 logging.info('Server: %s', swarming) logging.info('Task name: %s', task_name) trigger_url = swarming + '/test' manifest_text = manifest.to_json() result = net.url_read(trigger_url, data={'request': manifest_text}) if not result: tools.report_error( 'Failed to trigger task %s\n%s' % (task_name, trigger_url)) return 1 try: json.loads(result) except (ValueError, TypeError) as e: msg = '\n'.join(( 'Failed to trigger task %s' % task_name, 'Manifest: %s' % manifest_text, 'Bad response: %s' % result, str(e))) tools.report_error(msg) return 1 return 0
def process_manifest(swarming, isolate_server, namespace, isolated_hash, task_name, extra_args, shards, dimensions, env, working_dir, deadline, verbose, profile, priority): """Processes the manifest file and send off the swarming task request.""" try: manifest = Manifest(isolate_server=isolate_server, namespace=namespace, isolated_hash=isolated_hash, task_name=task_name, extra_args=extra_args, shards=shards, dimensions=dimensions, env=env, working_dir=working_dir, deadline=deadline, verbose=verbose, profile=profile, priority=priority) except ValueError as e: tools.report_error('Unable to process %s: %s' % (task_name, e)) return 1 chromium_setup(manifest) logging.info('Zipping up files...') if not zip_and_upload(manifest): return 1 logging.info('Server: %s', swarming) logging.info('Task name: %s', task_name) trigger_url = swarming + '/test' manifest_text = manifest.to_json() result = net.url_read(trigger_url, data={'request': manifest_text}) if not result: tools.report_error('Failed to trigger task %s\n%s' % (task_name, trigger_url)) return 1 try: json.loads(result) except (ValueError, TypeError) as e: msg = '\n'.join(('Failed to trigger task %s' % task_name, 'Manifest: %s' % manifest_text, 'Bad response: %s' % result, str(e))) tools.report_error(msg) return 1 return 0
def url_read(self, resource, **kwargs): url = self.url + resource if kwargs.get('data') == None: # No XSRF token for GET. return net.url_read(url, **kwargs) if not self.token: self.token = self.refresh_token() resp = self._url_read_post(url, **kwargs) if resp is None: # This includes 403 because the XSRF token expired. Renew the token. # TODO(maruel): It'd be great if it were transparent. self.refresh_token() resp = self._url_read_post(url, **kwargs) if resp is None: raise Error('Failed to connect to %s' % url) return resp
def get_task_keys(swarm_base_url, task_name): """Returns the Swarming task key for each shards of task_name.""" key_data = urllib.urlencode([("name", task_name)]) url = "%s/get_matching_test_cases?%s" % (swarm_base_url, key_data) for _ in net.retry_loop(max_attempts=net.URL_OPEN_MAX_ATTEMPTS): result = net.url_read(url, retry_404=True) if result is None: raise Failure("Error: Unable to find any task with the name, %s, on swarming server" % task_name) # TODO(maruel): Compare exact string. if "No matching" in result: logging.warning("Unable to find any task with the name, %s, on swarming " "server" % task_name) continue return json.loads(result) raise Failure("Error: Unable to find any task with the name, %s, on swarming server" % task_name)
def CMDis_fine(_args): """Just reports that the code doesn't throw. That ensures that the bot has minimal viability before transfering control to it. For now, it just imports bot_main and send ping request to server but later it'll check the config, etc. """ # pylint: disable=unused-variable from bot_code import bot_main from config import bot_config resp = net.url_read(bot_main.get_config()['server'] + '/swarming/api/v1/bot/server_ping') if resp is None: logging.error('No response from server_ping') return 1 return 0
def calculate_version(url): """Retrieves the swarm_bot code and returns the SHA-1 for it.""" # Cannot use url_open() since zipfile requires .seek(). archive = zipfile.ZipFile(StringIO.StringIO(net.url_read(url))) # See # https://code.google.com/p/swarming/source/browse/src/common/version.py?repo=swarming-server files = ( 'slave_machine.py', 'swarm_bot/local_test_runner.py', 'common/__init__.py', 'common/swarm_constants.py', 'common/version.py', 'common/test_request_message.py', 'common/url_helper.py', ) d = hashlib.sha1() for f in files: d.update(archive.read(f)) return d.hexdigest()
def get_test_keys(swarm_base_url, test_name): """Returns the Swarm test key for each shards of test_name.""" key_data = urllib.urlencode([('name', test_name)]) url = '%s/get_matching_test_cases?%s' % (swarm_base_url, key_data) for _ in net.retry_loop(max_attempts=net.URL_OPEN_MAX_ATTEMPTS): result = net.url_read(url, retry_404=True) if result is None: raise Failure( 'Error: Unable to find any tests with the name, %s, on swarm server' % test_name) # TODO(maruel): Compare exact string. if 'No matching' in result: logging.warning('Unable to find any tests with the name, %s, on swarm ' 'server' % test_name) continue return json.loads(result) raise Failure( 'Error: Unable to find any tests with the name, %s, on swarm server' % test_name)
def _do_push(self, push_state, content): """Uploads isolated file to the URL. Used only for storing files, not for API calls. Can be overridden in subclasses. Args: url: URL to upload the data to. push_state: an _IsolateServicePushState instance item: the original Item to be uploaded content: an iterable that yields 'str' chunks. """ # A cheezy way to avoid memcpy of (possibly huge) file, until streaming # upload support is implemented. if isinstance(content, list) and len(content) == 1: content = content[0] else: content = ''.join(content) # DB upload if not push_state.finalize_url: url = '%s/%s' % (self._base_url, push_state.upload_url) content = base64.b64encode(content) data = { 'upload_ticket': push_state.preupload_status['upload_ticket'], 'content': content, } response = net.url_read_json(url=url, data=data) return response is not None and response['ok'] # upload to GS url = push_state.upload_url response = net.url_read( content_type='application/octet-stream', data=content, method='PUT', headers={'Cache-Control': 'public, max-age=31536000'}, url=url) return response is not None
def _url_read_post(self, url, **kwargs): headers = (kwargs.pop('headers', None) or {}).copy() headers['X-XSRF-Token'] = self.token return net.url_read(url, headers=headers, **kwargs)
def retrieve_results(base_url, shard_index, task_key, timeout, should_stop, output_collector): """Retrieves results for a single task_key. Returns: <result dict> on success. None on failure. """ assert isinstance(timeout, float), timeout params = [('r', task_key)] result_url = '%s/get_result?%s' % (base_url, urllib.urlencode(params)) started = now() deadline = started + timeout if timeout else None attempt = 0 while not should_stop.is_set(): attempt += 1 # Waiting for too long -> give up. current_time = now() if deadline and current_time >= deadline: logging.error('retrieve_results(%s) timed out on attempt %d', base_url, attempt) return None # Do not spin too fast. Spin faster at the beginning though. # Start with 1 sec delay and for each 30 sec of waiting add another second # of delay, until hitting 15 sec ceiling. if attempt > 1: max_delay = min(15, 1 + (current_time - started) / 30.0) delay = min(max_delay, deadline - current_time) if deadline else max_delay if delay > 0: logging.debug('Waiting %.1f sec before retrying', delay) should_stop.wait(delay) if should_stop.is_set(): return None # Disable internal retries in net.url_read, since we are doing retries # ourselves. Do not use retry_404 so should_stop is polled more often. response = net.url_read(result_url, retry_404=False, retry_50x=False) # Request failed. Try again. if response is None: continue # Got some response, ensure it is JSON dict, retry if not. try: result = json.loads(response) or {} if not isinstance(result, dict): raise ValueError() except (ValueError, TypeError): logging.warning( 'Received corrupted or invalid data for task_key %s, retrying: %r', task_key, response) continue # Swarming server uses non-empty 'output' value as a flag that task has # finished. How to wait for tasks that produce no output is a mystery. if result.get('output'): # Record the result, try to fetch attached output files (if any). if output_collector: # TODO(vadimsh): Respect |should_stop| and |deadline| when fetching. output_collector.process_shard_result(shard_index, result) return result
def process_manifest( swarming, isolate_server, namespace, isolated_hash, task_name, extra_args, shards, dimensions, env, working_dir, deadline, verbose, profile, priority, ): """Processes the manifest file and send off the swarming task request.""" try: manifest = Manifest( isolate_server=isolate_server, namespace=namespace, isolated_hash=isolated_hash, task_name=task_name, extra_args=extra_args, shards=shards, dimensions=dimensions, env=env, working_dir=working_dir, deadline=deadline, verbose=verbose, profile=profile, priority=priority, ) except ValueError as e: tools.report_error("Unable to process %s: %s" % (task_name, e)) return 1 chromium_setup(manifest) logging.info("Zipping up files...") if not zip_and_upload(manifest): return 1 logging.info("Server: %s", swarming) logging.info("Task name: %s", task_name) trigger_url = swarming + "/test" manifest_text = manifest.to_json() result = net.url_read(trigger_url, data={"request": manifest_text}) if not result: tools.report_error("Failed to trigger task %s\n%s" % (task_name, trigger_url)) return 1 try: json.loads(result) except (ValueError, TypeError) as e: msg = "\n".join( ( "Failed to trigger task %s" % task_name, "Manifest: %s" % manifest_text, "Bad response: %s" % result, str(e), ) ) tools.report_error(msg) return 1 return 0
def calculate_version(url): """Retrieves the swarm_bot code and returns the SHA-1 for it.""" # Cannot use url_open() since zipfile requires .seek(). return generate_version(StringIO.StringIO(net.url_read(url)))
def _run(self): """Polls the server and fake execution.""" try: self._progress.update_item('%d alive' % self._index, bots=1) while True: if self._kill_event.is_set(): return data = {'attributes': json.dumps(self._attributes)} request = net.url_open(self._swarming + '/poll_for_test', data=data) if request is None: self._events.put('poll_for_test_empty') continue start = time.time() try: manifest = json.load(request) except ValueError: self._progress.update_item('Failed to poll') self._events.put('poll_for_test_invalid') continue commands = [c['function'] for c in manifest.get('commands', [])] if not commands: # Nothing to run. self._events.put('sleep') time.sleep(manifest['come_back']) continue if commands == ['UpdateSlave']: # Calculate the proper SHA-1 and loop again. # This could happen if the Swarming server is upgraded while this # script runs. self._attributes['version'] = calculate_version( manifest['commands'][0]['args']) self._events.put('update_slave') continue if commands != ['RunManifest']: self._progress.update_item( 'Unexpected RPC call %s\n%s' % (commands, manifest)) self._events.put('unknown_rpc') break store_cmd = manifest['commands'][0] if not isinstance(store_cmd['args'], unicode): self._progress.update_item('Unexpected RPC manifest\n%s' % manifest) self._events.put('unknown_args') break result_url = manifest['result_url'] test_run = json.loads(store_cmd['args']) if result_url != test_run['result_url']: self._progress.update_item( 'Unexpected result url: %s != %s' % (result_url, test_run['result_url'])) self._events.put('invalid_result_url') break ping_url = test_run['ping_url'] ping_delay = test_run['ping_delay'] self._progress.update_item('%d processing' % self._index, processing=1) # Fake activity and send pings as requested. while True: remaining = max(0, (start + self._duration) - time.time()) if remaining > ping_delay: # Include empty data to ensure the request is a POST request. result = net.url_read(ping_url, data={}) assert result == 'Success.', result remaining = max(0, (start + self._duration) - time.time()) if not remaining: break time.sleep(remaining) # In the old API, r=<task_id>&id=<bot_id> is passed as the url. data = { 'o': TASK_OUTPUT, 'x': '0', } result = net.url_read(manifest['result_url'], data=data) self._progress.update_item( '%d processed' % self._index, processing=-1, processed=1) if not result: self._events.put('result_url_fail') else: assert result == 'Successfully update the runner results.', result self._events.put(time.time() - start) finally: try: # Unregister itself. Otherwise the server will have tons of fake slaves # that the admin will have to remove manually. response = net.url_open( self._swarming + '/delete_machine_stats', data=[('r', self._bot_id)]) if not response: self._events.put('failed_unregister') else: response.read() finally: self._progress.update_item('%d quit' % self._index, bots=-1)
def run_bot(arg_error): """Runs the bot until it reboots or self-update or a signal is received. When a signal is received, simply exit. """ quit_bit = threading.Event() def handler(sig, _): logging.info('Got signal %s', sig) quit_bit.set() # TODO(maruel): Set quit_bit when stdin is closed on Windows. with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler): config = get_config() try: # First thing is to get an arbitrary url. This also ensures the network is # up and running, which is necessary before trying to get the FQDN below. resp = net.url_read(config['server'] + '/swarming/api/v1/bot/server_ping') if resp is None: logging.error('No response from server_ping') except Exception as e: # url_read() already traps pretty much every exceptions. This except # clause is kept there "just in case". logging.exception('server_ping threw') if quit_bit.is_set(): logging.info('Early quit 1') return 0 # If this fails, there's hardly anything that can be done, the bot can't # even get to the point to be able to self-update. botobj = get_bot() resp = net.url_read_json( botobj.server + '/swarming/api/v1/bot/handshake', data=botobj._attributes) if not resp: logging.error('Failed to contact for handshake') else: logging.info('Connected to %s', resp.get('server_version')) if resp.get('bot_version') != botobj._attributes['version']: logging.warning( 'Found out we\'ll need to update: server said %s; we\'re %s', resp.get('bot_version'), botobj._attributes['version']) if arg_error: botobj.post_error('Bootstrapping error: %s' % arg_error) if quit_bit.is_set(): logging.info('Early quit 2') return 0 clean_isolated_cache(botobj) call_hook(botobj, 'on_bot_startup') if quit_bit.is_set(): logging.info('Early quit 3') return 0 # This environment variable is accessible to the tasks executed by this bot. os.environ['SWARMING_BOT_ID'] = botobj.id.encode('utf-8') # Remove the 'work' directory if present, as not removing it may cause the # bot to stay quarantined and not be able to get out of this state. work_dir = os.path.join(botobj.base_dir, 'work') try: if os.path.isdir(work_dir): file_path.rmtree(work_dir) except Exception as e: botobj.post_error('Failed to remove work: %s' % e) consecutive_sleeps = 0 while not quit_bit.is_set(): try: botobj.update_dimensions(get_dimensions(botobj)) botobj.update_state(get_state(botobj, consecutive_sleeps)) did_something = poll_server(botobj, quit_bit) if did_something: consecutive_sleeps = 0 else: consecutive_sleeps += 1 except Exception as e: logging.exception('poll_server failed') msg = '%s\n%s' % (e, traceback.format_exc()[-2048:]) botobj.post_error(msg) consecutive_sleeps = 0 logging.info('Quitting') # Tell the server we are going away. botobj.post_event('bot_shutdown', 'Signal was received') botobj.cancel_all_timers() return 0
def _run(self): try: self._progress.update_item('%d alive' % self._index, bots=1) while True: if self._kill_event.is_set(): return data = {'attributes': json.dumps(self._attributes)} request = net.url_open(self._swarming + '/poll_for_test', data=data) if request is None: self._events.put('poll_for_test_empty') continue start = time.time() try: manifest = json.load(request) except ValueError: self._progress.update_item('Failed to poll') self._events.put('poll_for_test_invalid') continue commands = [c['function'] for c in manifest.get('commands', [])] if not commands: # Nothing to run. self._events.put('sleep') time.sleep(manifest['come_back']) continue if commands == ['UpdateSlave']: # Calculate the proper SHA-1 and loop again. # This could happen if the Swarming server is upgraded while this # script runs. self._attributes['version'] = calculate_version( manifest['commands'][0]['args']) self._events.put('update_slave') continue if commands != ['StoreFiles', 'RunCommands']: self._progress.update_item( 'Unexpected RPC call %s\n%s' % (commands, manifest)) self._events.put('unknown_rpc') break # The normal way Swarming works is that it 'stores' a test_run.swarm # file and then defer control to swarm_bot/local_test_runner.py. store_cmd = manifest['commands'][0] assert len(store_cmd['args']) == 1, store_cmd['args'] filepath, filename, test_run_content = store_cmd['args'][0] assert filepath == '' assert filename == 'test_run.swarm' assert 'local_test_runner.py' in manifest['commands'][1]['args'][0], ( manifest['commands'][1]) result_url = manifest['result_url'] test_run = json.loads(test_run_content) assert result_url == test_run['result_url'] ping_url = test_run['ping_url'] ping_delay = test_run['ping_delay'] self._progress.update_item('%d processing' % self._index, processing=1) # Fake activity and send pings as requested. while True: remaining = max(0, (start + self._duration) - time.time()) if remaining > ping_delay: # Include empty data to ensure the request is a POST request. result = net.url_read(ping_url, data={}) assert result == 'Success.', result remaining = max(0, (start + self._duration) - time.time()) if not remaining: break time.sleep(remaining) data = { 'c': test_run['configuration']['config_name'], 'n': test_run['test_run_name'], 'o': False, 'result_output': TASK_OUTPUT, 's': True, 'x': '0', } result = net.url_read(manifest['result_url'], data=data) self._progress.update_item( '%d processed' % self._index, processing=-1, processed=1) if not result: self._events.put('result_url_fail') else: assert result == 'Successfully update the runner results.', result self._events.put(time.time() - start) finally: try: # Unregister itself. Otherwise the server will have tons of fake slaves # that the admin will have to remove manually. response = net.url_open( self._swarming + '/delete_machine_stats', data=[('r', self._machine_id)]) if not response: self._events.put('failed_unregister') else: response.read() finally: self._progress.update_item('%d quit' % self._index, bots=-1)
def ping(self): """Unlike all other methods, this one isn't authenticated.""" resp = net.url_read(self._server + '/swarming/api/v1/bot/server_ping') if resp is None: logging.error('No response from server_ping')
def process_manifest( file_hash_or_isolated, test_name, shards, test_filter, slave_os, working_dir, isolate_server, swarming, verbose, profile, priority, algo): """Process the manifest file and send off the swarm test request. Optionally archives an .isolated file. """ if file_hash_or_isolated.endswith('.isolated'): file_hash = archive( file_hash_or_isolated, isolate_server, slave_os, algo, verbose) if not file_hash: print >> sys.stderr, 'Archival failure %s' % file_hash_or_isolated return 1 elif isolateserver.is_valid_hash(file_hash_or_isolated, algo): file_hash = file_hash_or_isolated else: print >> sys.stderr, 'Invalid hash %s' % file_hash_or_isolated return 1 try: manifest = Manifest( file_hash, test_name, shards, test_filter, PLATFORM_MAPPING_SWARMING[slave_os], working_dir, isolate_server, verbose, profile, priority, algo) except ValueError as e: print >> sys.stderr, 'Unable to process %s: %s' % (test_name, e) return 1 chromium_setup(manifest) # Zip up relevant files. print('Zipping up files...') if not manifest.zip_and_upload(): return 1 # Send test requests off to swarm. print('Sending test requests to swarm.') print('Server: %s' % swarming) print('Job name: %s' % test_name) test_url = swarming + '/test' manifest_text = manifest.to_json() result = net.url_read(test_url, data={'request': manifest_text}) if not result: print >> sys.stderr, 'Failed to send test for %s\n%s' % ( test_name, test_url) return 1 try: json.loads(result) except (ValueError, TypeError) as e: print >> sys.stderr, 'Failed to send test for %s' % test_name print >> sys.stderr, 'Manifest: %s' % manifest_text print >> sys.stderr, 'Bad response: %s' % result print >> sys.stderr, str(e) return 1 return 0
def trigger_task(swarming_url, dimensions, sleep_time, output_size, progress, unique, timeout, index): """Triggers a Swarming job and collects results. Returns the total amount of time to run a task remotely, including all the overhead. """ name = 'load-test-%d-%s' % (index, unique) start = time.time() logging.info('trigger') manifest = swarming.Manifest(isolate_server='http://localhost:1', namespace='dummy-isolate', isolated_hash=1, task_name=name, extra_args=[], env={}, dimensions=dimensions, deadline=int(timeout - TIMEOUT_OVERHEAD), verbose=False, profile=False, priority=100) cmd = [ 'python', '-c', 'import time; print(\'1\'*%s); time.sleep(%d); print(\'Back\')' % (output_size, sleep_time) ] manifest.add_task('echo stuff', cmd) data = {'request': manifest.to_json()} response = net.url_read(swarming_url + '/test', data=data) if response is None: # Failed to trigger. Return a failure. return 'failed_trigger' result = json.loads(response) # Old API uses harcoded config name. New API doesn't have concept of config # name so it uses the task name. Ignore this detail. test_keys = [] for key in result['test_keys']: key.pop('config_name') test_keys.append(key.pop('test_key')) assert re.match('[0-9a-f]+', test_keys[-1]), test_keys expected = { u'priority': 100, u'test_case_name': unicode(name), u'test_keys': [{ u'num_instances': 1, u'instance_index': 0, }], } assert result == expected, '\n%s\n%s' % (result, expected) progress.update_item('%5d' % index, processing=1) try: logging.info('collect') new_test_keys = swarming.get_task_keys(swarming_url, name) if not new_test_keys: return 'no_test_keys' assert test_keys == new_test_keys, (test_keys, new_test_keys) out = [ output for _index, output in swarming.yield_results( swarming_url, test_keys, timeout, None, False, None, False, True) ] if not out: return 'no_result' for item in out: item.pop('machine_tag') item.pop('machine_id') # TODO(maruel): Assert output even when run on a real bot. _out_actual = item.pop('output') # assert out_actual == swarming_load_test_bot.TASK_OUTPUT, out_actual expected = [{ u'config_instance_index': 0, u'exit_codes': u'0', u'num_config_instances': 1, }] assert out == expected, '\n%s\n%s' % (out, expected) return time.time() - start finally: progress.update_item('%5d - done' % index, processing=-1, processed=1)
def _run(self): """Polls the server and fake execution.""" try: self._progress.update_item('%d alive' % self._index, bots=1) while True: if self._kill_event.is_set(): return data = {'attributes': json.dumps(self._attributes)} request = net.url_read(self._swarming + '/poll_for_test', data=data) if request is None: self._events.put('poll_for_test_empty') continue start = time.time() try: manifest = json.loads(request) except ValueError: self._progress.update_item('Failed to poll') self._events.put('poll_for_test_invalid') continue commands = [c['function'] for c in manifest.get('commands', [])] if not commands: # Nothing to run. self._events.put('sleep') time.sleep(manifest['come_back']) continue if commands == ['UpdateSlave']: # Calculate the proper SHA-1 and loop again. # This could happen if the Swarming server is upgraded while this # script runs. self._attributes['version'] = calculate_version( manifest['commands'][0]['args']) self._events.put('update_slave') continue if commands != ['RunManifest']: self._progress.update_item( 'Unexpected RPC call %s\n%s' % (commands, manifest)) self._events.put('unknown_rpc') break store_cmd = manifest['commands'][0] if not isinstance(store_cmd['args'], unicode): self._progress.update_item('Unexpected RPC manifest\n%s' % manifest) self._events.put('unknown_args') break result_url = manifest['result_url'] test_run = json.loads(store_cmd['args']) if result_url != test_run['result_url']: self._progress.update_item( 'Unexpected result url: %s != %s' % (result_url, test_run['result_url'])) self._events.put('invalid_result_url') break ping_url = test_run['ping_url'] ping_delay = test_run['ping_delay'] self._progress.update_item('%d processing' % self._index, processing=1) # Fake activity and send pings as requested. while True: remaining = max(0, (start + self._duration) - time.time()) if remaining > ping_delay: # Include empty data to ensure the request is a POST request. result = net.url_read(ping_url, data={}) assert result == 'Success.', result remaining = max(0, (start + self._duration) - time.time()) if not remaining: break time.sleep(remaining) # In the old API, r=<task_id>&id=<bot_id> is passed as the url. data = { 'o': TASK_OUTPUT, 'x': '0', } result = net.url_read(manifest['result_url'], data=data) self._progress.update_item( '%d processed' % self._index, processing=-1, processed=1) if not result: self._events.put('result_url_fail') else: assert result == 'Successfully update the runner results.', result self._events.put(time.time() - start) finally: try: # Unregister itself. Otherwise the server will have tons of fake bots # that the admin will have to remove manually. response = net.url_read( self._swarming + '/delete_machine_stats', data=[('r', self._bot_id)]) if response is None: self._events.put('failed_unregister') finally: self._progress.update_item('%d quit' % self._index, bots=-1)
def run_bot(arg_error): """Runs the bot until it reboots or self-update or a signal is received. When a signal is received, simply exit. """ quit_bit = threading.Event() def handler(sig, _): logging.info('Got signal %s', sig) quit_bit.set() # TODO(maruel): Set quit_bit when stdin is closed on Windows. with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler): config = get_config() try: # First thing is to get an arbitrary url. This also ensures the network is # up and running, which is necessary before trying to get the FQDN below. resp = net.url_read(config['server'] + '/swarming/api/v1/bot/server_ping') if resp is None: logging.error('No response from server_ping') except Exception as e: # url_read() already traps pretty much every exceptions. This except # clause is kept there "just in case". logging.exception('server_ping threw') if quit_bit.is_set(): logging.info('Early quit 1') return 0 # If this fails, there's hardly anything that can be done, the bot can't # even get to the point to be able to self-update. botobj = get_bot() resp = net.url_read_json(botobj.server + '/swarming/api/v1/bot/handshake', data=botobj._attributes) if not resp: logging.error('Failed to contact for handshake') else: logging.info('Connected to %s', resp.get('server_version')) if resp.get('bot_version') != botobj._attributes['version']: logging.warning( 'Found out we\'ll need to update: server said %s; we\'re %s', resp.get('bot_version'), botobj._attributes['version']) if arg_error: botobj.post_error('Bootstrapping error: %s' % arg_error) if quit_bit.is_set(): logging.info('Early quit 2') return 0 clean_isolated_cache(botobj) call_hook(botobj, 'on_bot_startup') if quit_bit.is_set(): logging.info('Early quit 3') return 0 # This environment variable is accessible to the tasks executed by this bot. os.environ['SWARMING_BOT_ID'] = botobj.id.encode('utf-8') # Remove the 'work' directory if present, as not removing it may cause the # bot to stay quarantined and not be able to get out of this state. work_dir = os.path.join(botobj.base_dir, 'work') try: if os.path.isdir(work_dir): file_path.rmtree(work_dir) except Exception as e: botobj.post_error('Failed to remove work: %s' % e) consecutive_sleeps = 0 while not quit_bit.is_set(): try: botobj.update_dimensions(get_dimensions(botobj)) botobj.update_state(get_state(botobj, consecutive_sleeps)) did_something = poll_server(botobj, quit_bit) if did_something: consecutive_sleeps = 0 else: consecutive_sleeps += 1 except Exception as e: logging.exception('poll_server failed') msg = '%s\n%s' % (e, traceback.format_exc()[-2048:]) botobj.post_error(msg) consecutive_sleeps = 0 logging.info('Quitting') # Tell the server we are going away. botobj.post_event('bot_shutdown', 'Signal was received') botobj.cancel_all_timers() return 0