def main(): init_logging() logging.info('Worker started') signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGINT, sig_handler) while not sigterm.is_set(): logging.debug('Starting poll cycle') try: worker_loop() req('PUT', '%s/workers/%s.json' % (DB, WORKER_NAME), body=make_worker_obj('IDLE')) except: logging.error('Exception in worker loop:\n%s', traceback.format_exc()) if sigterm.is_set(): break # Synchronize sleeping with the wall clock. This is so all VMs wake up at # the same time. See comment on distributing load above in this file. poll_time_sec = 5 time.sleep(poll_time_sec - (time.time() % poll_time_sec)) # The use case here is the VM being terminated by the GCE infrastructure. # We mark the worker as terminated and the job as cancelled so we don't wait # forever for it. logging.warning('Exiting the worker loop, got signal: %s', sigterm.is_set()) req('PUT', '%s/workers/%s.json' % (DB, WORKER_NAME), body=make_worker_obj('TERMINATED'))
def main(): init_logging() logging.info('Worker started') signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGINT, sig_handler) while not sigterm.is_set(): logging.debug('Starting poll cycle') try: worker_loop() req('PUT', '%s/workers/%s.json' % (DB, WORKER_NAME), body=make_worker_obj('IDLE')) except: logging.error('Exception in worker loop:\n%s', traceback.format_exc()) if sigterm.is_set(): break time.sleep(5) # The use case here is the VM being terminated by the GCE infrastructure. # We mark the worker as terminated and the job as cancelled so we don't wait # forever for it. logging.warning('Exiting the worker loop, got signal: %s', sigterm.is_set()) req('PUT', '%s/workers/%s.json' % (DB, WORKER_NAME), body=make_worker_obj('TERMINATED'))
def main(): init_logging() signal.alarm(WATCHDOG_SEC) mimetypes.add_type('application/wasm', '.wasm') parser = argparse.ArgumentParser() parser.add_argument('--rm', action='store_true', help='Removes the directory') parser.add_argument('--job-id', type=str, required=True, help='The Perfetto CI job ID to tie this upload to') args = parser.parse_args() job_id = args.job_id dirpath = os.path.join(os.getenv('ARTIFACTS_DIR', default=os.curdir), job_id) if not os.path.isdir(dirpath): logging.error('Directory not found: %s', dirpath) return 1 total_size = 0 uploads = 0 failures = 0 files = list_files(dirpath) pool = ThreadPool(processes=10) for upl_size in pool.imap_unordered(upload_one_file_with_retries, files): uploads += 1 if upl_size >= 0 else 0 failures += 1 if upl_size < 0 else 0 total_size += max(upl_size, 0) logging.info('Uploaded artifacts for %s: %d files, %s failures, %d KB', job_id, uploads, failures, total_size / 1e3) scan_and_upload_perf_folder(job_id, dirpath) if args.rm: subprocess.call(['sudo', 'rm', '-rf', dirpath]) return 0
def main(): init_logging() mimetypes.add_type('application/wasm', '.wasm') logging.info('Artifacts uploader started') pool = ThreadPool(processes=32) while True: scan_and_uplod_artifacts(pool, remove_after_upload='--rm' in sys.argv) time.sleep(RESCAN_PERIOD_SEC)
def main(argv): init_logging() if len(argv) != 2: print('Usage: %s job_id' % argv[0]) return 1 job_id = argv[1] res = 42 # The container name will be worker-N-sandbox. container = socket.gethostname() + '-sandbox' # Remove stale jobs, if any. subprocess.call(['sudo', 'docker', 'rm', '-f', container]) q = queue.Queue() # Conversely to real programs, signal handlers in python aren't really async # but are queued on the main thread. Hence We need to keep the main thread # responsive to react to signals. This is to handle timeouts and graceful # termination of the worker container, which dispatches a SIGTERM on stop. def sig_handler(sig, _): logging.warning('Job runner got signal %s, terminating job %s', sig, job_id) subprocess.call(['sudo', 'docker', 'kill', container]) os._exit( 1) # sys.exit throws a SystemExit exception, _exit really exits. signal.signal(signal.SIGTERM, sig_handler) log_thd = threading.Thread(target=log_thread, args=(job_id, q)) log_thd.start() # SYS_PTRACE is required for gtest death tests and LSan. cmd = [ 'sudo', 'docker', 'run', '--name', container, '--hostname', container, '--cap-add', 'SYS_PTRACE', '--rm', '--env', 'PERFETTO_TEST_JOB=%s' % job_id, '--tmpfs', '/tmp:exec' ] # Propagate environment variables coming from the job config. for kv in [ kv for kv in os.environ.items() if kv[0].startswith('PERFETTO_') ]: cmd += ['--env', '%s=%s' % kv] # We use the tmpfs mount created by gce-startup-script.sh, if present. The # problem is that Docker doesn't allow to both override the tmpfs-size and # prevent the "-o noexec". In turn the default tmpfs-size depends on the host # phisical memory size. if os.getenv('SANDBOX_TMP'): cmd += ['-v', '%s:/ci/ramdisk' % os.getenv('SANDBOX_TMP')] else: cmd += ['--tmpfs', '/ci/ramdisk:exec'] # Rationale for the conditional branches below: when running in the real GCE # environment, the gce-startup-script.sh mounts these directories in the right # locations, so that they are shared between all workers. # When running the worker container outside of GCE (i.e.for local testing) we # leave these empty. The VOLUME directive in the dockerfile will cause docker # to automatically mount a scratch volume for those. # This is so that the CI containers can be tested without having to do the # work that gce-startup-script.sh does. if os.getenv('SHARED_WORKER_CACHE'): cmd += ['--volume=%s:/ci/cache' % os.getenv('SHARED_WORKER_CACHE')] artifacts_dir = None if os.getenv('ARTIFACTS_DIR'): artifacts_dir = os.path.join(os.getenv('ARTIFACTS_DIR'), job_id) subprocess.call(['sudo', 'rm', '-rf', artifacts_dir]) os.mkdir(artifacts_dir) cmd += ['--volume=%s:/ci/artifacts' % artifacts_dir] cmd += os.getenv('SANDBOX_NETWORK_ARGS', '').split() cmd += [SANDBOX_IMG] logging.info('Starting %s', ' '.join(cmd)) proc = subprocess.Popen(cmd, stdin=open(os.devnull), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=65536) stdout = '' tstart = time.time() while True: ms_elapsed = int((time.time() - tstart) * 1000) stdout += read_nonblock(proc.stdout) # stdout/err pipes are not atomic w.r.t. '\n'. Extract whole lines out into # |olines| and keep the last partial line (-1) in the |stdout| buffer. lines = stdout.split('\n') stdout = lines[-1] lines = lines[:-1] # Each line has a key of the form <time-from-start><out|err><counter> # |counter| is relative to the batch and is only used to disambiguate lines # fetched at the same time, preserving the ordering. batch = {} for counter, line in enumerate(lines): batch['%06x-%04x' % (ms_elapsed, counter)] = line if batch: q.put(batch) if proc.poll() is not None: res = proc.returncode logging.info('Job subprocess terminated with code %s', res) break # Large sleeps favour batching in the log uploader. # Small sleeps favour responsiveness of the signal handler. time.sleep(1) q.put(None) # EOF maker log_thd.join() if artifacts_dir: artifacts_uploader = os.path.join(CUR_DIR, 'artifacts_uploader.py') cmd = ['setsid', artifacts_uploader, '--job-id=%s' % job_id, '--rm'] subprocess.call(cmd) return res