def control_send_cmd(worker_name, cmd, control_queue, **kwargs): body = {"control": {"cmd": cmd}} if kwargs: body["control"]["args"] = kwargs job_id = Job.add("control::worker::%s" % worker_name, body, [control_queue]) return job_id
def queue_job(jobs_set, queue, body, control_queues): timeout = body.get("options", {}).get("timeout") if timeout: job_id = Job.add(queue, body, control_queues, retry=timeout) else: job_id = Job.add(queue, body, control_queues) parent = body.get("parent") if parent: body = { "parent": parent, "subjob": job_id, "unique": os.environ.get("DWQ_JOB_UNIQUE"), } Job.add(control_queues[0], body, None) else: jobs_set.add(job_id) return job_id
def main(): Disque.connect(["localhost:7711"]) queues = sys.argv[1] or "default" try: while True: jobs = Job.wait(queues, count=16) for job in jobs: print(json.dumps(job, sort_keys=True, indent=4)) except KeyboardInterrupt: pass
def live(): global result_dict Disque.connect(["localhost:7711"]) http_root = os.environ.get("CI_BUILD_HTTP_ROOT", "") queue = sys.argv[1] prnum = sys.argv[2] last_update = 0 maxfailed = 20 failed_jobs = [] nfailed = 0 try: post_status({"status": "setting up build"}, prnum, [], "") while True: _list = Job.wait(queue, count=16) for _status in _list: job = _status.get('job') if job: filename = save_job_result(job) if filename and not has_passed(job): nfailed += 1 jobname = job_name(job) if jobname == "static_tests": failed_jobs = [(filename, jobname)] + failed_jobs elif nfailed <= maxfailed: failed_jobs.append((filename, job_name(job))) failed_jobs = failed_jobs[:maxfailed] if nfailed > maxfailed: failed_jobs.append((None, "(%s more failed jobs)" % (nfailed - maxfailed))) if _status.get("status", "") == "done": post_status(None, prnum, failed_jobs, http_root) return now = time.time() if now - last_update > 0.5: post_status(_status, prnum, failed_jobs, http_root) last_update = now except KeyboardInterrupt: pass
def control_cmd(nodes, cmd, **kwargs): control_queue = "control::%s" % str(random.random()) job_ids = [] for node in nodes: print('dwqm: sending "%s" command to node "%s"' % (cmd, node)) job_id = control_send_cmd(node, cmd, control_queue, **kwargs) job_ids.append(job_id) while job_ids: for job in Job.wait(control_queue, count=len(job_ids)): job_id = job["job_id"] job_ids.remove(job_id) try: print("%s:" % job["result"]["worker"], job["result"].get("output")) except KeyError: pass
def drain(queues): if not queues: print("dwqm: drain: no queues given.") sys.exit(1) Disque.connect(["localhost:7711"]) disque = Disque.get() try: while True: jobs = Job.get(queues, count=1024, nohang=True) if not jobs: return job_ids = [] for job in jobs: job_ids.append(job.job_id) disque.fast_ack(*job_ids) except KeyboardInterrupt: pass
def worker(n, cmd_server_pool, gitjobdir, args, working_set): global active_event global shutdown print("worker %2i: started" % n) buildnum = 0 while not shutdown: try: if not shutdown and not Disque.connected(): time.sleep(1) continue while not shutdown: active_event.wait() jobs = Job.get(args.queues) for job in jobs: if shutdown: job.nack() continue if job.additional_deliveries > 2: error = "too many deliveries (usual reason: timeout)" vprint(2, "worker %2i: %s" % (n, error)) job.done({ "status": "error", "output": "dwqw: %s\n" % error, "worker": args.name, "runtime": 0, "body": job.body, }) continue buildnum += 1 working_set.add(job.job_id) before = time.time() vprint( 2, "worker %2i: got job %s from queue %s" % (n, job.job_id, job.queue_name), ) try: repo = job.body["repo"] commit = job.body["commit"] command = job.body["command"] except KeyError: vprint(2, "worker %2i: invalid job json body" % n) job.done({ "status": "error", "output": "worker.py: invalid job description", }) continue vprint(2, 'worker %2i: command="%s"' % (n, command)) exclusive = None try: options = job.body.get("options") or {} if options.get("jobdir") or "" == "exclusive": exclusive = str(random.random()) except KeyError: pass unique = random.random() _env = os.environ.copy() try: _env.update(job.body["env"]) except KeyError: pass _env.update({ "DWQ_REPO": repo, "DWQ_COMMIT": commit, "DWQ_QUEUE": job.queue_name, "DWQ_WORKER": args.name, "DWQ_WORKER_BUILDNUM": str(buildnum), "DWQ_WORKER_THREAD": str(n), "DWQ_JOBID": job.job_id, "DWQ_JOB_UNIQUE": str(unique), "DWQ_CONTROL_QUEUE": job.body.get("control_queues")[0], }) workdir = None workdir_error = None try: try: workdir = gitjobdir.get(repo, commit, exclusive=exclusive or str(n)) except subprocess.CalledProcessError as e: workdir_error = ( "dwqw: error getting jobdir. output: \n" + e.output.decode("utf-8")) if not workdir: if job.nacks < options.get("max_retries", 2): job.nack() vprint( 1, "worker %2i: error getting job dir, requeueing job" % n, ) else: job.done({ "status": "error", "output": workdir_error or "dwqw: error getting jobdir\n", "worker": args.name, "runtime": 0, "body": job.body, }) vprint( 1, "worker %2i: cannot get job dir, erroring job" % n, ) working_set.discard(job.job_id) continue util.write_files(options.get("files"), workdir) # assets asset_dir = os.path.join( workdir, "assets", "%s:%s" % (hash(job.job_id), str(unique))) _env.update({"DWQ_ASSETS": asset_dir}) timeout = options.get("timeout", 300) if timeout >= 8: # send explit nack before disque times us out # but only if original timeout is not too small timeout -= 2 handle = cmd_server_pool.runcmd( command, cwd=workdir, shell=True, env=_env, start_new_session=True, ) output, result = handle.wait(timeout=timeout) if handle.timeout: result = "timeout" output = "dwqw: command timed out\n" if (result not in { 0, "0", "pass" }) and job.nacks < options.get("max_retries", 2): vprint( 2, "worker %2i: command:" % n, command, "result:", result, "nacks:", job.nacks, "re-queueing.", ) job.nack() else: runtime = time.time() - before options = job.body.get("options") if options: options.pop("files", None) # remove options from body if it is now empty if not options: job.body.pop("options", None) _result = { "status": result, "output": output, "worker": args.name, "runtime": runtime, "body": job.body, "unique": str(unique), } # pack assets try: asset_files = os.listdir(asset_dir) if asset_files: _result.update({ "assets": util.gen_file_data( asset_files, asset_dir) }) shutil.rmtree(asset_dir, ignore_errors=True) except FileNotFoundError: pass job.done(_result) vprint( 2, "worker %2i: command:" % n, command, "result:", result, "runtime: %.1fs" % runtime, ) working_set.discard(job.job_id) except Exception as e: if workdir: gitjobdir.release(workdir) raise e gitjobdir.release(workdir) except Exception as e: vprint(1, "worker %2i: uncaught exception" % n) traceback.print_exc() time.sleep(10) vprint(1, "worker %2i: restarting worker" % n)
def main(): global shutdown global verbose global active_event args = parse_args() verbose = args.verbose - args.quiet cmd_server_pool = cmdserver.CmdServerPool(args.jobs) signal.signal(signal.SIGTERM, sigterm_handler) _dir = "/tmp/dwq.%s" % str(random.random()) gitjobdir = GitJobDir(_dir, args.jobs) servers = ["localhost:7711"] try: Disque.connect(servers) vprint(1, "dwqw: connected.") except: pass working_set = SyncSet() for n in range(1, args.jobs + 1): threading.Thread( target=worker, args=(n, cmd_server_pool, gitjobdir, args, working_set), daemon=True, ).start() active_event.set() try: while True: if not Disque.connected(): try: vprint(1, "dwqw: connecting...") Disque.connect(servers) vprint(1, "dwqw: connected.") except RedisError: time.sleep(1) continue try: control_jobs = Job.get(["control::worker::%s" % args.name]) for job in control_jobs or []: handle_control_job(args, job) except RedisError: pass except (KeyboardInterrupt, SystemExit): vprint(1, "dwqw: shutting down") shutdown = True cmd_server_pool.destroy() vprint(1, "dwqw: nack'ing jobs") jobs = working_set.empty() d = Disque.get() d.nack_job(*jobs) vprint(1, "dwqw: cleaning up job directories") gitjobdir.cleanup()
def main(): global verbose args = parse_args() if (args.repo is None) ^ (args.commit is None): print("dwqc: error: both repo and commit must be specified!", file=sys.stderr) sys.exit(1) signal.signal(signal.SIGTERM, sigterm_handler) job_queue = args.queue Disque.connect([args.disque_url]) if args.subjob: try: control_queue = os.environ["DWQ_CONTROL_QUEUE"] except KeyError: print( "dwqc: error: --subjob specified, but DWQ_CONTROL_QUEUE unset." ) sys.exit(1) try: parent_jobid = os.environ["DWQ_JOBID"] except KeyError: print("dwqc: error: --subjob specified, but DWQ_JOBID unset.") sys.exit(1) else: control_queue = "control::%s" % random.random() parent_jobid = None verbose = args.verbose if args.progress or args.report: start_time = time.time() if args.report: Job.add(args.report, {"status": "collecting jobs"}) try: file_data = util.gen_file_data(args.file) except util.GenFileDataException as e: print("dwqc: error processing --file argument:", e, file=sys.stderr) sys.exit(1) base_options = {} if args.exclusive_jobdir: base_options.update({"jobdir": "exclusive"}) if file_data: base_options["files"] = file_data if args.tries != 3: if args.tries < 1: print("dwqc: error: --tries < 1!") sys.exit(1) base_options["max_retries"] = args.tries - 1 if args.timeout != 300: if args.timeout <= 0: print("dwqc: error: --timeout <=0!") sys.exit(1) base_options["timeout"] = args.timeout result_list = [] try: jobs = set() batch = [] if args.command and not args.stdin: options = base_options queue_job( jobs, job_queue, create_body(args, args.command, options, parent_jobid), [control_queue], ) else: jobs_read = 0 vprint("dwqc: reading jobs from stdin") for line in sys.stdin: line = line.rstrip() if args.stdin and args.command: cmdargs = line.split(" ") command = args.command for i in range(0, len(cmdargs)): command = command.replace("${%i}" % (i + 1), cmdargs[i]) command = command.replace("${0}", line) else: command = line tmp = command.split("###") command = tmp[0] options = {} options.update(base_options) if len(tmp) > 1: command = command.rstrip() try: options.update(json.loads(tmp[1])) except json.decoder.JSONDecodeError: vprint("dwqc: invalid option JSON. Skipping job.", file=sys.stderr) continue _job_queue = options.get("queue", job_queue) if args.batch: batch.append(( _job_queue, create_body(args, command, options, parent_jobid), [control_queue], )) else: job_id = queue_job( jobs, _job_queue, create_body(args, command, options, parent_jobid), [control_queue], ) vprint('dwqc: job %s command="%s" sent to queue %s.' % (job_id, command, _job_queue)) if args.progress: print("") if args.progress or args.report: jobs_read += 1 elapsed = time.time() - start_time if args.progress: print( "\033[F\033[K[%s] %s jobs read" % (nicetime(elapsed), jobs_read), end="\r", ) # if args.report: # Job.add(args.report, { "status" : "collecting jobs", "total" : jobs_read }) _time = "" if args.batch and args.stdin: before = time.time() vprint("dwqc: sending jobs") for _tuple in batch: queue_job(jobs, *_tuple) _time = "(took %s)" % nicetime(time.time() - before) if args.report: Job.add(args.report, {"status": "sending jobs"}) if args.stdin: vprint("dwqc: all jobs sent.", _time) if args.subjob: if args.report: Job.add(args.report, {"status": "done"}) return if args.progress: vprint("") unexpected = {} early_subjobs = [] total = len(jobs) done = 0 failed = 0 failed_expected = 0 passed = 0 subjobs = {} while jobs: _early_subjobs = early_subjobs or None early_subjobs = [] for job in _early_subjobs or Job.wait(control_queue, count=128): # print(json.dumps(job, sort_keys=True, indent=4)) subjob = job.get("subjob") if subjob: parent = job.get("parent") unique = job.get("unique") _dict = dict_dictadd(subjobs, parent) dict_addset(_dict, unique, subjob) else: try: job_id = job["job_id"] jobs.remove(job_id) done += 1 # if args.progress: # vprint("\033[F\033[K", end="") # vprint("dwqc: job %s done. result=%s" % (job["job_id"], job["result"]["status"])) if not args.quiet: if args.progress: print("\033[K", end="") print(job["result"]["output"], end="") _has_passed = job["result"]["status"] in { 0, "0", "pass" } if _has_passed: passed += 1 handle_assets(job, args) else: failed += 1 try: if job["result"]["body"]["options"]["fail_ok"]: failed_expected += 1 except KeyError: pass if args.outfile: result_list.append(job) # collect subjobs started by this job instance, add to waitlist unique = job["result"]["unique"] _subjobs = subjobs.get(job_id, {}).get(unique, []) for subjob_id in _subjobs: try: early_subjobs.append(unexpected.pop(subjob_id)) except KeyError: pass finally: jobs.add(subjob_id) total += len(_subjobs) if args.progress or args.report: elapsed = time.time() - start_time per_job = elapsed / done eta = (total - done) * per_job if args.progress: print( "\r\033[K[%s] %s/%s jobs done (%s passed, %s failed.) " "ETA:" % (nicetime(elapsed), done, total, passed, failed), nicetime(eta), end="\r", ) if args.report: Job.add( args.report, { "status": "working", "elapsed": elapsed, "eta": eta, "total": total, "passed": passed, "failed": failed, "job": job, }, ) if not _has_passed: if failed > failed_expected: if (failed - failed_expected) > args.maxfail: print( "dwqc: more than %i jobs failed. Exiting." % args.maxfail, file=sys.stderr, ) sys.exit(1) except KeyError: unexpected[job_id] = job if args.outfile: args.outfile.write(json.dumps(result_list)) if args.progress: print("") except (KeyboardInterrupt, SystemExit): print("dwqc: cancelling...") Job.cancel_all(jobs) if args.report: Job.add(args.report, {"status": "canceled"}) sys.exit(1) if args.report: Job.add(args.report, {"status": "done"}) if failed > failed_expected: sys.exit(1)
def worker(n, cmd_server_pool, gitjobdir, args, working_set): global active_event global shutdown worker_str = f"dwqw@{args.name}.{n}" print(f"{worker_str}: started") buildnum = 0 while not shutdown: try: if not shutdown and not Disque.connected(): time.sleep(1) continue while not shutdown: active_event.wait() jobs = Job.get(args.queues) for job in jobs: if shutdown or not active_event.is_set(): job.nack() continue if job.additional_deliveries > 2: error = "too many deliveries (usual reason: timeout)" vprint(2, f"{worker_str}: {error}") job.done({ "status": "error", "output": f"{worker_str}: {error}\n", "worker": args.name, "runtime": 0, "body": job.body, }) continue buildnum += 1 working_set.add(job.job_id) before = time.time() vprint( 2, f"{worker_str}: got job {job.job_id} from queue {job.queue_name}", ) try: command = job.body["command"] except KeyError: vprint(2, f"{worker_str}: invalid job json body") job.done({ "status": "error", "output": f'{worker_str} invalid job body: "{job.body}"', }) continue vprint(2, f'{worker_str}: command="{command}"') repo = None commit = None try: repo = job.body["repo"] commit = job.body["commit"] except KeyError: pass if (repo is None) ^ (commit is None): vprint( 2, f"{worker_str}: invalid job json body, only one of repo and commit specified" ) job.done({ "status": "error", "output": f'{worker_str} invalid job body: "{job.body}"', }) continue exclusive = None try: options = job.body.get("options") or {} if options.get("jobdir") or "" == "exclusive": exclusive = str(random.random()) except KeyError: pass unique = random.random() _env = os.environ.copy() try: _env.update(job.body["env"]) except KeyError: pass _env.update({ "DWQ_QUEUE": job.queue_name, "DWQ_WORKER": args.name, "DWQ_WORKER_BUILDNUM": str(buildnum), "DWQ_WORKER_THREAD": str(n), "DWQ_JOBID": job.job_id, "DWQ_JOB_UNIQUE": str(unique), "DWQ_CONTROL_QUEUE": job.body.get("control_queues")[0], }) workdir = None workdir_output = None workdir_error = None try: if repo is not None: _env.update({ "DWQ_REPO": repo, "DWQ_COMMIT": commit, }) try: (workdir, workdir_output) = gitjobdir.get( repo, commit, exclusive=exclusive or str(n)) except CalledProcessError as e: workdir_error = ( f"{worker_str}: error getting jobdir. output:\n" + e.output.decode("utf-8", "backslashreplace")) if not workdir: if job.nacks < options.get("max_retries", 2): job.nack() vprint( 1, f"{worker_str}: error getting job dir, requeueing job", ) else: job.done({ "status": "error", "output": workdir_error or f"{worker_str}: error getting jobdir\n", "worker": args.name, "runtime": 0, "body": job.body, }) vprint( 1, f"{worker_str}: cannot get job dir, erroring job", ) working_set.discard(job.job_id) continue else: workdir = "/tmp" workdir_done_at = time.time() files = options.get("files") util.write_files(files, workdir) write_files_done_at = time.time() # assets asset_dir = os.path.join( workdir, "assets", "%s:%s" % (hash(job.job_id), str(unique))) _env.update({"DWQ_ASSETS": asset_dir}) timeout = options.get("timeout", 300) # subtract time used for checkout and job files timeout -= time.time() - before # be sure to timeout a bit earlier, so transmit/network delays # don't make disque time-out itself. timeout -= 10 command_start_at = time.time() if timeout > 0: try: res = run( command, stdout=PIPE, stderr=STDOUT, cwd=workdir, shell=True, env=_env, start_new_session=True, timeout=timeout, ) result = res.returncode output = res.stdout.decode( "utf-8", "backslashreplace") except TimeoutExpired as e: result = "timeout" decoded = e.output.decode( "utf-8", "backslashreplace") output = f"{decoded}{worker_str}: error: timed out\n" else: result = "timeout" output = f"{worker_str}: command timed out while setting up job\n" command_done_at = time.time() if (result not in { 0, "0", "pass" }) and job.nacks < options.get("max_retries", 2): vprint( 2, f"{worker_str}: command:", command, "result:", result, "nacks:", job.nacks, "re-queueing.", ) job.nack() else: cmd_runtime = command_done_at - command_start_at workdir_setup_time = workdir_done_at - before write_files_time = write_files_done_at - workdir_done_at options = job.body.get("options") if options: options.pop("files", None) # remove options from body if it is now empty if not options: job.body.pop("options", None) _result = { "status": result, "output": output, "worker": args.name, "body": job.body, "unique": str(unique), "times": { "cmd_runtime": cmd_runtime, }, } if files: _result["times"][ "write_files"] = write_files_time if workdir_output: _result[ "workdir_output"] = workdir_output.decode( "utf-8", "backslashreplace") _result["times"][ "workdir_setup"] = workdir_setup_time # pack assets try: asset_files = os.listdir(asset_dir) if asset_files: before_assets = time.time() _result.update({ "assets": util.gen_file_data( asset_files, asset_dir) }) shutil.rmtree(asset_dir, ignore_errors=True) _result["times"]["assets"] = ( time.time() - before_assets) except FileNotFoundError: pass runtime = time.time() - before _result["runtime"] = runtime job.done(_result) vprint( 2, f"{worker_str}: command:", command, "result:", result, "runtime: %.1fs" % runtime, ) working_set.discard(job.job_id) except Exception as e: if workdir and repo: gitjobdir.release(workdir) raise e if repo: gitjobdir.release(workdir) except Exception as e: print(f"{worker_str}: uncaught exception") traceback.print_exc() time.sleep(2) print(f"{worker_str}: restarting worker")