def get_worker_jobs(self, queue, worker_type, worker): # TODO: need to get worker-group... return utils.get_jsonc( "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/provisioners/%s/worker-types/%s/workers/%s/%s" # "https://queue.taskcluster.net/v1/provisioners/%s/worker-types/%s/workers/%s/%s" % (self.provisioner, queue, worker_type, worker), self.verbosity, )
def get_worker_types(self, provisioner): # https://queue.taskcluster.net/v1/provisioners/proj-autophone/worker-types?limit=100 return utils.get_jsonc( "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/provisioners/%s/worker-types?limit=100" # "https://queue.taskcluster.net/v1/provisioners/%s/worker-types?limit=100" % provisioner, self.verbosity, )
def set_current_worker_types(self): # get the queues with data # https://queue.taskcluster.net/v1/provisioners/proj-autophone/worker-types?limit=100 url = ( "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/provisioners/%s/worker-types/?limit=%s" # "https://queue.taskcluster.net/v1/provisioners/proj-autophone/worker-types?limit=%s" % ("proj-autophone", MAX_WORKER_TYPES) ) json_1 = utils.get_jsonc(url, self.verbosity) for item in json_1["workerTypes"]: self.tc_current_worker_types.append(item["workerType"])
def set_queue_counts(self): for queue in self.devicepool_queues_and_workers: an_url = ( "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/pending/%s/%s" # "https://queue.taskcluster.net/v1/pending/proj-autophone/%s" % ("proj-autophone", queue) ) json_result = utils.get_jsonc(an_url, self.verbosity) if "pendingTasks" in json_result: self.tc_queue_counts[queue] = json_result["pendingTasks"] else: logger.warning("failed to get counts for queue '%s'", queue)
def simple_worker_report(self, worker_type, worker_prefix="packet-", worker_count=60): url = ( "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/provisioners/%s/worker-types/%s/workers?limit=100" % (self.provisioner, worker_type)) # print(url) try: workers_result = utils.get_jsonc(url, self.verbosity) except Exception as e: workers_result = [] print(e) # print(workers_result) expected_workers = [] for i in range(0, worker_count): expected_workers.append("%s%s" % (worker_prefix, i)) seen_workers = [] if "workers" in workers_result: for item in workers_result["workers"]: seen_workers.append(item["workerId"]) # pprint.pprint(workers_result) # for item in natsorted(seen_workers): # print(item) # should show 46 e_w = set(expected_workers) s_w = set(seen_workers) # missing = natsorted(s_w.symmetric_difference(e_w)) missing = e_w - s_w m_count = len(missing) print("missing workers (%s): %s" % (m_count, sorted(missing))) print("%s workers total" % worker_count)
def set_current_workers(self): # get the workers and count of workers # https://queue.taskcluster.net/v1/provisioners/proj-autophone/worker-types/gecko-t-ap-unit-p2/workers?limit=15 for item in self.tc_current_worker_types: url = ( "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/provisioners/%s/worker-types/%s/workers?limit=%s" # "https://queue.taskcluster.net/v1/provisioners/proj-autophone/worker-types/%s/workers?limit=%s" % ("proj-autophone", item, MAX_WORKER_COUNT) ) json_result = utils.get_jsonc(url, self.verbosity) if self.verbosity > 2: print("") print("%s (%s)" % (item, url)) self.pp.pprint(json_result) retries_left = 2 # tc can sometimes return empty results for this query, retry a few times while json_result["workers"] == []: json_result = utils.get_jsonc(url, self.verbosity) retries_left = retries_left - 1 if retries_left == 0: break # if json_result["workers"] == []: # logger.warning( # "no workers in %s... strange. let aerickson know if it continues" # % item # ) # logger.warning(url) self.tc_workers[item] = [] for worker in json_result["workers"]: self.tc_workers[item].append(worker["workerId"]) # TODO: quarantine data if "quarantineUntil" in worker: self.quarantined_workers.append(worker["workerId"]) if "latestTask" not in worker: # worker has no lastesttask... brand new or tc restart? # TODO: eventually alert if this persists # print("worker %s has no latestTask" % worker["workerId"]) continue an_url = ( "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/task/%s/status" # "https://queue.taskcluster.net/v1/task/%s/status" % worker["latestTask"]["taskId"] ) json_result2 = utils.get_jsonc(an_url, self.verbosity) if self.verbosity > 2: print("%s result2: " % worker["workerId"]) self.pp.pprint(json_result2) # if a quarantined host's last job is old it will # expire and we can't look at it if "code" in json_result2: if json_result2["code"] == "ResourceNotFound": continue # look at the last record for the task, could be rescheduled strange_result = True try: if "status" in json_result2: if "runs" in json_result2["status"]: # test pool workers, new workers # - workers that just started won't have a 'started' strange_result = False # normal workers # - set started_time if data if "started" in json_result2["status"]["runs"][-1]: started_time = json_result2["status"]["runs"][-1][ "started" ] if ( worker["workerId"] in self.tc_current_worker_last_started ): if ( self.tc_current_worker_last_started[ worker["workerId"] ] < started_time ): self.tc_current_worker_last_started[ worker["workerId"] ] = started_time else: self.tc_current_worker_last_started[ worker["workerId"] ] = started_time except KeyError: # pass, because we mention the strange result below pass if strange_result: logger.warning( "strange json_result2 for worker %s: %s" % (worker["workerId"], json_result2) )
def main(self, provisioner, worker_type, worker_id): # TODO: show when worker last started a task (taskStarted in TC) # - aws metal nodes has quarantined nodes that have been deleted that never drop off from worker-data start = timer() worker_count = 0 working_count = 0 # TODO: for this calculation, should we use a count of hosts that are reporting (vs all)? sr_total = 0 ## host mode if worker_type and worker_id: worker_count = 1 self.get_pending_tasks_multi([worker_type]) url = ( "https://firefox-ci-tc.services.mozilla.com/api/queue/v1/provisioners/%s/worker-types/%s/workers?limit=5" # "https://queue.taskcluster.net/v1/provisioners/%s/worker-types/%s/workers?limit=5" % (self.provisioner, worker_type)) # print(url) worker_group_result = utils.get_jsonc(url, self.verbosity) # worker_group = worker_group_result['workerTypes'][0][] # import pprint # pprint.pprint(worker_group_result) # sys.exit() if len(worker_group_result["workers"]) == 0: print("%s.%s: %s" % (worker_type, worker_id, "no data")) return worker_group = worker_group_result["workers"][0]["workerGroup"] _worker, res_obj, _e = self.device_fitness_report( worker_type, worker_group, worker_id) res_obj["worker_id"] = worker_id sr_total += res_obj["sr"] print("%s.%s" % (worker_type, self.format_workertype_fitness_report_result(res_obj))) else: ### queue mode if worker_type: worker_types = [worker_type] ### provisioner mode else: worker_types_result = self.get_worker_types(provisioner) worker_types = [] if "workerTypes" in worker_types_result: for provisioner in worker_types_result["workerTypes"]: worker_type = provisioner["workerType"] worker_types.append(worker_type) # print(worker_types) else: logger.warning( "error fetching workerTypes, results are incomplete!") self.get_pending_tasks_multi(worker_types) # TODO: process and then display? padding of worker_id is not consistent for whole provisioner report # - because we haven't scanned the potentially longest worker_ids when we display the first worker_group's data for a_worker_type in worker_types: wt, res_obj, _e = self.workertype_fitness_report(a_worker_type) for item in res_obj: worker_count += 1 sr_total += item["sr"] if item.get("state") and "working" in item.get("state"): working_count += 1 if self.args.only_show_alerting: if "alerts" in item: print("%s.%s" % ( wt, self.format_workertype_fitness_report_result( item), )) else: print( "%s.%s" % (wt, self.format_workertype_fitness_report_result(item) )) # if to protect from divide by 0 (happens on request failures) if worker_count: # TODO: show alerting count print( "%s workers queried in %s seconds (%s working), average SR %s%%" % ( worker_count, round((timer() - start), 2), working_count, round((sr_total / worker_count * 100), 2), ))