def calc_itp(self, run): itp = run['args']['throughput'] if itp < 1: itp = 1 elif itp > 500: itp = 500 itp *= math.sqrt(estimate_game_duration(run['args']['tc'])/estimate_game_duration('10+0.1')) itp *= math.sqrt(run['args']['threads']) if 'sprt' not in run['args']: itp *= 0.5 else: llr = run['args']['sprt'].get('llr',0) itp *= (5 + llr) / 5 run['args']['itp'] = itp
def homepage_results(request): # Calculate games_per_minute from current machines games_per_minute = 0.0 machines = request.rundb.get_machines() for machine in machines: machine['last_updated'] = delta_date(machine['last_updated']) if machine['nps'] != 0: games_per_minute += ( (machine['nps'] / 1600000.0) * (60.0 / estimate_game_duration(machine['run']['args']['tc'])) * (int(machine['concurrency']) // machine['run']['args'].get('threads', 1))) machines.reverse() # Get updated results for unfinished runs + finished runs (runs, pending_hours, cores, nps) = request.rundb.aggregate_unfinished_runs() return { **get_paginated_finished_runs(request), 'runs': runs, 'machines': machines, 'pending_hours': '%.1f' % (pending_hours), 'cores': cores, 'nps': nps, 'games_per_minute': int(games_per_minute), }
def homepage_results(request): # Calculate games_per_minute from current machines games_per_minute = 0.0 machines = request.rundb.get_machines() for machine in machines: machine["last_updated"] = delta_date(machine["last_updated"]) if machine["nps"] != 0: games_per_minute += ( (machine["nps"] / 1200000.0) * (60.0 / estimate_game_duration(machine["run"]["args"]["tc"])) * (int(machine["concurrency"]) // machine["run"]["args"].get("threads", 1))) machines.reverse() # Get updated results for unfinished runs + finished runs (runs, pending_hours, cores, nps) = request.rundb.aggregate_unfinished_runs() return { **get_paginated_finished_runs(request), "runs": runs, "machines": machines, "pending_hours": "%.1f" % (pending_hours), "cores": cores, "nps": nps, "games_per_minute": int(games_per_minute), }
def build_users(machines, info): for machine in machines: games_per_hour = ( (machine["nps"] / 1080000.0) * (3600.0 / estimate_game_duration(machine["run"]["args"]["tc"])) * (int(machine["concurrency"]) // machine["run"]["args"].get("threads", 1)) ) info[machine["username"]]["games_per_hour"] += games_per_hour users = [] for u in info.keys(): user = info[u] try: if isinstance(user["last_updated"], str): diff = diff_date(user["task_last_updated"]) user["diff"] = diff.total_seconds() user["last_updated"] = delta_date(diff) else: diff = diff_date(user["last_updated"]) user["diff"] = diff.total_seconds() user["last_updated"] = delta_date(diff) except: pass users.append(user) users = [u for u in users if u["games"] > 0 or u["tests"] > 0] return users
def calc_itp(self, run): itp = run["args"]["throughput"] if itp < 1: itp = 1 elif itp > 500: itp = 500 itp *= math.sqrt( estimate_game_duration(run["args"]["tc"]) / estimate_game_duration("10+0.1") ) itp *= math.sqrt(run["args"]["threads"]) if "sprt" not in run["args"]: itp *= 0.5 else: llr = run["args"]["sprt"].get("llr", 0) itp *= (5 + llr) / 5 run["args"]["itp"] = itp
def process_run(run, info, deltas=None): global skip if deltas and (skip or str(run["_id"]) in deltas): skip = True return if deltas != None and str(run["_id"]) in new_deltas: print("Warning: skipping repeated run!") return if "username" in run["args"]: username = run["args"]["username"] if username not in info: print("not in info: ", username) return else: info[username]["tests"] += 1 tc = estimate_game_duration(run["args"]["tc"]) for task in run["tasks"]: if "worker_info" not in task: continue username = task["worker_info"].get("username", None) if username == None: continue if username not in info: print("not in info: ", username) continue if "stats" in task: stats = task["stats"] num_games = stats["wins"] + stats["losses"] + stats["draws"] else: num_games = 0 try: info[username]["last_updated"] = max( task["last_updated"], info[username]["last_updated"] ) info[username]["task_last_updated"] = max( task["last_updated"], info[username]["last_updated"] ) except: info[username]["last_updated"] = task["last_updated"] info[username]["cpu_hours"] += float( num_games * int(run["args"].get("threads", 1)) * tc / (60 * 60) ) info[username]["games"] += num_games if deltas != None: new_deltas.update({str(run["_id"]): None})
def worker_cap(self, run, worker_info): # Estimate how many games a worker will be able to run # during the time interval determined by "self.task_duration". # Make sure the result is properly quantized and not zero. game_time = estimate_game_duration(run["args"]["tc"]) concurrency = worker_info["concurrency"] // run["args"]["threads"] assert concurrency >= 1 games = self.task_duration / game_time * concurrency if "sprt" in run["args"]: batch_size = 2 * run["args"]["sprt"].get("batch_size", 1) games = max(batch_size, batch_size * int(games / batch_size + 1 / 2)) else: games = max(2, 2 * int(games / 2 + 1 / 2)) return games
def process_run(run, info, deltas=None): global skip if deltas and (skip or str(run['_id']) in deltas): skip = True return if deltas != None and str(run['_id']) in new_deltas: print('Warning: skipping repeated run!') return if 'username' in run['args']: username = run['args']['username'] if username not in info: print('not in info: ', username) return else: info[username]['tests'] += 1 tc = estimate_game_duration(run['args']['tc']) for task in run['tasks']: if 'worker_info' not in task: continue username = task['worker_info'].get('username', None) if username == None: continue if username not in info: print('not in info: ', username) continue if 'stats' in task: stats = task['stats'] num_games = stats['wins'] + stats['losses'] + stats['draws'] else: num_games = 0 try: info[username]['last_updated'] = max( task['last_updated'], info[username]['last_updated']) except: info[username]['last_updated'] = task['last_updated'] info[username]['cpu_hours'] += float( num_games * int(run['args'].get('threads', 1)) * tc / (60 * 60)) info[username]['games'] += num_games if deltas != None: new_deltas.update({str(run['_id']): None})
def worker_cap(self, run, worker_info): # Estimate how many games a worker will be able to run # during the time interval determined by "self.task_duration". # Make sure the result is properly quantized and not zero. game_time = estimate_game_duration(run["args"]["tc"]) concurrency = worker_info["concurrency"] // run["args"]["threads"] assert concurrency >= 1 # as we have more tasks done (>1000), make them longer to avoid # having many tasks in long running tests scale_duration = 1 + (len(run["tasks"]) // 1000)**2 games = self.task_duration * scale_duration / game_time * concurrency if "sprt" in run["args"]: batch_size = 2 * run["args"]["sprt"].get("batch_size", 1) games = max(batch_size, batch_size * int(games / batch_size + 1 / 2)) else: games = max(2, 2 * int(games / 2 + 1 / 2)) return games
def build_users(machines, info): for machine in machines: games_per_hour = (machine['nps'] / 1600000.0) * ( 3600.0 / estimate_game_duration(machine['run']['args']['tc'])) * ( int(machine['concurrency']) // machine['run']['args'].get('threads', 1)) info[machine['username']]['games_per_hour'] += games_per_hour users = [] for u in info.keys(): user = info[u] try: user['last_updated'] = delta_date(user['last_updated']) except: pass users.append(user) users = [u for u in users if u['games'] > 0 or u['tests'] > 0] return users
def sync_request_task(self, worker_info): unique_key = worker_info["unique_key"] # We get the list of unfinished runs. # To limit db access the list is cached for # 60 seconds. if time.time() > self.task_time + 60: if DEBUG: print("Refresh queue", flush=True) self.task_runs = [] for r in self.get_unfinished_runs_id(): run = self.get_run(r["_id"]) self.sum_cores(run) self.calc_itp(run) self.task_runs.append(run) self.task_time = time.time() # We sort the list of unfinished runs according to priority. # Note that because of the caching, the properties of the # runs may have changed, so resorting is necessary. # Changes can be created by the code below or else in update_task(). # Note that update_task() uses the same objects as here # (they are not copies). last_run_id = self.worker_runs.get(unique_key, {}).get("last_run", None) def priority(run): # lower is better return ( -run["args"]["priority"], # Try to find a new run for this worker. run["_id"] == last_run_id, run["cores"] / run["args"]["itp"] * 100.0, -run["args"]["itp"], run["_id"], ) self.task_runs.sort(key=priority) # We go through the list of unfinished runs to see if the worker # has reached the number of allowed connections from the same ip # address. connections = 0 for run in self.task_runs: for task in run["tasks"]: if (task["active"] and task["worker_info"]["remote_addr"] == worker_info["remote_addr"]): connections = connections + 1 if connections >= self.userdb.get_machine_limit( worker_info["username"]): return {"task_waiting": False, "hit_machine_limit": True} # Collect some data about the worker that will be used below. # Memory max_threads = int(worker_info["concurrency"]) min_threads = int(worker_info.get("min_threads", 1)) max_memory = int(worker_info.get("max_memory", 0)) # Is the worker near the github api limit? if "rate" in worker_info: rate = worker_info["rate"] near_github_api_limit = rate["remaining"] <= 2 * math.sqrt( rate["limit"]) else: near_github_api_limit = False # Now go through the sorted list of unfinished runs. # We will add a task to the first run that is suitable. run_found = False for run in self.task_runs: if run["finished"]: continue if not run["approved"]: continue if run["args"]["threads"] > max_threads: continue if run["args"]["threads"] < min_threads: continue # Check if there aren't already enough workers # working on this run. committed_games = 0 for task in run["tasks"]: if not task["active"]: if "stats" in task: stats = task["stats"] committed_games += (stats["wins"] + stats["losses"] + stats["draws"]) else: committed_games += task["num_games"] remaining = run["args"]["num_games"] - committed_games if remaining <= 0: continue # We check if the worker has reserved enough memory need_tt = 0 need_base = 0 if max_memory > 0: def get_hash(s): h = re.search("Hash=([0-9]+)", s) if h: return int(h.group(1)) return 0 need_tt += get_hash(run["args"]["new_options"]) need_tt += get_hash(run["args"]["base_options"]) need_tt *= max_threads // run["args"]["threads"] # estime another 70MB per process for net (40) and other things besides hash need_base = 2 * 70 * (max_threads // run["args"]["threads"]) if need_base + need_tt > max_memory: continue # Github API limit... if near_github_api_limit: have_binary = (unique_key in self.worker_runs and run["_id"] in self.worker_runs[unique_key]) if not have_binary: continue # To avoid time losses in the case of large concurrency and short TC, # probably due to cutechess-cli as discussed in issue #822, # assign linux workers to LTC or multi-threaded jobs # and windows workers only to LTC jobs if max_threads >= 32: if "windows" in worker_info["uname"].lower(): short_tc = estimate_game_duration( run["args"]["tc"]) <= estimate_game_duration("55+0.5") else: short_tc = estimate_game_duration(run["args"]["tc"]) * run[ "args"]["threads"] <= estimate_game_duration("30+0.3") if short_tc: continue # Limit the number of cores. # Currently this is only done for spsa. if "spsa" in run["args"]: limit_cores = 40000 / math.sqrt( len(run["args"]["spsa"]["params"])) else: limit_cores = 1000000 # infinity cores = 0 core_limit_reached = False for task in run["tasks"]: if task["active"]: cores += task["worker_info"]["concurrency"] if cores > limit_cores: core_limit_reached = True break if core_limit_reached: continue # If we make it here, it means we have found a run # suitable for a new task. run_found = True break # If there is no suitable run, tell the worker. if not run_found: return {"task_waiting": False} # Now we create a new task for this run. opening_offset = 0 for task in run["tasks"]: opening_offset += task["num_games"] task_size = min(self.worker_cap(run, worker_info), remaining) task = { "num_games": task_size, "active": True, "worker_info": worker_info, "last_updated": datetime.utcnow(), "start": opening_offset, "stats": { "wins": 0, "losses": 0, "draws": 0, "pentanomial": 5 * [0] }, } run["tasks"].append(task) task_id = len(run["tasks"]) - 1 run["cores"] += task["worker_info"]["concurrency"] self.buffer(run, False) # Cache some data. Currently we record the id's # the worker has seen, as well as the last id that was seen. # Note that "worker_runs" is empty after a server restart. if unique_key not in self.worker_runs: self.worker_runs[unique_key] = {} if run["_id"] not in self.worker_runs[unique_key]: self.worker_runs[unique_key][run["_id"]] = True self.worker_runs[unique_key]["last_run"] = run["_id"] if DEBUG: print( "Allocate run: https://tests.stockfishchess.org/tests/view/{} task_id: {} to {}/{} Stats: {}" .format( run["_id"], task_id, worker_info["username"], unique_key, run["tasks"][task_id]["stats"], ), flush=True, ) return {"run": run, "task_id": task_id}
def sync_request_task(self, worker_info): if time.time() > self.task_time + 60: self.task_runs = [] for r in self.get_unfinished_runs_id(): run = self.get_run(r["_id"]) self.sum_cores(run) self.calc_itp(run) self.task_runs.append(run) self.task_runs.sort(key=lambda r: ( -r["args"]["priority"], r["cores"] / r["args"]["itp"] * 100.0, -r["args"]["itp"], r["_id"], )) self.task_time = time.time() max_threads = int(worker_info["concurrency"]) min_threads = int(worker_info.get("min_threads", 1)) max_memory = int(worker_info.get("max_memory", 0)) # We need to allocate a new task, but first check we don't have the same # machine already running because multiple connections are not allowed. connections = 0 for run in self.task_runs: for task in run["tasks"]: if (task["active"] and task["worker_info"]["remote_addr"] == worker_info["remote_addr"]): connections = connections + 1 # Allow a few connections, for multiple computers on same IP if connections >= self.userdb.get_machine_limit( worker_info["username"]): return {"task_waiting": False, "hit_machine_limit": True} # Limit worker Github API calls if "rate" in worker_info: rate = worker_info["rate"] limit = rate["remaining"] <= 2 * math.sqrt(rate["limit"]) else: limit = False worker_key = worker_info["unique_key"] # Get a new task that matches the worker requirements run_found = False for run in self.task_runs: # compute required TT memory need_tt = 0 if max_memory > 0: def get_hash(s): h = re.search("Hash=([0-9]+)", s) if h: return int(h.group(1)) return 0 need_tt += get_hash(run["args"]["new_options"]) need_tt += get_hash(run["args"]["base_options"]) need_tt *= max_threads // run["args"]["threads"] if (run["approved"] and (not limit or (worker_key in self.worker_runs and run["_id"] in self.worker_runs[worker_key])) and run["args"]["threads"] <= max_threads and run["args"]["threads"] >= min_threads and need_tt <= max_memory # To avoid time losses in the case of large concurrency and short TC, # probably due to cutechess-cli as discussed in issue #822, # assign those workers to LTC or multi-threaded jobs. and (max_threads < 32 or estimate_game_duration(run["args"]["tc"]) * run["args"]["threads"] > estimate_game_duration("30+0.3"))): task_id = -1 cores = 0 if "spsa" in run["args"]: limit_cores = 40000 / math.sqrt( len(run["args"]["spsa"]["params"])) else: limit_cores = 1000000 # No limit for SPRT for task in run["tasks"]: if task["active"]: cores += task["worker_info"]["concurrency"] if cores > limit_cores: break task_id = task_id + 1 if not task["active"] and task["pending"]: task["worker_info"] = worker_info task["last_updated"] = datetime.utcnow() task["active"] = True run_found = True break if run_found: break if not run_found: return {"task_waiting": False} self.sum_cores(run) self.task_runs.sort(key=lambda r: ( -r["args"]["priority"], r["cores"] / r["args"]["itp"] * 100.0, -r["args"]["itp"], r["_id"], )) self.buffer(run, False) # Update worker_runs (compiled tests) if worker_key not in self.worker_runs: self.worker_runs[worker_key] = {} if run["_id"] not in self.worker_runs[worker_key]: self.worker_runs[worker_key][run["_id"]] = True if "stats" not in run["tasks"][task_id]: run["tasks"][task_id]["stats"] = { "wins": 0, "losses": 0, "draws": 0, "pentanomial": 5 * [0], } return {"run": run, "task_id": task_id}