def __init__(self, db_name='fishtest_new'): # MongoDB server is assumed to be on the same machine, if not user should use # ssh with port forwarding to access the remote host. self.conn = MongoClient(os.getenv('FISHTEST_HOST') or 'localhost') self.db = self.conn[db_name] self.userdb = UserDb(self.db) self.actiondb = ActionDb(self.db) self.runs = self.db['runs'] self.chunk_size = 1000
class RunDb: def __init__(self, db_name='fishtest_new'): # MongoDB server is assumed to be on the same machine, if not user should use # ssh with port forwarding to access the remote host. self.conn = MongoClient(os.getenv('FISHTEST_HOST') or 'localhost') self.db = self.conn[db_name] self.userdb = UserDb(self.db) self.actiondb = ActionDb(self.db) self.regressiondb = RegressionDb(self.db) self.runs = self.db['runs'] self.chunk_size = 1000 def build_indices(self): self.runs.ensure_index([('finished', ASCENDING), ('last_updated', DESCENDING)]) def generate_tasks(self, num_games): tasks = [] remaining = num_games while remaining > 0: task_size = min(self.chunk_size, remaining) tasks.append({ 'num_games': task_size, 'pending': True, 'active': False, }) remaining -= task_size return tasks def new_run(self, base_tag, new_tag, num_games, tc, book, book_depth, threads, base_options, new_options, info='', resolved_base='', resolved_new='', msg_base='', msg_new='', base_signature='', new_signature='', regression_test=False, start_time=None, sprt=None, spsa=None, username=None, tests_repo=None, throughput=1000, priority=0): if start_time == None: start_time = datetime.utcnow() run_args = { 'base_tag': base_tag, 'new_tag': new_tag, 'num_games': num_games, 'tc': tc, 'book': book, 'book_depth': book_depth, 'threads': threads, 'regression_test': regression_test, 'resolved_base': resolved_base, 'resolved_new': resolved_new, 'msg_base': msg_base, 'msg_new': msg_new, 'base_options': base_options, 'new_options': new_options, 'info': info, 'base_signature': base_signature, 'new_signature': new_signature, 'username': username, 'tests_repo': tests_repo, 'throughput': throughput, 'priority': priority, 'internal_priority': - time.mktime(start_time.timetuple()), } if sprt != None: run_args['sprt'] = sprt if spsa != None: run_args['spsa'] = spsa new_run = { 'args': run_args, 'start_time': start_time, 'last_updated': start_time, # Will be filled in by tasks, indexed by task-id 'tasks': self.generate_tasks(num_games), # Aggregated results 'results': { 'wins': 0, 'losses': 0, 'draws': 0 }, 'results_stale': False, 'finished': False, 'approved': False, 'approver': '', } # Check for an existing approval matching the git commit SHAs def get_approval(sha): q = { '$or': [{ 'args.resolved_base': sha }, { 'args.resolved_new': sha }], 'approved': True } return self.runs.find_one(q) base_approval = get_approval(resolved_base) new_approval = get_approval(resolved_new) allow_auto = username in ['mcostalba', 'jkiiski', 'glinscott', 'lbraesch'] if base_approval != None and new_approval != None and allow_auto: new_run['approved'] = True new_run['approver'] = new_approval['approver'] return self.runs.insert(new_run) def get_machines(self): machines = [] for run in self.runs.find({'tasks': {'$elemMatch': {'active': True}}}): for task in run['tasks']: if task['active']: machine = copy.copy(task['worker_info']) machine['last_updated'] = task.get('last_updated', None) machine['run'] = run machine['nps'] = task.get('nps', 0) # TODO(glinscott): Temporary - remove once worker version >= 41 if not isinstance(machine['uname'], basestring): machine['uname'] = machine['uname'][0] + machine['uname'][2] machines.append(machine) return machines def get_run(self, id): return self.runs.find_one({'_id': ObjectId(id)}) def get_run_to_build(self): return self.runs.find_one({'binaries_url': {'$exists': False}, 'finished': False, 'deleted': {'$exists': False}}) def get_runs(self): return list(self.get_unfinished_runs()) + self.get_finished_runs()[0] def get_unfinished_runs(self): return self.runs.find({'finished': False}, sort=[('last_updated', DESCENDING), ('start_time', DESCENDING)]) def get_finished_runs(self, skip=0, limit=0, username=''): q = {'finished': True, 'deleted': {'$exists': False}} if len(username) > 0: q['args.username'] = username c = self.runs.find(q, skip=skip, limit=limit, sort=[('last_updated', DESCENDING)]) return (list(c), c.count()) def get_results(self, run): if not run['results_stale']: return run['results'] results = { 'wins': 0, 'losses': 0, 'draws': 0, 'crashes': 0, 'time_losses':0 } for task in run['tasks']: if 'stats' in task: stats = task['stats'] results['wins'] += stats['wins'] results['losses'] += stats['losses'] results['draws'] += stats['draws'] results['crashes'] += stats['crashes'] results['time_losses'] += stats.get('time_losses', 0) if 'sprt' in run['args'] and 'state' in run['args']['sprt']: results['sprt'] = run['args']['sprt']['state'] run['results_stale'] = False run['results'] = results self.runs.save(run) return results def request_task(self, worker_info): # Check for blocked user or ip if self.userdb.is_blocked(worker_info): return {'task_waiting': False} max_threads = int(worker_info['concurrency']) exclusion_list = [] # Does this worker have a task already? If so, just hand that back existing_run = self.runs.find_one({'tasks': {'$elemMatch': {'active': True, 'worker_info': worker_info}}}) if existing_run != None and existing_run['_id'] not in exclusion_list: for task_id, task in enumerate(existing_run['tasks']): if task['active'] and task['worker_info'] == worker_info: if task['pending']: return {'run': existing_run, 'task_id': task_id} else: # Don't hand back tasks that have been marked as no longer pending task['active'] = False self.runs.save(existing_run) # We need to allocate a new task, but first check we don't have the same # machine already running because multiple connections are not allowed. remote_addr = worker_info['remote_addr'] machines = self.get_machines() connections = sum([int(m.get('remote_addr','') == remote_addr) for m in machines]) # Allow a few connections, for multiple computers on same IP if connections >= self.userdb.get_machine_limit(worker_info['username']): return {'task_waiting': False, 'hit_machine_limit': True} # Ok, we get a new task that does not require more threads than available concurrency q = { 'new': True, 'query': { '$and': [ {'tasks': {'$elemMatch': {'active': False, 'pending': True}}}, {'args.threads': { '$lte': max_threads }}, {'_id': { '$nin': exclusion_list}}, {'approved': True}]}, 'sort': [('args.priority', DESCENDING), ('args.internal_priority', DESCENDING), ('_id', ASCENDING)], 'update': { '$set': { 'tasks.$.active': True, 'tasks.$.last_updated': datetime.utcnow(), 'tasks.$.worker_info': worker_info, } } } run = self.runs.find_and_modify(**q) if run == None: return {'task_waiting': False} # Find the task we have just activated: the one with the highest 'last_updated' latest_time = datetime.min for idx, task in enumerate(run['tasks']): if 'last_updated' in task and task['last_updated'] > latest_time: latest_time = task['last_updated'] task_id = idx # Recalculate internal priority based on task start date and throughput # Formula: - second_since_epoch - played_and_allocated_tasks * 3600 * 1000 / games_throughput # With default value 'throughput = 1000', this means that the priority is unchanged as long as we play at rate '1000 games / hour'. if (run['args']['throughput'] != None and run['args']['throughput'] != 0): run['args']['internal_priority'] = - time.mktime(run['start_time'].timetuple()) - task_id * 3600 * 1000 / run['args']['throughput'] self.runs.save(run) return {'run': run, 'task_id': task_id} def update_task(self, run_id, task_id, stats, nps, spsa): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} # Guard against incorrect results num_games = stats['wins'] + stats['losses'] + stats['draws'] if 'stats' in task and num_games < task['stats']['wins'] + task['stats']['losses'] + task['stats']['draws']: return {'task_alive': False} task['stats'] = stats task['nps'] = nps if num_games >= task['num_games']: task['active'] = False task['pending'] = False update_time = datetime.utcnow() task['last_updated'] = update_time run['last_updated'] = update_time run['results_stale'] = True # Update spsa results if 'spsa' in run['args'] and spsa['wins'] + spsa['losses'] + spsa['draws'] == spsa['num_games']: self.update_spsa(run, spsa) self.runs.save(run) # Check if SPRT stopping is enabled if 'sprt' in run['args']: sprt = run['args']['sprt'] sprt_stats = stat_util.SPRT(self.get_results(run), elo0=sprt['elo0'], alpha=sprt['alpha'], elo1=sprt['elo1'], beta=sprt['beta'], drawelo=sprt['drawelo']) if sprt_stats['finished']: run['args']['sprt']['state'] = sprt_stats['state'] self.runs.save(run) self.stop_run(run_id) return {'task_alive': task['active']} def failed_task(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} # Mark the task as inactive: it will be rescheduled task['active'] = False self.runs.save(run) return {} def stop_run(self, run_id): run = self.get_run(run_id) prune_idx = len(run['tasks']) for idx, task in enumerate(run['tasks']): is_active = task['active'] task['active'] = False task['pending'] = False if 'stats' not in task and not is_active: prune_idx = min(idx, prune_idx) else: prune_idx = idx + 1 # Truncate the empty tasks if prune_idx < len(run['tasks']): del run['tasks'][prune_idx:] self.runs.save(run) return {} def approve_run(self, run_id, approver): run = self.get_run(run_id) # Can't self approve if run['args']['username'] == approver: return False run['approved'] = True run['approver'] = approver self.runs.save(run) return True def request_spsa(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} result = { 'task_alive': True, 'w_params': [], 'b_params': [], } spsa = run['args']['spsa'] # Increment the iter counter spsa['iter'] += 1 self.runs.save(run) # Generate the next set of tuning parameters for param in spsa['params']: a = param['a'] / (spsa['A'] + spsa['iter']) ** spsa['alpha'] c = param['c'] / spsa['iter'] ** spsa['gamma'] R = a / c ** 2 flip = 1 if bool(random.getrandbits(1)) else -1 result['w_params'].append({ 'name': param['name'], 'value': min(max(param['theta'] + c * flip, param['min']), param['max']), 'R': R, 'c': c, 'flip': flip, }) result['b_params'].append({ 'name': param['name'], 'value': min(max(param['theta'] - c * flip, param['min']), param['max']), }) return result def update_spsa(self, run, spsa_results): spsa = run['args']['spsa'] spsa['iter'] += int(spsa_results['num_games'] / 2) - 1 # Update the current theta based on the results from the worker # Worker wins/losses are always in terms of w_params result = spsa_results['wins'] - spsa_results['losses'] summary = [] for idx, param in enumerate(spsa['params']): R = spsa_results['w_params'][idx]['R'] c = spsa_results['w_params'][idx]['c'] flip = spsa_results['w_params'][idx]['flip'] param['theta'] = min(max(param['theta'] + R * c * result * flip, param['min']), param['max']) summary.append({ 'theta': param['theta'], 'R': R, 'c': c, }) # Every 100 iterations, record the parameters if 'param_history' not in spsa: spsa['param_history'] = [] if len(spsa['param_history']) < spsa['iter'] / 100: spsa['param_history'].append(summary)
class RunDb: def __init__(self, db_name='fishtest_new'): # MongoDB server is assumed to be on the same machine, if not user should use # ssh with port forwarding to access the remote host. self.conn = MongoClient(os.getenv('FISHTEST_HOST') or 'localhost') self.db = self.conn[db_name] self.userdb = UserDb(self.db) self.actiondb = ActionDb(self.db) self.pgndb = self.db['pgns'] self.runs = self.db['runs'] self.old_runs = self.db['old_runs'] self.deltas = self.db['deltas'] self.chunk_size = 250 def build_indices(self): self.runs.ensure_index([('finished', ASCENDING), ('last_updated', DESCENDING)]) self.pgndb.ensure_index([('run_id', ASCENDING)]) def generate_tasks(self, num_games): tasks = [] remaining = num_games while remaining > 0: task_size = min(self.chunk_size, remaining) tasks.append({ 'num_games': task_size, 'pending': True, 'active': False, }) remaining -= task_size return tasks def new_run(self, base_tag, new_tag, num_games, tc, book, book_depth, threads, base_options, new_options, info='', resolved_base='', resolved_new='', msg_base='', msg_new='', base_signature='', new_signature='', start_time=None, sprt=None, spsa=None, username=None, tests_repo=None, auto_purge=True, throughput=3000, priority=0): if start_time == None: start_time = datetime.utcnow() run_args = { 'base_tag': base_tag, 'new_tag': new_tag, 'num_games': num_games, 'tc': tc, 'book': book, 'book_depth': book_depth, 'threads': threads, 'resolved_base': resolved_base, 'resolved_new': resolved_new, 'msg_base': msg_base, 'msg_new': msg_new, 'base_options': base_options, 'new_options': new_options, 'info': info, 'base_signature': base_signature, 'new_signature': new_signature, 'username': username, 'tests_repo': tests_repo, 'auto_purge': auto_purge, 'throughput': throughput, 'priority': priority, 'internal_priority': -time.mktime(start_time.timetuple()), } if sprt != None: run_args['sprt'] = sprt if spsa != None: run_args['spsa'] = spsa new_run = { 'args': run_args, 'start_time': start_time, 'last_updated': start_time, # Will be filled in by tasks, indexed by task-id 'tasks': self.generate_tasks(num_games), # Aggregated results 'results': { 'wins': 0, 'losses': 0, 'draws': 0 }, 'results_stale': False, 'finished': False, 'approved': False, 'approver': '', } return self.runs.insert(new_run) def get_machines(self): machines = [] for run in self.runs.find({'tasks': {'$elemMatch': {'active': True}}}): for task in run['tasks']: if task['active']: machine = copy.copy(task['worker_info']) machine['last_updated'] = task.get('last_updated', None) machine['run'] = run machine['nps'] = task.get('nps', 0) machines.append(machine) return machines def get_pgn(self, id): id = id.split('.')[0] # strip .pgn pgn = self.pgndb.find_one({'run_id': id}) if pgn: return zlib.decompress(pgn['pgn_zip']).decode() return None def get_pgn_100(self, skip): return [ p['run_id'] for p in self.pgndb.find( skip=skip, limit=100, sort=[('_id', DESCENDING)]) ] # Cache runs run_cache = {} run_cache_lock = threading.Lock() run_cache_write_lock = threading.Lock() timer = None def get_run(self, id): with self.run_cache_lock: id = str(id) if id in self.run_cache: self.run_cache[id]['rtime'] = time.time() return self.run_cache[id]['run'] run = self.runs.find_one({'_id': ObjectId(id)}) if not run: run = self.old_runs.find_one({'_id': ObjectId(id)}) self.run_cache[id] = { 'rtime': time.time(), 'ftime': time.time(), 'run': run, 'dirty': False } return run def buffer(self, run, flush): with self.run_cache_lock: if self.timer is None: self.timer = threading.Timer(1.0, self.flush_buffers) self.timer.start() id = str(run['_id']) if flush: self.run_cache[id] = { 'dirty': False, 'rtime': time.time(), 'ftime': time.time(), 'run': run } with self.run_cache_write_lock: self.runs.save(run) else: if id in self.run_cache: ftime = self.run_cache[id]['ftime'] else: ftime = time.time() self.run_cache[id] = { 'dirty': True, 'rtime': time.time(), 'ftime': ftime, 'run': run } def stop(self): with self.run_cache_lock: self.timer = None time.sleep(1.1) def flush_buffers(self): with self.run_cache_lock: if self.timer is None: return now = time.time() old = now + 1 oldest = None for id in self.run_cache.keys(): if not self.run_cache[id]['dirty']: if self.run_cache[id]['rtime'] < now - 60: del self.run_cache[id] elif self.run_cache[id]['ftime'] < old: old = self.run_cache[id]['ftime'] oldest = id if not oldest is None: if int(now) % 60 == 0: self.scavenge(self.run_cache[oldest]['run']) with self.run_cache_write_lock: self.runs.save(self.run_cache[oldest]['run']) self.run_cache[oldest]['dirty'] = False self.run_cache[oldest]['ftime'] = time.time() self.timer = threading.Timer(1.0, self.flush_buffers) self.timer.start() def scavenge(self, run): old = datetime.utcnow() - timedelta(minutes=30) for task in run['tasks']: if task['active'] and task['last_updated'] < old: task['active'] = False def get_runs(self): return list(self.get_unfinished_runs()) + self.get_finished_runs()[0] def get_unfinished_runs(self): with self.run_cache_write_lock: return self.runs.find({'finished': False}, sort=[('last_updated', DESCENDING), ('start_time', DESCENDING)]) def get_finished_runs(self, skip=0, limit=0, username='', success_only=False, ltc_only=False): q = {'finished': True, 'deleted': {'$exists': False}} if len(username) > 0: q['args.username'] = username if ltc_only: q['args.tc'] = {'$regex': '^([4-9][0-9])|([1-9][0-9][0-9])'} if success_only: # This is unfortunate, but the only way we have of telling if a run was successful or # not currently is the color! q['results_info.style'] = '#44EB44' c = self.runs.find(q, skip=skip, limit=limit, sort=[('last_updated', DESCENDING)]) result = [list(c), c.count()] if limit != 0 and len(result[0]) != limit: c = self.old_runs.find(q, skip=max(0, skip - c.count()), limit=limit - len(result[0])) result[0] += list(c) result[1] += c.count() else: result[1] += self.old_runs.find(q).count() return result def get_results(self, run, save_run=True): if not run['results_stale']: return run['results'] results = { 'wins': 0, 'losses': 0, 'draws': 0, 'crashes': 0, 'time_losses': 0 } for task in run['tasks']: if 'stats' in task: stats = task['stats'] results['wins'] += stats['wins'] results['losses'] += stats['losses'] results['draws'] += stats['draws'] results['crashes'] += stats['crashes'] results['time_losses'] += stats.get('time_losses', 0) if 'sprt' in run['args'] and 'state' in run['args']['sprt']: results['sprt'] = run['args']['sprt']['state'] run['results_stale'] = False run['results'] = results if save_run: self.buffer(run, True) return results def recalc_prio(self, run, task_id=None): if task_id is None: task_id = -1 for task in run['tasks']: task_id = task_id + 1 if not task['active'] and task['pending']: break # Recalculate internal priority based on task start date and throughput # Formula: - second_since_epoch - played_and_allocated_tasks * 3600 * chunk_size / games_throughput # With default value 'throughput = 3000', this means that the priority is unchanged as long as # we play at rate '3000 games / hour'. if (run['args']['throughput'] != None and run['args']['throughput'] != 0): run['args']['internal_priority'] = - time.mktime(run['start_time'].timetuple()) - \ task_id * 3600 * self.chunk_size * run['args']['threads'] / run['args']['throughput'] # Limit concurrent request_task task_lock = threading.Lock() task_semaphore = threading.Semaphore(4) task_time = 0 task_runs = None def request_task(self, worker_info): if self.task_semaphore.acquire(False): try: with self.task_lock: return self.sync_request_task(worker_info) finally: self.task_semaphore.release() else: return {'task_waiting': False} def sync_request_task(self, worker_info): if time.time() > self.task_time + 60: self.task_runs = [] for r in self.get_unfinished_runs(): self.task_runs.append(r) self.task_runs.sort(key=lambda r: (-r['args']['priority'], -r[ 'args']['internal_priority'], r['_id'])) self.task_time = time.time() max_threads = int(worker_info['concurrency']) min_threads = int(worker_info.get('min_threads', 1)) max_memory = int(worker_info.get('max_memory', 0)) exclusion_list = [] # We need to allocate a new task, but first check we don't have the same # machine already running because multiple connections are not allowed. connections = 0 for run in self.task_runs: for task in run['tasks']: if task['active'] and task['worker_info'][ 'remote_addr'] == worker_info['remote_addr']: connections = connections + 1 # Allow a few connections, for multiple computers on same IP if connections >= self.userdb.get_machine_limit( worker_info['username']): return {'task_waiting': False, 'hit_machine_limit': True} # Get a new task that matches the worker requirements run_found = False for runt in self.task_runs: run = self.get_run(runt['_id']) # compute required TT memory need_tt = 0 if max_memory > 0: def get_hash(s): h = re.search('Hash=([0-9]+)', s) if h: return int(h.group(1)) return 0 need_tt += get_hash(run['args']['new_options']) need_tt += get_hash(run['args']['base_options']) need_tt *= max_threads // run['args']['threads'] if run['_id'] not in exclusion_list and run['approved'] \ and run['args']['threads'] <= max_threads \ and run['args']['threads'] >= min_threads \ and need_tt <= max_memory: task_id = -1 for task in run['tasks']: task_id = task_id + 1 if not task['active'] and task['pending']: task['worker_info'] = worker_info task['last_updated'] = datetime.utcnow() task['active'] = True run_found = True break if run_found: break if not run_found: return {'task_waiting': False} self.recalc_prio(run, task_id) self.buffer(run, False) for runt in self.task_runs: if runt['_id'] == run['_id']: runt['args']['internal_priority'] = run['args'][ 'internal_priority'] self.task_runs.sort(key=lambda r: (-r['args']['priority'], -r[ 'args']['internal_priority'], r['_id'])) break return {'run': run, 'task_id': task_id} # Create a lock for each active run run_lock = threading.Lock() active_runs = {} purge_count = 0 def active_run_lock(self, id): with self.run_lock: self.purge_count = self.purge_count + 1 if self.purge_count > 100000: old = time.time() - 10000 self.active_runs = dict( (k, v) for k, v in self.active_runs.iteritems() if v['time'] >= old) self.purge_count = 0 if id in self.active_runs: active_lock = self.active_runs[id]['lock'] self.active_runs[id]['time'] = time.time() else: active_lock = threading.Lock() self.active_runs[id] = { 'time': time.time(), 'lock': active_lock } return active_lock def update_task(self, run_id, task_id, stats, nps, spsa, username): lock = self.active_run_lock(str(run_id)) with lock: return self.sync_update_task(run_id, task_id, stats, nps, spsa, username) def sync_update_task(self, run_id, task_id, stats, nps, spsa, username): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} if task['worker_info']['username'] != username: print('Update_task: Non matching username: '******'task_alive': False} # Guard against incorrect results count_games = lambda d: d['wins'] + d['losses'] + d['draws'] num_games = count_games(stats) old_num_games = count_games( task['stats']) if 'stats' in task else num_games spsa_games = count_games(spsa) if 'spsa' in run['args'] else 0 if num_games < old_num_games \ or (spsa_games > 0 and num_games <= 0) \ or (spsa_games > 0 and 'stats' in task and num_games <= old_num_games): return {'task_alive': False} flush = False task['stats'] = stats task['nps'] = nps if num_games >= task['num_games']: task[ 'pending'] = False # Make pending False before making active false to prevent race in request_task task['active'] = False flush = True update_time = datetime.utcnow() task['last_updated'] = update_time run['last_updated'] = update_time run['results_stale'] = True # Update spsa results if 'spsa' in run['args'] and spsa_games == spsa['num_games']: self.update_spsa(task['worker_info']['unique_key'], run, spsa) # Check if SPRT stopping is enabled if 'sprt' in run['args']: sprt = run['args']['sprt'] sprt_stats = fishtest.stat_util.SPRT(self.get_results(run, False), elo0=sprt['elo0'], alpha=sprt['alpha'], elo1=sprt['elo1'], beta=sprt['beta'], drawelo=sprt['drawelo']) if sprt_stats['finished']: run['args']['sprt']['state'] = sprt_stats['state'] self.stop_run(run_id, run) flush = True self.buffer(run, flush) return {'task_alive': task['active']} def upload_pgn(self, run_id, pgn_zip): self.pgndb.insert({'run_id': run_id, 'pgn_zip': Binary(pgn_zip)}) return {} def failed_task(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} # Mark the task as inactive: it will be rescheduled task['active'] = False self.buffer(run, True) return {} def stop_run(self, run_id, run=None): self.clear_params(run_id) save_it = False if run is None: run = self.get_run(run_id) save_it = True prune_idx = len(run['tasks']) for idx, task in enumerate(run['tasks']): is_active = task['active'] task[ 'pending'] = False # Make pending False before making active false to prevent race in request_task task['active'] = False if 'stats' not in task and not is_active: prune_idx = min(idx, prune_idx) else: prune_idx = idx + 1 # Truncate the empty tasks if prune_idx < len(run['tasks']): del run['tasks'][prune_idx:] if save_it: self.buffer(run, True) self.task_time = 0 return {} def approve_run(self, run_id, approver): run = self.get_run(run_id) # Can't self approve if run['args']['username'] == approver: return False run['approved'] = True run['approver'] = approver self.buffer(run, True) self.task_time = 0 return True def spsa_param_clip_round(self, param, increment, clipping, rounding): value = 0.0 if clipping == 'old': value = min(max(param['theta'] + increment, param['min']), param['max']) else: #clipping == 'careful': inc = min(abs(increment), abs(param['theta'] - param['min']) / 2, abs(param['theta'] - param['max']) / 2) if inc > 0: inc_sgn = 0 if increment == 0 else increment / abs(increment) value = param['theta'] + inc_sgn * inc else: #revert to old behavior to bounce off boundary value = min(max(param['theta'] + increment, param['min']), param['max']) #'deterministic' rounding calls round() inside the worker. #'randomized' says 4.p should be 5 with probability p, 4 with probability 1-p, # and is continuous (albeit after expectation) unlike round(). if rounding == 'randomized': fl = math.floor( value) #greatest integer <= value, thus works for negative. if random.uniform(0, 1) < value - fl: value = fl + 1 else: value = fl return value # Store SPSA parameters for each worker spsa_params = {} def store_params(self, run_id, worker, params): run_id = str(run_id) if not run_id in self.spsa_params: self.spsa_params[run_id] = {} self.spsa_params[run_id][worker] = params def get_params(self, run_id, worker): run_id = str(run_id) if not run_id in self.spsa_params: # Should only happen after server restart return self.generate_spsa(self.get_run(run_id))['w_params'] return self.spsa_params[run_id][worker] def clear_params(self, run_id): run_id = str(run_id) if run_id in self.spsa_params: del self.spsa_params[run_id] def request_spsa(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} result = self.generate_spsa(run) self.store_params(run['_id'], task['worker_info']['unique_key'], result['w_params']) return result def generate_spsa(self, run): result = { 'task_alive': True, 'w_params': [], 'b_params': [], } spsa = run['args']['spsa'] if 'clipping' not in spsa: spsa['clipping'] = 'old' if 'rounding' not in spsa: spsa['rounding'] = 'deterministic' # Generate the next set of tuning parameters iter_local = spsa[ 'iter'] + 1 #assume at least one completed, and avoid division by zero for param in spsa['params']: a = param['a'] / (spsa['A'] + iter_local)**spsa['alpha'] c = param['c'] / iter_local**spsa['gamma'] R = a / c**2 flip = 1 if bool(random.getrandbits(1)) else -1 result['w_params'].append({ 'name': param['name'], 'value': self.spsa_param_clip_round(param, c * flip, spsa['clipping'], spsa['rounding']), 'R': R, 'c': c, 'flip': flip, }) result['b_params'].append({ 'name': param['name'], 'value': self.spsa_param_clip_round(param, -c * flip, spsa['clipping'], spsa['rounding']), }) return result def update_spsa(self, worker, run, spsa_results): spsa = run['args']['spsa'] if 'clipping' not in spsa: spsa['clipping'] = 'old' spsa['iter'] += int(spsa_results['num_games'] / 2) # Store the history every 'freq' iterations. # More tuned parameters result in a lower update frequency, # so that the required storage (performance) remains constant. if 'param_history' not in spsa: spsa['param_history'] = [] L = len(spsa['params']) freq = L * 25 if freq < 100: freq = 100 maxlen = 250000 / freq grow_summary = len(spsa['param_history']) < min( maxlen, spsa['iter'] / freq) # Update the current theta based on the results from the worker # Worker wins/losses are always in terms of w_params result = spsa_results['wins'] - spsa_results['losses'] summary = [] w_params = self.get_params(run['_id'], worker) for idx, param in enumerate(spsa['params']): R = w_params[idx]['R'] c = w_params[idx]['c'] flip = w_params[idx]['flip'] param['theta'] = self.spsa_param_clip_round( param, R * c * result * flip, spsa['clipping'], 'deterministic') if grow_summary: summary.append({ 'theta': param['theta'], 'R': R, 'c': c, }) if grow_summary: spsa['param_history'].append(summary)
class RunDb: def __init__(self, db_name='fishtest_new', clop_socket=None): # MongoDB server is assumed to be on the same machine, if not user should use # ssh with port forwarding to access the remote host. self.conn = MongoClient(os.getenv('FISHTEST_HOST') or 'localhost') self.db = self.conn[db_name] self.userdb = UserDb(self.db) self.clopdb = ClopDb(self.db, clop_socket) self.actiondb = ActionDb(self.db) self.runs = self.db['runs'] self.chunk_size = 1000 def build_indices(self): self.runs.ensure_index([('finished', ASCENDING), ('last_updated', DESCENDING)]) def generate_tasks(self, num_games): tasks = [] remaining = num_games while remaining > 0: task_size = min(self.chunk_size, remaining) tasks.append({ 'num_games': task_size, 'pending': True, 'active': False, }) remaining -= task_size return tasks def new_run(self, base_tag, new_tag, num_games, tc, book, book_depth, threads, base_options, new_options, info='', resolved_base='', resolved_new='', msg_base='', msg_new='', base_signature='', new_signature='', regression_test=False, start_time=None, sprt=None, clop=None, username=None, tests_repo=None, priority=0): if start_time == None: start_time = datetime.utcnow() run_args = { 'base_tag': base_tag, 'new_tag': new_tag, 'num_games': num_games, 'tc': tc, 'book': book, 'book_depth': book_depth, 'threads': threads, 'regression_test': regression_test, 'resolved_base': resolved_base, 'resolved_new': resolved_new, 'msg_base': msg_base, 'msg_new': msg_new, 'base_options': base_options, 'new_options': new_options, 'info': info, 'base_signature': base_signature, 'new_signature': new_signature, 'username': username, 'tests_repo': tests_repo, 'priority': priority, } if sprt != None: run_args['sprt'] = sprt if clop != None: run_args['clop'] = clop new_run = { 'args': run_args, 'start_time': start_time, 'last_updated': start_time, # Will be filled in by tasks, indexed by task-id 'tasks': self.generate_tasks(num_games), # Aggregated results 'results': { 'wins': 0, 'losses': 0, 'draws': 0 }, 'results_stale': False, 'finished': False, 'approved': False, 'approver': '', } # Check for an existing approval matching the git commit SHAs def get_approval(sha): q = { '$or': [{ 'args.resolved_base': sha }, { 'args.resolved_new': sha }], 'approved': True } return self.runs.find_one(q) base_approval = get_approval(resolved_base) new_approval = get_approval(resolved_new) if base_approval != None and new_approval != None: new_run['approved'] = True new_run['approver'] = new_approval['approver'] return self.runs.insert(new_run) def get_machines(self): machines = [] for run in self.runs.find({'tasks': {'$elemMatch': {'active': True}}}): for task in run['tasks']: if task['active']: machine = copy.copy(task['worker_info']) machine['last_updated'] = task.get('last_updated', None) machine['run'] = run machine['nps'] = task.get('nps', 0) # TODO(glinscott): Temporary - remove once worker version >= 41 if not isinstance(machine['uname'], basestring): machine['uname'] = machine['uname'][0] + machine['uname'][2] machines.append(machine) return machines def get_run(self, id): return self.runs.find_one({'_id': ObjectId(id)}) def get_run_to_build(self): return self.runs.find_one({'binaries_url': {'$exists': False}, 'finished': False, 'deleted': {'$exists': False}}) def get_runs(self): return list(self.get_unfinished_runs()) + self.get_finished_runs()[0] def get_unfinished_runs(self): return self.runs.find({'finished': False}, sort=[('last_updated', DESCENDING), ('start_time', DESCENDING)]) def get_finished_runs(self, skip=0, limit=0, username=''): q = {'finished': True, 'deleted': {'$exists': False}} if len(username) > 0: q['args.username'] = username c = self.runs.find(q, skip=skip, limit=limit, sort=[('last_updated', DESCENDING)]) return (list(c), c.count()) def get_clop_exclusion_list(self, minimum): exclusion_list = [] for run in self.runs.find({'args.clop': {'$exists': True}, 'finished': False, 'deleted': {'$exists': False}}): available_games = 0 for game in self.clopdb.get_games(run['_id']): if len(game['task_id']) == 0: available_games += 1 active = False for task in run['tasks']: if task['active']: active = True if available_games < minimum and active: exclusion_list.append(run['_id']) return exclusion_list def get_results(self, run): if not run['results_stale']: return run['results'] results = { 'wins': 0, 'losses': 0, 'draws': 0, 'crashes': 0, 'time_losses':0 } for task in run['tasks']: if 'stats' in task: stats = task['stats'] results['wins'] += stats['wins'] results['losses'] += stats['losses'] results['draws'] += stats['draws'] results['crashes'] += stats['crashes'] results['time_losses'] += stats.get('time_losses', 0) if 'sprt' in run['args'] and 'state' in run['args']['sprt']: results['sprt'] = run['args']['sprt']['state'] run['results_stale'] = False run['results'] = results self.runs.save(run) return results def request_task(self, worker_info): # Check for blocked user or ip if self.userdb.is_blocked(worker_info): return {'task_waiting': False} # Build list of CLOP runs that are already almost full max_threads = int(worker_info['concurrency']) if 'Windows' in worker_info['uname']: exclusion_list = [r['_id'] for r in self.runs.find({'args.clop': {'$exists': True}, 'finished': False, 'deleted': {'$exists': False}})] else: exclusion_list = self.get_clop_exclusion_list(2 + max_threads) # Does this worker have a task already? If so, just hand that back existing_run = self.runs.find_one({'tasks': {'$elemMatch': {'active': True, 'worker_info': worker_info}}}) if existing_run != None and existing_run['_id'] not in exclusion_list: for task_id, task in enumerate(existing_run['tasks']): if task['active'] and task['worker_info'] == worker_info: if task['pending']: return {'run': existing_run, 'task_id': task_id} else: # Don't hand back tasks that have been marked as no longer pending task['active'] = False self.runs.save(existing_run) # We need to allocate a new task, but first check we don't have the same # machine already running because multiple connections are not allowed. remote_addr = worker_info['remote_addr'] machines = self.get_machines() connections = sum([int(m.get('remote_addr','') == remote_addr) for m in machines]) # Allow a few connections, for multiple computers on same IP if connections >= self.userdb.get_machine_limit(worker_info['username']): return {'task_waiting': False, 'hit_machine_limit': True} # Ok, we get a new task that does not require more threads than available concurrency q = { 'new': True, 'query': { '$and': [ {'tasks': {'$elemMatch': {'active': False, 'pending': True}}}, {'args.threads': { '$lte': max_threads }}, {'_id': { '$nin': exclusion_list}}, {'approved': True}]}, 'sort': [('args.priority', DESCENDING), ('_id', ASCENDING)], 'update': { '$set': { 'tasks.$.active': True, 'tasks.$.last_updated': datetime.utcnow(), 'tasks.$.worker_info': worker_info, } } } run = self.runs.find_and_modify(**q) if run == None: return {'task_waiting': False} # Find the task we have just activated: the one with the highest 'last_updated' latest_time = datetime.min for idx, task in enumerate(run['tasks']): if 'last_updated' in task and task['last_updated'] > latest_time: latest_time = task['last_updated'] task_id = idx # Lower priority of long running tests if task_id > 40 and 'sprt' in run['args'] and run['args']['priority'] == 0: run['args']['priority'] = -1 self.runs.save(run) # Lower priority of LTC long running tests if task_id > 50 and 'sprt' in run['args'] and parse_tc(run['args']['tc']) > 100: run['args']['priority'] = -3 self.runs.save(run) return {'run': run, 'task_id': task_id} def update_task(self, run_id, task_id, stats, nps, clop): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} # Guard against incorrect results num_games = stats['wins'] + stats['losses'] + stats['draws'] if 'stats' in task and num_games < task['stats']['wins'] + task['stats']['losses'] + task['stats']['draws']: return {'task_alive': False} task['stats'] = stats task['nps'] = nps if num_games >= task['num_games']: task['active'] = False task['pending'] = False update_time = datetime.utcnow() task['last_updated'] = update_time run['last_updated'] = update_time run['results_stale'] = True self.runs.save(run) # Check if SPRT stopping is enabled if 'sprt' in run['args']: sprt = run['args']['sprt'] sprt_stats = stat_util.SPRT(self.get_results(run), elo0=sprt['elo0'], alpha=sprt['alpha'], elo1=sprt['elo1'], beta=sprt['beta'], drawelo=sprt['drawelo']) if sprt_stats['finished']: run['args']['sprt']['state'] = sprt_stats['state'] self.runs.save(run) self.stop_run(run_id) # Update clop results if 'clop' in run['args'] and len(clop['game_id']) > 0: self.clopdb.write_result(clop['game_id'], clop['game_result']) if not task['active']: self.clopdb.stop_games(run_id, task_id) return {'task_alive': task['active']} def failed_task(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} # Mark the task as inactive: it will be rescheduled task['active'] = False self.runs.save(run) if 'clop' in run['args']: self.clopdb.stop_games(run_id, task_id) return {} def stop_run(self, run_id): run = self.get_run(run_id) prune_idx = len(run['tasks']) for idx, task in enumerate(run['tasks']): is_active = task['active'] task['active'] = False task['pending'] = False if 'stats' not in task and not is_active: prune_idx = min(idx, prune_idx) else: prune_idx = idx + 1 # Truncate the empty tasks if prune_idx < len(run['tasks']): del run['tasks'][prune_idx:] self.runs.save(run) return {} def approve_run(self, run_id, approver): run = self.get_run(run_id) # Can't self approve if run['args']['username'] == approver: return False run['approved'] = True run['approver'] = approver self.runs.save(run) return True
class RunDb: def __init__(self, db_name='fishtest_new'): # MongoDB server is assumed to be on the same machine, if not user should use # ssh with port forwarding to access the remote host. self.conn = MongoClient(os.getenv('FISHTEST_HOST') or 'localhost') self.db = self.conn[db_name] self.userdb = UserDb(self.db) self.actiondb = ActionDb(self.db) self.pgndb = self.db['pgns'] self.runs = self.db['runs'] self.old_runs = self.db['old_runs'] self.chunk_size = 250 def build_indices(self): self.runs.ensure_index([('finished', ASCENDING), ('last_updated', DESCENDING)]) self.pgndb.ensure_index([('run_id', ASCENDING)]) def generate_tasks(self, num_games): tasks = [] remaining = num_games while remaining > 0: task_size = min(self.chunk_size, remaining) tasks.append({ 'num_games': task_size, 'pending': True, 'active': False, }) remaining -= task_size return tasks def new_run(self, base_tag, new_tag, num_games, tc, book, book_depth, threads, base_options, new_options, info='', resolved_base='', resolved_new='', msg_base='', msg_new='', base_signature='', new_signature='', start_time=None, sprt=None, spsa=None, username=None, tests_repo=None, auto_purge=True, throughput=3000, priority=0): if start_time == None: start_time = datetime.utcnow() run_args = { 'base_tag': base_tag, 'new_tag': new_tag, 'num_games': num_games, 'tc': tc, 'book': book, 'book_depth': book_depth, 'threads': threads, 'resolved_base': resolved_base, 'resolved_new': resolved_new, 'msg_base': msg_base, 'msg_new': msg_new, 'base_options': base_options, 'new_options': new_options, 'info': info, 'base_signature': base_signature, 'new_signature': new_signature, 'username': username, 'tests_repo': tests_repo, 'auto_purge': auto_purge, 'throughput': throughput, 'priority': priority, 'internal_priority': - time.mktime(start_time.timetuple()), } if sprt != None: run_args['sprt'] = sprt if spsa != None: run_args['spsa'] = spsa new_run = { 'args': run_args, 'start_time': start_time, 'last_updated': start_time, # Will be filled in by tasks, indexed by task-id 'tasks': self.generate_tasks(num_games), # Aggregated results 'results': { 'wins': 0, 'losses': 0, 'draws': 0 }, 'results_stale': False, 'finished': False, 'approved': False, 'approver': '', } return self.runs.insert(new_run) def get_machines(self): machines = [] for run in self.runs.find({'tasks': {'$elemMatch': {'active': True}}}): for task in run['tasks']: if task['active']: machine = copy.copy(task['worker_info']) machine['last_updated'] = task.get('last_updated', None) machine['run'] = run machine['nps'] = task.get('nps', 0) machines.append(machine) return machines def get_pgn(self, id): id = id.split('.')[0] # strip .pgn pgn = self.pgndb.find_one({'run_id': id}) if pgn: return zlib.decompress(pgn['pgn_zip']).decode() return None def get_pgn_100(self, skip): return [p['run_id'] for p in self.pgndb.find(skip=skip, limit=100, sort=[('_id',DESCENDING)])] # Cache runs run_cache = {} run_cache_lock = threading.Lock() run_cache_write_lock = threading.Lock() timer = None def get_run(self, id): with self.run_cache_lock: id = str(id) if id in self.run_cache: self.run_cache[id]['rtime'] = time.time() return self.run_cache[id]['run'] run = self.runs.find_one({'_id': ObjectId(id)}) if not run: run = self.old_runs.find_one({'_id': ObjectId(id)}) self.run_cache[id] = { 'rtime': time.time(), 'ftime': time.time(), 'run': run, 'dirty': False } return run def buffer(self, run, flush): with self.run_cache_lock: if self.timer is None: self.timer = threading.Timer(1.0, self.flush_buffers) self.timer.start() id = str(run['_id']) if flush: self.run_cache[id] = { 'dirty': False, 'rtime': time.time(), 'ftime': time.time(), 'run': run } with self.run_cache_write_lock: self.runs.save(run) else: if id in self.run_cache: ftime = self.run_cache[id]['ftime'] else: ftime = time.time() self.run_cache[id] = { 'dirty': True, 'rtime': time.time(), 'ftime': ftime, 'run': run } def stop(self): with self.run_cache_lock: self.timer = None time.sleep(1.1) def flush_buffers(self): with self.run_cache_lock: if self.timer is None: return now = time.time() old = now + 1 oldest = None for id in self.run_cache.keys(): if not self.run_cache[id]['dirty']: if self.run_cache[id]['rtime'] < now - 60: del self.run_cache[id] elif self.run_cache[id]['ftime'] < old: old = self.run_cache[id]['ftime'] oldest = id if not oldest is None: if int(now) % 60 == 0: self.scavenge(self.run_cache[oldest]['run']) with self.run_cache_write_lock: self.runs.save(self.run_cache[oldest]['run']) self.run_cache[oldest]['dirty'] = False self.run_cache[oldest]['ftime'] = time.time() self.timer = threading.Timer(1.0, self.flush_buffers) self.timer.start() def scavenge(self, run): old = datetime.utcnow() - timedelta(minutes=30) for task in run['tasks']: if task['active'] and task['last_updated'] < old: task['active'] = False def get_runs(self): return list(self.get_unfinished_runs()) + self.get_finished_runs()[0] def get_unfinished_runs(self): with self.run_cache_write_lock: return self.runs.find({'finished': False}, sort=[('last_updated', DESCENDING), ('start_time', DESCENDING)]) def get_finished_runs(self, skip=0, limit=0, username='', success_only=False, ltc_only=False): q = {'finished': True, 'deleted': {'$exists': False}} if len(username) > 0: q['args.username'] = username if ltc_only: q['args.tc'] = {'$regex':'^([4-9][0-9])|([1-9][0-9][0-9])'} if success_only: # This is unfortunate, but the only way we have of telling if a run was successful or # not currently is the color! q['results_info.style'] = '#44EB44' c = self.runs.find(q, skip=skip, limit=limit, sort=[('last_updated', DESCENDING)]) result = [list(c), c.count()] if limit != 0 and len(result[0]) != limit: c = self.old_runs.find(q, skip=max(0, skip-c.count()), limit=limit-len(result[0])) result[0] += list(c) result[1] += c.count() else: result[1] += self.old_runs.find(q).count() return result def get_results(self, run, save_run=True): if not run['results_stale']: return run['results'] results = { 'wins': 0, 'losses': 0, 'draws': 0, 'crashes': 0, 'time_losses':0 } for task in run['tasks']: if 'stats' in task: stats = task['stats'] results['wins'] += stats['wins'] results['losses'] += stats['losses'] results['draws'] += stats['draws'] results['crashes'] += stats['crashes'] results['time_losses'] += stats.get('time_losses', 0) if 'sprt' in run['args'] and 'state' in run['args']['sprt']: results['sprt'] = run['args']['sprt']['state'] run['results_stale'] = False run['results'] = results if save_run: self.buffer(run, True) return results def recalc_prio(self, run, task_id=None): if task_id is None: task_id = -1 for task in run['tasks']: task_id = task_id + 1 if not task['active'] and task['pending']: break # Recalculate internal priority based on task start date and throughput # Formula: - second_since_epoch - played_and_allocated_tasks * 3600 * chunk_size / games_throughput # With default value 'throughput = 3000', this means that the priority is unchanged as long as # we play at rate '3000 games / hour'. if (run['args']['throughput'] != None and run['args']['throughput'] != 0): run['args']['internal_priority'] = - time.mktime(run['start_time'].timetuple()) - \ task_id * 3600 * self.chunk_size * run['args']['threads'] / run['args']['throughput'] # Limit concurrent request_task task_lock = threading.Lock() task_semaphore = threading.Semaphore(4) task_time = 0 task_runs = None def request_task(self, worker_info): if self.task_semaphore.acquire(False): try: with self.task_lock: return self.sync_request_task(worker_info) finally: self.task_semaphore.release() else: return {'task_waiting': False} def sync_request_task(self, worker_info): if time.time() > self.task_time + 60: self.task_runs = [] for r in self.get_unfinished_runs(): self.task_runs.append(r) self.task_runs.sort(key=lambda r: (-r['args']['priority'], -r['args']['internal_priority'], r['_id'])) self.task_time = time.time() max_threads = int(worker_info['concurrency']) min_threads = int(worker_info.get('min_threads', 1)) max_memory = int(worker_info.get('max_memory', 0)) exclusion_list = [] # We need to allocate a new task, but first check we don't have the same # machine already running because multiple connections are not allowed. connections = 0 for run in self.task_runs: for task in run['tasks']: if task['active'] and task['worker_info']['remote_addr'] == worker_info['remote_addr']: connections = connections + 1 # Allow a few connections, for multiple computers on same IP if connections >= self.userdb.get_machine_limit(worker_info['username']): return {'task_waiting': False, 'hit_machine_limit': True} # Get a new task that matches the worker requirements run_found = False for runt in self.task_runs: run = self.get_run(runt['_id']) # compute required TT memory need_tt = 0 if max_memory > 0: def get_hash(s): h = re.search('Hash=([0-9]+)', s) if h: return int(h.group(1)) return 0 need_tt += get_hash(run['args']['new_options']) need_tt += get_hash(run['args']['base_options']) need_tt *= max_threads // run['args']['threads'] if run['_id'] not in exclusion_list and run['approved'] \ and run['args']['threads'] <= max_threads \ and run['args']['threads'] >= min_threads \ and need_tt <= max_memory: task_id = -1 for task in run['tasks']: task_id = task_id + 1 if not task['active'] and task['pending']: task['worker_info'] = worker_info task['last_updated'] = datetime.utcnow() task['active'] = True run_found = True break if run_found: break if not run_found: return {'task_waiting': False} self.recalc_prio(run, task_id) self.buffer(run, False) for runt in self.task_runs: if runt['_id'] == run['_id']: runt['args']['internal_priority'] = run['args']['internal_priority'] self.task_runs.sort(key=lambda r: (-r['args']['priority'], -r['args']['internal_priority'], r['_id'])) break return {'run': run, 'task_id': task_id} # Create a lock for each active run run_lock = threading.Lock() active_runs = {} purge_count = 0 def active_run_lock(self, id): with self.run_lock: self.purge_count = self.purge_count + 1 if self.purge_count > 100000: old = time.time() - 10000 self.active_runs = dict((k,v) for k, v in self.active_runs.iteritems() if v['time'] >= old) self.purge_count = 0 if id in self.active_runs: active_lock = self.active_runs[id]['lock'] self.active_runs[id]['time'] = time.time() else: active_lock = threading.Lock() self.active_runs[id] = { 'time': time.time(), 'lock': active_lock } return active_lock def update_task(self, run_id, task_id, stats, nps, spsa, username): lock = self.active_run_lock(str(run_id)) with lock: return self.sync_update_task(run_id, task_id, stats, nps, spsa, username) def sync_update_task(self, run_id, task_id, stats, nps, spsa, username): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} if task['worker_info']['username'] != username: print('Update_task: Non matching username: '******'task_alive': False} # Guard against incorrect results count_games = lambda d: d['wins'] + d['losses'] + d['draws'] num_games = count_games(stats) old_num_games = count_games(task['stats']) if 'stats' in task else num_games spsa_games = count_games(spsa) if 'spsa' in run['args'] else 0 if num_games < old_num_games \ or (spsa_games > 0 and num_games <= 0) \ or (spsa_games > 0 and 'stats' in task and num_games <= old_num_games): return {'task_alive': False} flush = False task['stats'] = stats task['nps'] = nps if num_games >= task['num_games']: task['pending'] = False # Make pending False before making active false to prevent race in request_task task['active'] = False flush = True update_time = datetime.utcnow() task['last_updated'] = update_time run['last_updated'] = update_time run['results_stale'] = True # Update spsa results if 'spsa' in run['args'] and spsa_games == spsa['num_games']: self.update_spsa(task['worker_info']['unique_key'], run, spsa) # Check if SPRT stopping is enabled if 'sprt' in run['args']: sprt = run['args']['sprt'] sprt_stats = fishtest.stat_util.SPRT(self.get_results(run, False), elo0=sprt['elo0'], alpha=sprt['alpha'], elo1=sprt['elo1'], beta=sprt['beta'], drawelo=sprt['drawelo']) if sprt_stats['finished']: run['args']['sprt']['state'] = sprt_stats['state'] self.stop_run(run_id, run) flush = True self.buffer(run, flush) return {'task_alive': task['active']} def upload_pgn(self, run_id, pgn_zip): self.pgndb.insert({'run_id': run_id, 'pgn_zip': Binary(pgn_zip)}) return {} def failed_task(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} # Mark the task as inactive: it will be rescheduled task['active'] = False self.buffer(run, True) return {} def stop_run(self, run_id, run=None): self.clear_params(run_id) save_it = False if run is None: run = self.get_run(run_id) save_it = True prune_idx = len(run['tasks']) for idx, task in enumerate(run['tasks']): is_active = task['active'] task['pending'] = False # Make pending False before making active false to prevent race in request_task task['active'] = False if 'stats' not in task and not is_active: prune_idx = min(idx, prune_idx) else: prune_idx = idx + 1 # Truncate the empty tasks if prune_idx < len(run['tasks']): del run['tasks'][prune_idx:] if save_it: self.buffer(run, True) self.task_time = 0 return {} def approve_run(self, run_id, approver): run = self.get_run(run_id) # Can't self approve if run['args']['username'] == approver: return False run['approved'] = True run['approver'] = approver self.buffer(run, True) self.task_time = 0 return True def spsa_param_clip_round(self, param, increment, clipping, rounding): value = 0.0 if clipping == 'old': value = min(max(param['theta'] + increment, param['min']), param['max']) else: #clipping == 'careful': inc = min(abs(increment), abs(param['theta'] - param['min']) / 2, abs(param['theta'] - param['max']) / 2) if inc > 0: inc_sgn = 0 if increment == 0 else increment / abs(increment) value = param['theta'] + inc_sgn * inc else: #revert to old behavior to bounce off boundary value = min(max(param['theta'] + increment, param['min']), param['max']) #'deterministic' rounding calls round() inside the worker. #'randomized' says 4.p should be 5 with probability p, 4 with probability 1-p, # and is continuous (albeit after expectation) unlike round(). if rounding == 'randomized': fl = math.floor(value) #greatest integer <= value, thus works for negative. if random.uniform(0,1) < value - fl: value = fl + 1 else: value = fl return value # Store SPSA parameters for each worker spsa_params = {} def store_params(self, run_id, worker, params): run_id = str(run_id) if not run_id in self.spsa_params: self.spsa_params[run_id] = {} self.spsa_params[run_id][worker] = params def get_params(self, run_id, worker): run_id = str(run_id) if not run_id in self.spsa_params: # Should only happen after server restart return self.generate_spsa(self.get_run(run_id))['w_params'] return self.spsa_params[run_id][worker] def clear_params(self, run_id): run_id = str(run_id) if run_id in self.spsa_params: del self.spsa_params[run_id] def request_spsa(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} result = self.generate_spsa(run) self.store_params(run['_id'], task['worker_info']['unique_key'], result['w_params']) return result def generate_spsa(self, run): result = { 'task_alive': True, 'w_params': [], 'b_params': [], } spsa = run['args']['spsa'] if 'clipping' not in spsa: spsa['clipping'] = 'old' if 'rounding' not in spsa: spsa['rounding'] = 'deterministic' # Generate the next set of tuning parameters iter_local = spsa['iter'] + 1 #assume at least one completed, and avoid division by zero for param in spsa['params']: a = param['a'] / (spsa['A'] + iter_local) ** spsa['alpha'] c = param['c'] / iter_local ** spsa['gamma'] R = a / c ** 2 flip = 1 if bool(random.getrandbits(1)) else -1 result['w_params'].append({ 'name': param['name'], 'value': self.spsa_param_clip_round(param, c * flip, spsa['clipping'], spsa['rounding']), 'R': R, 'c': c, 'flip': flip, }) result['b_params'].append({ 'name': param['name'], 'value': self.spsa_param_clip_round(param, -c * flip, spsa['clipping'], spsa['rounding']), }) return result def update_spsa(self, worker, run, spsa_results): spsa = run['args']['spsa'] if 'clipping' not in spsa: spsa['clipping'] = 'old' spsa['iter'] += int(spsa_results['num_games'] / 2) # Store the history every 'freq' iterations. # More tuned parameters result in a lower update frequency, # so that the required storage (performance) remains constant. if 'param_history' not in spsa: spsa['param_history'] = [] L = len(spsa['params']) freq = L * 25 if freq < 100: freq = 100 maxlen = 250000 / freq grow_summary = len(spsa['param_history']) < min(maxlen, spsa['iter'] / freq) # Update the current theta based on the results from the worker # Worker wins/losses are always in terms of w_params result = spsa_results['wins'] - spsa_results['losses'] summary = [] w_params = self.get_params(run['_id'], worker) for idx, param in enumerate(spsa['params']): R = w_params[idx]['R'] c = w_params[idx]['c'] flip = w_params[idx]['flip'] param['theta'] = self.spsa_param_clip_round(param, R * c * result * flip, spsa['clipping'], 'deterministic') if grow_summary: summary.append({ 'theta': param['theta'], 'R': R, 'c': c, }) if grow_summary: spsa['param_history'].append(summary)
class RunDb: def __init__(self, db_name='fishtest_new'): # MongoDB server is assumed to be on the same machine, if not user should use # ssh with port forwarding to access the remote host. self.conn = MongoClient(os.getenv('FISHTEST_HOST') or 'localhost') self.db = self.conn[db_name] self.userdb = UserDb(self.db) self.actiondb = ActionDb(self.db) self.regressiondb = RegressionDb(self.db) self.runs = self.db['runs'] self.old_runs = self.db['old_runs'] self.chunk_size = 1000 def build_indices(self): self.runs.ensure_index([('finished', ASCENDING), ('last_updated', DESCENDING)]) def generate_tasks(self, num_games): tasks = [] remaining = num_games while remaining > 0: task_size = min(self.chunk_size, remaining) tasks.append({ 'num_games': task_size, 'pending': True, 'active': False, }) remaining -= task_size return tasks def new_run(self, base_tag, new_tag, num_games, tc, book, book_depth, threads, base_options, new_options, info='', resolved_base='', resolved_new='', msg_base='', msg_new='', base_signature='', new_signature='', regression_test=False, start_time=None, sprt=None, spsa=None, username=None, tests_repo=None, auto_purge=True, throughput=1000, priority=0): if start_time == None: start_time = datetime.utcnow() run_args = { 'base_tag': base_tag, 'new_tag': new_tag, 'num_games': num_games, 'tc': tc, 'book': book, 'book_depth': book_depth, 'threads': threads, 'regression_test': regression_test, 'resolved_base': resolved_base, 'resolved_new': resolved_new, 'msg_base': msg_base, 'msg_new': msg_new, 'base_options': base_options, 'new_options': new_options, 'info': info, 'base_signature': base_signature, 'new_signature': new_signature, 'username': username, 'tests_repo': tests_repo, 'auto_purge': auto_purge, 'throughput': throughput, 'priority': priority, 'internal_priority': -time.mktime(start_time.timetuple()), } if sprt != None: run_args['sprt'] = sprt if spsa != None: run_args['spsa'] = spsa new_run = { 'args': run_args, 'start_time': start_time, 'last_updated': start_time, # Will be filled in by tasks, indexed by task-id 'tasks': self.generate_tasks(num_games), # Aggregated results 'results': { 'wins': 0, 'losses': 0, 'draws': 0 }, 'results_stale': False, 'finished': False, 'approved': False, 'approver': '', } # Check for an existing approval matching the git commit SHAs def get_approval(sha): q = { '$or': [{ 'args.resolved_base': sha }, { 'args.resolved_new': sha }], 'approved': True } return self.runs.find_one(q) base_approval = get_approval(resolved_base) new_approval = get_approval(resolved_new) allow_auto = username in [ 'mcostalba', 'jkiiski', 'glinscott', 'lbraesch' ] if base_approval != None and new_approval != None and allow_auto: new_run['approved'] = True new_run['approver'] = new_approval['approver'] return self.runs.insert(new_run) def get_machines(self): machines = [] for run in self.runs.find({'tasks': {'$elemMatch': {'active': True}}}): for task in run['tasks']: if task['active']: machine = copy.copy(task['worker_info']) machine['last_updated'] = task.get('last_updated', None) machine['run'] = run machine['nps'] = task.get('nps', 0) # TODO(glinscott): Temporary - remove once worker version >= 41 if not isinstance(machine['uname'], basestring): machine['uname'] = machine['uname'][0] + machine[ 'uname'][2] machines.append(machine) return machines # Cache runs run_cache = {} run_cache_lock = threading.Lock() timer = None def get_run(self, id): with self.run_cache_lock: id = str(id) if id in self.run_cache: self.run_cache[id]['rtime'] = time.time() return self.run_cache[id]['run'] run = self.runs.find_one({'_id': ObjectId(id)}) if not run: run = self.old_runs.find_one({'_id': ObjectId(id)}) self.run_cache[id] = { 'rtime': time.time(), 'ftime': time.time(), 'run': run, 'dirty': False } return run def buffer(self, run, flush): with self.run_cache_lock: if self.timer is None: self.timer = threading.Timer(1.0, self.flush_buffers) self.timer.start() id = str(run['_id']) if flush: self.run_cache[id] = { 'dirty': False, 'rtime': time.time(), 'ftime': time.time(), 'run': run } self.runs.save(run) else: if id in self.run_cache: ftime = self.run_cache[id]['ftime'] else: ftime = time.time() self.run_cache[id] = { 'dirty': True, 'rtime': time.time(), 'ftime': ftime, 'run': run } def flush_buffers(self): with self.run_cache_lock: now = time.time() old = now + 1 oldest = None for id in self.run_cache.keys(): if not self.run_cache[id]['dirty']: if self.run_cache[id]['rtime'] < now - 10: del self.run_cache[id] elif self.run_cache[id]['ftime'] < old: old = self.run_cache[id]['ftime'] oldest = id if not oldest is None: if int(now) % 60 == 0: self.scavenge(self.run_cache[oldest]['run']) self.runs.save(self.run_cache[oldest]['run']) self.run_cache[oldest]['dirty'] = False self.run_cache[oldest]['ftime'] = time.time() self.timer = threading.Timer(1.0, self.flush_buffers) self.timer.start() def scavenge(self, run): old = datetime.utcnow() - timedelta(minutes=30) for task in run['tasks']: if task['active'] and task['last_updated'] < old: task['active'] = False def get_run_to_build(self): return self.runs.find_one({ 'binaries_url': { '$exists': False }, 'finished': False, 'deleted': { '$exists': False } }) def get_runs(self): return list(self.get_unfinished_runs()) + self.get_finished_runs()[0] def get_unfinished_runs(self): with self.run_lock: return self.runs.find({'finished': False}, sort=[('last_updated', DESCENDING), ('start_time', DESCENDING)]) def get_finished_runs(self, skip=0, limit=0, username='', success_only=False): q = {'finished': True, 'deleted': {'$exists': False}} if len(username) > 0: q['args.username'] = username if success_only: # This is unfortunate, but the only way we have of telling if a run was successful or # not currently is the color! q['results_info.style'] = '#44EB44' c = self.runs.find(q, skip=skip, limit=limit, sort=[('last_updated', DESCENDING)]) result = [list(c), c.count()] if limit != 0 and len(result[0]) != limit: c = self.old_runs.find(q, skip=max(0, skip - c.count()), limit=limit - len(result[0])) result[0] += list(c) result[1] += c.count() else: result[1] += self.old_runs.find(q).count() return result def get_results(self, run, save_run=True): if not run['results_stale']: return run['results'] results = { 'wins': 0, 'losses': 0, 'draws': 0, 'crashes': 0, 'time_losses': 0 } for task in run['tasks']: if 'stats' in task: stats = task['stats'] results['wins'] += stats['wins'] results['losses'] += stats['losses'] results['draws'] += stats['draws'] results['crashes'] += stats['crashes'] results['time_losses'] += stats.get('time_losses', 0) if 'sprt' in run['args'] and 'state' in run['args']['sprt']: results['sprt'] = run['args']['sprt']['state'] run['results_stale'] = False run['results'] = results if save_run: self.buffer(run, True) return results # Limit concurrent request_task task_lock = threading.Lock() task_time = 0 task_runs = None worker_task = {} def request_task(self, worker_info): with self.task_lock: return self.sync_request_task(worker_info) def sync_request_task(self, worker_info): # Check for blocked user or ip if self.userdb.is_blocked(worker_info): return {'task_waiting': False} if time.time() > self.task_time + 60: self.task_runs = [] for r in self.get_unfinished_runs(): self.task_runs.append(r) self.task_runs.sort(key=lambda r: (-r['args']['priority'], -r[ 'args']['internal_priority'], r['_id'])) self.task_time = time.time() max_threads = int(worker_info['concurrency']) exclusion_list = [] # Does this worker have a task already? If so, just hand that back for runt in self.task_runs: key = worker_info['unique_key'] if key in self.worker_task and self.worker_task[key] == runt['_id']: run = self.get_run(runt['_id']) task_id = -1 for task in run['tasks']: task_id = task_id + 1 if task['active'] and task['worker_info'] == worker_info: if task['pending']: return {'run': run, 'task_id': task_id} else: # Don't hand back tasks that have been marked as no longer pending task['active'] = False self.buffer(run, True) # We need to allocate a new task, but first check we don't have the same # machine already running because multiple connections are not allowed. connections = 0 for run in self.task_runs: for task in run['tasks']: if task['active'] and task['worker_info'][ 'remote_addr'] == worker_info['remote_addr']: connections = connections + 1 # Allow a few connections, for multiple computers on same IP if connections >= self.userdb.get_machine_limit( worker_info['username']): return {'task_waiting': False, 'hit_machine_limit': True} # Ok, we get a new task that does not require more threads than available concurrency run_found = False for runt in self.task_runs: run = self.get_run(runt['_id']) if run['_id'] not in exclusion_list and run['approved']: task_id = -1 for task in run['tasks']: task_id = task_id + 1 if not task['active'] and task['pending'] and run['args'][ 'threads'] <= max_threads: task['active'] = True task['last_updated'] = datetime.utcnow() task['worker_info'] = worker_info run_found = True break if run_found: break if not run_found: return {'task_waiting': False} # Recalculate internal priority based on task start date and throughput # Formula: - second_since_epoch - played_and_allocated_tasks * 3600 * chunk_size / games_throughput # With default value 'throughput = 1000', this means that the priority is unchanged as long as we play at rate '1000 games / hour'. if (run['args']['throughput'] != None and run['args']['throughput'] != 0): run['args']['internal_priority'] = -time.mktime( run['start_time'].timetuple( )) - task_id * 3600 * self.chunk_size * run['args'][ 'threads'] / run['args']['throughput'] self.buffer(run, True) self.worker_task[worker_info['unique_key']] = run['_id'] return {'run': run, 'task_id': task_id} # Create a lock for each active run run_lock = threading.Lock() active_runs = {} purge_count = 0 def active_run_lock(self, id): with self.run_lock: self.purge_count = self.purge_count + 1 if self.purge_count > 100000: old = time.time() - 10000 self.active_runs = dict( (k, v) for k, v in self.active_runs.iteritems() if v['time'] >= old) self.purge_count = 0 if id in self.active_runs: active_lock = self.active_runs[id]['lock'] self.active_runs[id]['time'] = time.time() else: active_lock = threading.Lock() self.active_runs[id] = { 'time': time.time(), 'lock': active_lock } return active_lock def update_task(self, run_id, task_id, stats, nps, spsa): lock = self.active_run_lock(str(run_id)) with lock: return self.sync_update_task(run_id, task_id, stats, nps, spsa) def sync_update_task(self, run_id, task_id, stats, nps, spsa): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} # Guard against incorrect results count_games = lambda d: d['wins'] + d['losses'] + d['draws'] num_games = count_games(stats) old_num_games = count_games( task['stats']) if 'stats' in task else num_games spsa_games = count_games(spsa) if 'spsa' in run['args'] else 0 if num_games < old_num_games \ or (spsa_games > 0 and num_games <= 0) \ or (spsa_games > 0 and 'stats' in task and num_games <= old_num_games): return {'task_alive': False} flush = False task['stats'] = stats task['nps'] = nps if num_games >= task['num_games']: task['active'] = False task['pending'] = False flush = True update_time = datetime.utcnow() task['last_updated'] = update_time run['last_updated'] = update_time run['results_stale'] = True # Update spsa results if 'spsa' in run['args'] and spsa_games == spsa['num_games']: self.update_spsa(run, spsa) # Check if SPRT stopping is enabled if 'sprt' in run['args']: sprt = run['args']['sprt'] sprt_stats = stat_util.SPRT(self.get_results(run, False), elo0=sprt['elo0'], alpha=sprt['alpha'], elo1=sprt['elo1'], beta=sprt['beta'], drawelo=sprt['drawelo']) if sprt_stats['finished']: run['args']['sprt']['state'] = sprt_stats['state'] self.stop_run(run_id, run) flush = True if (not 'spsa' in run['args'] or spsa_games == spsa['num_games'] or num_games >= task['num_games'] or len(spsa['w_params']) < 20): self.buffer(run, flush) return {'task_alive': task['active']} def failed_task(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} # Mark the task as inactive: it will be rescheduled task['active'] = False self.buffer(run, True) return {} def stop_run(self, run_id, run=None): save_it = False if run is None: run = self.get_run(run_id) save_it = True prune_idx = len(run['tasks']) for idx, task in enumerate(run['tasks']): is_active = task['active'] task['active'] = False task['pending'] = False if 'stats' not in task and not is_active: prune_idx = min(idx, prune_idx) else: prune_idx = idx + 1 # Truncate the empty tasks if prune_idx < len(run['tasks']): del run['tasks'][prune_idx:] if save_it: self.buffer(run, True) self.task_time = 0 return {} def approve_run(self, run_id, approver): run = self.get_run(run_id) # Can't self approve if run['args']['username'] == approver: return False run['approved'] = True run['approver'] = approver self.buffer(run, True) self.task_time = 0 return True def spsa_param_clip_round(self, param, increment, clipping, rounding): value = 0.0 if clipping == 'old': value = min(max(param['theta'] + increment, param['min']), param['max']) else: #clipping == 'careful': inc = min(abs(increment), abs(param['theta'] - param['min']) / 2, abs(param['theta'] - param['max']) / 2) if inc > 0: inc_sgn = 0 if increment == 0 else increment / abs(increment) value = param['theta'] + inc_sgn * inc else: #revert to old behavior to bounce off boundary value = min(max(param['theta'] + increment, param['min']), param['max']) #'deterministic' rounding calls round() inside the worker. #'randomized' says 4.p should be 5 with probability p, 4 with probability 1-p, # and is continuous (albeit after expectation) unlike round(). if rounding == 'randomized': fl = math.floor( value) #greatest integer <= value, thus works for negative. if random.uniform(0, 1) < value - fl: value = fl + 1 else: value = fl return value def request_spsa(self, run_id, task_id): run = self.get_run(run_id) if task_id >= len(run['tasks']): return {'task_alive': False} task = run['tasks'][task_id] if not task['active'] or not task['pending']: return {'task_alive': False} result = { 'task_alive': True, 'w_params': [], 'b_params': [], } spsa = run['args']['spsa'] if 'clipping' not in spsa: spsa['clipping'] = 'old' if 'rounding' not in spsa: spsa['rounding'] = 'deterministic' # Generate the next set of tuning parameters iter_local = spsa[ 'iter'] + 1 #assume at least one completed, and avoid division by zero for param in spsa['params']: a = param['a'] / (spsa['A'] + iter_local)**spsa['alpha'] c = param['c'] / iter_local**spsa['gamma'] R = a / c**2 flip = 1 if bool(random.getrandbits(1)) else -1 result['w_params'].append({ 'name': param['name'], 'value': self.spsa_param_clip_round(param, c * flip, spsa['clipping'], spsa['rounding']), 'R': R, 'c': c, 'flip': flip, }) result['b_params'].append({ 'name': param['name'], 'value': self.spsa_param_clip_round(param, -c * flip, spsa['clipping'], spsa['rounding']), }) return result def update_spsa(self, run, spsa_results): spsa = run['args']['spsa'] if 'clipping' not in spsa: spsa['clipping'] = 'old' spsa['iter'] += int(spsa_results['num_games'] / 2) # Update the current theta based on the results from the worker # Worker wins/losses are always in terms of w_params result = spsa_results['wins'] - spsa_results['losses'] summary = [] for idx, param in enumerate(spsa['params']): R = spsa_results['w_params'][idx]['R'] c = spsa_results['w_params'][idx]['c'] flip = spsa_results['w_params'][idx]['flip'] param['theta'] = self.spsa_param_clip_round( param, R * c * result * flip, spsa['clipping'], 'deterministic') summary.append({ 'theta': param['theta'], 'R': R, 'c': c, }) # Every 100/1000 iterations, record the parameters # and stop at 500000 or 200000 if 'param_history' not in spsa: spsa['param_history'] = [] if len(spsa['params']) < 20: freq = 100 maxlen = 5001 elif len(spsa['params']) < 50: freq = 1000 maxlen = 201 else: freq = 10000 maxlen = 41 if len(spsa['param_history']) < maxlen: if len(spsa['param_history']) < spsa['iter'] / freq: spsa['param_history'].append(summary) elif len(spsa['param_history']) > maxlen: spsa['param_history'] = spsa['param_history'][-maxlen:]