def foo(uid, a, b, c, d, e=1, epoch=0, experiment_name=NAMESPACE, client=None): result = a + 2 * b - c**2 + d + e client = new_client(URI, DATABASE) for i in range(epoch + 1): data = {'obj': i + result, 'valid': i + result, 'uid': uid, 'epoch': i} client.push(METRIC_QUEUE, experiment_name, data, mtype=METRIC_ITEM) return result + i
def run(uri, database, namespace, function, num_experiments, num_repro, objective, variables, defaults, params, resumable, sleep_time=60, save_dir='.'): client = new_client(uri, database) defaults.update(dict(list(variables.items()) + list(params.items()))) configs = generate(num_experiments, num_repro, objective, list(sorted(variables)), defaults, resumable) register(client, function, namespace, configs) wait(client, namespace, sleep=sleep_time) data = fetch_results(client, namespace, configs, list(sorted(variables)), params, defaults) save_results(namespace, data, save_dir) test(data, num_experiments, num_repro, objective, variables, resumable)
def run(uri, database, namespace, function, fidelity, space, count, variables, plot_filename, objective, defaults, save_dir='.', sleep_time=60, register=True): if fidelity is None: fidelity = Fidelity(1, 1, name='epoch').to_dict() defaults.update(variables) config = { 'name': 'random_search', 'fidelity': fidelity, 'space': space, 'count': count } client = new_client(uri, database) if not is_registered(client, namespace) and register: register_hpo(client, namespace, function, config, defaults=defaults) while not is_hpo_completed(client, namespace): print_status(client, namespace) time.sleep(sleep_time) # get the result of the HPO print(f'HPO is done') data = fetch_hpo_valid_curves(client, namespace, list(sorted(variables.keys()))) save_results(namespace, data, save_dir) plot(space, objective, data, plot_filename, model_seed=1)
def my_important_transaction(): # Nothing is done there, everything is lazy client = RecordQueue() client.do_something_1() time.sleep(5) client.do_something_2() # Our Transaction is running there client.execute(FakeClient(new_client(URI, DATABASE)))
def __init__(self, uri=None, database=None, experiment=None, client=None): if ERROR is not None: raise ERROR self.experiment = experiment if client is None: client = new_client(uri, database) self.client = client self.uid = None
def __enter__(self): self.server = new_server(uri=self.uri, database=DATABASE) try: self.server.start(wait=True) except Exception as e: self.server.stop() shutil.rmtree('/tmp/queue/', ignore_errors=True) raise e self.client = new_client(self.uri, DATABASE, 'client-test') self.monitor = self.client.monitor() return self
def test_hpo_serializable(model_type): namespace = 'test-robo-' + model_type n_init = 2 count = 10 # First run using a remote worker where serialization is necessary # and for which hpo is resumed between each braning call hpo = build_robo(model_type, n_init=n_init, count=count) namespace = 'test_hpo_serializable' hpo = { 'hpo': make_remote_call(HPOptimizer, **hpo.kwargs), 'hpo_state': None, 'work': make_remote_call(branin), 'experiment': namespace } client = new_client(URI, DATABASE) client.push(WORK_QUEUE, namespace, message=hpo, mtype=HPO_ITEM) worker = TrialWorker(URI, DATABASE, 0, None) worker.max_retry = 0 worker.timeout = 1 worker.run() messages = client.monitor().unread_messages(RESULT_QUEUE, namespace) for m in messages: if m.mtype == HPO_ITEM: break assert m.mtype == HPO_ITEM, 'HPO not completed' worker_hpo = build_robo(model_type) worker_hpo.load_state_dict(m.message['hpo_state']) assert len(worker_hpo.trials) == count # Then run locally where BO is not resumed local_hpo = build_robo(model_type, n_init=n_init, count=count) i = 0 best = float('inf') while local_hpo.remaining() and i < local_hpo.hpo.count: samples = local_hpo.suggest() for sample in samples: z = branin(**sample) local_hpo.observe(sample['uid'], z) best = min(z, best) i += 1 assert i == local_hpo.hpo.count # Although remote worker was resumed many times, it should give the same # results as the local one which was executed in a single run. assert worker_hpo.trials == local_hpo.trials
def test_check_sigkill_nothing_happened(signal): client = new_client(URI, DATABASE) client.db[QUEUE].drop() client.push(QUEUE, NAMESPACE, {'my_work': 0}) p = Process(target=my_important_transaction) p.start() time.sleep(1) os.kill(p.pid, signal) # Nothing was done, the process died before the transaction assert not client.monitor().messages(QUEUE, NAMESPACE)[0].read assert not client.monitor().messages(QUEUE, NAMESPACE)[0].actioned
def __init__(self, queue_uri, database, namespace, worker_id, work_queue, result_queue=None): self.uri = queue_uri self.namespace = namespace self.client: MessageQueue = new_client(queue_uri, database) self.running = False self.work_id = worker_id self.broker = None self.work_queue = work_queue self.result_queue = result_queue self.context = {} self.client.name = f'worker-{self.work_id}' self.namespaced = True self.timeout = 5 * 60 self.max_retry = 3 self.dispatcher = { SHUTDOWN: self.shutdown_worker }
def test_check_sigterm_everything_finished(signal): client = new_client(URI, DATABASE) client.db[QUEUE].drop() client.push(QUEUE, NAMESPACE, {'my_work': 0}) p = Process(target=my_important_transaction) p.start() time.sleep(10) # Kill during the transaction os.kill(p.pid, signal) # Should not be able to kill it until the end of the thread p.join() assert client.monitor().messages(QUEUE, NAMESPACE)[0].read assert client.monitor().messages(QUEUE, NAMESPACE)[0].actioned
def run(uri, database, namespace, function, objective, medians, defaults, variables, params, num_experiments, add_reference, sleep_time=60, save_dir='.'): if num_experiments is None: num_experiments = 20 client = new_client(uri, database) defaults.update(dict(list(variables.items()) + list(params.items()))) configs = generate(range(num_experiments), medians, defaults, add_reference=False) register(client, function, namespace, configs) wait(client, namespace, sleep=sleep_time) data = fetch_results(client, namespace, configs, medians, params, defaults) defaults.update(get_medians(data, medians, objective)) new_configs = generate(range(num_experiments), variables, defaults, add_reference=add_reference) register(client, function, namespace, new_configs) wait(client, namespace, sleep=5) configs.update(new_configs) data = fetch_results(client, namespace, configs, variables, params, defaults) save_results(namespace, data, save_dir)
def __init__(self, uri, database, experiment, clean=False, launch_server=False): # Start a message broker self.database = database self.uri = uri self.broker = None if launch_server: self.broker = new_server(uri, database) self.broker.start() self.client = new_client(uri, database) self.client.name = 'group-leader' self.experiment = experiment self.workers = [] if clean: self.clear_queue()
def __init__(self, hpo, rank, uri, experiment, database=option('olympus.database', 'olympus')): self.hpo = hpo self.experiment = experiment self.client = new_client(uri, database) self.current_message = None # check that HPO is not finished state = self._fetch_final_state() if state is not None: raise ExperimentFinished( f'Experiment `{experiment}` is finished, change the experiment name' ) # first worker queue HPO if rank == 0: self._queue_hpo() # broadcast that one worker is joining self.client.push(RESULT_QUEUE, self.experiment, {}, mtype=WORKER_JOIN)
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--uri', default='mongo://127.0.0.1:27017', type=str) parser.add_argument('--database', default='olympus', type=str) parser.add_argument('--namespace', type=str) options = parser.parse_args(argv) client = new_client(options.uri, options.database) if options.namespace is None: print('Found') print(client.db[METRIC_QUEUE].count()) print(client.db[WORK_QUEUE].count()) print(client.db[RESULT_QUEUE].count()) stats = client.db[WORK_QUEUE].aggregate([ { '$project': { 'namespace': 1, } }, { '$group': { '_id': '$namespace', } }, ]) stats = sorted(doc['_id'] for doc in stats) if not stats: print(f'No namespace found for {options.namespace}') return 0 print('\n'.join(stats)) output = input( 'Do you want to delete all matching namespaces above. (y/n):') if output != 'y': print('Cancel purge') return client.db[METRIC_QUEUE].drop() client.db[WORK_QUEUE].drop() client.db[RESULT_QUEUE].drop() print(client.db[METRIC_QUEUE].count()) print(client.db[WORK_QUEUE].count()) print(client.db[RESULT_QUEUE].count()) else: query = { 'namespace': { '$regex': re.compile(f"^{options.namespace}", re.IGNORECASE) } } stats = client.db[WORK_QUEUE].aggregate([ { '$match': query }, { '$project': { 'namespace': 1, } }, { '$group': { '_id': '$namespace', } }, ]) stats = sorted(doc['_id'] for doc in stats) if not stats: print(f'No namespace found for {options.namespace}') return 0 print('\n'.join(stats)) output = input( 'Do you want to delete all matching namespaces above. (y/n):') if output != 'y': print('Cancel purge') return print('Found') print(client.db[METRIC_QUEUE].count(query)) print(client.db[WORK_QUEUE].count(query)) print(client.db[RESULT_QUEUE].count(query)) client.db[METRIC_QUEUE].remove(query) client.db[WORK_QUEUE].remove(query) client.db[RESULT_QUEUE].remove(query) print('Now there is') print(client.db[METRIC_QUEUE].count(query)) print(client.db[WORK_QUEUE].count(query)) print(client.db[RESULT_QUEUE].count(query))
def test_user_pass(): uri = f'mongo://*****:*****@127.0.0.1:27017' client = new_client(uri, 'test') client.push(QUEUE, NAMESPACE, 'test') _ = client.pop(QUEUE, NAMESPACE)
def client(): return new_client(URI, DATABASE)
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--uri', default='mongo://127.0.0.1:27017', type=str) parser.add_argument('--database', default='olympus', type=str) parser.add_argument('--namespace', default=None, type=str) parser.add_argument('--test-only', action='store_true') parser.add_argument('--show-errors', action='store_true') args = parser.parse_args(argv) set_verbose_level(3) client = new_client(args.uri, args.database) query = { 'namespace': { '$regex': re.compile(f"^{args.namespace}", re.IGNORECASE) } } stats = client.db[WORK_QUEUE].aggregate([ { '$match': query }, { '$project': { 'namespace': 1, } }, { '$group': { '_id': '$namespace', } }, ]) stats = sorted(doc['_id'] for doc in stats) if not stats: print(f'No namespace found for {args.namespace}') return 0 if len(stats) > 1: print('\n'.join(stats)) print('All these namespaces were found.') namespaces = stats else: namespaces = [args.namespace] for namespace in namespaces: print() print(namespace) if args.show_errors: show_errors(client, namespace, HPO_ITEM) show_errors(client, namespace, WORK_ITEM) repair_hpo_duplicates(client, namespace, test_only=args.test_only) repair_trials_duplicates(client, namespace, test_only=args.test_only) repair_hpo_lost_results(client, args.uri, args.database, namespace, test_only=args.test_only) failover_broken(client, namespace, test_only=args.test_only)
def run(uri, database, namespace, function, num_experiments, num_simuls, fidelity, space, objective, variables, defaults, num_replicates=None, sleep_time=60, do_full_train=False, save_dir='.', seed=1, register=True, rep_types=REP_TYPES): hpo_budget = 100 surrogate_budget = 200 if num_replicates is None: num_replicates = num_experiments # We use 200 trials to fit the surrogate models (surrogate_budget is 200) # but we only need 100 for the ideal (hpo_budget is 100) # therefore, since num_simuls is at least half smaller than number of # replicates, we can run only (num_replicates / 2) hpo runs and use # first half and second 100 half as 2 separe ideal runs. # This is possible since we are using random search. assert (num_experiments % 2) == 0 assert num_simuls <= (num_experiments / 2) num_ideal = num_experiments // 2 hpo = 'random_search' # TODO # for each repetition, vary all sources of variations # when one hpo is done, create all biased and simulations if fidelity is None: fidelity = Fidelity(1, 1, name='epoch').to_dict() client = new_client(uri, database) configs = generate_hpos(list(range(num_ideal)), [hpo], surrogate_budget, fidelity, space, namespace, defaults) to_replicate = get_configs_to_replicate(configs, num_simuls) reset_pool_size(configs['random_search']) randomize_seeds(configs['random_search'], variables, seed) variable_names = list(sorted(variables.keys())) hpo_stats = fetch_all_hpo_stats(client, namespace) namespaces = register_hpos(client, namespace, function, configs, defaults, hpo_stats, register=register) remainings = namespaces data_hpo = defaultdict(dict) all_replicates = dict(random_search=dict()) while sum(remainings.values(), []): print_status(client, namespace, namespaces) hpos_ready, remainings = fetch_hpos_valid_curves( client, remainings, variable_names, data_hpo) ready_configs = get_ready_configs(hpos_ready, configs, to_replicate) replicates = generate_replicates(ready_configs, data_hpo, variables, objective, hpo_budget, num_replicates, early_stopping=False, rep_types=rep_types) if register: registered_replicates = register_all_replicates( client, function, namespace, replicates) if replicates.get('random_search'): all_replicates['random_search'].update(replicates['random_search']) if sum(remainings.values(), []) and not registered_replicates: time.sleep(sleep_time) wait(client, namespace, sleep=sleep_time) data_replicates = fetch_hpos_replicates(client, configs, all_replicates, variable_names, space, rep_types) # Save valid results data = consolidate_results(data_hpo, data_replicates, rep_types) save_results(namespace, data, save_dir)
def client(): return new_client('mongo://127.0.0.1:27017', 'olympus')
def run(uri, database, namespace, function, num_experiments, budget, fidelity, space, objective, variables, defaults, sleep_time=60, do_full_train=False, save_dir='.', partial=False, register=True): # TODO: Add hyperband hpos = ['grid_search', 'nudged_grid_search', 'noisy_grid_search', 'random_search', 'bayesopt'] if fidelity is None: fidelity = Fidelity(1, 1, name='epoch').to_dict() # TODO: Add back when hyperband is implemented # if fidelity['min'] == fidelity['max']: # hpos.remove(hpos.index('hyperband')) if num_experiments is None: num_experiments = 2 client = new_client(uri, database) hpo_stats = fetch_all_hpo_stats(client, namespace) configs = generate_hpos( list(range(num_experiments)), hpos, budget, fidelity, space, namespace, defaults) variable_names = list(sorted(variables.keys())) if partial: namespaces = defaultdict(list) for hpo, hpo_configs in configs.items(): for hpo_namespace, config in hpo_configs.items(): namespaces[hpo].append(hpo_namespace) data = defaultdict(dict) fetch_hpos_valid_curves(client, namespaces, variable_names, data, partial=True) data = consolidate_results(data) save_results(namespace, data, save_dir) return namespaces = register_hpos( client, namespace, function, configs, dict(list(variables.items()) + list(defaults.items())), hpo_stats, register) remainings = namespaces print_status(client, namespace, namespaces) data = defaultdict(dict) while sum(remainings.values(), []): hpos_ready, remainings = fetch_hpos_valid_curves(client, remainings, variable_names, data) # TODO: Implement full-train part if do_full_train: configs = generate_tests(data, defaults, registered) new_registered_tests = register_tests(client, namespace, function, configs) if not sum(hpos_ready.values(), []): print_status(client, namespace, namespaces) time.sleep(sleep_time) # Save valid results data = consolidate_results(data) save_results(namespace, data, save_dir) if not do_full_train: return # TODO: Implement full-train part wait(completed) # take the sum of all hpo_namespaces # NOTE & TODO: This should follow the same format as valid results, but we need to # make sure the mapping in order of trials is the same. data = fetch_results(client, namespace, namespaces) # Save test results save_results(namespace, data, save_dir)