def observe(self, hpo): debug('observe') new_results = 0 m = self.pop_result() while m is not None: actioned = True if m.mtype == RESULT_ITEM: info(f'HPO {self.experiment} observed {m.message[0]["uid"]}') try: hpo.observe(m.message[0], m.message[1]) new_results += 1 except TrialDoesNotExist as e: warning(f'Could not observe trial: {e}') actioned = False elif m.mtype == WORKER_JOIN: self.worker_count += 1 elif m.mtype == WORKER_LEFT: self.worker_count -= 1 else: debug(f'Received: {m}') if actioned: self.future_client.mark_actioned(RESULT_QUEUE, m) m = self.pop_result() return new_results
def run_hpo(self, message, _): """Run the HPO only when needed and then let it die until the results are ready""" state = message.message namespace = message.namespace info(f'Starting (hpo: {namespace})') # Instantiate HPO hpo = exec_remote_call(state['hpo']) hpo_state = state.get('hpo_state') if hpo_state is not None: hpo.load_state_dict(hpo_state) manager = HPOManager(self.client, state, self.backoff.get(namespace, 0)) new_results, new_trials = manager.step(hpo) if new_trials: self.backoff[namespace] = 0 else: # Cap to 5 minutes sleep (2 ** 8) self.backoff[namespace] = min( self.backoff.get(namespace, 0) + 1, 8) info( f'HPO read (results: {new_results}) and queued (trials: {new_trials})' ) # Return the future work that has to be done # before marking this task as complete return manager.recorded_operations()
def __init__(self, uri, database, id, experiment=None, hpo_allowed=True, work_allowed=True, log_capture=False): super(TrialWorker, self).__init__(uri, database, experiment, id, WORK_QUEUE, RESULT_QUEUE) self.namespaced = experiment is not None self.client.capture = log_capture if work_allowed: self.new_handler(WORK_ITEM, self.run_trial) if hpo_allowed: self.new_handler(HPO_ITEM, self.run_hpo) self.new_handler(WORKER_JOIN, self.ignore_message) self.timeout = option('worker.timeout', 5 * 60, type=int) self.max_retry = option('worker.max_retry', 3, type=int) self.backoff = dict() # Disable shutting down when receiving shut down if experiment is None: info(f'Disabling message shutdown because {experiment}') self.dispatcher[SHUTDOWN] = lambda *args, **kwargs: print( 'ignoring shutdown signal')
def suggest(self, depth=0): """Pop an item from the work queue""" if depth > 0: time.sleep(1) # if depth > 10: # raise WaitingForTrials(f'Retried to find new trials {depth} times without success') m = None while m is None: m = self.client.pop(WORK_QUEUE, self.experiment) if m is None: time.sleep(0.001) if m.mtype == HPO_ITEM: self.run_hpo(m) return self.suggest(depth + 1) elif m.mtype == WORK_ITEM: self.current_message = m return [m.message['kwargs']] elif m.mtype == SHUTDOWN: self.client.push(RESULT_QUEUE, self.experiment, {}, mtype=WORKER_LEFT) raise OptimizationIsDone() info(f'Received unsupported message {m}') return self.suggest(depth + 1)
def result(self): state = self._fetch_final_state() if state is None: info('No HPO_ITEM message found') return None state = state.message self.hpo.load_state_dict(state['hpo_state']) return self.hpo.result()
def single_gpu_launch(task_name, script_args, job_env, device_id, rank, world_size, port): """Launch the task for a given GPU""" info(f'Launching job on (device: {device_id})') script = f'{os.path.dirname(__file__)}/{task_name}.py' cmd = list([f'CUDA_VISIBLE_DEVICES={device_id}', sys.executable, '-u']) cmd.append(script) cmd.extend(script_args) return subprocess.Popen(' '.join(cmd), env=job_env, shell=True)
def launch_workers(self, count, namespaced=True): """Launching async workers""" info('starting workers') namespace = self.experiment if not namespaced: namespace = None for w in range(0, count): self.workers.append( TrialWorker.async_worker(self.uri, self.database, w, namespace))
def run_trial(self, message, context): """Run a trial and return its result""" state = message.message uid = state['kwargs']['uid'] info(f'Starting (trial: {uid})') state['kwargs']['experiment_name'] = context['namespace'] state['kwargs']['client'] = self.client result = exec_remote_call(state) state['kwargs'].pop('experiment_name') state['kwargs'].pop('client') info(f'Finished (trial: {uid}) with (objective: {result:.5f})') return state['kwargs'], result
def split(datasets, data_size, seed, ratio, index, balanced): n_train = datasets.train_size n_valid = datasets.valid_size n_test = datasets.test_size n_points = len(datasets) assert n_points == n_train + n_valid + n_test info('Using the original split') return Split( train=range(n_train), valid=range(n_train, n_train + n_valid), test=range(n_train + n_valid, n_points))
def local_multigpu_launch(task_name, script_args, job_env, device_id, rank, world_size, port): """Launch the task using multiple GPUs""" info(f'Launching job on (device: {device_id})') script = f'{os.path.dirname(__file__)}/{task_name}.py' cmd = list([f'CUDA_VISIBLE_DEVICES={device_id}', sys.executable, '-u']) cmd.append(script) cmd.extend(('--rank', str(rank))) cmd.extend(('--world-size', str(world_size))) cmd.extend(('--dist-url', f'nccl:tcp://localhost:{port}')) cmd.extend(script_args) return subprocess.Popen(' '.join(cmd), env=job_env, shell=True)
def safe_load(self, name, device): """Handles a few common exceptions for you and returns None if a file is not found""" try: return self.load(name, device=device) except RuntimeError as e: # This error happens when there is a mismatch between save device and current device if 'CPU-only machine' in str(e): raise KeyboardInterrupt( 'Job got scheduled on bad node.') from e except FileNotFoundError: info(f'State file {name} was not found') return None
def kill_idle_worker(self, hpo): remaining = hpo.remaining() worker = self.worker_count # Keep a spare worker kill_worker = max(worker - (remaining + 1), 0) info( f'killing {kill_worker} workers because (worker: {worker}) > (remaining: {remaining}) ' ) for i in range(kill_worker): self.future_client.push(WORK_QUEUE, self.experiment, {}, mtype=SHUTDOWN)
def save(self, task): if self.uid is None: raise BadCheckpoint('No uid was given cannot save state') was_saved = False state = state_dict(task) state['rng'] = get_rng_states() # Was enough time passed since last save now = datetime.utcnow() elapsed = now - self.last_save should_save = elapsed.total_seconds() > self.time_buffer # Is it the best model we have seen so far is_best = True if self.keep_best is not None: is_best = self.keep_best(task.metrics.value()) if state: # Current model is not the best and we did not save the last model in a different path # (which is the best right now) # So we need to move the last state so it does not get overridden by current state if not is_best and self.best_name is None: info(f'Saving best ({self.keep_best.metric}: {self.keep_best.best})') self.best_name = self.new_best_name() was_pending = self.save_pending() if not was_pending: self.storage.rename(self.uid, self.best_name) if should_save: was_saved = self.storage.save(self.uid, state) self.save_pending() self.pending = None self.last_save = datetime.utcnow() else: self.save_pending() self.pending = (is_best, state) # we have a new best and the best was saved as with a different filename # So we need to change both the best state and the latest state if is_best and self.best_name is not None: info(f'New best ({self.keep_best.metric}: {self.keep_best.best})') self.storage.remove(self.best_name) self.best_name = self.new_best_name() was_pending = self.save_pending() if not was_pending: self.storage.copyfile(self.uid, self.best_name) else: warning('The state dictionary was empty!') if was_saved: info('Checkpoint saved') return info('Skipped Checkpoint')
def run_hpo(self, message): state = message.message # Instantiate HPO self.hpo = exec_remote_call(state['hpo']) hpo_state = state.get('hpo_state') if hpo_state is not None: self.hpo.load_state_dict(hpo_state) manager = HPOManager(self.client, state) new_results, new_trials = manager.step(self.hpo) info( f'HPO read (results: {new_results}) and queued (trials: {new_trials})' )
def suggest(self, hpo): debug('suggest') trials = self._maybe_suggest(hpo, **self.work['kwargs']) if trials is None: return 0 for trial in trials: new_work = copy.deepcopy(self.work) new_work['kwargs'] = trial info(f'HPO {self.experiment} suggested {trial["uid"]}') self.future_client.push(WORK_QUEUE, self.experiment, new_work, mtype=WORK_ITEM) return len(trials)
def __call__(self, input_size, output_size, attention_probs_dropout_prob, hidden_dropout_prob): cache_dir = option('model.cache', '/tmp/olympus/cache') info('model cache folder: {}'.format(cache_dir)) config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, finetuning_task=self.task, cache_dir=cache_dir) config.attention_probs_dropout_prob = attention_probs_dropout_prob config.hidden_dropout_prob = hidden_dropout_prob model = BertWrapper.from_pretrained('bert-base-uncased', from_tf=False, config=config, cache_dir=cache_dir) return model
def step(self, hpo): new_results = self.observe(hpo) new_trials = self.suggest(hpo) if hpo.is_done(): self.shutdown() # Queue the HPO but this time in the result queue self.queue_hpo(hpo, RESULT_QUEUE) return 0, 0 else: self.kill_idle_worker(hpo) if new_trials == 0: info(f'HPO sleeping {2 ** self.backoff} seconds') time.sleep(2**self.backoff) if 'hpo_state' in self.state: self.queue_hpo(hpo) return new_results, new_trials
def build(input_size, output_size): cfg = [[1, 16, 1, 1], [6, 24, 2, 1], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], [6, 320, 1, 1]] if input_size == (1, 28, 28): info('Using MobileNetV2 architecture for MNIST') conv = {'kernel_size': 3, 'stride': 1, 'padding': 1} avgpool = {'kernel_size': 4} elif input_size == (3, 32, 32): info('Using MobileNetV2 architecture for CIFAR10/100') conv = {'kernel_size': 3, 'stride': 1, 'padding': 1} avgpool = {'kernel_size': 4} elif input_size == (3, 64, 64): info('Using MobileNetV2 architecture for TinyImageNet') conv = {'kernel_size': 3, 'stride': 2, 'padding': 1} avgpool = {'kernel_size': 2} cfg[1][-1] = 2 # TODO: Add support for ImageNet return MobileNetV2(cfg, input_size, num_classes=output_size, conv=conv, avgpool=avgpool)
def __init__(self, layers, input_size, num_classes, batch_norm): super(VGG, self).__init__() if input_size == (1, 28, 28): info('Using VGG architecture for MNIST') classifier = {'input': 512, 'hidden': None} layers = layers[:-1] # Drop last maxpool elif input_size == (3, 32, 32): info('Using VGG architecture for CIFAR10/100') classifier = {'input': 512, 'hidden': None} elif input_size == (3, 64, 64): info('Using VGG architecture for TinyImageNet') classifier = {'input': 2048, 'hidden': 1024} # TODO: Add support for ImageNet else: raise ValueError( 'There is no VGG architecture for an input size {}'.format( input_size)) self.features = self.make_layers(input_size[0], layers, batch_norm) if classifier.get('hidden'): self.classifier = nn.Sequential( nn.Linear(classifier['input'], classifier['hidden']), nn.ReLU(True), nn.Dropout(), nn.Linear(classifier['hidden'], classifier['hidden']), nn.ReLU(True), nn.Dropout(), nn.Linear(classifier['hidden'], num_classes), ) else: self.classifier = nn.Linear(classifier['input'], num_classes) self._initialize_weights()
def __init__(self, input_size, num_classes): super(LeNet, self).__init__() if not isinstance(num_classes, int): num_classes = numpy.product(num_classes) n_channels = input_size[0] if tuple(input_size) == (1, 28, 28): info('Using LeNet architecture for MNIST') self.conv1 = nn.Conv2d(n_channels, 20, 5, 1) self.pool1 = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(20, 50, 5, 1) self.pool2 = nn.MaxPool2d(2, 2) self.fc1 = nn.Linear(50 * 4 * 4, 500) self.fc2 = nn.Linear(500, num_classes) elif tuple(input_size) == (3, 32, 32): info('Using LeNet architecture for CIFAR10/100') self.conv1 = nn.Conv2d(n_channels, 20, 5, 1) self.pool1 = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(20, 50, 5, 1) self.pool2 = nn.MaxPool2d(2, 2) self.fc1 = nn.Linear(50 * 5 * 5, 500) self.fc2 = nn.Linear(500, num_classes) elif tuple(input_size) == (3, 64, 64): info('Using LeNet architecture for TinyImageNet') self.conv1 = nn.Conv2d(n_channels, 20, 5, 1) self.pool1 = nn.MaxPool2d(3, 3) self.conv2 = nn.Conv2d(20, 50, 5, 1) self.pool2 = nn.MaxPool2d(3, 3) self.fc1 = nn.Linear(50 * 5 * 5, 500) self.fc2 = nn.Linear(500, num_classes) else: raise ValueError( 'There is no LeNet architecture for an input size {}'.format( input_size))
def build(block, cfg, input_size, output_size): if not isinstance(output_size, int): output_size = numpy.product(output_size) if input_size == (1, 28, 28): info('Using PreActResNet architecture for MNIST') conv = {'kernel_size': 3, 'stride': 1, 'padding': 1} avgpool = {'kernel_size': 4} maxpool = {} elif input_size == (3, 32, 32): info('Using PreActResNet architecture for CIFAR10/100') conv = {'kernel_size': 3, 'stride': 1, 'padding': 1} avgpool = {'kernel_size': 4} maxpool = {} elif input_size == (3, 64, 64): info('Using PreActResNet architecture for TinyImageNet') conv = {'kernel_size': 7, 'stride': 2, 'padding': 3} avgpool = {'kernel_size': 2} maxpool = {'kernel_size': 3, 'stride': 2, 'padding': 1} # Add Resnet for ImageNet (3, 224, 224)! model = ResNet(block, cfg, input_size=input_size, conv=conv, maxpool=maxpool, avgpool=avgpool, num_classes=output_size) return model
def on_new_trial(self, task, step, parameters, uid): """On new trial try to resume the new trial""" # Make a unique id for resuming self.uid = parameters.get('uid', uid) if self.uid is None: self.uid = unique_trial_id(task.__class__.__name__, parameters) state = self.storage.safe_load(self.uid, device=task.device) if state is not None: set_rng_states(state['rng']) load_state_dict(task, state) info(f'Resuming (trial_id: {self.uid})') else: meta = dict(parameters=parameters, task=type(task).__name__) self.storage.save_meta(self.uid, meta) info(f'Starting a new (trial_id: {self.uid})') if state is None and self.save_init: state = state_dict(task) # state['rng'] = get_rng_states() self.storage.save(f'init_{self.uid}', state)
def build(block, cfg, input_size, output_size): if input_size == (1, 28, 28): info('Using PreActResNet architecture for MNIST') conv = {'kernel_size': 3, 'stride': 1, 'padding': 1} avgpool = {'kernel_size': 4} maxpool = {} elif input_size == (3, 32, 32): info('Using PreActResNet architecture for CIFAR10/100') conv = {'kernel_size': 3, 'stride': 1, 'padding': 1} avgpool = {'kernel_size': 4} maxpool = {} elif input_size == (3, 64, 64): info('Using PreActResNet architecture for TinyImageNet') conv = {'kernel_size': 7, 'stride': 2, 'padding': 3} avgpool = {'kernel_size': 2} maxpool = {'kernel_size': 3, 'stride': 2, 'padding': 1} return PreActResNet(block, cfg, input_size=input_size, num_classes=output_size, conv=conv, maxpool=maxpool, avgpool=avgpool)
def wait(self): for w in self.workers: w.join() w.close() info(f'joining worker{w}')