def server_handle_child_message( msg_output, controller, mi_info, options, n_idle, curr_iter): """ Petridish server handles the return message of a forked process that watches over a child job. """ log_dir_root = logger.get_logger_dir() q_parent, q_hallu = controller.q_parent, controller.q_hallu model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter fp, ve, te = jr['fp'], jr['ve'], jr['te'] logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format( model_iter, ve, te, fp * 1e-9)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp if (search_depth // 2 < options.max_growth and (options.search_max_flops is None or fp < options.search_max_flops)): controller.add_one_to_queue( q_parent, mi_info, model_iter, None) if q_parent.size() > 0: # choose a parent. pqe = controller.choose_parent(q_parent, mi_info) model_str, model_iter, _parent_iter, search_depth = pqe logger.info('PARENT : mi={}'.format(model_iter)) # Create hallucinations on the parent net_info_parent = net_info_from_str(model_str) n_hallu_per_parent = max( 1, min(controller.n_hallu_per_parent_on_idle, n_idle)) for _ in range(n_hallu_per_parent): net_info = copy.deepcopy(net_info_parent) hallus = net_info.sample_hallucinations( layer_ops=controller.valid_operations, merge_ops=controller.merge_operations, prob_at_layer=None, min_num_hallus=options.n_hallus_per_init, hallu_input_choice=options.hallu_input_choice) net_info = net_info.add_hallucinations( hallus, final_merge_op=controller.hallu_final_merge_op, stop_gradient_val=controller.stop_gradient_val, hallu_gate_layer=controller.hallu_gate_layer) # Update mi_info curr_iter += 1 hallu_str = net_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth + 1, None, None, hallu_str)) controller.add_one_to_queue( q_hallu, mi_info, curr_iter, net_info) return curr_iter
def server_handle_child_message_soft_vs_hard( msg_output, controller, mi_info, options, n_idle, curr_iter): """ Special replacement of server_handle_child_message for experimenting on soft init vs. hard init. This is for experiment only. TODO reuse code with regular server_handle_child_message? """ log_dir_root = logger.get_logger_dir() q_parent, q_hallu = controller.q_parent, controller.q_hallu model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter fp, ve, te = jr['fp'], jr['ve'], jr['te'] logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format( model_iter, ve, te, fp * 1e-9)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp if search_depth > 0: return curr_iter controller.n_hallu_per_parent_on_idle = 1 # for soft vs hard experiment, only root generates hallu. controller.add_one_to_queue(q_parent, mi_info, model_iter, None) if q_parent.size() > 0: # choose a parent. pqe = controller.choose_parent(q_parent, mi_info) model_str, model_iter, _parent_iter, search_depth = pqe logger.info('PARENT : mi={}'.format(model_iter)) # Create hallucinations on the parent net_info_parent = net_info_from_str(model_str) # this experiment only creates one hallu from the root hallus = net_info_parent.sample_hallucinations( layer_ops=controller.valid_operations, merge_ops=controller.merge_operations, prob_at_layer=None, min_num_hallus=options.n_hallus_per_init, hallu_input_choice=options.hallu_input_choice) for netmorph_method in ['hard', 'soft']: controller.set_netmorph_method(netmorph_method) net_info = copy.deepcopy(net_info_parent) net_info = net_info.add_hallucinations( hallus, final_merge_op=controller.hallu_final_merge_op, stop_gradient_val=controller.stop_gradient_val, hallu_gate_layer=controller.hallu_gate_layer) # Update mi_info curr_iter += 1 hallu_str = net_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth + 1, None, None, hallu_str)) controller.add_one_to_queue( q_hallu, mi_info, curr_iter, net_info) return curr_iter
def server_main( controller, options, hallu_handle=None, child_handle=None, critic_handle=None): """ Server entrance/main. """ model_options_base = options log_dir_root = logger.get_logger_dir() model_dir_root = options.model_dir ( mi_info, ipc, qname_to_pool, philly_wa, curr_iter, critic_iter, n_recv, n_last_train, n_last_mi_save ) = server_init(controller, options) # useful alias: (q_hallu, q_child) = (controller.q_hallu, controller.q_child) # message handles hallu_handle = ( hallu_handle if hallu_handle else server_handle_hallu_message) child_handle = ( child_handle if child_handle else server_handle_child_message) critic_handle = ( critic_handle if critic_handle else server_handle_critic_message) # server main loop while ipc.pools.has_active() or q_child.size() > 0 or q_hallu.size() > 0: # Launch child/hallu sleepers for job_type, queue in zip( [TRAIN_HALLU, TRAIN_MODEL], [q_hallu, q_child]): # Populate workers util either active is full # or option_queue is empty. while ipc.pools.has_idle(job_type) and queue.size() > 0: model_str, model_iter, parent_iter, search_depth = queue.pop() # log the pop order of models. Important for analysis logger.info("mi={} pi={} sd={}".format( model_iter, parent_iter, search_depth)) logger.info("LayerInfoList is :\n{}".format(model_str)) model_options = copy.deepcopy(model_options_base) model_options.net_info = net_info_from_str(model_str) fork_and_train_model(ipc=ipc, options=model_options, log_dir=_mi_to_dn(log_dir_root, model_iter), child_dir=_mi_to_dn(model_dir_root, model_iter), prev_dir=_mi_to_dn(model_dir_root, parent_iter), model_str=model_str, model_iter=model_iter, parent_iter=parent_iter, search_depth=search_depth, job_type=job_type) # launch critic sleepers for qname in [q_child.name, q_hallu.name]: _n_new = n_recv[qname] - n_last_train[qname] _train_every = controller.controller_train_every if _n_new >= _train_every: pool = qname_to_pool[qname] if ipc.pools.has_idle(pool): n_last_train[qname] = n_recv[qname] ci = critic_iter[qname] = 1 + critic_iter[qname] logger.info('Train critic {} ci={} ...'.format(qname, ci)) fork_and_train_critic( ipc=ipc, ctrl=controller, data_dir=options.data_dir, crawl_dirs=log_dir_root, log_dir=_ci_to_dn(log_dir_root, ci, qname), model_dir=_ci_to_dn(model_dir_root, ci, qname), prev_dir=_ci_to_dn(model_dir_root, ci-1, qname), critic_iter=ci, queue_name=qname, pool=pool) logger.info('...Train critic launched') logger.info('Listening for message...') msg_output, job_type = ipc.get_finished_message() if job_type == TRAIN_HALLU: n_recv[q_hallu.name] += 1 curr_iter = hallu_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options, curr_iter=curr_iter) elif job_type == TRAIN_MODEL: n_recv[q_child.name] += 1 n_idle = ipc.pools.num_idle(TRAIN_HALLU) curr_iter = child_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options, n_idle=n_idle, curr_iter=curr_iter) elif job_type in [ TRAIN_CRITIC_MODEL, TRAIN_CRITIC_HALLU, TRAIN_CRITIC_PARENT]: critic_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options) ## periodic log/heartbeat/ and exits. n_finished = n_recv[q_child.name] + n_recv[q_hallu.name] philly_wa.new_heart_beat(cnt=n_finished) philly_wa.print_progress_percent() # Saving mi_info periodically for training # critic, post-processing and recovering. np.savez(_mi_info_save_fn(log_dir_root), mi_info=mi_info) # we have explore enough models. quit now. if n_finished >= options.max_exploration: break # end while (server main loop) logger.info( "Exiting server main. n_recv[hallu]={} n_recv[child]={}".format( n_recv[q_hallu.name], n_recv[q_child.name]))
def server_handle_hallu_message( msg_output, controller, mi_info, options, curr_iter): """ Petridish server handles the return message of a forked process that watches over a halluciniation job. """ log_dir_root = logger.get_logger_dir() q_child = controller.q_child model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter (fp, ve, te, hallu_stats, l_op_indices, l_op_omega) = ( jr['fp'], jr['ve'], jr['te'], jr['l_stats'], jr['l_op_indices'], jr['l_op_omega'] ) logger.info( ("HALLU : mi={} val_err={} test_err={} " "Gflops={} hallu_stats={}").format( model_iter, ve, te, fp * 1e-9, hallu_stats)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp ## compute hallucination related info in net_info net_info = net_info_from_str(model_str) hallu_locs = net_info.contained_hallucination() # contained hallu_indices = net_info.sorted_hallu_indices(hallu_locs) # feature selection based on params l_fs_ops, l_fs_omega = feature_selection_cutoff( l_op_indices, l_op_omega, options) separated_hallu_info = net_info.separate_hallu_info_by_cname( hallu_locs, hallu_indices, l_fs_ops, l_fs_omega) ## Select a subset of hallucination to add to child model l_selected = [] # sort by -cos(grad, hallu) for the indices, 0,1,2,...,n_hallu-1. processed_stats = [process_hallu_stats_for_critic_feat([stats]) \ for stats in hallu_stats] logger.info('processed_stats={}'.format(processed_stats)) logger.info('separated_hallu_info={}'.format(separated_hallu_info)) # greedy select with gradient boosting l_greedy_selected = [] if options.n_greed_select_per_init: greedy_order = sorted( range(len(hallu_indices)), key=lambda i : - processed_stats[i][0]) min_select = options.n_hallus_per_select max_select = max(min_select, len(hallu_indices) // 2) for selected_len in range(min_select, max_select + 1): selected = greedy_order[:selected_len] l_greedy_selected.append(selected) n_greedy_select = len(l_greedy_selected) if n_greedy_select > options.n_greed_select_per_init: # random choose l_greedy_selected = list(np.random.choice( l_greedy_selected, options.n_greed_select_per_init, replace=False)) # random select a subset l_random_selected = [] if options.n_rand_select_per_init: # also try some random samples l_random_selected = online_sampling( itertools.combinations( range(len(hallu_indices)), options.n_hallus_per_select ), options.n_rand_select_per_init) np.random.shuffle(l_random_selected) l_selected = l_greedy_selected + l_random_selected ## for each selected subset of hallu, make a model for q_child # since more recent ones tend to be better, # we insert in reverse order, so greedy are inserted later. for selected in reversed(l_selected): # new model description child_info = copy.deepcopy(net_info) l_hi = [ hallu_indices[s] for s in selected ] child_info = child_info.select_hallucination( l_hi, separated_hallu_info) # Compute initialization stat stat = process_hallu_stats_for_critic_feat( [hallu_stats[s] for s in selected]) # update mi_info curr_iter += 1 child_str = child_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth+1, None, None, child_str, stat)) controller.add_one_to_queue( q_child, mi_info, curr_iter, child_info) return curr_iter
def full_recovery(self, prev_log_root, log_root, prev_model_root, model_root, q_parent, q_hallu, q_child, mi_info): # 0. assume that controller predictor is loaded # 1. load old mi_info # 2. filter mi_info into models from q_parent, q_hallu, and q_child # 3. Copy/link finished model log_dir/model_dir for those mi_info that has ve. # 4. add to q_hallu and q_child the ones that are not finished # 5. add to q_parent the models that are finished (and are not in hallu) # 6. TODO Compute counter variables like n_recv and friends def _is_hallu(info): return info.sd % 2 == 1 def _is_finished(info): return info.ve is not None and info.ve < 1.0 prev_mi_info_npz = _mi_info_save_fn(prev_log_root) mi_info_npz = _mi_info_save_fn(log_root) if os.path.exists(mi_info_npz): # Current trial has some mi_info already load from it instead # This happens on local runs because they don't have trial id. # This also happens on preempt on philly, which doesn't not advance trial id. prev_mi_info_npz = mi_info_npz prev_model_root = model_root prev_log_root = log_root if not os.path.exists(prev_mi_info_npz): # nothing to load. Return False to let outside know. return False mi_info.extend(np.load(prev_mi_info_npz, encoding='bytes')['mi_info']) if mi_info_npz != prev_mi_info_npz: os.rename(prev_mi_info_npz, mi_info_npz) all_mi_in_log = set(_all_mi(prev_log_root)) all_mi_in_model = set(_all_mi(prev_model_root)) for info in mi_info: mi = info.mi if mi in all_mi_in_log: all_mi_in_log.remove(mi) if mi in all_mi_in_model: all_mi_in_model.remove(mi) # TODO use heapify instaed of inserting one by one... old_log_dir = _mi_to_dn(prev_log_root, mi) old_model_dir = _mi_to_dn(prev_model_root, mi) queue = None if not _is_finished(info): queue = q_hallu if _is_hallu(info) else q_child # remove partial model/logs to avoid confusions. if os.path.exists(old_model_dir): shutil.rmtree(old_model_dir) if os.path.exists(old_log_dir): shutil.rmtree(old_log_dir) logger.info("Recover: mi={} queue={}".format(mi, queue.name)) else: # copy logs new_log_dir = _mi_to_dn(log_root, mi) if new_log_dir != old_log_dir: shutil.copytree(old_log_dir, new_log_dir) # copy models new_model_dir = _mi_to_dn(model_root, mi) if new_model_dir != old_model_dir: shutil.copytree(old_model_dir, new_model_dir) queue = None if _is_hallu(info) else q_parent qname = "" if queue is None else queue.name # It's important to log val_err for the purpose of analysis later logger.info("Recover: mi={} val_err={} queue={}".format( mi, info.ve, qname)) if queue is not None: self.controller.add_one_to_queue(queue, mi_info, mi, None) #end for each info # remove old mi that have log or model but are not in mi_info for mi in all_mi_in_log: old_log_dir = _mi_to_dn(prev_log_root, mi) if os.path.exists(old_log_dir): shutil.rmtree(old_log_dir) for mi in all_mi_in_model: old_model_dir = _mi_to_dn(prev_model_root, mi) if os.path.exists(old_model_dir): shutil.rmtree(old_model_dir) if not self.n_models_to_recover is None: # TODO compute priority and do nsmallest instead of trimming. q_parent.keep_top_k(self.n_models_to_recover) logger.info("full_recovery: trim q_parent to size {}".format( self.n_models_to_recover)) # recover successfully return True
def partial_recovery(self, prev_log_root, log_root, prev_model_root, model_root, q_parent, q_hallu, q_child, mi_info): """ deprecated DO NOT USE prev_log_root (str) : root of previous log, e.g., on philly: xxx/app_id/logs/2/petridish_main log_root (str) : current root of log prev_model_root (str) : previous model root, e.g., on philly: xxx/app_id/models/2 model_root (str) : current model root q_parent (PetridishQueue) : see PetridishController.init_queues q_hallu (PetridishQueue) : q_child (PetridishQueue) : mi_info (list) : of ModelSearchInfo """ old_npz = _mi_info_save_fn(prev_log_root) old_mi_info = list(np.load(old_npz, encoding='bytes')['mi_info']) def _is_hallu(info): return info.sd % 2 == 1 def _is_finished(info): return info.ve is not None and info.ve <= 1.0 min_ve_info = None min_ve = None for info in old_mi_info: if not _is_finished(info): continue if min_ve is None or (info.ve <= 1.0 and info.ve < min_ve): min_ve = info.ve min_ve_info = info if min_ve is None: # nothing finished. start regularly return # get the path to root. info = min_ve_info l_info = [info] while info.pi != info.mi: info = old_mi_info[info.pi] l_info.append(info) curr_iter = -1 for info in reversed(l_info): curr_iter += 1 # copy logs into the new dir old_log_dir = _mi_to_dn(prev_log_root, info.mi) new_log_dir = _mi_to_dn(log_root, curr_iter) shutil.copytree(old_log_dir, new_log_dir) # copy models old_model_dir = _mi_to_dn(prev_model_root, info.mi) new_model_dir = _mi_to_dn(model_root, curr_iter) shutil.copytree(old_model_dir, new_model_dir) info.mi = curr_iter info.pi = max(curr_iter - 1, 0) mi_info.append(info) info = mi_info[-1] queue = q_hallu if _is_hallu(info) else q_child queue.add(model_str=info.mstr, model_iter=info.mi, parent_iter=info.pi, search_depth=info.sd, priority=info.ve)