def __init__(self, test_case=None): self.parser = None self.config = Config() self.driver = None if test_case is not None: self.browser = test_case.BROWSER self.headless = test_case.HEAD_LESS logger.config_logger(test_case.__name__, self.browser)
def __init__(self, threadID: int, threadName: str, tqueue: queue.Queue, host: str = "localhost", port: str = "5555"): threading.Thread.__init__(self) # Threading variables self._threadID = threadID self._threadName = threadName self.__tqueue = tqueue self.__stoprequest = threading.Event() self.__logger = config_logger(name='request_handler', filepath='./logs/rhandler.log') # ZMQ context self.__host = host self.__port = port self.__context = zmq.Context() self._server = self.__context.socket(zmq.REP) self._server.connect("tcp://%s:%s" % (self.__host, self.__port)) self.__poller = zmq.Poller() self.__poller.register(self._server, zmq.POLLIN)
def __init__(self, threadID: int, threadName: str, queue: queue.Queue): threading.Thread.__init__(self) # Threading variables self._threadID = threadID self._threadName = threadName self._queue = queue self._stoprequest = threading.Event() self.__logger = config_logger(name=self._threadName, filepath='./logs/{}.log'.format( self._threadName))
def environment(bdw_paths: mp.Array, stop_env: mp.Event, end_of_run: mp.Event): rhostname = 'mininet' + '@' + SSH_HOST config = { 'server': 'ipc:///tmp/zmq', 'client': 'tcp://*:5555', 'publisher': 'tcp://*:5556', 'subscriber': 'ipc:///tmp/pubsub' } logger = config_logger('environment', filepath='./logs/environment.log') env = Environment(bdw_paths, logger=logger, mconfig=config, remoteHostname=rhostname) # Lets measure env runs in time while not stop_env.is_set(): # Only the agent can unblock this loop, after a training-batch has been completed while not end_of_run.is_set(): try: # update environment config from session if env.updateEnvironment() == -1: stop_env.set() end_of_run.set() break # run a single session & measure #------------------- now = time.time() env.run() end = time.time() #------------------- diff = int(end - now) logger.debug("Time to execute one run: {}s".format(diff)) end_of_run.set() # set the end of run so our agent knows # env.spawn_middleware() # restart middleware except Exception as ex: logger.error(ex) break time.sleep(0.1) env.close()
def train_net(config): pGen, pKv, pRpn, pRoi, pBbox, pDataset, pModel, pOpt, pTest, \ transform, data_name, label_name, metric_list = config.get_config(is_train=True) pGen = patch_config_as_nothrow(pGen) pKv = patch_config_as_nothrow(pKv) pRpn = patch_config_as_nothrow(pRpn) pRoi = patch_config_as_nothrow(pRoi) pBbox = patch_config_as_nothrow(pBbox) pDataset = patch_config_as_nothrow(pDataset) pModel = patch_config_as_nothrow(pModel) pOpt = patch_config_as_nothrow(pOpt) pTest = patch_config_as_nothrow(pTest) ctx = [mx.gpu(int(i)) for i in pKv.gpus] pretrain_prefix = pModel.pretrain.prefix pretrain_epoch = pModel.pretrain.epoch prefix = pGen.name save_path = os.path.join("experiments", prefix) begin_epoch = pOpt.schedule.begin_epoch end_epoch = pOpt.schedule.end_epoch lr_iter = pOpt.schedule.lr_iter # only rank==0 print all debug infos kvstore_type = "dist_sync" if os.environ.get( "DMLC_ROLE") == "worker" else pKv.kvstore kv = mx.kvstore.create(kvstore_type) rank = kv.rank # for distributed training using shared file system os.makedirs(save_path, exist_ok=True) from utils.logger import config_logger config_logger(os.path.join(save_path, "log.txt")) model_prefix = os.path.join(save_path, "checkpoint") # set up logger logger = logging.getLogger() sym = pModel.train_symbol # setup multi-gpu input_batch_size = pKv.batch_image * len(ctx) # print config # if rank == 0: # logger.info(pprint.pformat(config)) # load dataset and prepare imdb for training image_sets = pDataset.image_set roidbs = [ pkl.load(open("data/cache/{}.roidb".format(i), "rb"), encoding="latin1") for i in image_sets ] roidb = reduce(lambda x, y: x + y, roidbs) # filter empty image roidb = [rec for rec in roidb if rec["gt_bbox"].shape[0] > 0] # add flip roi record flipped_roidb = [] for rec in roidb: new_rec = rec.copy() new_rec["flipped"] = True flipped_roidb.append(new_rec) roidb = roidb + flipped_roidb from core.detection_input import AnchorLoader train_data = AnchorLoader(roidb=roidb, transform=transform, data_name=data_name, label_name=label_name, batch_size=input_batch_size, shuffle=True, kv=kv, num_worker=pGen.loader_worker or 12, num_collector=pGen.loader_collector or 1, worker_queue_depth=2, collector_queue_depth=2) # infer shape worker_data_shape = dict(train_data.provide_data + train_data.provide_label) for key in worker_data_shape: worker_data_shape[key] = ( pKv.batch_image, ) + worker_data_shape[key][1:] arg_shape, _, aux_shape = sym.infer_shape(**worker_data_shape) _, out_shape, _ = sym.get_internals().infer_shape(**worker_data_shape) out_shape_dict = list(zip(sym.get_internals().list_outputs(), out_shape)) _, out_shape, _ = sym.infer_shape(**worker_data_shape) terminal_out_shape_dict = zip(sym.list_outputs(), out_shape) if rank == 0: logger.info('parameter shape') logger.info( pprint.pformat( [i for i in out_shape_dict if not i[0].endswith('output')])) logger.info('intermediate output shape') logger.info( pprint.pformat( [i for i in out_shape_dict if i[0].endswith('output')])) logger.info('terminal output shape') logger.info(pprint.pformat([i for i in terminal_out_shape_dict])) # memonger if pModel.memonger: last_block = pModel.memonger_until or "" if rank == 0: logger.info("do memonger up to {}".format(last_block)) type_dict = {k: np.float32 for k in worker_data_shape} sym = search_plan_to_layer(sym, last_block, 1000, type_dict=type_dict, **worker_data_shape) # load and initialize params if pOpt.schedule.begin_epoch != 0: arg_params, aux_params = load_checkpoint(model_prefix, begin_epoch) elif pModel.from_scratch: arg_params, aux_params = dict(), dict() else: arg_params, aux_params = load_checkpoint(pretrain_prefix, pretrain_epoch) if pModel.process_weight is not None: pModel.process_weight(sym, arg_params, aux_params) ''' there are some conflicts between `mergebn` and `attach_quantized_node` in graph_optimize.py when mergebn ahead of attach_quantized_node such as `Symbol.ComposeKeyword` ''' if pModel.QuantizeTrainingParam is not None and pModel.QuantizeTrainingParam.quantize_flag: pQuant = pModel.QuantizeTrainingParam assert pGen.fp16 == False, "current quantize training only support fp32 mode." from utils.graph_optimize import attach_quantize_node _, out_shape, _ = sym.get_internals().infer_shape(**worker_data_shape) out_shape_dictoinary = dict( zip(sym.get_internals().list_outputs(), out_shape)) sym = attach_quantize_node(sym, out_shape_dictoinary, pQuant.WeightQuantizeParam, pQuant.ActQuantizeParam, pQuant.quantized_op) # merge batch normalization to save memory in fix bn training from utils.graph_optimize import merge_bn sym, arg_params, aux_params = merge_bn(sym, arg_params, aux_params) if pModel.random: import time mx.random.seed(int(time.time())) np.random.seed(int(time.time())) init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2) init.set_verbosity(verbose=True) # create solver fixed_param = pModel.pretrain.fixed_param excluded_param = pModel.pretrain.excluded_param data_names = [k[0] for k in train_data.provide_data] label_names = [k[0] for k in train_data.provide_label] if pModel.teacher_param: from models.KD.utils import create_teacher_module from models.KD.detection_module import KDDetModule t_mod, t_label_name, t_label_shape = create_teacher_module( pModel.teacher_param, worker_data_shape, input_batch_size, ctx, rank, logger) mod = KDDetModule(sym, teacher_module=t_mod, teacher_label_names=t_label_name, teacher_label_shapes=t_label_shape, data_names=data_names, label_names=label_names, logger=logger, context=ctx, fixed_param=fixed_param, excluded_param=excluded_param) else: mod = DetModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, fixed_param=fixed_param, excluded_param=excluded_param) eval_metrics = mx.metric.CompositeEvalMetric(metric_list) # callback batch_end_callback = [ callback.Speedometer(train_data.batch_size, frequent=pGen.log_frequency) ] batch_end_callback += pModel.batch_end_callbacks or [] epoch_end_callback = callback.do_checkpoint(model_prefix) sym.save(model_prefix + ".json") # decide learning rate lr_mode = pOpt.optimizer.lr_mode or 'step' base_lr = pOpt.optimizer.lr * kv.num_workers lr_factor = pOpt.schedule.lr_factor or 0.1 iter_per_epoch = len(train_data) // input_batch_size total_iter = iter_per_epoch * (end_epoch - begin_epoch) lr_iter = [total_iter + it if it < 0 else it for it in lr_iter] lr_iter = [it // kv.num_workers for it in lr_iter] lr_iter = [it - iter_per_epoch * begin_epoch for it in lr_iter] lr_iter_discount = [it for it in lr_iter if it > 0] current_lr = base_lr * (lr_factor**(len(lr_iter) - len(lr_iter_discount))) if rank == 0: logging.info('total iter {}'.format(total_iter)) logging.info('lr {}, lr_iters {}'.format(current_lr, lr_iter_discount)) logging.info('lr mode: {}'.format(lr_mode)) if pOpt.warmup and pOpt.schedule.begin_epoch == 0: if rank == 0: logging.info('warmup lr {}, warmup step {}'.format( pOpt.warmup.lr, pOpt.warmup.iter)) if lr_mode == 'step': lr_scheduler = WarmupMultiFactorScheduler( step=lr_iter_discount, factor=lr_factor, warmup=True, warmup_type=pOpt.warmup.type, warmup_lr=pOpt.warmup.lr, warmup_step=pOpt.warmup.iter) elif lr_mode == 'cosine': warmup_lr_scheduler = AdvancedLRScheduler(mode='linear', base_lr=pOpt.warmup.lr, target_lr=base_lr, niters=pOpt.warmup.iter) cosine_lr_scheduler = AdvancedLRScheduler( mode='cosine', base_lr=base_lr, target_lr=0, niters=(iter_per_epoch * (end_epoch - begin_epoch)) - pOpt.warmup.iter) lr_scheduler = LRSequential( [warmup_lr_scheduler, cosine_lr_scheduler]) else: raise NotImplementedError else: if lr_mode == 'step': lr_scheduler = WarmupMultiFactorScheduler(step=lr_iter_discount, factor=lr_factor) elif lr_mode == 'cosine': lr_scheduler = AdvancedLRScheduler(mode='cosine', base_lr=base_lr, target_lr=0, niters=iter_per_epoch * (end_epoch - begin_epoch)) else: lr_scheduler = None # optimizer optimizer_params = dict(momentum=pOpt.optimizer.momentum, wd=pOpt.optimizer.wd, learning_rate=current_lr, lr_scheduler=lr_scheduler, rescale_grad=1.0 / (len(ctx) * kv.num_workers), clip_gradient=pOpt.optimizer.clip_gradient) if pKv.fp16: optimizer_params['multi_precision'] = True optimizer_params['rescale_grad'] /= 128.0 profile = pGen.profile or False if profile: mx.profiler.set_config(profile_all=True, filename=os.path.join(save_path, "profile.json")) # train mod.fit(train_data=train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kv, optimizer=pOpt.optimizer.type, optimizer_params=optimizer_params, initializer=init, allow_missing=True, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch, profile=profile) logging.info("Training has done") time.sleep(10) logging.info("Exiting")
assert boxes.shape[1] == 4 for i in range(len(boxes)): x1, y1, x2, y2 = boxes[i] coords = (x1, y1), x2 - x1 + 1, y2 - y1 + 1 c = color if color else npr.random(3) rect_kwargs = dict(fill=False, edgecolor=c, linewidth=2) ax.add_patch(plt.Rectangle(*coords, **rect_kwargs)) if labels is not None: text_kwargs = dict(size='small', color='white', bbox=dict(facecolor=c, alpha=0.5, pad=0.15)) ax.text(x1 - 2, y1 - 2, labels[i], **text_kwargs) if __name__ == "__main__": config_logger() # parse arguments parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, help='deploy prototxt file') parser.add_argument('--weights', type=str, help='model weights file') parser.add_argument('--gpu', default=0, type=int, help='gpu id, -1 for cpu') parser.add_argument('--conf-thresh', default=0.6, help='detection confidence threshold') parser.add_argument('--nms-thresh', default=0.5, help='detection nms threshold')
def agent(): np.random.seed(RANDOM_SEED) # Create results path if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) # Spawn request handler tqueue = queue.Queue(1) rhandler = RequestHandler(1, "rhandler-thread", tqueue=tqueue, host=SSH_HOST, port='5555') rhandler.start() # Spawn collector thread cqueue = queue.Queue(0) collector = Collector(2, "collector-thread", queue=cqueue, host=SSH_HOST, port='5556') collector.start() # Spawn environment # process -- not a thread bdw_paths = mp.Array('i', 2) stop_env = mp.Event() end_of_run = mp.Event() env = mp.Process(target=environment, args=(bdw_paths, stop_env, end_of_run)) env.start() # keep record of threads and processes tp_list = [rhandler, collector, env] # Main training loop logger = config_logger('agent', './logs/agent.log') logger.info("Run Agent until training stops...") with tf.Session() as sess, open(LOG_FILE, 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = EPOCH time_stamp = 0 path = DEFAULT_PATH action_vec = np.zeros(A_DIM) action_vec[path] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] actor_gradient_batch = [] critic_gradient_batch = [] list_states = [] while not end_of_run.is_set(): # Get scheduling request from rhandler thread request, ev1 = get_request(tqueue, logger, end_of_run=end_of_run) # end of iterations -> exit loop -> save -> bb if stop_env.is_set(): break if request is None and end_of_run.is_set(): logger.info("END_OF_RUN => BATCH UPDATE") # get all stream_info from collector's queue stream_info = [] with cqueue.mutex: for elem in list(cqueue.queue): stream_info.append(elem) # clear the queue cqueue.queue.clear() # Validate # Proceed to next run # logger.info("len(list_states) {} == len(stream_info) {}".format(len(list_states), len(stream_info))) if len(list_states) != len(stream_info) or len( list_states) == 0: entropy_record = [] del s_batch[:] del a_batch[:] del r_batch[:] stream_info.clear() list_states.clear() end_of_run.clear() time.sleep(0.01) continue # Re-order rewards stream_info = arrangeStateStreamsInfo(list_states, stream_info) list_ids = [stream['StreamID'] for stream in stream_info] logger.info("all unique: {}".format( allUnique(list_ids, debug=True))) # for i, stream in enumerate(stream_info): # logger.info(stream) # logger.info(list_states[i]) # print this on index based # For each stream calculate a reward completion_times = [] for index, stream in enumerate(stream_info): path1_smoothed_RTT, path1_bandwidth, path1_packets, \ path1_retransmissions, path1_losses, \ path2_smoothed_RTT, path2_bandwidth, path2_packets, \ path2_retransmissions, path2_losses, \ = getTrainingVariables(list_states[index]) normalized_bwd_path0 = (bdw_paths[0] - 1.0) / (100.0 - 1.0) normalized_bwd_path1 = (bdw_paths[1] - 1.0) / (100.0 - 1.0) normalized_srtt_path0 = ( (path1_smoothed_RTT * 1000.0) - 1.0) / (120.0) normalized_srtt_path1 = ( (path2_smoothed_RTT * 1000.0) - 1.0) / (120.0) normalized_loss_path0 = ( (path1_retransmissions + path1_losses) - 0.0) / 20.0 normalized_loss_path1 = ( (path2_retransmissions + path2_losses) - 0.0) / 20.0 # aggr_bdw = normalized_bwd_path0 + normalized_bwd_path1 aggr_srtt = normalized_srtt_path0 + normalized_srtt_path1 aggr_loss = normalized_loss_path0 + normalized_loss_path1 reward = (a_batch[index][0] * normalized_bwd_path0 + a_batch[index][1] * normalized_bwd_path1 ) - stream['CompletionTime'] - ( 0.8 * aggr_srtt) - (1.0 * aggr_loss) r_batch.append(reward) completion_times.append(stream['CompletionTime']) # Check if we have a stream[0] = 0 add -> 0 to r_batch tmp_s_batch = np.stack(s_batch[:], axis=0) tmp_r_batch = np.vstack(r_batch[:]) if tmp_s_batch.shape[0] > tmp_r_batch.shape[0]: logger.debug("s_batch({}) > r_batch({})".format( tmp_s_batch.shape[0], tmp_r_batch.shape[0])) logger.debug(tmp_s_batch[0]) r_batch.insert(0, 0) # Save metrics for debugging # log time_stamp, bit_rate, buffer_size, reward for index, stream in enumerate(stream_info): path1_smoothed_RTT, path1_bandwidth, path1_packets, \ path1_retransmissions, path1_losses, \ path2_smoothed_RTT, path2_bandwidth, path2_packets, \ path2_retransmissions, path2_losses, \ = getTrainingVariables(list_states[index]) log_file.write( str(time_stamp) + '\t' + str(PATHS[path]) + '\t' + str(bdw_paths[0]) + '\t' + str(bdw_paths[1]) + '\t' + str(path1_smoothed_RTT) + '\t' + str(path2_smoothed_RTT) + '\t' + str(path1_retransmissions + path1_losses) + '\t' + str(path2_retransmissions + path2_losses) + '\t' + str(stream['CompletionTime']) + '\t' + str(stream['Path']) + '\n') log_file.flush() time_stamp += 1 # Single Training step # ---------------------------------------------------------------------------------------------------- actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0), # ignore the first chuck a_batch=np.vstack(a_batch[1:]), # since we don't have the r_batch=np.vstack(r_batch[1:]), # control over it terminal=True, actor=actor, critic=critic) td_loss = np.mean(td_batch) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) logger.debug("====") logger.debug("Epoch: {}".format(epoch)) msg = "TD_loss: {}, Avg_reward: {}, Avg_entropy: {}".format( td_loss, np.mean(r_batch[1:]), np.mean(entropy_record[1:])) logger.debug(msg) logger.debug("====") # ---------------------------------------------------------------------------------------------------- # Print summary for tensorflow # ---------------------------------------------------------------------------------------------------- summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: td_loss, summary_vars[1]: np.mean(r_batch), summary_vars[2]: np.mean(entropy_record), summary_vars[3]: np.mean(completion_times) }) writer.add_summary(summary_str, epoch) writer.flush() # ---------------------------------------------------------------------------------------------------- # Update gradients if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE: assert len(actor_gradient_batch) == len( critic_gradient_batch) for i in range(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) epoch += 1 if epoch % MODEL_SAVE_INTERVAL == 0: save_path = saver.save( sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") entropy_record = [] # Clear all before proceeding to next run del s_batch[:] del a_batch[:] del r_batch[:] stream_info.clear() list_states.clear() end_of_run.clear() else: ev1.set() # let `producer` (rh) know we received request list_states.append(request) # The bandwidth metrics coming from MPQUIC are not correct # constant values not upgraded path1_smoothed_RTT, path1_bandwidth, path1_packets, \ path1_retransmissions, path1_losses, \ path2_smoothed_RTT, path2_bandwidth, path2_packets, \ path2_retransmissions, path2_losses, \ = getTrainingVariables(request) time_stamp += 1 # in ms last_path = path # retrieve previous state if len(s_batch) == 0: state = np.zeros((S_INFO, S_LEN)) else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = (bdw_paths[0] - 1.0) / (100.0 - 1.0 ) # bandwidth path1 state[1, -1] = (bdw_paths[1] - 1.0) / (100.0 - 1.0 ) # bandwidth path2 state[2, -1] = ((path1_smoothed_RTT * 1000.0) - 1.0) / ( 120.0) # max RTT so far 120ms state[3, -1] = ((path2_smoothed_RTT * 1000.0) - 1.0) / (120.0) state[4, -1] = ( (path1_retransmissions + path1_losses) - 0.0) / 20.0 state[5, -1] = ( (path2_retransmissions + path2_losses) - 0.0) / 20.0 s_batch.append(state) action_prob = actor.predict( np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) path = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() action_vec = np.zeros(A_DIM) action_vec[path] = 1 a_batch.append(action_vec) logger.debug("PATH: {}".format(path)) entropy_record.append(a3c.compute_entropy(action_prob[0])) # prepare response response = [request['StreamID'], PATHS[path]] response = [str(r).encode('utf-8') for r in response] ev2 = threading.Event() put_response((response, ev2), tqueue, logger) ev2.wait( ) # blocks until `consumer` (i.e. rh) receives response # send kill signal to all stop_env.set() rhandler.stophandler() collector.stophandler() # wait for threads and process to finish gracefully... for tp in tp_list: tp.join()
def train_net(config): pGen, pKv, pRpn, pRoi, pBbox, pDataset, pModel, pOpt, pTest, \ transform, data_name, label_name, metric_list = config.get_config(is_train=True) ctx = [mx.gpu(int(i)) for i in pKv.gpus] pretrain_prefix = pModel.pretrain.prefix pretrain_epoch = pModel.pretrain.epoch prefix = pGen.name save_path = os.path.join("experiments", prefix) begin_epoch = pOpt.schedule.begin_epoch end_epoch = pOpt.schedule.end_epoch lr_iter = pOpt.schedule.lr_iter # only rank==0 print all debug infos kvstore_type = "dist_sync" if os.environ.get( "DMLC_ROLE") == "worker" else pKv.kvstore kv = mx.kvstore.create(kvstore_type) rank = kv.rank # for distributed training using shared file system if rank == 0: if not os.path.exists(save_path): os.makedirs(save_path) from utils.logger import config_logger config_logger(os.path.join(save_path, "log.txt")) model_prefix = os.path.join(save_path, "checkpoint") # set up logger logger = logging.getLogger() sym = pModel.train_symbol # setup multi-gpu input_batch_size = pKv.batch_image * len(ctx) # print config # if rank == 0: # logger.info(pprint.pformat(config)) # load dataset and prepare imdb for training image_sets = pDataset.image_set roidbs = [ pkl.load(open("data/cache/{}.roidb".format(i), "rb"), encoding="latin1") for i in image_sets ] roidb = reduce(lambda x, y: x + y, roidbs) # filter empty image roidb = [rec for rec in roidb if rec["gt_bbox"].shape[0] > 0] # add flip roi record flipped_roidb = [] for rec in roidb: new_rec = rec.copy() new_rec["flipped"] = True flipped_roidb.append(new_rec) roidb = roidb + flipped_roidb from core.detection_input import AnchorLoader train_data = AnchorLoader(roidb=roidb, transform=transform, data_name=data_name, label_name=label_name, batch_size=input_batch_size, shuffle=True, kv=kv) # infer shape worker_data_shape = dict(train_data.provide_data + train_data.provide_label) for key in worker_data_shape: worker_data_shape[key] = ( pKv.batch_image, ) + worker_data_shape[key][1:] arg_shape, _, aux_shape = sym.infer_shape(**worker_data_shape) _, out_shape, _ = sym.get_internals().infer_shape(**worker_data_shape) out_shape_dict = list(zip(sym.get_internals().list_outputs(), out_shape)) _, out_shape, _ = sym.infer_shape(**worker_data_shape) terminal_out_shape_dict = zip(sym.list_outputs(), out_shape) if rank == 0: logger.info('parameter shape') logger.info( pprint.pformat( [i for i in out_shape_dict if not i[0].endswith('output')])) logger.info('intermediate output shape') logger.info( pprint.pformat( [i for i in out_shape_dict if i[0].endswith('output')])) logger.info('terminal output shape') logger.info(pprint.pformat([i for i in terminal_out_shape_dict])) # memonger if pModel.memonger: last_block = pModel.memonger_until or "" if rank == 0: logger.info("do memonger up to {}".format(last_block)) type_dict = {k: np.float32 for k in worker_data_shape} sym = search_plan_to_layer(sym, last_block, 1000, type_dict=type_dict, **worker_data_shape) # load and initialize params if pOpt.schedule.begin_epoch != 0: arg_params, aux_params = load_checkpoint(model_prefix, begin_epoch) elif pModel.from_scratch: arg_params, aux_params = dict(), dict() else: arg_params, aux_params = load_checkpoint(pretrain_prefix, pretrain_epoch) try: pModel.process_weight(sym, arg_params, aux_params) except AttributeError: pass if pModel.random: import time mx.random.seed(int(time.time())) np.random.seed(int(time.time())) init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2) init.set_verbosity(verbose=True) # create solver fixed_param_prefix = pModel.pretrain.fixed_param data_names = [k[0] for k in train_data.provide_data] label_names = [k[0] for k in train_data.provide_label] mod = DetModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, fixed_param_prefix=fixed_param_prefix) eval_metrics = mx.metric.CompositeEvalMetric(metric_list) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=pGen.log_frequency) epoch_end_callback = callback.do_checkpoint(model_prefix) sym.save(model_prefix + ".json") # decide learning rate base_lr = pOpt.optimizer.lr * kv.num_workers lr_factor = 0.1 iter_per_epoch = len(train_data) // input_batch_size lr_iter = [it // kv.num_workers for it in lr_iter] lr_iter = [it - iter_per_epoch * begin_epoch for it in lr_iter] lr_iter_discount = [it for it in lr_iter if it > 0] current_lr = base_lr * (lr_factor**(len(lr_iter) - len(lr_iter_discount))) if rank == 0: logging.info('total iter {}'.format(iter_per_epoch * (end_epoch - begin_epoch))) logging.info('lr {}, lr_iters {}'.format(current_lr, lr_iter_discount)) if pOpt.warmup is not None and pOpt.schedule.begin_epoch == 0: if rank == 0: logging.info('warmup lr {}, warmup step {}'.format( pOpt.warmup.lr, pOpt.warmup.iter)) lr_scheduler = WarmupMultiFactorScheduler(step=lr_iter_discount, factor=lr_factor, warmup=True, warmup_type=pOpt.warmup.type, warmup_lr=pOpt.warmup.lr, warmup_step=pOpt.warmup.iter) else: if len(lr_iter_discount) > 0: lr_scheduler = mx.lr_scheduler.MultiFactorScheduler( lr_iter_discount, lr_factor) else: lr_scheduler = None # optimizer optimizer_params = dict(momentum=pOpt.optimizer.momentum, wd=pOpt.optimizer.wd, learning_rate=current_lr, lr_scheduler=lr_scheduler, rescale_grad=1.0 / (len(pKv.gpus) * kv.num_workers), clip_gradient=pOpt.optimizer.clip_gradient) if pKv.fp16: optimizer_params['multi_precision'] = True optimizer_params['rescale_grad'] /= 128.0 # train mod.fit(train_data=train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kv, optimizer=pOpt.optimizer.type, optimizer_params=optimizer_params, initializer=init, allow_missing=True, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) logging.info("Training has done")