def prepare_solver_config(self): use_gpu, cores = utils.get_cores() gpu_counts = cores if use_gpu else 0 batch_size, actual_batch_size, val_batch_size = utils.get_batch_size() step=[550,1650,2750,4400] optimizer_params = { "momentum": params.get_value("momentum", default=0.9), "wd": params.get_value("wd", default=0.0005), "learning_rate": params.get_value("learning_rate", default=0.01), "lr_scheduler": mx.lr_scheduler.MultiFactorScheduler(step, factor=0.1), } self.solver_config = { "gpu_counts": gpu_counts, # fit_args "optimizer": "SGD", "optimizer_params": optimizer_params, }
def prepare_train_config(self): """配置训练参数""" # AVA-SDK 获取训练参数 """ 1) 获取所有配置 example param_dict = params.get_all() value1 = param_dict["key1"] 2) 获取某项value value1 = params.get_value("key1", default=1) """ snapshot_interval_epochs = params.get_value( "intervals.snapshotIntervalEpochs", default=1) max_epochs = params.get_value("stopCondition.maxEpochs", default=3) rand_crop = params.get_value("inputTransform.randomCrop", default=True) rand_mirror = params.get_value("inputTransform.randomMirror", default=True) batch_size, actual_batch_size, val_batch_size = utils.get_batch_size() # USING the trainning batch size as valadition batch size val_batch_size = actual_batch_size #crop_w, crop_h = utils.get_crop_size() # 使用默认的一个较小的 crop_size crop_w, crop_h = 16, 16 # 添加监控 snapshot_prefix = self.train_ins.get_snapshot_base_path() + "/snapshot" kv_store = "device" kv = mx.kvstore.create(kv_store) ''' rank = int(kv.rank) if rank > 0: snapshot_prefix += "-%s" % rank ''' batch_freq = 10 # 打印/上报指标的 batch 粒度 # AVA-SDK mxnet monitor callback 初始化 batch_end_cb = self.train_ins.get_monitor_callback( "mxnet", batch_size=actual_batch_size, batch_freq=batch_freq) epoch_end_cb = [ # mxnet default epoch callback mx.callback.do_checkpoint(snapshot_prefix, snapshot_interval_epochs), self.train_ins.get_epoch_end_callback( "mxnet", epoch_interval=snapshot_interval_epochs, other_files=[]) ] # 训练参数,用户可以自行配置 self.train_config = { "input_data_shape": (CROP_CHANNELS, crop_h, crop_w), "rand_crop": rand_crop, "rand_mirror": rand_mirror, "batch_size": batch_size, "actual_batch_size": actual_batch_size, "val_batch_size": val_batch_size, # fit_args "eval_metric": mxnet_monitor.full_mxnet_metrics(), # AVA-SDK 获取mxnet metric 列表 "epoch_end_callback": epoch_end_cb, "batch_end_callback": batch_end_cb, "kvstore": kv, "num_epoch": max_epochs, }
def prepare_train_config(self): """配置训练参数""" # AVA-SDK 获取训练参数 """ 1) 获取所有配置 example param_dict = params.get_all() value1 = param_dict["key1"] 2) 获取某项value value1 = params.get_value("key1", default=1) """ snapshot_interval_epochs = params.get_value("snapshot_interval_epochs", default=1) max_epochs = params.get_value("max_epochs", default=3) rand_crop = params.get_value("rand_crop", default=True) rand_mirror = params.get_value("rand_mirror", default=True) batch_size, actual_batch_size, val_batch_size = utils.get_batch_size() batch_size = params.get_value("batchSize", default=8) val_batch_size = params.get_value("valBatchSize", default=batch_size) use_gpu, cores = utils.get_cores() logger.info("Cores GPU=%s, count=%d", use_gpu, cores) actual_batch_size = batch_size if not use_gpu else batch_size * cores if use_gpu: val_batch_size *= cores # USING the trainning batch size as valadition batch size val_batch_size = actual_batch_size # crop_w, crop_h = utils.get_crop_size() # 根据模型的输入要求选择 crop_size crop_w = params.get_value("crop_w") crop_h = params.get_value("crop_h") # 添加监控 snapshot_prefix = self.train_ins.get_snapshot_base_path() + "/snapshot" kv_store = "device" kv = mx.kvstore.create(kv_store) ''' rank = int(kv.rank) if rank > 0: snapshot_prefix += "-%s" % rank ''' batch_freq = 10 # 打印/上报指标的 batch 粒度 batch_of_epoch = utils.ceil_by_level( float(utils.get_sampleset_num() / actual_batch_size)) # AVA-SDK mxnet monitor callback 初始化 batch_end_cb = self.train_ins.get_monitor_callback( "mxnet", batch_size=actual_batch_size, batch_freq=batch_freq) epoch_end_cb = [ # mxnet default epoch callback mx.callback.do_checkpoint(snapshot_prefix, snapshot_interval_epochs), self.train_ins.get_epoch_end_callback( "mxnet", batch_of_epoch=batch_of_epoch, epoch_interval=snapshot_interval_epochs, other_files=[]) ] # 训练参数,用户可以自行配置 self.train_config = { "input_data_shape": (CROP_CHANNELS, crop_h, crop_w), "rand_crop": rand_crop, "rand_mirror": rand_mirror, "batch_size": batch_size, "actual_batch_size": actual_batch_size, "val_batch_size": val_batch_size, # fit_args "eval_metric": mxnet_monitor.full_mxnet_metrics(), # AVA-SDK 获取mxnet metric 列表 "epoch_end_callback": epoch_end_cb, "batch_end_callback": batch_end_cb, "kvstore": kv, "num_epoch": max_epochs, }