def run(args): num_gpus = get_nr_gpu() num_towers = max(num_gpus, 1) config = get_config(args, AvatarSynthModel(args), num_gpus, num_towers) if args.load_path: config.session_init = SaverRestore(args.load_path) # trainer = SyncMultiGPUTrainerParameterServer(num_towers) # trainer = QueueInputTrainer() trainer = SyncMultiGPUTrainerReplicated(num_towers) launch_train_with_config(config, trainer)
def train(self, args): self.args = args # Make sure the save path exist if not os.path.exists(self.args.save): os.makedirs(self.args.save) with change_gpu(self.args.gpu): train_df = self._dataflow() trainer = (SimpleTrainer() if get_num_gpu() <= 1 else SyncMultiGPUTrainerReplicated(get_num_gpu())) print("Found %d gpus. Using trainer:" % get_num_gpu(), trainer) # Setup callbacks self._default_callbacks() try: launch_train_with_config( self.pred_config(self.args, train_df, self.callbacks), trainer) except Exception as error: traceback.print_exc() else: # If everythin worked save a compated model self.export(os.path.join(self.args.save, "compact.pb"))
"Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy." "Pretrained models listed in README were trained with batch=32x8.") parser.add_argument('--mode', choices=['resnet', 'preact', 'se'], help='variants of resnet to use', default='resnet') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.mode) model.data_format = args.data_format if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth))) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
temp = temp[keys[i]] temp[keys[-1]] = value # set GPU machine if config['gpu'] in [None, 'None', '']: os.environ['CUDA_VISIBLE_DEVICES'] = '' num_gpu = 0 else: os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu'] num_gpu = max(get_num_gpu(), 1) config['num_gpu'] = num_gpu # set log directory if config['logdir'] in [None, 'None', '']: logger.auto_set_dir() else: logger.set_logger_dir('train_log/' + config['logdir'], action='d') # save configuration with open(logger.get_logger_dir() + '/config.json', 'w') as outfile: json.dump(config, outfile) # get train config train_config = get_train_config(config) # train the model if num_gpu > 1: launch_train_with_config(train_config, SyncMultiGPUTrainerReplicated(num_gpu)) else: launch_train_with_config(train_config, SimpleTrainer())
model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, session_init=init, max_epoch=100, ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--data', help='imagenet data dir') parser.add_argument('--load', required=True, help='path to pre-trained model') parser.add_argument('--fake', help='use FakeData to debug or benchmark this model', action='store_true') parser.add_argument('--batch', default=256, type=int, help='total batch size') parser.add_argument('--logdir') args = parser.parse_args() model = LinearModel() if args.fake: logger.set_logger_dir('fake_train_log', 'd') else: if args.logdir is None: args.logdir = './moco_lincls' logger.set_logger_dir(args.logdir) config = get_config(model) trainer = SyncMultiGPUTrainerReplicated(get_num_gpu()) launch_train_with_config(config, trainer)
def train(args, cfg): out_dirs = gen_outdirs(args, "tp") output_dir, out_res_dir = out_dirs["output_dir"], out_dirs["out_res_dir"] df = PneuSegDF(args.mode, out_res_dir, args.train_dir, args.testset_dir, args.min_num_workers, cfg) num_gpu = max(get_num_gpu(), 1) ds = df.prepared(num_gpu, cfg.batch_size) # Avoid overwritting config file if os.path.exists(pj(output_dir, os.path.basename(args.config))): input( "Config file will NOT be overwritten. Press Enter to continue...") else: shutil.copy(args.config, output_dir) logger.set_logger_dir(pj(output_dir, "log")) callback_list = [ # PeriodicCallback overwritten the frequency of what's wrapped PeriodicCallback(ModelSaver(50, checkpoint_dir=output_dir), every_k_epochs=1), GPUUtilizationTracker(), MergeAllSummaries(1 if args.train_debug else 0), # ProgressBar(["Loss"]) ] if cfg.network["norm_layer"] == "BN_layers": callback_list.append(BN_layers_update()) if cfg.lr_schedule["type"] == "epoch_wise_constant": schedule = [(ep, lr / num_gpu) for ep, lr in zip( [0] + cfg.lr_schedule["epoch_to_drop_lr"], cfg.lr_schedule["lr"])] callback_list.append( ScheduledHyperParamSetter("learning_rate", schedule)) elif cfg.lr_schedule["type"] == "halved": schedule = [(0, cfg.lr_schedule["init_lr"])] for i in range(cfg.lr_schedule["first_epoch2drop"], cfg.max_epoch, cfg.lr_schedule["period"]): schedule.append( (i, schedule[int((i - cfg.lr_schedule["first_epoch2drop"]) / cfg.lr_schedule["period"])][1] / (cfg.lr_schedule["decay_rate"] * num_gpu))) print(schedule) callback_list.append( ScheduledHyperParamSetter("learning_rate", schedule)) steps_per_epoch = len(ds) // num_gpu + 1 train_cfg = TrainConfig( model=Tensorpack_model(cfg, steps_per_epoch), data=QueueInput(ds), steps_per_epoch=steps_per_epoch, callbacks=callback_list, monitors=[ # ScalarPrinter(True, whitelist=["Loss", "LR"]), ScalarPrinter(True), # ScalarPrinter(), TFEventWriter(), # JSONWriter() ], max_epoch=cfg.max_epoch, session_init=SmartInit(args.resume), starting_epoch=args.resume_epoch) launch_train_with_config( train_cfg, SyncMultiGPUTrainerReplicated(num_gpu) if num_gpu > 1 else SimpleTrainer())