def do_train(model, image_size=224, buffer_size=2000): batch = args.batch total_batch = batch * hvd.size() if args.fake: data = FakeData([[batch, image_size, image_size, 3], [batch]], 1000, random=False, dtype=['uint8', 'int32']) data = StagingInput(QueueInput(data)) callbacks = [] steps_per_epoch = 50 else: logger.info("#Tower: {}; Batch size per tower: {}".format( hvd.size(), batch)) zmq_addr = 'ipc://@imagenet-train-b{}-p{}'.format(batch, args.port) if args.no_zmq_ops: dataflow = RemoteDataZMQ(zmq_addr, hwm=buffer_size, bind=False) data = QueueInput(dataflow) else: data = ZMQInput(zmq_addr, 30, bind=False) data = StagingInput(data) steps_per_epoch = int(np.round(1281167 / total_batch)) BASE_LR = 0.1 * (total_batch // 256) """ ImageNet in 1 Hour, Sec 2.1: Linear Scaling Rule: When the minibatch size is multiplied by k, multiply the learning rate by k. """ logger.info("Base LR: {}".format(BASE_LR)) callbacks = [ ModelSaver(max_to_keep=10), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (35, BASE_LR * 1e-1), (70, BASE_LR * 1e-2), (95, BASE_LR * 1e-3)]) ] """ Feature Denoising, Sec 5: Our models are trained for a total of 110 epochs; we decrease the learning rate by 10× at the 35- th, 70-th, and 95-th epoch """ max_epoch = 110 if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (5 * steps_per_epoch, BASE_LR)], interp='linear', step_based=True)) """ ImageNet in 1 Hour, Sec 2.2: we start from a learning rate of η and increment it by a constant amount at each iteration such that it reaches ηˆ = kη after 5 epochs """ if not args.fake: # add distributed evaluation, for various attackers that we care. def add_eval_callback(name, attacker, condition): cb = create_eval_callback( name, model.get_inference_func(attacker), # always eval in the last 2 epochs no matter what lambda epoch_num: condition(epoch_num) or epoch_num > max_epoch - 2, image_size=image_size) callbacks.append(cb) add_eval_callback('eval-clean', NoOpAttacker(), lambda e: True) add_eval_callback( 'eval-10step', PGDAttacker(10, args.attack_epsilon, args.attack_step_size), lambda e: True) add_eval_callback( 'eval-50step', PGDAttacker(50, args.attack_epsilon, args.attack_step_size), lambda e: e % 20 == 0) add_eval_callback( 'eval-100step', PGDAttacker(100, args.attack_epsilon, args.attack_step_size), lambda e: e % 10 == 0 or e > max_epoch - 5) for k in [20, 30, 40, 60, 70, 80, 90]: add_eval_callback( 'eval-{}step'.format(k), PGDAttacker(k, args.attack_epsilon, args.attack_step_size), lambda e: False) trainer = HorovodTrainer(average=True) trainer.setup_graph(model.get_input_signature(), data, model.build_graph, model.get_optimizer) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=steps_per_epoch, session_init=SmartInit(args.load), max_epoch=max_epoch, starting_epoch=args.starting_epoch)
choices=[50, 101, 152]) parser.add_argument('--arch', help='Name of architectures defined in nets.py', default='ResNet') args = parser.parse_args() # Define model model = getattr(nets, args.arch + 'Model')(args) # Define attacker if args.attack_iter == 0 or args.eval_directory: attacker = NoOpAttacker() else: attacker = PGDAttacker( args.attack_iter, args.attack_epsilon, args.attack_step_size, prob_start_from_clean=0.2 if not args.eval else 0.0) if args.use_fp16xla: attacker.USE_FP16 = True attacker.USE_XLA = True #False if args.arch.endswith("Dither") else True model.set_attacker(attacker) os.system("nvidia-smi") hvd.init() gpu_thread_count = 2 os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'