model_cls = None if args.densenet_version == 'atv2': model_cls = AnytimeFCDenseNetV2 elif args.densenet_version == 'atv1': model_cls = AnytimeFCDenseNet(AnytimeLogDenseNetV1) elif args.densenet_version == 'loglog': model_cls = AnytimeFCDenseNet(AnytimeLogLogDenseNet) elif args.densenet_version == 'c2f': model_cls = AnytimeFCNCoarseToFine side = 360 elif args.densenet_version == 'dense': model_cls = FCDensenet logger.set_log_root(log_root=args.log_dir) logger.auto_set_dir(action='k') fs.set_dataset_path(args.data_dir) ## # Store a philly_operation.txt in root log dir # every run will check it if the script is run on philly # philly_operation.txt should contain the same step that is current running # if it does not exit, the default is written there (train) if args.is_philly: philly_operation_fn = os.path.join(args.log_dir, 'philly_operation.txt') if not os.path.exists(philly_operation_fn): with open(philly_operation_fn, 'wt') as fout: fout.write(args.operation) else: with open(philly_operation_fn, 'rt') as fin: philly_operation = fin.read().strip() if philly_operation != args.operation:
def get_training_params(model_cls, args, is_training=True): """ Data set specific params. Modify args for the specific data-set. """ model = None ds_train, ds_val, insrc_train, insrc_val = None, None, None, None args.steps_per_epoch = None lr_schedule = None has_cbs_init = False train_cbs = [] val_cbs = [] output_names = None output_funcs = None args.batch_size = scale_int_val_with_gpu(args.batch_size_per_gpu, args.nr_gpu) args.init_lr = args.init_lr_per_sample * args.batch_size if args.ds_name == 'cifar10' or args.ds_name == 'cifar100': if args.ds_name == 'cifar10': args.num_classes = 10 else: args.num_classes = 100 args.regularize_coef = 'decay' args.input_size = 32 fs.set_dataset_path(path=args.data_dir, auto_download=False) get_data = cifar.get_cifar_augmented_data if is_training: ds_train = get_data('train', args, do_multiprocess=True, do_validation=args.do_validation, shuffle=True) if args.training_type == 'darts_cifar': args.init_lr = 0.025 args.regularize_coef = 'const' args.regularize_const = 3e-4 lr = float(args.init_lr) if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = args.init_model_epoch else: max_epoch = args.max_train_model_epoch max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.lr_decay_method = 'cosine' args.gradprocs = [GlobalNormClip(5)] args.max_epoch = max_epoch args.steps_per_epoch = ds_train.size() if args.do_remote_child_inf_runner or not is_training: ds_val = get_data('test', args, do_multiprocess=False, do_validation=args.do_validation, shuffle=False) elif args.ds_name == 'ilsvrc' or args.ds_name == 'imagenet': args.num_classes = 1000 args.input_size = 224 args.do_mean_std_gpu_process = True args.input_type = 'uint8' args.mean = imagenet.ilsvrc_mean args.std = imagenet.ilsvrc_std #args.s_type = 'imagenet' # make sure to check this... get_data = imagenet.get_ilsvrc_augmented_data if is_training: ds_train = get_data('train', args, do_multiprocess=True, is_train=True, shuffle=True) if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = 100 else: max_epoch = args.max_train_model_epoch args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.steps_per_epoch = ds_train.size() args = imagenet.training_params_update(args) args.gradprocs = [GlobalNormClip(5)] if args.do_remote_child_inf_runner or not is_training: ds_val = get_data('val', args, do_multiprocess=True, is_train=False, shuffle=True) elif args.ds_name == 'tiny_imagenet': # fix data-set specific params args.num_classes = 200 args.input_size = 64 # transfer uint8 data and cast to float in gpu args.do_mean_std_gpu_process = True args.input_type = 'uint8' args.mean = get_augmented_data.ilsvrc_mean args.std = get_augmented_data.ilsvrc_std args.s_type = 'conv7' args.b_type = 'bottleneck' # training params args.regularize_coef = 'const' fs.set_dataset_path(path=args.data_dir, auto_download=False) get_data = tiny_imagenet.get_tiny_imagenet_augmented_data if is_training: ds_train = get_data('train', args, do_multiprocess=True, shuffle=True, is_train=True) if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = 100 else: max_epoch = args.max_train_model_epoch args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.steps_per_epoch = ds_train.size() args = imagenet.training_params_update(args) args.gradprocs = [GlobalNormClip(10)] if args.do_remote_child_inf_runner or not is_training: ds_val = get_data('val', args, do_multiprocess=True, is_train=False) elif downsampled_imagenet.is_ds_name_downsampled_imagenet(args.ds_name): args.num_classes = 1000 args.input_size = downsampled_imagenet.ds_name_to_input_size( args.ds_name) args.regularize_coef = 'decay' args.b_type = 'bottleneck' get_data = downsampled_imagenet.get_downsampled_imagenet_augmented_data if is_training: ds_train = get_data('train', args, do_multiprocess=True, shuffle=True, do_validation=args.do_validation) if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = 100 else: max_epoch = args.max_train_model_epoch args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.steps_per_epoch = ds_train.size() args = imagenet.training_params_update(args) args.gradprocs = [GlobalNormClip(10)] if args.do_remote_child_inf_runner or not is_training: ds_val = get_data('val', args, do_multiprocess=True, do_validation=args.do_validation) elif args.ds_name == 'speech_commands': args.regularize_coef = 'const' args.num_classes = len(speech_commands.DEFAULT_TRAIN_WORDS) + 2 get_data = speech_commands.get_augmented_speech_commands_data if is_training: ds_train = get_data('train', args, do_multiprocess=True, shuffle=True) if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = 90 else: max_epoch = args.max_train_model_epoch args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.steps_per_epoch = ds_train.size() args = imagenet.training_params_update(args) args.gradprocs = [GlobalNormClip(10)] if args.do_remote_child_inf_runner or not is_training: val_split = 'val' if args.do_validation else 'test' ds_val = get_data(val_split, args, do_multiprocess=False, shuffle=False) elif args.ds_name == 'svhn': args.num_classes = 10 args.regularize_coef = 'decay' args.input_size = 32 fs.set_dataset_path(path=args.data_dir, auto_download=False) get_data = get_augmented_data.get_svhn_augmented_data ## Training model if is_training: ds_train = get_data('train', args, do_multiprocess=True, shuffle=True) if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = 60 else: max_epoch = 12 args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.steps_per_epoch = ds_train.size() args.lr_decay_method = 'cosine' args.gradprocs = [GlobalNormClip(5)] if args.do_remote_child_inf_runner or not is_training: ds_val = get_data('test', args, do_multiprocess=False, shuffle=False) elif args.ds_name.startswith('openml_'): int_start = args.ds_name.find('_') + 1 dataset_idx = int(args.ds_name[int_start:]) # Some arg protetection in case these are used in the future #assert not hasattr(args, 'mlp_input_types') and not hasattr(args, 'mlp_input_dims') (l_ds, args.mlp_input_types, args.mlp_input_dims, n_data, args.num_classes, args.mlp_feat_means, args.mlp_feat_stds) = openml.get_openml_dataflow( dataset_idx, args.data_dir, splits=['train', 'val'], do_validation=args.do_validation) ds_train = preprocess_data_flow(l_ds['train'], args, True) ds_val = preprocess_data_flow(l_ds['val'], args, False) logger.info("Dataset {} has {} samples and {} dims".format(\ args.ds_name, n_data, len(args.mlp_input_types))) if is_training: lr = float(args.init_lr) lr = float(args.init_lr) if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = args.init_model_epoch else: max_epoch = args.max_train_model_epoch args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.steps_per_epoch = ds_train.size() max_epoch = args.max_epoch lr_schedule = [(1, lr), (max_epoch // 2, lr * 1e-1), (max_epoch * 3 // 4, lr * 1e-2), (max_epoch * 7 // 8, lr * 1e-3)] if args.do_remote_child_inf_runner or not is_training: ds_val = preprocess_data_flow(l_ds['val'], args, True) elif args.ds_name == 'inat' or args.ds_name == 'inat100' or args.ds_name == 'inat1000' or args.ds_name == 'inat2017_1000': inat_lmdb_dir = None inat_year = '2018' if args.ds_name == 'inat': args.num_classes = 8142 n_allow = None elif args.ds_name == 'inat100': args.num_classes = 100 n_allow = 100 elif args.ds_name == 'inat1000': args.num_classes = 1000 n_allow = 1000 elif args.ds_name == 'inat2017_1000': args.num_classes = 1000 n_allow = 1000 inat_year = '2017' inat_lmdb_dir = 'inat2017_data/lmdb' args.input_size = 224 args.do_mean_std_gpu_process = True args.input_type = 'uint8' args.mean = inat.image_mean args.std = inat.image_std get_data = inat.get_inat_augmented_data if is_training: ds_train = get_data('train', args, lmdb_dir=inat_lmdb_dir, year=inat_year, do_multiprocess=True, do_validation=args.do_validation, is_train=True, shuffle=True, n_allow=n_allow) if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = 100 else: max_epoch = args.max_train_model_epoch args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.steps_per_epoch = ds_train.size() args = imagenet.training_params_update(args) args.gradprocs = [GlobalNormClip(5)] if args.do_remote_child_inf_runner or not is_training: ds_val = get_data('val', args, lmdb_dir=inat_lmdb_dir, year=inat_year, do_multiprocess=True, do_validation=args.do_validation, is_train=False, shuffle=True, n_allow=n_allow) elif args.ds_name == 'ptb': ptb_data_dir = os.path.join(args.data_dir, 'ptb_data') args.input_type = 'int32' args.search_cell_based = False # force single gpu for now. args.nr_gpu = 1 args, local_args = ptb.training_params_update(args) # evaluation/testing batch size change. if not is_training: if args.do_validation: args.batch_size_per_gpu = 64 else: args.batch_size_per_gpu = 64 # update globale batch size. args.batch_size = args.batch_size_per_gpu * args.nr_gpu args.init_lr = args.init_lr_per_sample * args.batch_size if is_training: var_size = not args.model_rnn_has_static_len ds_train = ptb.PennTreeBankDataFlow('train', ptb_data_dir, args.batch_size, args.model_rnn_max_len, var_size=var_size) args.steps_per_epoch = ds_train.size() args.model_rnn_vocab_size = ds_train.vocab_size if args.child_train_from_scratch and args.job_type == 'remote_child': max_epoch = 100 else: max_epoch = args.max_train_model_epoch args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu args.gradprocs = [GlobalNormClip(local_args.grad_clip)] model = model_cls(args) # # compute some callbacks for training # We need to consturct the model now, as some op requires # the graph to be up. (shifting states and reset states) ptb.ptb_training_cbs(model, args, ptb_data_dir, train_cbs) has_cbs_init = True if args.do_remote_child_inf_runner or not is_training: ds_val = ptb.PennTreeBankDataFlow( 'valid' if args.do_validation else 'test', ptb_data_dir, args.batch_size, args.model_rnn_max_len, var_size=False) args.model_rnn_vocab_size = ds_val.vocab_size model = model_cls(args) # testing set up. # log loss of each sample output_names = [ #model.inference_update_tensor(name_only=True) + ':0', 'avg_batch_cost:0', 'seq_len:0', 'per_seq_sum_logloss:0', ] # the averaging is done automatically over the batches # We need to average over the time. # We exponentiate per prediction logloss for the perplexity score. output_funcs = [ #None, lambda x: x * args.batch_size, lambda x: x * args.batch_size, lambda x: np.exp(x / args.model_rnn_max_len), ] else: raise Exception("Unknown dataset {}".format(args.ds_name)) # computing epochs / steps / reading init learning rate. # Last section that may affect the args args.max_train_steps = None if is_training: args.candidate_gate_eps = 1.0 / args.steps_per_epoch / args.batch_size starting_epoch = 1 if args.model_dir is not None: ckpt = tf.train.latest_checkpoint(args.model_dir) if ckpt: starting_epoch = ann_app_utils.grep_starting_epoch( ckpt, args.steps_per_epoch) if lr_schedule: args.init_lr = ann_app_utils.grep_init_lr(starting_epoch, lr_schedule) if args.debug_child_max_epoch: args.max_epoch = args.debug_child_max_epoch if args.debug_steps_per_epoch: args.steps_per_epoch = args.debug_steps_per_epoch starting_epoch = 1 logger.info("Start at epoch {} with learning rate {}".format( starting_epoch, args.init_lr)) args.max_train_steps = args.steps_per_epoch * args.max_epoch if model is not None: model.options.max_train_steps = args.max_train_steps if model is None: # if the dataset specific does not init the model, we init it here model = model_cls(args) # From now on args should be const. if is_training: if not has_cbs_init: if ds_val and args.debug_steps_per_epoch: ds_val = FixedSizeData(ds_val, args.debug_steps_per_epoch) train_cbs.extend( _inference_runner_train_cbs(args, ds_val, insrc_val, val_cbs)) return (model, args, starting_epoch, lr_schedule, ds_train, insrc_train, train_cbs) else: if output_names is None: output_names = _inference_output_names(args) output_funcs = [None] * len(output_names) return (model, args, ds_val, insrc_val, output_names, output_funcs)
def cifar_svhn_train_or_test(args, model_cls): """ args : parsed arguments model_cls : class name of the model to run return : """ log_init(args, model_cls) # generate a list of none-empty strings for specifying the splits args.evaluate = list(filter(bool, args.evaluate.split(','))) do_eval = len(args.evaluate) > 0 evaluate = evaluate_cifar_svhn ## Set dataset-network specific assert/info if args.ds_name == 'cifar10' or args.ds_name == 'cifar100': if args.ds_name == 'cifar10': args.num_classes = 10 else: args.num_classes = 100 args.regularize_coef = 'decay' INPUT_SIZE = 32 fs.set_dataset_path(path=args.data_dir, auto_download=False) get_data = get_augmented_data.get_cifar_augmented_data ds_train = get_data('train', args, do_multiprocess=not do_eval, do_validation=args.do_validation) ds_val = get_data('test', args, do_multiprocess=False, do_validation=args.do_validation) lr_schedule = \ [(1, 0.1), (140, 0.01), (210, 0.001), (250, 0.0002)] max_epoch = 300 if do_eval: for eval_name in args.evaluate: if eval_name == 'train': ds = ds_train elif eval_name == 'test': ds = ds_val evaluate(model_cls, ds, eval_name) return elif args.ds_name == 'svhn': args.num_classes = 10 args.regularize_coef = 'decay' INPUT_SIZE = 32 fs.set_dataset_path(path=args.data_dir, auto_download=False) get_data = get_augmented_data.get_svhn_augmented_data if do_eval: if 'train' in args.evaluate: args.evaluate.append('extra') for eval_name in args.evaluate: ds = get_data(eval_name, args, do_multiprocess=False) evaluate(model_cls, ds, eval_name) return ## Training model ds_train = get_data('train', args, do_multiprocess=True) ds_val = get_data('test', args, do_multiprocess=False) lr_schedule = \ [(1, 0.1), (15, 0.01), (30, 0.001), (45, 0.0002)] max_epoch = 60 # svhn/cifar are small enough so that we restart from scratch if interrupted. steps_per_epoch = ds_train.size() // args.nr_gpu starting_epoch = grep_starting_epoch(args.load, steps_per_epoch) logger.info("The starting epoch is {}".format(starting_epoch)) args.init_lr = grep_init_lr(starting_epoch, lr_schedule) logger.info("The starting learning rate is {}".format(args.init_lr)) model = model_cls(INPUT_SIZE, args) classification_cbs = model.compute_classification_callbacks() loss_select_cbs = model.compute_loss_select_callbacks() config = TrainConfig( dataflow=ds_train, callbacks=[ ModelSaver(checkpoint_dir=args.model_dir, max_to_keep=2, keep_checkpoint_every_n_hours=100), InferenceRunner(ds_val, [ScalarStats('cost')] + classification_cbs), ScheduledHyperParamSetter('learning_rate', lr_schedule), HumanHyperParamSetter('learning_rate') ] + loss_select_cbs, model=model, monitors=[JSONWriter(), ScalarPrinter()], steps_per_epoch=steps_per_epoch, max_epoch=max_epoch, starting_epoch=starting_epoch ) if args.load and os.path.exists(args.load): config.session_init = SaverRestore(args.load) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(args.nr_gpu))
DO_VALID = args.do_validation EXP_BASE = args.base OPTIMAL_AT = args.opt_at print("TF version: {}".format(tf.__version__)) if STOP_GRADIENTS: STOP_GRADIENTS_PARTIAL = True SG_GAMMA = 0.0 if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu logger.set_log_root(log_root=args.log_dir) logger.auto_set_dir() fs.set_dataset_path(path=args.data_dir, auto_download=False) MODEL_DIR = args.model_dir logger.info("On Dataset CIFAR{}, Parameters: f= {}, n= {}, w= {}, c= {}, s= {}, batch_size= {}, stopgrad= {}, stopgradpartial= {}, sg_gamma= {}, rand_loss_selector= {}, exp_gamma= {}, sum_rand_ratio= {} do_validation= {} exp_base= {} opt_at= {}".format(\ NUM_CLASSES, FUNC_TYPE, NUM_UNITS, WIDTH, INIT_CHANNEL, \ NUM_UNITS_PER_STACK, BATCH_SIZE, STOP_GRADIENTS, \ STOP_GRADIENTS_PARTIAL, SG_GAMMA, \ args.samloss, EXP3_GAMMA, SUM_RAND_RATIO, DO_VALID, \ EXP_BASE, OPTIMAL_AT)) config = get_config() if args.load: config.session_init = SaverRestore(args.load) if args.gpu: config.nr_tower = len(args.gpu.split(',')) SyncMultiGPUTrainer(config).train()