Exemplo n.º 1
0
 def _get_optimizer(self):
     lr = symbolic_functions.get_scalar_var('learning_rate',
                                            5e-3,
                                            summary=True)
     opt = tf.train.AdamOptimizer(lr, epsilon=1e-3)
     return optimizer.apply_grad_processors(
         opt, [GlobalNormClip(5), SummaryGradient()])
Exemplo n.º 2
0
Arquivo: dr4.py Projeto: voidiak/MTRE
 def optimizer(self):
     lr = tf.get_variable("learning_rate",
                          initializer=0.001,
                          trainable=False)
     opt = tf.train.AdamOptimizer(lr)
     return optimizer.apply_grad_processors(
         opt, [GlobalNormClip(5), SummaryGradient()])
Exemplo n.º 3
0
 def optimizer(self):
     lr = tf.get_variable('learning_rate',
                          initializer=5e-3,
                          trainable=False)
     opt = tf.train.AdamOptimizer(lr, epsilon=1e-3)
     return optimizer.apply_grad_processors(
         opt, [GlobalNormClip(5), SummaryGradient()])
Exemplo n.º 4
0
 def get_optimizer(self):
     lr = tf.get_variable('learning_rate',
                          initializer=2e-3,
                          trainable=False)
     opt = tf.train.AdamOptimizer(lr)
     #what's ???
     return optimizer.apply_grad_processors(opt, [GlobalNormClip(5)])
Exemplo n.º 5
0
 def optimizer(self):
     lr = tf.get_variable('learning_rate',
                          initializer=2e-2,
                          trainable=False)
     opt = tf.train.AdamOptimizer(lr)
     return optimizer.apply_grad_processors(opt,
                                            [GlobalNormClip(GRADIENT_CLIP)])
Exemplo n.º 6
0
Arquivo: edr4.py Projeto: voidiak/MTRE
 def optimizer(self):
     lr = tf.get_variable("learning_rate",
                          initializer=self.params.lr,
                          trainable=False)
     opt = tf.train.AdamOptimizer(lr)
     return optimizer.apply_grad_processors(opt, [GlobalNormClip(5)])
Exemplo n.º 7
0
def get_training_params(model_cls, args, is_training=True):
    """
    Data set specific params. Modify args for the specific data-set.
    """
    model = None
    ds_train, ds_val, insrc_train, insrc_val = None, None, None, None
    args.steps_per_epoch = None
    lr_schedule = None
    has_cbs_init = False
    train_cbs = []
    val_cbs = []
    output_names = None
    output_funcs = None

    args.batch_size = scale_int_val_with_gpu(args.batch_size_per_gpu,
                                             args.nr_gpu)
    args.init_lr = args.init_lr_per_sample * args.batch_size
    if args.ds_name == 'cifar10' or args.ds_name == 'cifar100':
        if args.ds_name == 'cifar10':
            args.num_classes = 10
        else:
            args.num_classes = 100
        args.regularize_coef = 'decay'
        args.input_size = 32
        fs.set_dataset_path(path=args.data_dir, auto_download=False)
        get_data = cifar.get_cifar_augmented_data

        if is_training:
            ds_train = get_data('train',
                                args,
                                do_multiprocess=True,
                                do_validation=args.do_validation,
                                shuffle=True)
            if args.training_type == 'darts_cifar':
                args.init_lr = 0.025
                args.regularize_coef = 'const'
                args.regularize_const = 3e-4

            lr = float(args.init_lr)
            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = args.init_model_epoch
            else:
                max_epoch = args.max_train_model_epoch
            max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.lr_decay_method = 'cosine'
            args.gradprocs = [GlobalNormClip(5)]
            args.max_epoch = max_epoch
            args.steps_per_epoch = ds_train.size()

        if args.do_remote_child_inf_runner or not is_training:
            ds_val = get_data('test',
                              args,
                              do_multiprocess=False,
                              do_validation=args.do_validation,
                              shuffle=False)

    elif args.ds_name == 'ilsvrc' or args.ds_name == 'imagenet':
        args.num_classes = 1000
        args.input_size = 224

        args.do_mean_std_gpu_process = True
        args.input_type = 'uint8'
        args.mean = imagenet.ilsvrc_mean
        args.std = imagenet.ilsvrc_std
        #args.s_type = 'imagenet' # make sure to check this...

        get_data = imagenet.get_ilsvrc_augmented_data
        if is_training:
            ds_train = get_data('train',
                                args,
                                do_multiprocess=True,
                                is_train=True,
                                shuffle=True)
            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = 100
            else:
                max_epoch = args.max_train_model_epoch
            args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.steps_per_epoch = ds_train.size()
            args = imagenet.training_params_update(args)
            args.gradprocs = [GlobalNormClip(5)]

        if args.do_remote_child_inf_runner or not is_training:
            ds_val = get_data('val',
                              args,
                              do_multiprocess=True,
                              is_train=False,
                              shuffle=True)

    elif args.ds_name == 'tiny_imagenet':
        # fix data-set specific params
        args.num_classes = 200
        args.input_size = 64

        # transfer uint8 data and cast to float in gpu
        args.do_mean_std_gpu_process = True
        args.input_type = 'uint8'
        args.mean = get_augmented_data.ilsvrc_mean
        args.std = get_augmented_data.ilsvrc_std
        args.s_type = 'conv7'
        args.b_type = 'bottleneck'

        # training params
        args.regularize_coef = 'const'
        fs.set_dataset_path(path=args.data_dir, auto_download=False)
        get_data = tiny_imagenet.get_tiny_imagenet_augmented_data

        if is_training:
            ds_train = get_data('train',
                                args,
                                do_multiprocess=True,
                                shuffle=True,
                                is_train=True)
            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = 100
            else:
                max_epoch = args.max_train_model_epoch
            args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.steps_per_epoch = ds_train.size()
            args = imagenet.training_params_update(args)
            args.gradprocs = [GlobalNormClip(10)]

        if args.do_remote_child_inf_runner or not is_training:
            ds_val = get_data('val',
                              args,
                              do_multiprocess=True,
                              is_train=False)

    elif downsampled_imagenet.is_ds_name_downsampled_imagenet(args.ds_name):
        args.num_classes = 1000
        args.input_size = downsampled_imagenet.ds_name_to_input_size(
            args.ds_name)
        args.regularize_coef = 'decay'
        args.b_type = 'bottleneck'

        get_data = downsampled_imagenet.get_downsampled_imagenet_augmented_data
        if is_training:
            ds_train = get_data('train',
                                args,
                                do_multiprocess=True,
                                shuffle=True,
                                do_validation=args.do_validation)

            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = 100
            else:
                max_epoch = args.max_train_model_epoch
            args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.steps_per_epoch = ds_train.size()
            args = imagenet.training_params_update(args)
            args.gradprocs = [GlobalNormClip(10)]

        if args.do_remote_child_inf_runner or not is_training:
            ds_val = get_data('val',
                              args,
                              do_multiprocess=True,
                              do_validation=args.do_validation)

    elif args.ds_name == 'speech_commands':
        args.regularize_coef = 'const'
        args.num_classes = len(speech_commands.DEFAULT_TRAIN_WORDS) + 2
        get_data = speech_commands.get_augmented_speech_commands_data

        if is_training:
            ds_train = get_data('train',
                                args,
                                do_multiprocess=True,
                                shuffle=True)
            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = 90
            else:
                max_epoch = args.max_train_model_epoch
            args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.steps_per_epoch = ds_train.size()
            args = imagenet.training_params_update(args)
            args.gradprocs = [GlobalNormClip(10)]

        if args.do_remote_child_inf_runner or not is_training:
            val_split = 'val' if args.do_validation else 'test'
            ds_val = get_data(val_split,
                              args,
                              do_multiprocess=False,
                              shuffle=False)

    elif args.ds_name == 'svhn':
        args.num_classes = 10
        args.regularize_coef = 'decay'
        args.input_size = 32
        fs.set_dataset_path(path=args.data_dir, auto_download=False)
        get_data = get_augmented_data.get_svhn_augmented_data

        ## Training model
        if is_training:
            ds_train = get_data('train',
                                args,
                                do_multiprocess=True,
                                shuffle=True)
            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = 60
            else:
                max_epoch = 12
            args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.steps_per_epoch = ds_train.size()
            args.lr_decay_method = 'cosine'
            args.gradprocs = [GlobalNormClip(5)]

        if args.do_remote_child_inf_runner or not is_training:
            ds_val = get_data('test',
                              args,
                              do_multiprocess=False,
                              shuffle=False)

    elif args.ds_name.startswith('openml_'):
        int_start = args.ds_name.find('_') + 1
        dataset_idx = int(args.ds_name[int_start:])
        # Some arg protetection in case these are used in the future
        #assert not hasattr(args, 'mlp_input_types') and not hasattr(args, 'mlp_input_dims')
        (l_ds, args.mlp_input_types, args.mlp_input_dims, n_data,
         args.num_classes, args.mlp_feat_means,
         args.mlp_feat_stds) = openml.get_openml_dataflow(
             dataset_idx,
             args.data_dir,
             splits=['train', 'val'],
             do_validation=args.do_validation)
        ds_train = preprocess_data_flow(l_ds['train'], args, True)
        ds_val = preprocess_data_flow(l_ds['val'], args, False)
        logger.info("Dataset {} has {} samples and {} dims".format(\
            args.ds_name, n_data, len(args.mlp_input_types)))

        if is_training:
            lr = float(args.init_lr)
            lr = float(args.init_lr)
            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = args.init_model_epoch
            else:
                max_epoch = args.max_train_model_epoch

            args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.steps_per_epoch = ds_train.size()
            max_epoch = args.max_epoch
            lr_schedule = [(1, lr), (max_epoch // 2, lr * 1e-1),
                           (max_epoch * 3 // 4, lr * 1e-2),
                           (max_epoch * 7 // 8, lr * 1e-3)]

        if args.do_remote_child_inf_runner or not is_training:
            ds_val = preprocess_data_flow(l_ds['val'], args, True)

    elif args.ds_name == 'inat' or args.ds_name == 'inat100' or args.ds_name == 'inat1000' or args.ds_name == 'inat2017_1000':
        inat_lmdb_dir = None
        inat_year = '2018'

        if args.ds_name == 'inat':
            args.num_classes = 8142
            n_allow = None
        elif args.ds_name == 'inat100':
            args.num_classes = 100
            n_allow = 100
        elif args.ds_name == 'inat1000':
            args.num_classes = 1000
            n_allow = 1000
        elif args.ds_name == 'inat2017_1000':
            args.num_classes = 1000
            n_allow = 1000
            inat_year = '2017'
            inat_lmdb_dir = 'inat2017_data/lmdb'
        args.input_size = 224

        args.do_mean_std_gpu_process = True
        args.input_type = 'uint8'
        args.mean = inat.image_mean
        args.std = inat.image_std

        get_data = inat.get_inat_augmented_data
        if is_training:
            ds_train = get_data('train',
                                args,
                                lmdb_dir=inat_lmdb_dir,
                                year=inat_year,
                                do_multiprocess=True,
                                do_validation=args.do_validation,
                                is_train=True,
                                shuffle=True,
                                n_allow=n_allow)
            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = 100
            else:
                max_epoch = args.max_train_model_epoch
            args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.steps_per_epoch = ds_train.size()
            args = imagenet.training_params_update(args)
            args.gradprocs = [GlobalNormClip(5)]

        if args.do_remote_child_inf_runner or not is_training:
            ds_val = get_data('val',
                              args,
                              lmdb_dir=inat_lmdb_dir,
                              year=inat_year,
                              do_multiprocess=True,
                              do_validation=args.do_validation,
                              is_train=False,
                              shuffle=True,
                              n_allow=n_allow)

    elif args.ds_name == 'ptb':
        ptb_data_dir = os.path.join(args.data_dir, 'ptb_data')
        args.input_type = 'int32'
        args.search_cell_based = False
        # force single gpu for now.
        args.nr_gpu = 1
        args, local_args = ptb.training_params_update(args)

        # evaluation/testing batch size change.
        if not is_training:
            if args.do_validation:
                args.batch_size_per_gpu = 64
            else:
                args.batch_size_per_gpu = 64
        # update globale batch size.
        args.batch_size = args.batch_size_per_gpu * args.nr_gpu
        args.init_lr = args.init_lr_per_sample * args.batch_size

        if is_training:
            var_size = not args.model_rnn_has_static_len
            ds_train = ptb.PennTreeBankDataFlow('train',
                                                ptb_data_dir,
                                                args.batch_size,
                                                args.model_rnn_max_len,
                                                var_size=var_size)
            args.steps_per_epoch = ds_train.size()
            args.model_rnn_vocab_size = ds_train.vocab_size
            if args.child_train_from_scratch and args.job_type == 'remote_child':
                max_epoch = 100
            else:
                max_epoch = args.max_train_model_epoch
            args.max_epoch = (max_epoch + args.nr_gpu - 1) // args.nr_gpu
            args.gradprocs = [GlobalNormClip(local_args.grad_clip)]

            model = model_cls(args)
            #
            # compute some callbacks for training
            # We need to consturct the model now, as some op requires
            # the graph to be up. (shifting states and reset states)
            ptb.ptb_training_cbs(model, args, ptb_data_dir, train_cbs)
            has_cbs_init = True

        if args.do_remote_child_inf_runner or not is_training:
            ds_val = ptb.PennTreeBankDataFlow(
                'valid' if args.do_validation else 'test',
                ptb_data_dir,
                args.batch_size,
                args.model_rnn_max_len,
                var_size=False)
            args.model_rnn_vocab_size = ds_val.vocab_size

            model = model_cls(args)
            # testing set up.
            # log loss of each sample
            output_names = [
                #model.inference_update_tensor(name_only=True) + ':0',
                'avg_batch_cost:0',
                'seq_len:0',
                'per_seq_sum_logloss:0',
            ]
            # the averaging is done automatically over the batches
            # We need to average over the time.
            # We exponentiate per prediction logloss for the perplexity score.
            output_funcs = [
                #None,
                lambda x: x * args.batch_size,
                lambda x: x * args.batch_size,
                lambda x: np.exp(x / args.model_rnn_max_len),
            ]

    else:
        raise Exception("Unknown dataset {}".format(args.ds_name))

    # computing epochs / steps / reading init learning rate.
    # Last section that may affect the args
    args.max_train_steps = None
    if is_training:
        args.candidate_gate_eps = 1.0 / args.steps_per_epoch / args.batch_size
        starting_epoch = 1
        if args.model_dir is not None:
            ckpt = tf.train.latest_checkpoint(args.model_dir)
            if ckpt:
                starting_epoch = ann_app_utils.grep_starting_epoch(
                    ckpt, args.steps_per_epoch)
        if lr_schedule:
            args.init_lr = ann_app_utils.grep_init_lr(starting_epoch,
                                                      lr_schedule)
        if args.debug_child_max_epoch:
            args.max_epoch = args.debug_child_max_epoch
        if args.debug_steps_per_epoch:
            args.steps_per_epoch = args.debug_steps_per_epoch
            starting_epoch = 1
        logger.info("Start at epoch {} with learning rate {}".format(
            starting_epoch, args.init_lr))
        args.max_train_steps = args.steps_per_epoch * args.max_epoch
        if model is not None:
            model.options.max_train_steps = args.max_train_steps

    if model is None:
        # if the dataset specific does not init the model, we init it here
        model = model_cls(args)
    # From now on args should be const.

    if is_training:
        if not has_cbs_init:
            if ds_val and args.debug_steps_per_epoch:
                ds_val = FixedSizeData(ds_val, args.debug_steps_per_epoch)
            train_cbs.extend(
                _inference_runner_train_cbs(args, ds_val, insrc_val, val_cbs))
        return (model, args, starting_epoch, lr_schedule, ds_train,
                insrc_train, train_cbs)

    else:
        if output_names is None:
            output_names = _inference_output_names(args)
            output_funcs = [None] * len(output_names)
        return (model, args, ds_val, insrc_val, output_names, output_funcs)
Exemplo n.º 8
0
 def get_gradient_processor(self):
     return [GlobalNormClip(400)]