Exemplo n.º 1
0
def main():
    repeat = stparams.get_value('repeat', None)
    output_dir = stparams.get_value(
        'outpu_dir',
        stparams.get_value('backup',
                           '/mlsteam/data/yolo/model_weights/trained'))
    pretrained_weights = stparams.get_value('weights_file', None)

    if not os.path.exists(output_dir):
        pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    if repeat:
        repeat = int(repeat)
        for i in range(1, repeat + 1):
            if i == 1:
                pretrained = pretrained_weights
            else:
                pretrained = '%s/mls_final_run_%d.weights' % (output_dir,
                                                              i - 1)

            trained = '%s/mls_final_run_%d.weights' % (output_dir, i)
            train_run(pretrained, output_dir, trained)
    else:
        train_run(pretrained_weights, output_dir)
Exemplo n.º 2
0
    def _experiment_fn(run_config, hparams):
        """Returns an Experiment."""
        # Create estimator.
        train_input_fn = functools.partial(
            input_fn,
            data_dir,
            subset='train',
            num_shards=num_gpus,
            batch_size=hparams.train_batch_size,
            use_distortion_for_training=use_distortion_for_training)

        eval_input_fn = functools.partial(input_fn,
                                          data_dir,
                                          subset='eval',
                                          batch_size=hparams.eval_batch_size,
                                          num_shards=num_gpus)

        num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch(
            'eval')
        if num_eval_examples % hparams.eval_batch_size != 0:
            raise ValueError(
                'validation set size must be multiple of eval_batch_size')

        train_steps = stparams.get_value('train_steps', hparams.train_steps)
        eval_steps = num_eval_examples // hparams.eval_batch_size

        classifier = tf.estimator.Estimator(model_fn=get_model_fn(
            num_gpus, variable_strategy, run_config.num_worker_replicas or 1),
                                            config=run_config,
                                            params=hparams)

        # Create experiment.
        return tf.contrib.learn.Experiment(classifier,
                                           train_input_fn=train_input_fn,
                                           eval_input_fn=eval_input_fn,
                                           train_steps=train_steps,
                                           eval_steps=eval_steps)
Exemplo n.º 3
0
            trainer.test()


if __name__ == "__main__":
    parser = ArgumentParser()

    # PROGRAM level args
    parser.add_argument("--data_dir", type=str, default="/mlsteam/data/cifar10")
    parser.add_argument("--download_weights", type=int, default=0, choices=[0, 1])
    parser.add_argument("--test_phase", type=int, default=0, choices=[0, 1])
    parser.add_argument("--dev", type=int, default=0, choices=[0, 1])
    parser.add_argument(
        "--logger", type=str, default="tensorboard", choices=["tensorboard", "wandb"]
    )

    # TRAINER args
    parser.add_argument("--classifier", type=str, default=stparams.get_value("network", "mobilenet_v2"))
    parser.add_argument("--pretrained", type=int, default=0, choices=[0, 1])

    parser.add_argument("--precision", type=int, default=32, choices=[16, 32])
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--max_epochs", type=int, default=stparams.get_value("num_epochs", 100))
    parser.add_argument("--num_workers", type=int, default=8)
    parser.add_argument("--gpu_id", type=str, default="0")

    parser.add_argument("--learning_rate", type=float, default=1e-2)
    parser.add_argument("--weight_decay", type=float, default=1e-2)

    args = parser.parse_args()
    main(args)
Exemplo n.º 4
0
from keras import optimizers
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
import numpy as np 
###import mlsteam function #####
from mlsteam import stparams

# step 1: load data

img_width = 150
img_height = 150

######Prarms code######
train_data_dir = '/mlsteam/input/train'
valid_data_dir = '/mlsteam/input/validation'
batch_size=stparams.get_value("batch_size", 128)
validation_batch_size=stparams.get_value("validation_batch_size", 128)
num_epochs=stparams.get_value("num_epochs", 30)
########

datagen = ImageDataGenerator(rescale = 1./255)

train_generator = datagen.flow_from_directory(directory=train_data_dir,
											   target_size=(img_width,img_height),
											   classes=['dogs','cats'],
											   class_mode='binary',
											   batch_size=batch_size)

validation_generator = datagen.flow_from_directory(directory=valid_data_dir,
											   target_size=(img_width,img_height),
											   classes=['dogs','cats'],
Exemplo n.º 5
0
    def _resnet_model_fn(features, labels, mode, params):
        """Resnet model body.

    Support single host, one or more GPU training. Parameter distribution can
    be either one of the following scheme.
    1. CPU is the parameter server and manages gradient updates.
    2. Parameters are distributed evenly across all GPUs, and the first GPU
       manages gradient updates.

    Args:
      features: a list of tensors, one for each tower
      labels: a list of tensors, one for each tower
      mode: ModeKeys.TRAIN or EVAL
      params: Hyperparameters suitable for tuning
    Returns:
      A EstimatorSpec object.
    """
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        weight_decay = params.weight_decay
        momentum = params.momentum

        tower_features = features
        tower_labels = labels
        tower_losses = []
        tower_gradvars = []
        tower_preds = []

        # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
        # on CPU. The exception is Intel MKL on CPU which is optimal with
        # channels_last.
        data_format = params.data_format
        if not data_format:
            if num_gpus == 0:
                data_format = 'channels_last'
            else:
                data_format = 'channels_first'

        if num_gpus == 0:
            num_devices = 1
            device_type = 'cpu'
        else:
            num_devices = num_gpus
            device_type = 'gpu'

        for i in range(num_devices):
            worker_device = '/{}:{}'.format(device_type, i)
            if variable_strategy == 'CPU':
                device_setter = cifar10_utils.local_device_setter(
                    worker_device=worker_device)
            elif variable_strategy == 'GPU':
                device_setter = cifar10_utils.local_device_setter(
                    ps_device_type='gpu',
                    worker_device=worker_device,
                    ps_strategy=tf.contrib.training.
                    GreedyLoadBalancingStrategy(
                        num_gpus, tf.contrib.training.byte_size_load_fn))
            with tf.variable_scope('resnet', reuse=bool(i != 0)):
                with tf.name_scope('tower_%d' % i) as name_scope:
                    with tf.device(device_setter):
                        loss, gradvars, preds = _tower_fn(
                            is_training, weight_decay, tower_features[i],
                            tower_labels[i], data_format, params.num_layers,
                            params.batch_norm_decay, params.batch_norm_epsilon)
                        tower_losses.append(loss)
                        tower_gradvars.append(gradvars)
                        tower_preds.append(preds)
                        if i == 0:
                            # Only trigger batch_norm moving mean and variance update from
                            # the 1st tower. Ideally, we should grab the updates from all
                            # towers but these stats accumulate extremely fast so we can
                            # ignore the other stats from the other towers without
                            # significant detriment.
                            update_ops = tf.get_collection(
                                tf.GraphKeys.UPDATE_OPS, name_scope)

        # Now compute global loss and gradients.
        gradvars = []
        with tf.name_scope('gradient_averaging'):
            all_grads = {}
            for grad, var in itertools.chain(*tower_gradvars):
                if grad is not None:
                    all_grads.setdefault(var, []).append(grad)
            for var, grads in six.iteritems(all_grads):
                # Average gradients on the same device as the variables
                # to which they apply.
                with tf.device(var.device):
                    if len(grads) == 1:
                        avg_grad = grads[0]
                    else:
                        avg_grad = tf.multiply(tf.add_n(grads),
                                               1. / len(grads))
                gradvars.append((avg_grad, var))

        # Device that runs the ops to apply global gradient updates.
        consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
        with tf.device(consolidation_device):
            # Suggested learning rate scheduling from
            # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
            num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
                'train') // (stparams.get_value(
                    'train_bs', params.train_batch_size) * num_workers)
            boundaries = [
                num_batches_per_epoch * x
                for x in np.array([82, 123, 300], dtype=np.int64)
            ]
            staged_lr = [
                params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]
            ]

            learning_rate = tf.train.piecewise_constant(
                tf.train.get_global_step(), boundaries, staged_lr)

            loss = tf.reduce_mean(tower_losses, name='loss')

            examples_sec_hook = cifar10_utils.ExamplesPerSecondHook(
                stparams.get_value('train_bs', params.train_batch_size),
                every_n_steps=10)

            tensors_to_log = {'learning_rate': learning_rate, 'loss': loss}

            log_hook = cifar10_utils.LogHook(tensors_to_log)

            logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                                      every_n_iter=100)

            train_hooks = [logging_hook, examples_sec_hook, log_hook]

            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                                   momentum=momentum)

            if params.sync:
                optimizer = tf.train.SyncReplicasOptimizer(
                    optimizer, replicas_to_aggregate=num_workers)
                sync_replicas_hook = optimizer.make_session_run_hook(
                    params.is_chief)
                train_hooks.append(sync_replicas_hook)

            # Create single grouped train op
            train_op = [
                optimizer.apply_gradients(
                    gradvars, global_step=tf.train.get_global_step())
            ]
            train_op.extend(update_ops)
            train_op = tf.group(*train_op)

            predictions = {
                'classes':
                tf.concat([p['classes'] for p in tower_preds], axis=0),
                'probabilities':
                tf.concat([p['probabilities'] for p in tower_preds], axis=0)
            }
            stacked_labels = tf.concat(labels, axis=0)
            metrics = {
                'accuracy':
                tf.metrics.accuracy(stacked_labels, predictions['classes'])
            }

        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          loss=loss,
                                          train_op=train_op,
                                          training_hooks=train_hooks,
                                          eval_metric_ops=metrics)
Exemplo n.º 6
0
def write_model_cfg(output_cfg=DEFAULT_MODEL_CFG, **kwargs):
    if stparams.get_value('version') == 'tiny':
        model_file = MODEL_CFG_TINY_TEMPLATE
    else:
        model_file = MODEL_CFG_ORIG_TEMPLATE
    return write_cfg(model_file, output_cfg, **kwargs)
Exemplo n.º 7
0
def train_run(pretrained_weights, output_dir, output_weights=None):
    # read obj names get num class
    train_dir = stparams.get_value('train_dir',
                                   '/mlstea/data/yolo/training_data/yolo')
    param_names = stparams.get_value('names', '/mlsteam/data/yolo/obj.names')
    image_dir, names_file = get_input_dirs(train_dir, param_names)
    max_batches = int(stparams.get_value('max_batches', 500))
    number_classes = num_classes(names_file)
    filters = int((number_classes + 5) * 3)

    num_epoch_save = '1,000' if max_batches < 10000 else '10,000'
    log_summary(f"Number of classes: {number_classes}")
    log_summary(f"Output weight policy: every {num_epoch_save} batch")
    log_summary("   if max_batcheds < 10,000, save weights every 1,000 batch")
    log_summary(
        "   if max_batcheds >= 10,000, save weights every 10,000 batch")

    # Prepare train_list.txt
    train_list = os.path.join(train_dir, TRAIN_LIST_FILENAME)
    ensure_img_list(train_list, image_dir,
                    stparams.get_value('image_exts', 'jpg;png').split(';'))

    # Prepare valid_list.txt if valid_dir present
    valid_dir = stparams.get_value('valid_dir', None)
    valid_list = os.path.join(train_dir, VALID_LIST_FILENAME)
    if not os.path.exists(valid_list):
        print("Validate list not exist, try to scan valid_dir {}".format(
            valid_dir))
        if valid_dir in ['', None]:
            print("Parameter valid_dir not specify! skip validation")
            valid_list = 'no_valid_list'
        else:
            image_dir, names_file = get_input_dirs(valid_list, param_names)
            valid_list = ensure_img_list(valid_list, image_dir)

    cfg_dir = os.path.join(output_dir, 'cfg')
    if not os.path.exists(cfg_dir):
        os.makedirs(cfg_dir)

    model_cfg = write_model_cfg(
        output_cfg=os.path.join(cfg_dir, MODEL_CFG_NAME),
        batch=stparams.get_value('batch', 64),
        subdivisions=stparams.get_value('subdivisions', 64),
        learning_rate=stparams.get_value('learning_rate', 0.001),
        max_batches=max_batches,
        steps=stparams.get_value('steps', '400, 450').replace(
            ';', ','),  # comma(,) is mlsteam parameter reserved character
        scales=stparams.get_value('scales', '.1, .1').replace(
            ';', ','),  # comma(,) is mlsteam parameter reserved character
        num_classes=number_classes,
        filters=filters,  # num_mask *(num_class+5) = 3*(1+5)
    )
    data_cfg = os.path.join(train_dir, DATA_CFG_NAME)
    data_cfg = write_data_cfg(output_cfg=data_cfg,
                              num_classes=number_classes,
                              train_list=train_list,
                              valid_list=valid_list,
                              names=names_file,
                              backup=output_dir,
                              eval=stparams.get_value('eval', 'coco'))

    #prepare config for inferencing
    write_inf_cfg(output_cfg=os.path.join(cfg_dir, DATA_CFG_NAME),
                  num_classes=number_classes,
                  names=os.path.join('cfg', os.path.basename(names_file)))
    copyfile(names_file, os.path.join(cfg_dir, os.path.basename(names_file)))

    train(data_cfg, model_cfg, pretrained_weights=pretrained_weights)

    if output_weights:
        weights_path = '%s/%s_final.weights' % (
            output_dir, os.path.basename(model_cfg).split('.')[0])
        copyfile(weights_path, output_weights)