def main(argv):
    parser = argparse.ArgumentParser()
    # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints
    # should be saved. You can define additional user arguments which will have to be specified after
    # an empty arg -- on the command line:
    # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args

    # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156)
    # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.0.8849 loss 1.466 job 159)
    parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints')
    parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped')
    parser.add_argument('--hp-lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate')
    parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate')
    parser.add_argument('--hp-lr2', default=600, type=float, help='Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.')
    parser.add_argument('--hp-dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.')
    parser.add_argument('--hp-conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.')
    parser.add_argument('--hp-conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.')
    parser.add_argument('--hp-conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.')
    parser.add_argument('--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.')
    parser.add_argument('--hp-iterations', default=10000, type=int, help='Hyperparameter: number of training iterations.')
    args = parser.parse_args()
    arguments = args.__dict__

    hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')}
    otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')}

    logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items())))

    output_dir = otherargs.pop('job_dir')

    # learn_runner needs an experiment function with a single parameter: the output directory.
    # Here we pass additional command line arguments through a closure.
    experiment_fn = lambda output_dir: experiment_fn_with_params(output_dir, hparams, **otherargs)
    # Compatibility warning: learn_runner is currently in contrib. It will move in TF 1.2
    tf.contrib.learn.learn_runner.run(experiment_fn, output_dir)
def log(level, message, *args):
  """Conditionally logs `message % args` at the level `level`.

  Note that tensorboard_logging verbosity and logging verbosity are separate;
  the message will always be passed through to the logging module regardless of
  whether it passes the tensorboard_logging verbosity check.

  Args:
    level: The verbosity level to use. Must be one of
      tensorboard_logging.{DEBUG, INFO, WARN, ERROR, FATAL}.
    message: The message template to use.
    *args: Arguments to interpolate to the message template, if any.

  Raises:
    ValueError: If `level` is not a valid logging level.
    RuntimeError: If the `SummaryWriter` to use has not been set.
  """
  if _summary_writer is _sentinel_summary_writer:
    raise RuntimeError('Must call set_summary_writer before doing any '
                       'logging from tensorboard_logging')
  _check_verbosity(level)
  proto_level = _LEVEL_PROTO_MAP[level]
  if proto_level >= _LEVEL_PROTO_MAP[_verbosity]:
    log_message = event_pb2.LogMessage(level=proto_level,
                                       message=message % args)
    event = event_pb2.Event(wall_time=time.time(), log_message=log_message)

    if _summary_writer:
      _summary_writer.add_event(event)

  logging.log(_PLATFORM_LOGGING_LEVEL_MAP[level], message, *args)
示例#3
0
def log(level, message, *args):
    """Conditionally logs `message % args` at the level `level`.

  Note that tensorboard_logging verbosity and logging verbosity are separate;
  the message will always be passed through to the logging module regardless of
  whether it passes the tensorboard_logging verbosity check.

  Args:
    level: The verbosity level to use. Must be one of
      tensorboard_logging.{DEBUG, INFO, WARN, ERROR, FATAL}.
    message: The message template to use.
    *args: Arguments to interpolate to the message template, if any.

  Raises:
    ValueError: If `level` is not a valid logging level.
    RuntimeError: If the `SummaryWriter` to use has not been set.
  """
    if _summary_writer is _sentinel_summary_writer:
        raise RuntimeError('Must call set_summary_writer before doing any '
                           'logging from tensorboard_logging')
    _check_verbosity(level)
    proto_level = _LEVEL_PROTO_MAP[level]
    if proto_level >= _LEVEL_PROTO_MAP[_verbosity]:
        log_message = event_pb2.LogMessage(level=proto_level,
                                           message=message % args)
        event = event_pb2.Event(wall_time=time.time(), log_message=log_message)

        if _summary_writer:
            _summary_writer.add_event(event)

    logging.log(_PLATFORM_LOGGING_LEVEL_MAP[level], message, *args)
def run_data_generation(data, output_dir, record_batch_size, shuffle_buf, tiles_per_gt_roi, rnd_distmax, rnd_orientation, is_eval):

    img_filelist, roi_filelist = load_file_list(data)

    # sanity checks and log messages
    if len(img_filelist) > 0:
        logging.log(logging.INFO, "Generating {} data.".format("eval" if is_eval else "training"))
    else:
        logging.log(logging.INFO, "No image/json pairs found in folder {}. Skipping.".format(data))
        return

    # dummy args only used in YOLO box assignments, which will be discarded anyway
    # TODO: refactor these outside of the generate_slice function
    yolo_cfg = YOLOConfig(grid_nn = 16, cell_n = 2, cell_swarm = True, cell_grow = 1.0)

    if is_eval:
        dataset = init_eval_dataset_from_images(img_filelist, roi_filelist, record_batch_size, yolo_cfg)
    else:
        dataset = init_train_dataset_from_images(img_filelist, roi_filelist, record_batch_size, shuffle_buf, yolo_cfg,
                                                 False, rnd_orientation, tiles_per_gt_roi, rnd_distmax)  # False = no rnd hue

    dataset = dataset.repeat(1)

    ###
    # TF graph for JPEG image encoding
    features, labels = dataset.make_one_shot_iterator().get_next()
    image_tiles = features['image']
    fname = labels['fnames']
    target_rois = labels['target_rois']  # shape [n_tiles, MAX_TARGET_ROIS_PER_TILE, 4]
    encoded_jpegs = tf.map_fn(lambda image_bytes:
                              tf.image.encode_jpeg(image_bytes, optimize_size=True, chroma_downsampling=False),
                              image_tiles, dtype=tf.string)
    # end of TF graph for image encoding
    ###

    i = 0
    with tf.Session() as sess:
        while True:
            try:
                image_jpegs_r, target_rois_r, fname_r = sess.run([encoded_jpegs, target_rois, fname])
            except tf.errors.OutOfRangeError:
                break
            except tf.errors.NotFoundError:
                break
            i += 1
            # write ROIs
            basename = os.path.basename(fname_r[0].decode("utf-8"))
            basename, _ = os.path.splitext(basename)
            filename = os.path.join(output_dir, "{}tiles{:06}_{}.tfrecord".format(record_batch_size, i, basename))
            with tf.python_io.TFRecordWriter(filename) as file:
                for one_image_jpeg, per_image_target_rois in zip(image_jpegs_r, target_rois_r):
                    nonempty_target_rois = filter(lambda roi: abs(roi[2]-roi[0]) > 0 and  # roi format is x1y1x2y2
                                                              abs(roi[3]-roi[1]) > 0, per_image_target_rois)
                    nonempty_target_rois = np.array(list(nonempty_target_rois), np.float32)
                    nonempty_target_rois = np.reshape(nonempty_target_rois, [-1]).tolist()
                    write_tfrecord_features(file, one_image_jpeg, nonempty_target_rois, fname_r[0])  # write TFRecord
示例#5
0
def batch_filter_by_bool(rois, mask, max_n):
    rois_n = tf.count_nonzero(mask, axis=1)
    overflow = tf.maximum(rois_n - max_n, 0)

    rois = tf.map_fn(
        lambda rois__mask: filter_by_bool_remove(*rois__mask, max_n=max_n),
        (rois, mask),
        dtype=tf.float32)  # shape[batch,max_n, 4]
    rois = tf.reshape(rois, [-1, max_n, 4])
    logging.log(logging.INFO, rois)
    # Tensorflow needs a hint about the shape
    return rois, overflow
def datagen_main(argv):
    parser = argparse.ArgumentParser()
    def str2bool(v): return v=='True'
    parser.add_argument('--job-dir', default="checkpoints", help='Not used in datagen mode but required by ML engine')
    parser.add_argument('--data', default="sample_data/USGS_public_domain_airports", help='Path to data file (can be on Google cloud storage gs://...)')
    parser.add_argument('--output-dir', default="tilecache", help='Folder where generated training and eval tiles will be stored (can be on Google cloud storage gs://...)')
    parser.add_argument('--record-batch-size', default=100, type=int, help='How many tiles per TFRecord file in the output')
    parser.add_argument('--shuffle-buf', default=10000, type=int, help='Size of the shuffle buffer for shuffling tiles. 0 to disable shuffling.')
    parser.add_argument('--hp-data-tiles-per-gt-roi', default=100, type=int, help='Data generation hyperparameter: number of training tiles generated around each ground truth ROI')
    parser.add_argument('--hp-data-rnd-distmax', default=2.0, type=float, help='Data generation hyperparameter: training tiles selection max random distance from ground truth ROI (always 2.0 for eval tiles)')
    parser.add_argument('--hp-data-rnd-orientation', default=True, type=str2bool, help='Data generation hyperparameter: data augmentation by rotating and flipping tiles.')
    args = parser.parse_args()

    data_eval = args.data + "_eval"
    output_dir_eval = args.output_dir + "_eval"
    if not gcsfile.file_exists(args.output_dir) or not gcsfile.file_exists(output_dir_eval):
        logging.log(logging.ERROR, "Error: both the otput path \"{}\" and the eval "
                                   "output path \"{}\" must exist. Please create them "
                                   "before starting data generation.".format(args.output_dir, output_dir_eval))
        exit(-1)

    logging.log(logging.INFO, "Training data path: " + args.data)
    logging.log(logging.INFO, "Eval data path: " + data_eval)
    logging.log(logging.INFO, "Command-line parameters only affect training data generation. "
                              "Eval data is generated with hard-coded parameters so as to offer "
                              "a consistent evaluation benchmark.")

    rnd_distmax = args.hp_data_rnd_distmax
    tiles_per_gt_roi = args.hp_data_tiles_per_gt_roi
    rnd_orientation = args.hp_data_rnd_orientation

    # training and eval data generation
    run_data_generation(args.data, args.output_dir, args.record_batch_size, args.shuffle_buf, tiles_per_gt_roi, rnd_distmax, rnd_orientation, is_eval=False)
    run_data_generation(data_eval, output_dir_eval, args.record_batch_size, args.shuffle_buf, tiles_per_gt_roi, rnd_distmax, rnd_orientation, is_eval=True)
示例#7
0
def load_data(path):

    # loads from GCS if gs:// path,
    # loads locally otherwise

    with gcsfile.FileIO(path, 'rb') as zf:
        with gzip.GzipFile(fileobj=zf, mode='rb') as f:
            planesnet = pickle.load(f)
            # unpack dictionary
            data_images = planesnet['data']
            data_labels = np.array(planesnet['labels'])
            #data_latlon = np.array(planesnet['locations'])
            #data_scnids = np.array(planesnet['scene_ids'])
            assert len(data_images) == len(data_labels)
            #log message
            logging.log(logging.INFO, "Loaded data file " + path)

            # images are provided, as a single array of ints, by color planes first
            # and in each color plane, first row first. Reshaping to [batch, 3, 20, 20]
            # will give indexing as [batch, rgb, y, x]. Then swap axes -> [batch, y, x, rgb]
            data_images = np.reshape(data_images, (-1, 3, 20, 20), order="C")
            data_images = np.swapaxes(data_images, 1, 2)
            data_images = np.swapaxes(data_images, 2, 3)

            # image dump for debugging
            #for i in range(24000, 32000):
            #    image_dump(data_images[i], data_labels[i], data_latlon[i], data_scnids[i])

            # shuffle the data
            np.random.seed(0)
            n = len(data_images)
            p = np.random.permutation(n)
            data_images = data_images[p]
            data_labels = data_labels[p]

            # convert images to float
            #data_images = (data_images / 255.0).astype(np.float32)
            # image format uint8

            # partition training and test data
            TEST_SIZE = n // 10
            TEST_SIZE = 5000 if TEST_SIZE < 5000 else 10000 if TEST_SIZE > 10000 else TEST_SIZE
            test_images = data_images[:TEST_SIZE]
            test_labels = data_labels[:TEST_SIZE]
            train_images = data_images[TEST_SIZE:]
            train_labels = data_labels[TEST_SIZE:]
            return test_images, test_labels, train_images, train_labels
def main(argv):
    parser = argparse.ArgumentParser()
    # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints
    # should be saved. You can define additional user arguments which will have to be specified after
    # an empty arg -- on the command line:
    # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args

    # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156)
    # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.9949 loss 1.466 job 159)
    parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints')
    parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped')
    parser.add_argument('--hp-lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate')
    parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate')
    parser.add_argument('--hp-lr2', default=600, type=float, help='Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.')
    parser.add_argument('--hp-dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.')
    parser.add_argument('--hp-conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.')
    parser.add_argument('--hp-conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.')
    parser.add_argument('--hp-conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.')
    parser.add_argument('--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.')
    parser.add_argument('--hp-iterations', default=3000, type=int, help='Hyperparameter: number of training iterations.')
    args = parser.parse_args()
    arguments = args.__dict__

    hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')}
    otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')}

    logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items())))

    data_dir = otherargs['data_dir']
    job_dir = otherargs.pop('job_dir')

    train_images_file, train_labels_file, test_images_file, test_labels_file = load_mnist_data(data_dir)
    def train_input_fn(): return train_data_input_fn(train_images_file, train_labels_file)
    def eval_input_fn(): return eval_data_input_fn(test_images_file, test_labels_file)

    training_config = tf.estimator.RunConfig(model_dir=job_dir, save_summary_steps=10, save_checkpoints_steps=200)
    estimator = tf.estimator.Estimator(model_fn=conv_model, model_dir=job_dir, params=hparams, config=training_config)
    train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=hparams['iterations'])
    export_latest = tf.estimator.LatestExporter("mnist-model",serving_input_receiver_fn=serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=1, exporters=export_latest, throttle_secs=60)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#9
0
def main(argv):
    parser = argparse.ArgumentParser()
    # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints
    # should be saved. You can define additional user arguments which will have to be specified after
    # an empty arg -- on the command line:
    # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args

    # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156)
    # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.9949 loss 1.466 job 159)
    parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints')
    parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped')
    parser.add_argument('--hp-lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate')
    parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate')
    parser.add_argument('--hp-lr2', default=600, type=float, help='Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.')
    parser.add_argument('--hp-dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.')
    parser.add_argument('--hp-conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.')
    parser.add_argument('--hp-conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.')
    parser.add_argument('--hp-conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.')
    parser.add_argument('--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.')
    parser.add_argument('--hp-iterations', default=3000, type=int, help='Hyperparameter: number of training iterations.')
    args = parser.parse_args()
    arguments = args.__dict__

    hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')}
    otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')}

    logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items())))

    data_dir = otherargs['data_dir']
    job_dir = otherargs.pop('job_dir')

    train_images_file, train_labels_file, test_images_file, test_labels_file = load_mnist_data(data_dir)
    def train_input_fn(): return train_data_input_fn(train_images_file, train_labels_file)
    def eval_input_fn(): return eval_data_input_fn(test_images_file, test_labels_file)

    training_config = tf.estimator.RunConfig(model_dir=job_dir, save_summary_steps=10, save_checkpoints_steps=200)
    estimator = tf.estimator.Estimator(model_fn=conv_model, model_dir=job_dir, params=hparams, config=training_config)
    train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=hparams['iterations'])
    export_latest = tf.estimator.LatestExporter("mnist-model",serving_input_receiver_fn=serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=1, exporters=export_latest, throttle_secs=60)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#10
0
def start_training(output_dir, hparams, data, tiledata, **kwargs):

    # YOLO configuration for ROI assignments
    yolo_cfg = datagen.YOLOConfig(hparams["grid_nn"], hparams["cell_n"],
                                  hparams["cell_swarm"], hparams["cell_grow"])
    eval_yolo_cfg = datagen.YOLOConfig(hparams["grid_nn"], hparams["cell_n"],
                                       hparams["cell_swarm"], 1.0)

    # data source selection: full aerial imagery of TFRecords containing individual 256x256 tiles
    if tiledata != "" and data == "":  # training from tfrecords
        tfrec_filelist = gcsfile.get_matching_files(tiledata + "/*.tfrecord")
        train_data_input_fn = lambda params: datagen.train_dataset_from_tfrecords(
            tfrec_filelist, params['batch_size'], hparams["shuffle_buf"],
            yolo_cfg, hparams["data_rnd_hue"], hparams[
                "data_rnd_orientation"], hparams["data_cache_n_epochs"])
        tfrec_filelist_eval = gcsfile.get_matching_files(tiledata + "_eval" +
                                                         "/*.tfrecord")
        eval_data_input_fn = lambda params: datagen.eval_dataset_from_tfrecords(
            tfrec_filelist_eval, params['batch_size'], eval_yolo_cfg)
    elif data != "" and tiledata == "":  # training from aerial imagery directly
        img_filelist, roi_filelist = datagen.load_file_list(data)
        train_data_input_fn = lambda params: datagen.train_dataset_from_images(
            img_filelist, roi_filelist, params['batch_size'], hparams[
                "shuffle_buf"], yolo_cfg, hparams["data_rnd_hue"], hparams[
                    "data_rnd_orientation"], hparams["data_tiles_per_gt_roi"],
            hparams["data_rnd_distmax"], hparams["data_cache_n_epochs"])
        img_filelist_eval, roi_filelist_eval = datagen.load_file_list(data +
                                                                      "_eval")
        eval_data_input_fn = lambda params: datagen.eval_dataset_from_images(
            img_filelist_eval, roi_filelist_eval, params['batch_size'],
            eval_yolo_cfg)
    else:
        logging.log(
            logging.ERROR,
            "One and only one of parameters 'data' and 'tiledata' must be supplied."
        )
        return

    # Estimator configuration
    # export_latest = tf.estimator.LatestExporter(name="planespotting",
    #                                             serving_input_receiver_fn=serving_input_fn,
    #                                             exports_to_keep=1)

    # train_spec = tf.estimator.TrainSpec(input_fn=train_data_input_fn,
    #                                     max_steps=hparams["iterations"])

    # eval_spec = tf.estimator.EvalSpec(input_fn=eval_data_input_fn,
    #                                   steps=hparams['eval_iterations'],
    #                                   exporters=export_latest,
    #                                   start_delay_secs=1,  # Confirmed: this does not work (plane533 for ex.)
    #                                   throttle_secs=60)

    training_config = tf.contrib.tpu.RunConfig(
        model_dir=output_dir,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=tf.contrib.tpu.TPUConfig(hparams['tpu_iterations'],
                                            8),  # 8 cores in a TPU board
        cluster=tf.contrib.cluster_resolver.TPUClusterResolver(
            kwargs['tpu'], kwargs['tpu_zone'], kwargs['gcp_project'])
        if hparams['use_tpu'] else None)

    # Experimental distribution strategy if running on a machine with multiple GPUs
    # logging.log(logging.INFO, "GPUs found: " + str(get_available_gpus()))
    # distribution = tf.contrib.distribute.MirroredStrategy() if len(get_available_gpus()) > 1 else None

    # training_config = tf.estimator.RunConfig(model_dir=output_dir,
    #                                          save_summary_steps=100,
    #                                          save_checkpoints_steps=2000,
    #                                          keep_checkpoint_max=1)

    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=model.model_fn,
        model_dir=output_dir,
        params=hparams,
        train_batch_size=hparams['batch'],
        eval_batch_size=hparams[
            'batch'],  # TPU constraint: batch sizes must be the same (?)
        config=training_config,
        use_tpu=hparams['use_tpu'],
        export_to_tpu=False
    )  # we do not need the TPU graph in the exported model since
    # we will be serving it from CPUs/GPUs. Also, without
    # export_to_tpu=Flase, TPUEstimator.export_saved_model crashes (TF1.12 and earlier)

    # estimator = tf.estimator.Estimator(model_fn=model.model_fn,
    #                                    model_dir=output_dir,
    #                                    config=training_config,
    #                                    params=hparams)

    # tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    TPU_EVAL_EVERY_STEPS = 10000  # only one eval at the end
    for i in range(
            int(math.ceil(hparams["iterations"] * 1.0 /
                          TPU_EVAL_EVERY_STEPS))):
        estimator.train(train_data_input_fn,
                        steps=min(
                            TPU_EVAL_EVERY_STEPS,
                            hparams["iterations"] - TPU_EVAL_EVERY_STEPS * i))
        estimator.evaluate(input_fn=eval_data_input_fn,
                           steps=hparams['eval_iterations'])
    estimator.export_savedmodel(os.path.join(output_dir, "planespotting"),
                                serving_input_fn)
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from tensorflow.examples.tutorials.mnist import input_data as mnist_data
import argparse
import math
import sys
logging.set_verbosity(logging.INFO)
logging.log(logging.INFO, "Tensorflow version " + tf.__version__)

#
# To run this: see README.md
#

# Called when the model is deployed for online predictions on Cloud ML Engine.
def serving_input_fn():
    inputs = {'image': tf.placeholder(tf.float32, [None, 28, 28])}
    # Here, you can transform the data received from the API call
    features = inputs
    return tf.estimator.export.ServingInputReceiver(features, inputs)


# In memory training data for this simple case.
# When data is too large to fit in memory, use Tensorflow queues.
示例#12
0
def log(msg):
    tf_logging.log(tf_logging.FATAL, msg) # FATAL to show up at any TF logging level
    logging.getLogger('DeepBugHunter').info(msg)
示例#13
0
def main(argv):
    training_config = tf.contrib.learn.RunConfig(save_checkpoints_secs=None,
                                                 save_checkpoints_steps=500)
    # Bug, exports_to_keep=None is necessary, otherwise this crashes under Python 3
    export_strategy = tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(
        serving_input_fn=serving_input_fn, exports_to_keep=None)

    # The Experiment is an Estimator with data loading functions and other parameters
    def experiment_fn_with_params(output_dir, hparams, data, **kwargs):
        # load data
        test_images, test_labels, train_images, train_labels = load_data(data)
        #dataset, nb = load_dataset(data)
        #dataset_eval, nb_eval_files = load_dataset(data + "_eval")
        ITERATIONS = hparams["iterations"]
        # Compatibility warning: Experiment will move out of contrib in 1.4
        return tf.contrib.learn.Experiment(
            estimator=tf.estimator.Estimator(model_fn=model.model_fn,
                                             model_dir=output_dir,
                                             config=training_config,
                                             params=hparams),
            train_input_fn=lambda: train_data_input_fn(train_images,
                                                       train_labels),
            eval_input_fn=lambda: eval_data_input_fn(test_images, test_labels),
            #train_input_fn=lambda: dataset_input_fn(dataset),
            #eval_input_fn=lambda: dataset_eval_input_fn(dataset_eval, nb_eval_files),
            train_steps=ITERATIONS,
            eval_steps=1,
            min_eval_frequency=100,
            export_strategies=export_strategy)

    parser = argparse.ArgumentParser()
    # mandatory arguments format for ML Engine:
    # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args

    parser.add_argument(
        '--job-dir',
        default="checkpoints",
        help='GCS or local path where to store training checkpoints')
    parser.add_argument(
        '--data',
        default="planesnet32K.pklz",
        help='Path to data file (can be on Google cloud storage gs://...)')
    parser.add_argument('--hp-iterations',
                        default=80000,
                        type=int,
                        help='Hyperparameter: number of training iterations')
    parser.add_argument('--hp-lr0',
                        default=0.01,
                        type=float,
                        help='Hyperparameter: initial (max) learning rate')
    parser.add_argument('--hp-lr1',
                        default=0.0001,
                        type=float,
                        help='Hyperparameter: target (min) learning rate')
    parser.add_argument(
        '--hp-lr2',
        default=800,
        type=float,
        help=
        'Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.'
    )
    parser.add_argument('--hp-dropout',
                        default=0.3,
                        type=float,
                        help='Hyperparameter: dropout rate on dense layers.')
    parser.add_argument(
        '--hp-filter-sizes',
        default='S',
        help='Hyperparameter: convolutional filter sizes S, M, L.')
    parser.add_argument(
        '--hp-conv1',
        default=16,
        type=int,
        help=
        'Hyperparameter: depth of first convolutional layer. Depth then doubles at each layer.'
    )
    parser.add_argument(
        '--hp-bnexp',
        default=0.993,
        type=float,
        help='Hyperparameter: exponential decay for batch norm moving averages.'
    )
    parser.add_argument('--hp-dense',
                        default=80,
                        type=int,
                        help='Hyperparameter: size of the dense layer')
    args = parser.parse_args()
    arguments = args.__dict__

    hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')}
    otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')}

    logging.log(logging.INFO,
                "Hyperparameters:" + str(sorted(hparams.items())))
    logging.log(logging.INFO,
                "Other parameters:" + str(sorted(otherargs.items())))

    output_dir = otherargs.pop('job_dir')
    experiment_fn = lambda output_dir: experiment_fn_with_params(
        output_dir, hparams, **otherargs)
    tf.contrib.learn.learn_runner.run(experiment_fn, output_dir)
示例#14
0
def main(argv):
    parser = argparse.ArgumentParser()

    # mandatory arguments format for ML Engine:
    # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args
    def str2bool(v):
        return v == 'True'

    parser.add_argument(
        '--job-dir',
        default="checkpoints",
        help='GCS or local path where to store training checkpoints')
    parser.add_argument(
        '--data',
        default="",
        help=
        'Path to training data folder containing full-scale aerial imagery (can be on Google cloud storage gs://...). Eval data should be in a folder with the same name and and _eval suffix.'
    )
    parser.add_argument(
        '--tiledata',
        default="",
        help=
        'Path to training data folder containing image tiles (can be on Google cloud storage gs://...). Eval data should be in a folder with the same name and and _eval suffix.'
    )
    parser.add_argument('--hp-iterations',
                        default=25000,
                        type=int,
                        help='Hyperparameter: number of training iterations')
    parser.add_argument('--hp-batch-size',
                        default=10,
                        type=int,
                        help='Hyperparameter: training batch size')
    parser.add_argument('--hp-eval-batch-size',
                        default=32,
                        type=int,
                        help='Hyperparameter: evaluation batch size')
    parser.add_argument(
        '--hp-eval-iterations',
        default=262,
        type=int,
        help='Hyperparameter: eval iterations'
    )  # eval dataset is 8380 tiles (262 batches of 32) - larger batch will OOM.
    parser.add_argument('--hp-shuffle-buf',
                        default=10000,
                        type=int,
                        help='Hyperparameter: data shuffle buffer size')
    parser.add_argument('--hp-layers',
                        default=12,
                        type=int,
                        help='Hyperparameter: number of layers')
    parser.add_argument('--hp-first-layer-filter-size',
                        default=6,
                        type=int,
                        help='Hyperparameter: filter size in first layer')
    parser.add_argument('--hp-first-layer-filter-stride',
                        default=2,
                        type=int,
                        help='Hyperparameter: filter stride in first layer')
    parser.add_argument(
        '--hp-first-layer-filter-depth',
        default=32,
        type=int,
        help=
        'Hyperparameter: the number of filters in the first and last layers')
    parser.add_argument(
        '--hp-depth-increment',
        default=5,
        type=int,
        help=
        'Hyperparameter: increment the decrement filter depth by this amount between first and last layer'
    )
    parser.add_argument(
        '--hp-grid-nn',
        default=16,
        type=int,
        help='Hyperparameter: size of YOLO grid: grid-nn x grid-nn')
    parser.add_argument(
        '--hp-cell-n',
        default=2,
        type=int,
        help='Hyperparameter: number of ROIs detected per YOLO grid cell')
    parser.add_argument(
        '--hp-cell-swarm',
        default=True,
        type=str2bool,
        help=
        'Hyperparameter: ground truth ROIs selection algorithm. The better swarm algorithm is only implemented for cell_n=2'
    )
    parser.add_argument(
        '--hp-cell-grow',
        default=1.3,
        type=float,
        help=
        'Hyperparameter: ROIs allowed to be cetered beyond grid cell by this factor'
    )
    parser.add_argument('--hp-lr0',
                        default=0.01,
                        type=float,
                        help='Hyperparameter: initial (max) learning rate')
    parser.add_argument('--hp-lr1',
                        default=0.0001,
                        type=float,
                        help='Hyperparameter: target (min) learning rate')
    parser.add_argument(
        '--hp-lr2',
        default=3000,
        type=float,
        help=
        'Hyperparameter: learning rate decay period in steps. Only used when the decay type is "exponential". For "cosine-restarts", the first decay period is always iterations/8.'
    )
    parser.add_argument(
        '--hp-decay-type',
        default="exponential",
        choices=["exponential", "cosine-restarts"],
        help=
        'Hyperparameter: learning rate decay type. "exponential" (default) or "cosine-restarts".'
    )
    parser.add_argument(
        '--hp-decay-restarts',
        default=3,
        type=int,
        choices=range(0, 6),
        help=
        'Hyperparameter: learning rate decay restarts over the entire training. Only used when decay-type is "cosine-restarts". The learning rate always decays to its min value at the end of "iterations" and the first restart is always at iterations/8.'
    )
    parser.add_argument(
        '--hp-decay-restart-height',
        default=0.99,
        type=float,
        help=
        'Hyperparameter: learning rate restart value as a fraction of the previous max learning rate. Only used when decay-type is "cosine-restarts"'
    )
    parser.add_argument(
        '--hp-dropout',
        default=0.0,
        type=float,
        help=
        'Hyperparameter: dropout rate. It should be between 0.0 and 0.5. 0.0 for no dropout.'
    )
    parser.add_argument(
        '--hp-spatial-dropout',
        default=True,
        type=str2bool,
        help=
        'Hyperparameter: dropout type, spatial or ordinary. Spatial works better in convolutional networks.'
    )
    parser.add_argument(
        '--hp-bnexp',
        default=0.993,
        type=float,
        help='Hyperparameter: exponential decay for batch norm moving averages.'
    )
    parser.add_argument('--hp-lw1',
                        default=1,
                        type=float,
                        help='Hyperparameter: loss weight LW1')
    parser.add_argument('--hp-lw2',
                        default=3,
                        type=float,
                        help='Hyperparameter: loss weight LW2')
    parser.add_argument('--hp-lw3',
                        default=30,
                        type=float,
                        help='Hyperparameter: loss weight LW3')
    # hyperparameters for training data generation when training from large photos directly. They do not affect test data.
    parser.add_argument(
        '--hp-data-tiles-per-gt-roi',
        default=166,
        type=int,
        help=
        'Data generation hyperparameter: number of training tiles generated around each ground truth ROI'
    )
    parser.add_argument(
        '--hp-data-rnd-distmax',
        default=2.0,
        type=float,
        help=
        'Data generation hyperparameter: training tiles selection max random distance from ground truth ROI (always 2.0 for eval tiles)'
    )
    parser.add_argument(
        '--hp-data-rnd-hue',
        default=True,
        type=str2bool,
        help=
        'Data generation hyperparameter: data augmentation with random hue on training images'
    )
    parser.add_argument(
        '--hp-data-rnd-orientation',
        default=True,
        type=str2bool,
        help=
        'Data generation hyperparameter: data augmentation by rotating and flipping tiles.'
    )
    parser.add_argument(
        '--hp-data-cache-n-epochs',
        default=0,
        type=int,
        help=
        'Generate random data variations for n epochs then cache and reuse.')

    args = parser.parse_args()
    arguments = args.__dict__

    hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')}
    otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')}

    logging.log(logging.INFO,
                "Hyperparameters:" + str(sorted(hparams.items())))
    logging.log(logging.INFO,
                "Other parameters:" + str(sorted(otherargs.items())))

    output_dir = otherargs.pop('job_dir')
    start_training(output_dir, hparams, **otherargs)
def PrintAndLog(msg, lvl=tf_logging.INFO):
  tf_logging.log(lvl, msg)
  print(msg)
def YOLO_head(x, mode, params, info, grid_nn, cell_n):
    """YOLO (You Look Only Once) bounding box head. Divides each image into a gid_nn x grid_nn
    grid and predicts cell_n bounding boxes per grid cell."""
    assert grid_nn == 48
    pool_size = 48 // grid_nn
    # Average pooling down to the grid size.
    # for GRID_N=48, need pool_size=1, strides=1 (no pooling)
    y = tf.layers.average_pooling2d(
        x, pool_size=pool_size, strides=pool_size,
        padding="valid")  # [batch, grid_nn, grid_nn, cell_n*32]

    info = _layer_stats(info, "YOLO head, avg pool", y, 0, 0)

    # for each cell, this has CELL_B predictions of bounding box (x,y,w,h,c)
    # apply tanh for x, y, sigmoid for w,h, softmax for c
    # TODO: idea: batch norm may be bad on this layer
    # TODO: try with a deeper layer as well
    # TODO: try a filtered convolution instead of pooling2d, maybe info from cell sides should be weighted differently
    box_xr, box_yr, box_wr, box_hr, box_c0, box_c1 = tf.split(
        y, 6, axis=-1)  # shape 4 x [batch, grid_nn, grid_nn, 36]
    box_x = tf.nn.tanh(conv1x1_batch_norm(
        box_xr, mode, params,
        depth=cell_n))  # shape [batch, grid_nn, grid_nn, cell_n]
    box_y = tf.nn.tanh(conv1x1_batch_norm(
        box_yr, mode, params,
        depth=cell_n))  # shape [batch, grid_nn, grid_nn, cell_n]
    box_w = tf.nn.sigmoid(
        conv1x1_batch_norm(
            box_wr, mode, params,
            depth=cell_n))  # shape [batch, grid_nn, grid_nn, cell_n]
    box_h = tf.nn.sigmoid(
        conv1x1_batch_norm(
            box_hr, mode, params,
            depth=cell_n))  # shape [batch, grid_nn, grid_nn, cell_n]

    box_c = tf.concat([box_c0, box_c1], axis=-1)
    # no batch norm before softmax
    # TODO: really no batch norm here ? What kind of batch norm could work ?
    box_c_logits = conv1x1(
        box_c, depth=cell_n * 2
    )  # shape [batch, grid_nn, grid_nn, cell_n*2], 2 = number of classes, plane or not plane
    box_all = tf.concat([box_x, box_y, box_w, box_h, box_c_logits], axis=-1)

    info = _layer_stats(
        info, "YOLO head, box XYWHC", box_all, 1,
        4 * _count_conv_weights(box_xr, box_x, 1) +
        _count_conv_weights(box_c, box_c_logits, 1))

    box_c_logits = tf.reshape(box_c_logits, [-1, grid_nn, grid_nn, cell_n, 2])
    box_c = tf.nn.softmax(
        box_c_logits)  # shape [batch, GRID_N,GRID_N,CELL_B,2]
    #box_c_noplane, box_c_plane = tf.unstack(box_c, axis=-1)

    # Leave some breathing room to the roi sizes so that rois from adjacent cells can reach into this one.
    # This prevents training from punishing cells that do see an ship but are not assigned any because
    # the plane is centered in an adjacent cell very close to the limit. A ground truth box that is slightly
    # off could change cell ownership of a plane while not changing anyhting about the underlying pixels.
    box_x = box_x * 1.0 * params["cell_grow"]
    box_y = box_y * 1.0 * params["cell_grow"]
    logging.log(logging.INFO, y)
    logging.log(logging.INFO, box_x)
    logging.log(logging.INFO, box_y)
    logging.log(logging.INFO, box_w)
    logging.log(logging.INFO, box_h)
    logging.log(logging.INFO, box_c)
    logging.log(logging.INFO, box_c_logits)

    return box_x, box_y, box_w, box_h, box_c, box_c_logits, info
def model_fn(features, labels, mode, params):
    """The model, with loss, metrics and debug summaries"""
    # YOLO parameters
    grid_nn = params["grid_nn"]  # each tile is divided into a grid_nn x grid_nn grid
    cell_n = params["cell_n"]  # each grid cell predicts cell_n bounding boxes.
    info = None
    # model inputs
    X = tf.to_float(features["image"]) / 255.0 # input image format is uint8 with range 0 to 255
    X=tf.reshape(X,[-1,768,768,3])
    # The model itself is here
    #Y, info = model_core_squeezenet12(X, mode, params, info)
    #Y, info = model_core_squeezenet17(X, mode, params, info)
    Y, info = model_core_squeezenet12(X, mode, params,info)
    logging.debug(X.shape)
    # YOLO head: predicts bounding boxes around ships
    box_x, box_y, box_w, box_h, box_c, box_c_logits, info = layer.YOLO_head(Y, mode, params, info, grid_nn, cell_n)
    # Debug: print the model structure
    if mode == tf.estimator.ModeKeys.TRAIN:
        logging.log(logging.INFO, info["description"])
        logging.log(logging.INFO, "NN {} layers / {:,d} total weights".format(info["layers"], info["weights"]))
    box_c_sim = box_c[:,:,:,:,1]
    DETECTION_TRESHOLD = 0.5  # ship "detected" if predicted C>0.5
    detected_w = tf.where(tf.greater(box_c_sim, DETECTION_TRESHOLD), box_w, tf.zeros_like(box_w))
    detected_h = tf.where(tf.greater(box_c_sim, DETECTION_TRESHOLD), box_h, tf.zeros_like(box_w))
    # all rois with confidence factors
    predicted_rois = tf.stack([box_x, box_y, box_w, box_h], axis=-1)  # shape [batch, GRID_N, GRID_N, CELL_B, 4]
    predicted_rois = box.grid_cell_to_tile_coords(predicted_rois, grid_nn, 768) / 768
    predicted_rois = tf.reshape(predicted_rois, [-1, grid_nn*grid_nn*cell_n, 4])
    predicted_c = tf.reshape(box_c_sim, [-1, grid_nn*grid_nn*cell_n])
    # only the rois where a ship was detected
    detected_rois = tf.stack([box_x, box_y, detected_w, detected_h], axis=-1)  # shape [batch, GRID_N, GRID_N, CELL_B, 4]
    detected_rois = box.grid_cell_to_tile_coords(detected_rois, grid_nn, 768) / 768
    detected_rois = tf.reshape(detected_rois, [-1, grid_nn*grid_nn*cell_n, 4])
    detected_rois, detected_rois_overflow = box.remove_empty_rois(detected_rois, 50)
    loss = train_op = eval_metrics = None
    if mode != tf.estimator.ModeKeys.PREDICT:
        # Target labels
        # Ground truth boxes. Used to compute IOU accuracy and display debug ground truth boxes.
        target_rois = labels["target_rois"] # shape [batch, MAX_TARGET_ROIS_PER_TILE, x1y1x2y2]
        # Ground truth boxes assigned to YOLO grid cells. Used to compute loss.
        target_rois_yolo = labels["yolo_target_rois"]  # shape [4,4,3,3] = [batch, GRID_N, GRID_N, CEL_B, xywh]
        target_x, target_y, target_w, target_h = tf.unstack(target_rois_yolo, axis=-1) # shape 3 x [batch, 4,4,3] = [batch, GRID_N, GRID_N,CELL_B]
        # target probability is 1 if there is a corresponding target box, 0 otherwise
        target_is_ship = tf.greater(target_w, 0.0001)
        target_is_ship_onehot = tf.one_hot(tf.cast(target_is_ship, tf.int32), 2, dtype=tf.float32)
        target_is_ship_float = tf.cast(target_is_ship, tf.float32) # shape [batch, 4,4,3] = [batch, GRID_N, GRID_N, CELL_B]
        # Mistakes and correct detections for visualisation and debugging.
        # This is computed against the ground truth boxes assigned to YOLO grid cells.
        mistakes, size_correct, position_correct, all_correct = box.compute_mistakes(box_x, box_y,
                                                                                     box_w, box_h, box_c_sim,
                                                                                     target_x, target_y,
                                                                                     target_w, target_h, target_is_ship, grid_nn)
        debug_img = imgdbg.debug_image(X, mistakes, target_rois, predicted_rois, predicted_c,
                                       size_correct, position_correct, all_correct,
                                       grid_nn, cell_n, 768)
        if mode == tf.estimator.ModeKeys.EVAL:
            iou_accuracy = box.compute_safe_IOU(target_rois, detected_rois, detected_rois_overflow, 768)
        # Loss function

        position_loss = tf.reduce_mean(target_is_ship_float * (tf.square(box_x - target_x) + tf.square(box_y - target_y)))
        size_loss = tf.reduce_mean(target_is_ship_float * tf.square(box_w - target_w) * 2 + target_is_ship_float * tf.square(box_h - target_h) * 2)
        obj_loss = tf.losses.softmax_cross_entropy(target_is_ship_onehot, box_c_logits)
        # YOLO trick: weights the different losses differently
        loss_weight_total = (params['lw1'] + params['lw2'] + params['lw3']) * 1.0  # 1.0 to force conversion to float
        w_obj_loss = obj_loss*(params['lw1'] / loss_weight_total)
        w_position_loss = position_loss*(params['lw2'] / loss_weight_total)
        w_size_loss = size_loss*(params['lw3'] / loss_weight_total)
        loss = w_position_loss + w_size_loss + w_obj_loss
        nb_mistakes = tf.reduce_sum(mistakes)
        # average number of mistakes per image
        lr = learn_rate_decay(tf.train.get_or_create_global_step(), params)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = tf.contrib.training.create_train_op(loss, optimizer)
        if mode == tf.estimator.ModeKeys.EVAL:
            # metrics removed from training mode because they are not yet supported with MirroredStrategy
            eval_metrics = {"position_error": tf.metrics.mean(w_position_loss),
                            "size_error": tf.metrics.mean(w_size_loss),
                            "ship_cross_entropy_error": tf.metrics.mean(w_obj_loss),
                            "mistakes": tf.metrics.mean(nb_mistakes),
                            'IOU': tf.metrics.mean(iou_accuracy)
                            }
        else:
            eval_metrics = None
        # Tensorboard summaries for debugging
        tf.summary.scalar("position_error", w_position_loss)
        tf.summary.scalar("size_error", w_size_loss)
        tf.summary.scalar("ship_cross_entropy_error", w_obj_loss)
        tf.summary.scalar("loss", loss)
        tf.summary.image("input_image", debug_img, max_outputs=20)
        tf.summary.scalar("learning_rate", lr)
        # a summary on iou_accuracy would be nice but it goes Out Of Memory
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions={"rois":predicted_rois, "rois_confidence": predicted_c},  # name these fields as you like
        loss=loss, train_op=train_op, eval_metric_ops=eval_metrics,
        export_outputs={'classes': tf.estimator.export.PredictOutput({"rois": predicted_rois, 
                                                                      "rois_confidence": predicted_c})}  
)
示例#18
0
def model_fn(features, labels, mode, params):
    """The model, with loss, metrics and debug summaries"""

    # YOLO parameters
    grid_nn = params["grid_nn"]  # each tile is divided into a grid_nn x grid_nn grid
    cell_n = params["cell_n"]  # each grid cell predicts cell_n bounding boxes.
    info = None

    # model inputs
    X = tf.to_float(features["image"]) / 255.0 # input image format is uint8 with range 0 to 255

    # The model itself is here
    #Y, info = model_core_squeezenet12(X, mode, params, info)
    #Y, info = model_core_squeezenet17(X, mode, params, info)
    #Y, info = model_core_darknet(X, mode, params, info)
    #Y, info = model_core_darknet17(X, mode, params, info)
    Y, info = model_core_configurable_squeezenet(X, mode, params, info)

    # YOLO head: predicts bounding boxes around airplanes
    box_x, box_y, box_w, box_c, box_c_logits, info = layer.YOLO_head(Y, mode, params, info, grid_nn, cell_n)

    # Debug: print the model structure
    if mode == tf.estimator.ModeKeys.TRAIN:
        logging.log(logging.INFO, info["description"])
        logging.log(logging.INFO, "NN {} layers / {:,d} total weights".format(info["layers"], info["weights"]))

    # TODO: refactor predicted_rois and predicted_c (or keep it to keep the conde compatible with confidence factor implem?)
    # with the current softmax implementation, confidence factors are either 0 or 1.
    box_c_sim = tf.cast(tf.argmax(box_c, axis=-1), dtype=tf.float32)  # shape [batch, GRID_N,GRID_N,CELL_B]
    DETECTION_TRESHOLD = 0.5  # plane "detected" if predicted C>0.5
    detected_w = tf.where(tf.greater(box_c_sim, DETECTION_TRESHOLD), box_w, tf.zeros_like(box_w))
    # all rois with confidence factors
    predicted_rois = tf.stack([box_x, box_y, box_w], axis=-1)  # shape [batch, GRID_N, GRID_N, CELL_B, 3]
    predicted_rois = box.grid_cell_to_tile_coords(predicted_rois, grid_nn, settings.TILE_SIZE) / settings.TILE_SIZE
    predicted_rois = tf.reshape(predicted_rois, [-1, grid_nn*grid_nn*cell_n, 4])
    predicted_c = tf.reshape(box_c_sim, [-1, grid_nn*grid_nn*cell_n])
    # only the rois where a plane was detected
    detected_rois = tf.stack([box_x, box_y, detected_w], axis=-1)  # shape [batch, GRID_N, GRID_N, CELL_B, 3]
    detected_rois = box.grid_cell_to_tile_coords(detected_rois, grid_nn, settings.TILE_SIZE) / settings.TILE_SIZE
    detected_rois = tf.reshape(detected_rois, [-1, grid_nn*grid_nn*cell_n, 4])
    detected_rois, detected_rois_overflow = box.remove_empty_rois(detected_rois, settings.MAX_DETECTED_ROIS_PER_TILE)

    loss = train_op = eval_metrics = None
    if mode != tf.estimator.ModeKeys.PREDICT:

        # Target labels
        target_count = labels["count"]  # not used
        # Ground truth boxes. Used to compute IOU accuracy and display debug ground truth boxes.
        target_rois = labels["target_rois"] # shape [batch, MAX_TARGET_ROIS_PER_TILE, x1y1x2y2]
        # Ground truth boxes assigned to YOLO grid cells. Used to compute loss.
        target_rois_yolo = labels["yolo_target_rois"]  # shape [4,4,3,3] = [batch, GRID_N, GRID_N, CEL_B, xyw]
        target_x, target_y, target_w = tf.unstack(target_rois_yolo, 3, axis=-1) # shape 3 x [batch, 4,4,3] = [batch, GRID_N, GRID_N, CELL_B]
        # target probability is 1 if there is a corresponding target box, 0 otherwise
        target_is_plane = tf.greater(target_w, 0.0001)
        target_is_plane_onehot = tf.one_hot(tf.cast(target_is_plane, tf.int32), 2, dtype=tf.float32)
        target_is_plane_float = tf.cast(target_is_plane, tf.float32) # shape [batch, 4,4,3] = [batch, GRID_N, GRID_N, CELL_B]

        # Mistakes and correct detections for visualisation and debugging.
        # This is computed against the ground truth boxes assigned to YOLO grid cells.
        mistakes, size_correct, position_correct, all_correct = box.compute_mistakes(box_x, box_y,
                                                                                     box_w, box_c_sim,
                                                                                     target_x, target_y,
                                                                                     target_w, target_is_plane, grid_nn)
        # Debug image for logging in Tensorboad.
        debug_img = imgdbg.debug_image(X, mistakes, target_rois, predicted_rois, predicted_c,
                                       size_correct, position_correct, all_correct,
                                       grid_nn, cell_n, settings.TILE_SIZE)

        # IOU (Intersection Over Union) accuracy
        # IOU computation removed from training mode because it used an op not yet supported with MirroredStrategy
        if mode == tf.estimator.ModeKeys.EVAL:
            iou_accuracy = box.compute_safe_IOU(target_rois, detected_rois, detected_rois_overflow, settings.TILE_SIZE)

        # Improvement ideas and experiment results
        # 1) YOLO trick: take square root of predicted size for loss so as not to drown errors on small boxes: tested, no benefit
        # 2) if only one plane in cell, teach all cell_n detectors to detect it: implemented in box.n_experimental_roi_selection_strategy, beneficial
        # 3) TODO: try two or more grids, shifted by 1/2 cell size: This could make it easier to have cells detect planes in their center, if that is an actual problem they have (no idea)
        # 4) try using TC instead of TC_ in position loss and size loss: tested, no benefit
        # 5) TODO: one run without batch norm for comparison
        # 6) TODO: add dropout, tested, weird resukts: eval accuracy goes up signicantly but model performs worse in real life. Probably not enough training data.
        # 7) TODO: idea, compute detection box loss agains all ROI, not just assigned ROIs: if neighboring cell detects something that aligns well with ground truth, no reason to penalise
        # 8) TODO: add tile rotations, tile color inversion (data augmentation)

        # Loss function
        position_loss = tf.reduce_mean(target_is_plane_float * (tf.square(box_x - target_x) + tf.square(box_y - target_y)))
        size_loss = tf.reduce_mean(target_is_plane_float * tf.square(box_w - target_w) * 2)
        obj_loss = tf.losses.softmax_cross_entropy(target_is_plane_onehot, box_c_logits)

        # YOLO trick: weights the different losses differently
        loss_weight_total = (params['lw1'] + params['lw2'] + params['lw3']) * 1.0  # 1.0 to force conversion to float
        w_obj_loss = obj_loss*(params['lw1'] / loss_weight_total)
        w_position_loss = position_loss*(params['lw2'] / loss_weight_total)
        w_size_loss = size_loss*(params['lw3'] / loss_weight_total)
        loss = w_position_loss + w_size_loss + w_obj_loss

        # average number of mistakes per image
        nb_mistakes = tf.reduce_sum(mistakes)

        lr = learn_rate_decay(tf.train.get_or_create_global_step(), params)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = tf.contrib.training.create_train_op(loss, optimizer)

        if mode == tf.estimator.ModeKeys.EVAL:
            # metrics removed from training mode because they are not yet supported with MirroredStrategy
            eval_metrics = {"position_error": tf.metrics.mean(w_position_loss),
                            "size_error": tf.metrics.mean(w_size_loss),
                            "plane_cross_entropy_error": tf.metrics.mean(w_obj_loss),
                            "mistakes": tf.metrics.mean(nb_mistakes),
                            'IOU': tf.metrics.mean(iou_accuracy)}
        else:
            eval_metrics = None


        # Tensorboard summaries for debugging
        tf.summary.scalar("position_error", w_position_loss)
        tf.summary.scalar("size_error", w_size_loss)
        tf.summary.scalar("plane_cross_entropy_error", w_obj_loss)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("mistakes", nb_mistakes)
        tf.summary.scalar("learning_rate", lr)
        tf.summary.image("input_image", debug_img, max_outputs=20)
        # a summary on iou_accuracy would be nice but it goes Out Of Memory

    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions={"rois":predicted_rois, "rois_confidence": predicted_c},  # name these fields as you like
        loss=loss, train_op=train_op, eval_metric_ops=eval_metrics,
        export_outputs={'classes': tf.estimator.export.PredictOutput({"rois": box.swap_xy(predicted_rois), # TODO: the visualisation GUI was coded for swapped coordinates y1 x1 y2 x2
                                                                      "rois_confidence": predicted_c})}  # TODO: remove legacy C
    )
示例#19
0
Main file for training the YOLO (You Look Only Once) detection model"""

import os
import sys
import json
import argparse
import tensorflow as tf
from tensorflow.python.client import device_lib as tf_devices
from tensorflow.python.lib.io import file_io as gcsfile
from tensorflow.python.platform import tf_logging as logging

from trainer_yolo import model
from trainer_yolo import datagen

logging.set_verbosity(logging.INFO)
logging.log(logging.INFO, "Tensorflow version " + tf.__version__)


def get_available_gpus():
    local_device_protos = tf_devices.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']


# input function for base64 encoded JPEG in JSON
# Called when the model is deployed for online predictions on Cloud ML Engine.
def serving_input_fn():

    # input expects a list of jpeg images

    input_bytes = {
        'image_bytes': tf.placeholder(
示例#20
0
def main(argv):
    parser = argparse.ArgumentParser()
    # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints
    # should be saved. You can define additional user arguments which will have to be specified after
    # an empty arg -- on the command line:
    # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args

    # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156)
    # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.0.8849 loss 1.466 job 159)
    parser.add_argument(
        '--job-dir',
        default="checkpoints",
        help='GCS or local path where to store training checkpoints')
    parser.add_argument('--data-dir',
                        default="data",
                        help='Where training data will be loaded and unzipped')
    parser.add_argument('--hp-lr0',
                        default=0.02,
                        type=float,
                        help='Hyperparameter: initial (max) learning rate')
    parser.add_argument('--hp-lr1',
                        default=0.0001,
                        type=float,
                        help='Hyperparameter: target (min) learning rate')
    parser.add_argument(
        '--hp-lr2',
        default=600,
        type=float,
        help=
        'Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.'
    )
    parser.add_argument('--hp-dropout',
                        default=0.3,
                        type=float,
                        help='Hyperparameter: dropout rate on dense layers.')
    parser.add_argument(
        '--hp-conv1',
        default=6,
        type=int,
        help='Hyperparameter: depth of first convolutional layer.')
    parser.add_argument(
        '--hp-conv2',
        default=12,
        type=int,
        help='Hyperparameter: depth of second convolutional layer.')
    parser.add_argument(
        '--hp-conv3',
        default=24,
        type=int,
        help='Hyperparameter: depth of third convolutional layer.')
    parser.add_argument(
        '--hp-bnexp',
        default=0.993,
        type=float,
        help='Hyperparameter: exponential decay for batch norm moving averages.'
    )
    parser.add_argument('--hp-iterations',
                        default=10000,
                        type=int,
                        help='Hyperparameter: number of training iterations.')
    args = parser.parse_args()
    arguments = args.__dict__

    hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')}
    otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')}

    logging.log(logging.INFO,
                "Hyperparameters:" + str(sorted(hparams.items())))

    output_dir = otherargs.pop('job_dir')

    # learn_runner needs an experiment function with a single parameter: the output directory.
    # Here we pass additional command line arguments through a closure.
    experiment_fn = lambda output_dir: experiment_fn_with_params(
        output_dir, hparams, **otherargs)
    # Compatibility warning: learn_runner is currently in contrib. It will move in TF 1.2
    tf.contrib.learn.learn_runner.run(experiment_fn, output_dir)
示例#21
0
def start_training(output_dir, hparams, data, tiledata, **kwargs):

    # YOLO configuration for ROI assignments
    yolo_cfg = datagen.YOLOConfig(hparams["grid_nn"], hparams["cell_n"],
                                  hparams["cell_swarm"], hparams["cell_grow"])
    eval_yolo_cfg = datagen.YOLOConfig(hparams["grid_nn"], hparams["cell_n"],
                                       hparams["cell_swarm"], 1.0)

    # data source selection: full aerial imagery of TFRecords containing individual 256x256 tiles
    if tiledata != "" and data == "":  # training from tfrecords
        tfrec_filelist = gcsfile.get_matching_files(tiledata + "/*.tfrecord")
        train_data_input_fn = lambda: datagen.train_dataset_from_tfrecords(
            tfrec_filelist, hparams["batch_size"], hparams["shuffle_buf"],
            yolo_cfg, hparams["data_rnd_hue"], hparams[
                "data_rnd_orientation"], hparams["data_cache_n_epochs"])
        tfrec_filelist_eval = gcsfile.get_matching_files(tiledata + "_eval" +
                                                         "/*.tfrecord")
        eval_data_input_fn = lambda: datagen.eval_dataset_from_tfrecords(
            tfrec_filelist_eval, hparams["eval_batch_size"], eval_yolo_cfg)
    elif data != "" and tiledata == "":  # training from aerial imagery directly
        img_filelist, roi_filelist = datagen.load_file_list(data)
        train_data_input_fn = lambda: datagen.train_dataset_from_images(
            img_filelist, roi_filelist, hparams["batch_size"], hparams[
                "shuffle_buf"], yolo_cfg, hparams["data_rnd_hue"], hparams[
                    "data_rnd_orientation"], hparams["data_tiles_per_gt_roi"],
            hparams["data_rnd_distmax"], hparams["data_cache_n_epochs"])
        img_filelist_eval, roi_filelist_eval = datagen.load_file_list(data +
                                                                      "_eval")
        eval_data_input_fn = lambda: datagen.eval_dataset_from_images(
            img_filelist_eval, roi_filelist_eval, hparams["eval_batch_size"],
            eval_yolo_cfg)
    else:
        logging.log(
            logging.ERROR,
            "One and only one of parameters 'data' and 'tiledata' must be supplied."
        )
        return

    # Estimator configuration
    export_latest = tf.estimator.LatestExporter(
        name="planespotting",
        serving_input_receiver_fn=serving_input_fn,
        exports_to_keep=1)

    train_spec = tf.estimator.TrainSpec(input_fn=train_data_input_fn,
                                        max_steps=hparams["iterations"])

    eval_spec = tf.estimator.EvalSpec(
        input_fn=eval_data_input_fn,
        steps=hparams['eval_iterations'],
        exporters=export_latest,
        start_delay_secs=1,  # Confirmed: this does not work (plane533 for ex.)
        throttle_secs=60)

    # Device filters to prevent unwanted communications between nodes
    # This is necessary for now for running distributed jobs on ML Engine
    # If running long evaluations, workers can be done before master and in that case ML Engine crashes.
    # These device filters prevent unwanted communications from happening and will prevent the crash.
    # This code should be folded into Estimator in Tensorflow v1.9
    tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
    config = None
    if 'task' not in tf_config:
        config = None
    elif tf_config['task']['type'] == 'master':
        config = tf.ConfigProto(device_filters=['/job:ps', '/job:master'])
    elif tf_config['task']['type'] == 'worker':
        config = tf.ConfigProto(device_filters=[
            '/job:ps',
            '/job:worker/task:%d' % tf_config['task']['index']
        ])
    # end of temporary fix code for distributed training on ML Engine

    # Experimental distribution strategy if running on a machine with multiple GPUs
    logging.log(logging.INFO, "GPUs found: " + str(get_available_gpus()))
    distribution = tf.contrib.distribute.MirroredStrategy() if len(
        get_available_gpus()) > 1 else None

    training_config = tf.estimator.RunConfig(
        model_dir=output_dir,
        save_summary_steps=100,
        save_checkpoints_steps=2000,
        keep_checkpoint_max=1,
        train_distribute=distribution,
        session_config=config)  # device filters set here

    estimator = tf.estimator.Estimator(model_fn=model.model_fn,
                                       model_dir=output_dir,
                                       config=training_config,
                                       params=hparams)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#22
0
def main(argv):
    parser = argparse.ArgumentParser()
    # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints
    # should be saved. You can define additional user arguments which will have to be specified after
    # an empty arg -- on the command line:
    # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args

    # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156)
    # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.9949 loss 1.466 job 159)
    def str2bool(v): return v=='True'
    parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints')
    parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped')
    parser.add_argument('--lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate')
    parser.add_argument('--lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate')
    parser.add_argument('--lr2', default=600, type=float, help='Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.')
    parser.add_argument('--dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.')
    parser.add_argument('--conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.')
    parser.add_argument('--conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.')
    parser.add_argument('--conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.')
    parser.add_argument('--bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.')
    parser.add_argument('--iterations', default=5000, type=int, help='Hyperparameter: number of training iterations.')
    parser.add_argument('--eval-iterations', default=10, type=int, help='Hyperparameter: number of evaluation iterations.')
    parser.add_argument('--batch', default=1024, type=int, help='Global batch size (1/8th of this is the real batch size on one TPU)')
    parser.add_argument('--use-tpu', default=False, type=str2bool, help='Using a TPU or not')
    parser.add_argument('--tpu-iterations', default=100, type=int, help='Iterations per call to the TPU')
    # TPUEstimator also adds the following parameters internally - do not use them
    parser.add_argument('--tpu', default=None, help='(internal) ML Engine uses this argument to apps the IP address of the TPU')
    parser.add_argument('--tpu-zone', default=None, help='(internal) GCP zone where to provision the TPUs')
    parser.add_argument('--gcp-project', default=None, help='(internal) GCP project where to provision the TPUs')
    #parser.add_argument('--batch-size', default=None, help='(internal) Global batch size on TPUs')
    args = parser.parse_args()

    logging.log(logging.INFO, "Parameters:" + str(args))

    train_images_file, train_labels_file, test_images_file, test_labels_file = load_mnist_data(args.data_dir)
    def train_input_fn(params): return train_data_input_fn(train_images_file, train_labels_file, params)
    def eval_input_fn(params): return eval_data_input_fn(test_images_file, test_labels_file, params)

    # training_config = tf.contrib.tpu.RunConfig(
    #     cluster=tf.contrib.cluster_resolver.TPUClusterResolver(args.tpu, args.tpu_zone,args.gcp_project) \
    #         if args.use_tpu else None,
    #     model_dir=args.job_dir,
    #     session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True),
    #     tpu_config=tf.contrib.tpu.TPUConfig(args.tpu_iterations, 8)
    # )

    training_config = tf.estimator.RunConfig(model_dir=args.job_dir, save_summary_steps=100, save_checkpoints_steps=500, keep_checkpoint_max=1)
    # estimator = tf.contrib.tpu.TPUEstimator(model_fn=conv_model2, model_dir=args.job_dir, params=args.__dict__,
    #                                         train_batch_size=args.batch,
    #                                         eval_batch_size=args.batch,
    #                                         config=training_config, use_tpu=args.use_tpu)

    params = args.__dict__
    params["batch_size"] = args.batch
    estimator = tf.estimator.Estimator(model_fn=conv_model2, model_dir=args.job_dir, params=params, config=training_config)

    #train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=args.iterations)
    train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=None)
    export_latest = tf.estimator.LatestExporter("mnist-model",serving_input_receiver_fn=serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=10, exporters=export_latest, throttle_secs=2)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)