Пример #1
0
def main(args):
    if args.dataset.lower() == "cityscapes":
        use_coarse = True if args.extra is not None and \
                             "coarse" in args.extra[0].lower() \
                     else False
        dataset = datasets.Cityscapes(use_coarse)
    elif args.dataset.lower() == "freiburg":
        modalities = None if args.extra is None else args.extra
        dataset = datasets.Freiburg(modalities)
    elif args.dataset.lower() == "vistas":
        dataset = datasets.Vistas()
    else:
        raise ValueError("Invalid argument \"dataset\": %s" % args.dataset[0])

    if os.path.exists(args.data_dir):
        dataset_paths = dataset.file_associations(args.data_dir)
    else:
        raise ValueError("Dataset path does not exist\n%s\n" % args.data_dir)

    if not os.path.exists(args.output_dir):
        sys.stdout.write("Directory \"%s\" does not exist. "
                         % args.output_dir)
        sys.stdout.write("Do you want to create it? [y/N] ")
        sys.stdout.flush()
        user_input = sys.stdin.read(1)
        if user_input.lower()[0] != "y":
            sys.exit(0)
        else:
            os.makedirs(args.output_dir)

    for split in dataset_paths.keys():
        # Create directory for the split
        split_path = os.path.join(args.output_dir, split)
        if not os.path.exists(split_path):
            os.mkdir(split_path)
        try:
            p = multiprocessing.Pool()
            # Progress bar
            examples = list(dataset_paths[split].items())
            _record_example = partial(record_example, scale=args.scale_factor, dataset=dataset, split_path=split_path)
            if show_progress:
                example_iter = tqdm(
                    p.imap_unordered(_record_example, examples),
                    total=len(examples), desc="%-7s" % split).__iter__()
            else:
                example_iter = p.imap_unordered(_record_example, examples)
            # retrieve a single prototype feature
            features = next(example_iter)
            # multiprocess the rest
            for _ in example_iter:
                pass
        finally:
            p.close()
    # Write feature keys in order to dynamically being able to reconstruct the
    # content of the records when reading the records.
    meta_file = os.path.join(args.output_dir, "meta.txt")
    with open(meta_file, "w") as f:
        f.write("\n".join(features.keys()))
def main(args):
    tf.logging.set_verbosity(tf.logging.ERROR)
    dataset = None
    if args.dataset.lower() == "cityscapes":
        dataset = datasets.Cityscapes(args.use_coarse)
    elif args.dataset.lower() == "freiburg":
        dataset = datasets.Freiburg(args.modalities)
    elif args.dataset.lower() == "vistas":
        dataset = datasets.Vistas()
    elif args.dataset.lower() == "generic":
        dataset = datasets.Generic(args.image_dir, args.label_dir)
    else:
        raise ValueError("Dataset \"%s\" not supported." % args.dataset)

    if not os.path.exists(args.output_dir):
        sys.stdout.write("Directory \"%s\" does not exist. "
                         % args.output_dir)
        sys.stdout.write("Do you want to create it? [y/N] ")
        sys.stdout.flush()
        user_input = sys.stdin.read(1)
        if user_input.lower()[0] != "y":
            sys.exit(0)
        else:
            os.makedirs(args.output_dir)

    file_associations = dataset.file_associations(args.data_dir)
    sess = tf.Session()
    for split in file_associations:
        # Create path to split
        split_path = os.path.join(args.output_dir, split)
        if not os.path.exists(split_path):
            os.mkdir(split_path)

        # Create generator and retrieve the length
        generator, output_len = generator_from_file_associations(file_associations[split])
        # Create dataset from generator
        tf_dataset = tf.data.Dataset.from_generator(generator,
                output_types=(tf.string, tf.string),
                output_shapes=(output_len, output_len))
        # Add fixed arguments @dataset / @split_path to map functions
        _read_images = lambda x, y: read_images(x, y, dataset, args.width)
        _tf_write_serialized_example = lambda x, y, z, u: \
                tf_write_serialized_example(x, y, z, u, split_path)
        # Map the above functions
        tf_dataset = tf_dataset.map(_read_images,
                num_parallel_calls=_NUM_CPUS-1)
        tf_dataset = tf_dataset.map(_tf_write_serialized_example,
                num_parallel_calls=_NUM_CPUS-1)
        tf_dataset = tf_dataset.batch(_NUM_CPUS-1)
        # Create iterator
        _iter = tf_dataset.make_one_shot_iterator()
        _next = _iter.get_next()
        # Run over all examples
        with tqdm.tqdm(total=len(file_associations[split]),
                       ascii=" #",
                       desc="%-6s" % split,
                       dynamic_ncols=True) as pbar:
            while True:
                try:
                    filenames = sess.run(_next)
                    pbar.update(len(filenames))
                except tf.errors.OutOfRangeError:
                    break
    sess.close()
    return 0
def main(args):
    dataset = None
    scale_factor = args.scale_factor
    scale_factor_image = None
    scale_factor_label = None

    if args.dataset.lower() == "cityscapes":
        use_coarse = True if args.extra is not None and \
                             "coarse" in args.extra[0].lower() \
                     else False
        dataset = datasets.Cityscapes(use_coarse)
    elif args.dataset.lower() == "freiburg":
        modalities = None if args.extra is None else args.extra
        dataset = datasets.Freiburg(modalities)
    elif args.dataset.lower() == "vistas":
        dataset = datasets.Vistas()
    else:
        raise ValueError("Invalid argument \"dataset\": %s" % args.dataset)

    ################ Build Tensorflow Graph ##################
    input_filename = tf.placeholder(dtype=tf.string)
    file_contents = tf.read_file(input_filename)
    # Seperate heads for decoding png or jpg
    image_decoding = tf.image.decode_image(file_contents)
    label_decoding = tf.image.decode_image(file_contents)
    # Get the shape of image / labels to assert them equal
    image_shape = tf.shape(image_decoding)
    label_shape = tf.shape(label_decoding)
    if args.width is not None:
        scale_factor_image = image_shape[1] / args.width
        scale_factor_label = label_shape[1] / args.width
    if scale_factor_image is not None:
        scale_factors_image = tf.stack(
            [scale_factor_image, scale_factor_image, 1])
        scale_factors_label = tf.stack(
            [scale_factor_label, scale_factor_label, 1])
        # Compute rescaled shapes
        image_shape = tf.cast(
            tf.round(tf.cast(image_shape, tf.float64) / scale_factors_image),
            tf.int32)
        label_shape = tf.cast(
            tf.round(tf.cast(label_shape, tf.float64) / scale_factors_label),
            tf.int32)

        image_decoding = tf.image.resize_nearest_neighbor(
            tf.expand_dims(image_decoding, axis=0), image_shape[:-1])
        label_decoding = tf.image.resize_nearest_neighbor(
            tf.expand_dims(label_decoding, axis=0), label_shape[:-1])
        image_decoding = tf.squeeze(image_decoding, axis=0)
        label_decoding = tf.squeeze(label_decoding, axis=0)
    image_encoding = tf.cond(
        tf.strings.regex_full_match(input_filename, ".+\.png$"),
        true_fn=lambda: tf.image.encode_png(image_decoding),
        false_fn=lambda: tf.image.encode_jpeg(image_decoding))
    # Remapping of labels (can only be png)
    embedding = tf.constant(dataset.embedding, dtype=tf.uint8)
    label_remapped = tf.gather_nd(embedding, tf.cast(label_decoding, tf.int32))
    label_remapped = tf.expand_dims(label_remapped, axis=-1)
    label_encoding = tf.image.encode_png(label_remapped)
    # In order to convert tiff to png
    tif_input_image = tf.placeholder(tf.uint8, shape=[None, None, None])
    tif_png_encoding = tf.image.encode_png(tif_input_image)
    ##########################################################

    if os.path.exists(args.data_dir):
        dataset_paths = dataset.file_associations(args.data_dir)
    else:
        raise ValueError("Dataset path does not exist\n%s\n" % args.data_dir)

    if not os.path.exists(args.output_dir):
        sys.stdout.write("Directory \"%s\" does not exist. " % args.output_dir)
        sys.stdout.write("Do you want to create it? [y/N] ")
        sys.stdout.flush()
        user_input = sys.stdin.read(1)
        if user_input.lower()[0] != "y":
            sys.exit(0)
        else:
            os.makedirs(args.output_dir)

    # Create session on CPU
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = ""
    sess = tf.Session(config=config)
    # Write records for each split
    for split in dataset_paths.keys():
        # Create directory for the split
        split_path = os.path.join(args.output_dir, split)
        if not os.path.exists(split_path):
            os.mkdir(split_path)
        # Progress bar
        if show_progress:
            example_iter = tqdm(list(dataset_paths[split].items()),
                                desc="%-7s" % split,
                                ascii=True,
                                dynamic_ncols=True)
        else:
            example_iter = list(dataset_paths[split].items())
        # Iterate over all examples in split and gather samples in
        # separate records
        for example in example_iter:
            # example = [str(ID), dict({str(type): str(path)})]
            features = {}
            shapes = []
            for _type in example[1].keys():
                # Only "label" key need to be treated differently all other
                # is assumed to contain image data (rgb/nir/depthmap)
                path = example[1][_type]
                ext = path.split(".")[-1]  # path extension
                if "label" in _type:  # label data
                    # Check file extension
                    if ext != "png":
                        raise ValueError(
                            "The label images need to be png files!"
                            "Got \"%s\"" % ext)
                    label, shape = sess.run(
                        fetches=[label_encoding, label_shape],
                        feed_dict={input_filename: path})
                    features["label"] = _bytes_feature(label)
                else:  # image data
                    # Handle the different file extensions separately
                    if ext == "tif" or ext == "tiff":
                        # read image and convert to png
                        ext = "png"
                        # Read image as is (iscolor=-1)
                        image = cv2.imread(path, -1)
                        shape = image.shape
                        if len(shape) == 3 and shape[-1] == 3:
                            # Opencv defaults to BGR whereas Tensorflow RGB
                            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                        elif len(shape) == 2:
                            image = np.expand_dims(image, axis=-1)
                        image = sess.run(tif_png_encoding,
                                         feed_dict={tif_input_image: image})
                    elif ext == "png" or ext == "jpg" or ext == "jpeg":
                        image, shape = sess.run(
                            fetches=[image_encoding, image_shape],
                            feed_dict={input_filename: path})
                    else:
                        raise ValueError("Unsupported image format \"%s\"" %
                                         ext)
                    if len(shape) == 3:
                        channels = shape[2]
                    else:
                        channels = 1
                    # note that @_type/data is the raw image encoding
                    features[_type + "/channels"] = _int64_feature(channels)
                    features[_type + "/data"] = _bytes_feature(image)
                    features[_type + "/encoding"] = _bytes_feature(ext)
                shapes.append(shape)
            # END for _type in example[1].keys()
            # Check that shapes are consistent
            for i in range(1, len(shapes)):
                if shapes[i][0] != shapes[i-1][0] or \
                   shapes[i][1] != shapes[i-1][1]:
                    raise ValueError("Image dimensions does not match label.\n"
                                     "Got: %s" % shapes)
            # Add shape info to feature. Note that channels are allready added
            # and label image is assumed to be single channel png image.
            features["height"] = _int64_feature(shape[0])
            features["width"] = _int64_feature(shape[1])
            features["id"] = _bytes_feature(example[0])
            # Construct feature example
            tf_features = tf.train.Features(feature=features)
            tf_example = tf.train.Example(features=tf_features)
            filename = example[0] + ".tfrecord"
            with tf.io.TFRecordWriter(os.path.join(split_path, filename)) as f:
                f.write(tf_example.SerializeToString())
    # Write feature keys in order to dynamically being able to reconstruct the
    # content of the records when reading the records.
    meta_file = os.path.join(args.output_dir, "meta.txt")
    with open(meta_file, "w") as f:
        f.write("\n".join(features.keys()))
def main(args, logger):
    # Retrieve training parameters for convenience
    params   = args.params               # All parameters
    hparams  = params["hyperparams"]     # Hyperparamters
    alparams = params["active_learning"] # Active learning parameters
    state = None # State dict
    # Define state and config filenames
    state_filename  = os.path.join(args.log_dir, "state.json")
    config_filename = os.path.join(args.log_dir, "config.json")
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
        # Dump parameter config
        with open(config_filename, "w+") as f:
            json.dump(params, f, indent=4)

    # Retrieve dataset specific object
    if args.dataset == "cityscapes":
        dataset = datasets.Cityscapes(coarse=args.coarse)
        test_examples_glob = os.path.join(args.data_dir, "val", "*.tfrecord")
    elif args.dataset == "freiburg":
        dataset = datasets.Freiburg()
        test_examples_glob = os.path.join(args.data_dir, "test", "*.tfrecord")
    elif args.dataset == "vistas":
        dataset = datasets.Vistas()
        test_examples_glob = os.path.join(args.data_dir, "val", "*.tfrecord")
    else:
        raise NotImplementedError("Dataset \"%s\" not supported" % args.dataset)

    # Prepare dataset example file paths.
    train_examples_glob = os.path.join(args.data_dir, "train", "*.tfrecord")

    if not os.path.exists(state_filename):
        # Initialize state
        # Resolve example filenames
        train_val_examples = np.sort(np.array(glob.glob(train_examples_glob)))
        # Pick examples from training set to use for validation
        val_examples   = train_val_examples[:alparams["num_validation"]]
        # Use the rest as training examples
        train_examples = train_val_examples[alparams["num_validation"]:]

        # Use annotated test set, NOTE: cityscapes validation set
        test_examples  = np.array(glob.glob(test_examples_glob))

        # Draw random train examples and mark as annotated
        train_indices  = np.arange(len(train_examples), dtype=np.int32)
        np.random.shuffle(train_indices)

        initially_labelled = alparams["num_initially_labelled"]
        if initially_labelled < 0:
            # Use rest of labelled examples
            initially_labelled = len(train_examples)

        # Possibly add actually unlabelled examples
        no_label_indices = np.empty(0, dtype=str)
        if args.unlabelled is not None:
            no_label_glob     = os.path.join(args.unlabelled, "*.tfrecord")
            no_label_examples = glob.glob(no_label_glob)
            no_label_indices  = np.arange(
                len(train_indices), len(train_indices)+len(no_label_examples)
            )
            train_examples = np.concatenate(train_examples,
                                            no_label_examples)
            train_indices = np.concatenate((train_indices, no_label_indices))

        labelled = train_indices[:initially_labelled]
        unlabelled = train_indices[initially_labelled:]
        del train_indices

        # Setup initial state
        state = {
            "checkpoint" : None, # Keep track of latest checkpoint.
            "iteration"  : 0,
            "dataset" : {
                "train" : {
                    "filenames"  : list(train_examples),
                    "labelled"   : labelled.tolist(),
                    "unlabelled" : unlabelled.tolist(),
                    "no_label"   : no_label_indices.tolist()
                },
                "val"   : {
                    "filenames" : list(val_examples)
                },
                "test"  : {
                    "filenames" : list(test_examples)
                }
            }
        }
        with open(state_filename, "w+") as f:
            json.dump(state, f, indent=2)

    else:
        # Load state
        with open(state_filename, "r") as f:
            state = json.load(f)
        # Extract filename properties
        train_examples   = np.array(state["dataset"]["train"]["filenames"])
        val_examples     = np.array(state["dataset"]["val"]["filenames"])
        test_examples    = np.array(state["dataset"]["test"]["filenames"])
        labelled         = np.array(state["dataset"]["train"]["labelled"])
        unlabelled       = np.array(state["dataset"]["train"]["unlabelled"])
        no_label_indices = np.array(state["dataset"]["train"]["no_label"])

    train_input_labelled = np.full_like(train_examples, False, dtype=bool)
    train_input_labelled[labelled] = True
    train_input_indices = np.arange(len(train_examples))

    with tf.device("/device:CPU:0"):
        with tf.name_scope("Datasets"):
            # Create input placeholders
            train_input = tt.input.NumpyCapsule()
            train_input.filenames = train_examples
            train_input.labelled = train_input_labelled
            train_input.indices   = train_input_indices

            val_input = tt.input.NumpyCapsule()
            val_input.filenames = val_examples
            test_input = tt.input.NumpyCapsule()
            test_input.filenames = test_examples

            # Setup input pipelines
            train_input_stage = tt.input.InputStage(
                input_shape=[params["network"]["input"]["height"],
                             params["network"]["input"]["width"]])
            # Validation AND Test input stage
            val_input_stage  = tt.input.InputStage(
                input_shape=[params["network"]["input"]["height"],
                             params["network"]["input"]["width"]])

            # Add datasets
            train_input_stage.add_dataset_from_placeholders(
                "train", train_input.filenames,
                train_input.labelled, train_input.indices,
                batch_size=params["batch_size"],
                augment=True)
            # Validation set
            val_input_stage.add_dataset_from_placeholders(
                "val", val_input.filenames,
                batch_size=params["batch_size"])
            # Test set
            val_input_stage.add_dataset_from_placeholders(
                "test", test_input.filenames,
                batch_size=params["batch_size"])
            # Calculate number of batches in each iterator
            val_batches   = (len(val_examples) - 1)//params["batch_size"] + 1
            test_batches  = (len(test_examples) - 1)//params["batch_size"] + 1

            # Get iterator outputs
            train_image_raw, train_image, train_label, train_mask, \
                train_labelled, train_index = train_input_stage.get_output()
            val_image, val_label, val_mask = val_input_stage.get_output()

        # Create step variables
        with tf.variable_scope("StepCounters"):
            global_step = tf.Variable(0, dtype=tf.int64,
                                      trainable=False, name="GlobalStep")
            local_step  = tf.Variable(0, dtype=tf.int64,
                                      trainable=False, name="LocalStep")
            global_step_op = tf.assign_add(global_step, local_step)
            epoch_step  = tf.Variable(0, trainable=False, name="EpochStep")
            epoch_step_inc = tf.assign_add(epoch_step, 1)

    # Build training- and validation network
    regularization = {"drop_rates": hparams["dropout_rates"]}
    if hparams["weight_reg"]["L2"] > 0.0 \
       or hparams["weight_reg"]["L1"] > 0.0:
        regularization = {
            "weight_regularization" : tf.keras.regularizers.l1_l2(
                                          l1=hparams["weight_reg"]["L1"],
                                          l2=hparams["weight_reg"]["L2"]),
            "regularization_scaling" : hparams["weight_reg"]["glorot_scaling"],
        }

    # Initialize networks
    train_net = models.ENet(
        dataset.num_classes,
        **regularization
    )
    val_net = models.ENet(dataset.num_classes)

    with tf.device("/device:GPU:0"):
        # Build graph for training
        train_logits  = train_net(train_image, training=True)
        # Compute predictions: use @train_pred for metrics and
        # @pseudo_label for pseudo_annotation process.
        train_pred    = tf.math.argmax(train_logits, axis=-1,
                                       name="TrainPredictions")

        with tf.name_scope("PseudoAnnotation"):
            # Build ops one more time without dropout.
            pseudo_logits = train_net(train_image_raw, training=False)
            # Just make sure not to propagate gradients a second time.
            pseudo_logits = tf.stop_gradient(pseudo_logits)
            pseudo_label  = tf.math.argmax(pseudo_logits, axis=-1,
                                           name="TrainPredictions")
            pseudo_label = tf.cast(pseudo_label, tf.uint8)

            # Configure on-line high confidence pseudo labeling.
            pseudo_prob   = tf.nn.softmax(pseudo_logits, axis=-1, name="TrainProb")
            if alparams["measure"] == "entropy":
                # Reduce entropy over last dimension.
                # Compute prediction entropy
                entropy = - pseudo_prob * tf.math.log(pseudo_prob+EPSILON)
                entropy = tf.math.reduce_sum(entropy, axis=-1)
                # Convert logarithm base to units of number of classes
                # NOTE this will make the metric independent of number of
                #      classes as well the range in [0,1]
                log_base = tf.math.log(np.float32(dataset.num_classes))
                entropy = entropy / log_base
                # Convert entropy to confidence
                pseudo_confidence = 1.0 - entropy
            elif alparams["measure"] == "margin":
                # Difference between the two largest entries in last dimension.
                values, indices = tf.math.top_k(pseudo_prob, k=2)
                pseudo_confidence = values[:,:,:,0] - values[:,:,:,1]
            elif alparams["measure"] == "confidence":
                # Reduce max over last dimension.
                pseudo_confidence = tf.math.reduce_max(pseudo_prob, axis=-1)
            else:
                raise NotImplementedError("Uncertainty function not implemented.")
            pseudo_mean_confidence = tf.reduce_mean(
                tf.cast(pseudo_confidence, tf.float64),
                axis=(1,2))
            # Pseudo annotate high-confidence unlabeled example pixels
            pseudo_mask = tf.where(tf.math.less(pseudo_confidence, alparams["threshold"]),
                                   tf.zeros_like(pseudo_label,
                                                 dtype=train_label.dtype),
                                   tf.ones_like(pseudo_label,
                                                dtype=train_label.dtype))
            # Pseudo annotation logic (think of it as @tf.cond maped 
            # over batch dimension)
            train_label = tf.where(train_labelled, train_label,
                                   pseudo_label, name="MaybeGenLabel")
            train_mask  = tf.where(train_labelled, train_mask,
                                   pseudo_mask, name="MaybeGenMask")

    with tf.device("/device:GPU:1"):
        # Build validation network.
        val_logits = val_net(val_image, training=False)
        val_pred   = tf.math.argmax(val_logits, axis=-1,
                                    name="ValidationPredictions")

    # Build cost function
    with tf.name_scope("Cost"):
        with tf.device("/device:GPU:0"):
            # Establish loss function
            if hparams["softmax"]["multiscale"]:
                loss, loss_weights = \
                    tt.losses.multiscale_masked_softmax_cross_entropy(
                        train_label,
                        train_net.endpoint_outputs[0],
                        train_mask, dataset.num_classes,
                        weight=hparams["softmax"]["loginverse_scaling"],
                        label_smoothing=hparams["softmax"]["label_smoothing"],
                        scope="XEntropy")
                # NOTE: this will make @loss_weights checkpointed
                train_net.loss_scale_weights = loss_weights
            else:
                loss = tt.losses.masked_softmax_cross_entropy(
                    train_label,
                    train_logits,
                    train_mask, dataset.num_classes,
                    weight=hparams["softmax"]["loginverse_scaling"],
                    label_smoothing=hparams["softmax"]["label_smoothing"],
                    scope="XEntropy")
            cost = loss
            # Add regularization to cost function
            if len(train_net.losses) > 0:
                regularization_loss = tf.math.add_n(train_net.losses, name="Regularization")
                cost += tf.cast(regularization_loss, dtype=tf.float64)

            # Setup learning rate
            learning_rate = hparams["learning_rate"]
            if hparams["learning_rate_decay"] > 0.0:
                # Inverse time learning_rate if lr_decay specified
                learning_rate = tf.train.inverse_time_decay(
                    learning_rate, local_step,
                    decay_steps=train_batches,
                    decay_rate=hparams["learning_rate_decay"])

            # Create optimization procedure
            optimizer = tf.train.AdamOptimizer(learning_rate, **hparams["optimizer"]["kwargs"])

            # Create training op
            train_op  = optimizer.minimize(cost, global_step=local_step,
                                           name="TrainOp")
        # END tf.device("/device:GPU:0")
    # END tf.name_scope("Cost")

    # Create summary operations for training and validation network
    with tf.name_scope("Summary"):
        # Create colormap for image summaries
        colormap = tf.constant(dataset.colormap, dtype=tf.uint8,
                               name="Colormap")
        # Create metric evaluation and summaries
        with tf.device("/device:GPU:0"):
            with tf.name_scope("TrainMetrics"):
                # Create metrics object for training network.
                train_metrics = tt.metrics.Metrics(train_pred, train_label,
                                                   dataset.num_classes, train_mask)
                # Get Tensorflow update op.
                metric_update_op = train_metrics.get_update_op()
                # Get Tensorflow summary operations.
                metric_summaries = train_metrics.get_summaries()

            train_summary_iter = tf.summary.merge(
                [
                    # Summaries run at each iteration.
                    tf.summary.scalar("CrossEntropyLoss", loss,
                                      family="Losses"),
                    tf.summary.scalar("TotalCost", cost,
                                      family="Losses"),
                    tf.summary.scalar("LearningRate", learning_rate,
                                      family="Losses")
                ], name="IterationSummaries"
               )

            with tf.control_dependencies([metric_update_op]):
                train_summary_epoch = tf.summary.merge(
                    [
                        # Summaries run at epoch boundaries.
                        metric_summaries["Metrics"],
                        metric_summaries["ConfusionMat"]
                    ], name="EpochSummaries"
                   )

            train_image_summary = tf.summary.merge(
                [
                    tf.summary.image(
                        "PseudoLabel/input",
                        train_image_raw,
                        family="PseudoLabel"
                    ),
                    tf.summary.image(
                        "PseudoLabel/confidence",
                        tf.expand_dims(pseudo_confidence, axis=-1),
                        family="PseudoLabel"
                    ),
                    tf.summary.image(
                        "PseudoLabel", 
                        tf.gather(dataset.colormap,
                                  tf.cast(pseudo_label*pseudo_mask \
                                  + (1 - pseudo_mask)*255,
                                  tf.int32)),
                        family="PseudoLabel"
                    )
                ]
            )
        # Create metric evaluation and summaries
        with tf.device("/device:GPU:1"):
            with tf.name_scope("ValidationTestMetrics"):
                # Create metrics object
                val_metrics = tt.metrics.Metrics(val_pred, val_label,
                                                 dataset.num_classes, val_mask)
                # Get update tensorflow ops
                val_metric_update_op = val_metrics.get_update_op()
                # Get metric sumaries
                val_metric_summaries = val_metrics.get_summaries()

                with tf.control_dependencies([val_metric_update_op]):
                    val_metric_summary = tf.summary.merge(
                        [
                            # "Expensive" summaries run at epoch boundaries.
                            val_metric_summaries["Metrics"],
                            val_metric_summaries["ClassMetrics"],
                            val_metric_summaries["ConfusionMat"]
                        ], name="EpochSummaries"
                    )
                    val_image_summary = tf.summary.merge(
                        [
                            tf.summary.image("Input", val_image),
                            tf.summary.image("Label", tf.gather(
                                colormap, tf.cast(val_label + 255*(1-val_mask),
                                                  tf.int32))),
                            tf.summary.image("Predictions", tf.gather(
                                colormap, tf.cast(val_pred, tf.int32)))
                        ]
                    )
                    val_summary_epoch = val_metric_summary
                    test_summary_epoch = tf.summary.merge([
                        val_metric_summary,
                        val_image_summary
                        ]
                    )
        conf_summary_ph = tf.placeholder(tf.float64, shape=[None])
        conf_summary = tf.summary.histogram("ConfidenceDistribution",
                                            conf_summary_ph)
    # END name_scope("Summary")

    # Create session with soft device placement
    #     - some ops neet to run on the CPU
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        logger.debug("Initializing variables...")
        sess.run(tf.global_variables_initializer())


        # Create checkpoint object
        with tf.name_scope("Checkpoint"):
            checkpoint = tf.train.Checkpoint(model=train_net,
                                             epoch=epoch_step,
                                             step=global_step,
                                             optimizer=optimizer)
            checkpoint_name = os.path.join(args.log_dir, "model")
            if args.checkpoint is not None:
                # CMDline checkpoint given
                ckpt = args.checkpoint
                if os.path.isdir(ckpt):
                    ckpt = tf.train.latest_checkpoint(ckpt)
                if ckpt is None:
                    logger.error("Checkpoint path \"%s\" is invalid.")
                    return 1
                logger.info("Resuming from checkpoint \"%s\"" % ckpt)
                status = checkpoint.restore(ckpt)
                if tf.__version__ < "1.14.0":
                    status.assert_existing_objects_matched()
                else:
                    status.expect_partial()
                status.initialize_or_restore(sess)
                if args.reinitialize_output:
                    sess.run(train_net.Final.kernel.initializer)

            elif state["checkpoint"] != None:
                # Try to restore from checkpoint in logdir
                ckpt = state["checkpoint"]
                logger.info("Resuming from checkpoint \"%s\"" % ckpt)
                status = checkpoint.restore(ckpt)
                if tf.__version__ < "1.14.0":
                    status.assert_existing_objects_matched()
                else:
                    status.expect_partial()
                status.initialize_or_restore(sess)

            with tf.name_scope("UpdateValidationWeights"):
                update_val_op = []
                for i in range(len(val_net.layers)):
                    for j in range(len(val_net.layers[i].variables)):
                        update_val_op.append(
                            tf.assign(val_net.layers[i].variables[j],
                                      train_net.layers[i].variables[j]))
                update_val_op = tf.group(update_val_op)

        ckpt_manager = tt.checkpoint_manager.CheckpointManager(checkpoint,
                                                           args.log_dir)
        # END scope Checkpoint
        # Prepare global fetches dict
        fetches = {
            "train" : {
                "iteration" : {
                    "step"     : global_step_op,
                    "summary"  : train_summary_iter,
                    "train_op" : train_op,
                    "update"   : metric_update_op,
                    "updates"  : train_net.updates
                },
                "epoch"     : {
                    "step"     : epoch_step,
                    "summary"  : train_summary_epoch,
                    "summary/image" : train_image_summary
                }
            },
            "val"   : { # Validation and test fetches
                "iteration" : {
                    "update"   : val_metric_update_op
                },
                "epoch"     : {
                    "step"     : epoch_step,
                    "MeanIoU"  : val_metrics.metrics["MeanIoU"],
                    "summary"  : val_summary_epoch,
                    # Also add image summary, however only added to
                    # writer every N epochs.
                    "summary/image" : val_image_summary
                }
            },
            "test" : {
                "iteration" : {"update"  : val_metric_update_op},
                "epoch"     : {"summary" : test_summary_epoch}
            }
        }

        # Train loop (until convergence) -> Pick unlabeled examples -> test_loop
        def train_loop(summary_writer):
            """
            Train loop closure.
            Runs training loop untill no improvement is seen in
            @params["epochs"] epochs before returning.
            """
            # How many epoch until counting @no_improvement
            _initial_grace_period = alparams["epochs/warm_up"]
            best_ckpt             = state["checkpoint"]
            best_mean_iou         = 0.0
            log_subdir            = summary_writer.get_logdir()
            run_name              = os.path.basename(log_subdir)
            checkpoint_prefix     = os.path.join(log_subdir, "model")
            num_iter_per_epoch    = np.maximum(train_input.size,
                                              val_input.size)
            no_improvement_count = 0
            while no_improvement_count < params["epochs"] \
                or _initial_grace_period >= 0:
                _initial_grace_period -= 1
                # Increment in-graph epoch counter.
                epoch = sess.run(epoch_step_inc)

                # Prepare inner loop iterator
                _iter = range(0, num_iter_per_epoch, params["batch_size"])
                if show_progress:
                    _iter = tqdm.tqdm(_iter, desc="%s[%d]" % (run_name, epoch),
                                      dynamic_ncols=True,
                                      ascii=True,
                                      postfix={"NIC": no_improvement_count})

                # Initialize iterators
                train_input_stage.init_iterator(
                    "train", sess, train_input.feed_dict)
                val_input_stage.init_iterator(
                    "val", sess, val_input.feed_dict)

                # Reset confusion matrices
                train_metrics.reset_metrics(sess)
                val_metrics.reset_metrics(sess)

                # Prepare iteration fetches
                _fetches = {
                    "train" : {"iteration" : fetches["train"]["iteration"]},
                    "val"   : {"iteration" : fetches["val"]["iteration"]}
                }
                # Update validation network weights
                sess.run(update_val_op)

                try:
                    for i in _iter:
                        if train_input.size-params["batch_size"] <= i < train_input.size:
                            # Fetches for last training iteration.
                            _fetches["train"]["epoch"] = fetches["train"]["epoch"]
                        if val_input.size-params["batch_size"] <= i < val_input.size:
                            _fetches["val"]["epoch"] = fetches["val"]["epoch"]

                        # Run fetches
                        results = sess.run(_fetches)

                        if "train" in results.keys():
                            # Add iteration summary
                            summary_writer.add_summary(
                                results["train"]["iteration"]["summary"],
                                results["train"]["iteration"]["step"])

                            # Maybe add epoch summary
                            if "epoch" in results["train"].keys():
                                summary_writer.add_summary(
                                    results["train"]["epoch"]["summary"],
                                    results["train"]["epoch"]["step"]
                                )
                                # Pop fetches to prohibit OutOfRangeError due to
                                # asymmetric train-/val- input size.
                                if results["train"]["epoch"]["step"] % 100 == 0:
                                    summary_writer.add_summary(
                                        results["train"]["epoch"]["summary/image"],
                                        results["train"]["epoch"]["step"]
                                    )
                                _fetches.pop("train")

                        if "val" in results.keys() and \
                           "epoch" in results["val"].keys():
                            # Add summaries to event log.
                            summary_writer.add_summary(
                                results["val"]["epoch"]["summary"],
                                results["val"]["epoch"]["step"]
                            )
                            if results["val"]["epoch"]["step"] % 100 == 0:
                                # Only report image summary every 100th epoch.
                                summary_writer.add_summary(
                                    results["val"]["epoch"]["summary/image"],
                                    results["val"]["epoch"]["step"]
                                )
                            # Check if MeanIoU improved and
                            # update counter and best
                            if results["val"]["epoch"]["MeanIoU"] > best_mean_iou:
                                best_mean_iou = results["val"]["epoch"]["MeanIoU"]
                                # Update checkpoint file used for
                                # @tf.train.latest_checkpoint to point at
                                # current best.
                                _ckpt_name = ckpt_manager.commit(
                                    checkpoint_prefix, sess)
                                if _ckpt_name != "":
                                    best_ckpt = _ckpt_name
                                # Reset counter
                                no_improvement_count = 0
                            else:
                                # Result has not improved, increment counter.
                                no_improvement_count += 1
                                if no_improvement_count >= params["epochs"] and \
                                   _initial_grace_period < 0:
                                    _iter.close()
                                    break
                            if show_progress:
                                _iter.set_postfix(NIC=no_improvement_count)
                            # Pop fetches to prohibit OutOfRangeError due to
                            # asymmetric train-/val- input size.
                            _fetches.pop("val")
                        # END "maybe add epoch summary"
                except tf.errors.OutOfRangeError:
                    logger.error("Out of range error. Attempting to continue.")
                    pass

                summary_writer.flush()
                ckpt_manager.cache(sess)
            # END while no_improvement_count < params["epochs"]
            return best_ckpt

        def test_loop(summary_writer):
            """
            Test loop closure.
            """
            _step = len(labelled)
            # Initialize validation input stage with test set
            val_input_stage.init_iterator("test", sess, test_input.feed_dict)
            _iter = range(0, test_input.size, params["batch_size"])
            if show_progress:
                _iter = tqdm.tqdm(_iter, desc="test[%d]" % (_step),
                                  ascii=True,
                                  dynamic_ncols=True)
            summary_proto = None
            val_metrics.reset_metrics(sess)
            try:
                for i in _iter:
                    # Accumulate confusion matrix
                    if i < test_input.size - params["batch_size"]:
                        sess.run(fetches["test"]["iteration"]["update"])
                    else:
                        # Run summary operation last iteration
                        _, summary_proto = sess.run([fetches["test"]["iteration"]["update"],
                                                     fetches["test"]["epoch"]["summary"]])
            except tf.errors.OutOfRangeError:
                pass
            # Add summary with number of labelled examples as step.
            # NOTE this only runs on each major iteration.
            summary_writer.add_summary(
                summary_proto, _step
            )

        def rank_confidence():
            # Allocate array to store all confidence scores
            num_examples = len(state["dataset"]["train"]["filenames"])
            confidence = np.zeros(num_examples, dtype=np.float32)
            # Initialize input stage
            train_input_stage.init_iterator("train", sess,
                                            train_input.feed_dict)
            _iter = range(0, train_input.size, params["batch_size"])
            if show_progress:
                _iter = tqdm.tqdm(_iter, desc="ranking[%d]" % len(labelled),
                                  ascii=True,
                                  dynamic_ncols=True)
            try:
                for i in _iter:
                    # Loop over all examples and compute confidence
                    batch_confidence, batch_indices = sess.run(
                        [pseudo_mean_confidence, train_index])
                    # Add to list of confidence
                    confidence[batch_indices] = batch_confidence
            except tf.errors.OutOfRangeError:
                pass

            # Filter out labelled examples
            unlabelled_confidence = confidence[unlabelled]

            selection_size = np.minimum(len(unlabelled),
                                        alparams["selection_size"])
            # Get the lowest confidence indices of unlabelled subset
            example_indices = np.argpartition(unlabelled_confidence,
                                              selection_size)
            example_indices = example_indices[:selection_size]
            # Convert to indices into all filenames list
            low_conf_examples = unlabelled[example_indices]
            return low_conf_examples, unlabelled_confidence

        checkpoint_path = state["checkpoint"]
        # Only add graph to first event file
        _graph = sess.graph if checkpoint_path == None else None
        with tf.summary.FileWriter(args.log_dir, graph=_graph) as test_writer:
            iterations = alparams["iterations"]
            if iterations < 0:
                # Iterate untill all data is consumed
                iterations = np.ceil(len(unlabelled)
                                     / float(alparams["selection_size"]))
                logger.info("Iteration count: %d" % iterations)

            while state["iteration"] < iterations:
                # Step 1: train_loop
                train_input.set_indices(labelled)

                if state["iteration"] == 0:
                    # Pretrain
                    log_subdir = os.path.join(args.log_dir, "pretrain")
                    # Only use labelled subset
                else:
                    # Any other iteration
                    log_subdir = os.path.join(args.log_dir, "iter-%d" %
                                              state["iteration"])
                    # Sample from the unlabelled set
                    p = alparams["pseudo_labelling_proportion"]
                    sample_size = int(len(labelled)*p/(1-p))
                    sample_size = np.minimum(sample_size, len(unlabelled))
                    train_input.set_sample_size(sample_size)

                # Create subdir if it doesn't exist
                if not os.path.exists(log_subdir):
                    os.mkdir(log_subdir)

                # Change checkpoint manager directory
                ckpt_manager.chdir(log_subdir)
                with tf.summary.FileWriter(log_subdir) as train_val_writer:
                    # Enter train loop
                    try:
                        checkpoint_path = train_loop(train_val_writer)
                    except KeyboardInterrupt as exception:
                        # Quickly store state
                        if ckpt_manager.latest_checkpoint != "":
                            state["checkpoint"] = ckpt_manager.latest_checkpoint
                        with open(state_filename, "w") as f:
                            json.dump(state, f, indent=2)
                            f.truncate()
                        raise exception


                # Reload best checkpoint
                status = checkpoint.restore(checkpoint_path)
                status.run_restore_ops(sess)
                sess.run(update_val_op)

                # Step 2: test_loop
                if test_input.size > 0:
                    # This step may be omitted on deployment
                    test_loop(test_writer)

                # Step 3: Find low confidence examples
                # Reset train_input to use all examples for ranking
                train_input.set_indices()
                if alparams["selection_size"] > 0:
                    low_conf_examples, unlabelled_conf = rank_confidence()
                    _hist_summary = sess.run(conf_summary,
                                             {conf_summary_ph: 
                                              unlabelled_conf})
                    test_writer.add_summary(_hist_summary, state["iteration"])
                else:
                    # Draw examples randomly
                    selection_size = np.minimum(alparams["selection_size"],
                                                len(unlabelled.tolist()))
                    if selection_size != 0:
                        low_conf_examples = np.random.choice(
                            unlabelled, np.abs(alparams["selection_size"]))
                    else:
                        low_conf_examples = []

                # (maybe) Pause for user to annotate
                to_annotate_indices = no_label_indices[np.isin(
                    no_label_indices, low_conf_examples)]

                while len(to_annotate_indices) > 0:
                    to_annotate = train_examples[to_annotate_indices]
                    # Poll user for filenames of annotated examples
                    logger.info("Please annotate the following examples:\n%s" %
                                "\n".join(to_annotate_basename.tolist()))
                    filenames = tkinter.filedialog.askopenfilename(
                        multiple=1,
                        filetypes=(("TFRecord", "*.tfrecord"),))

                    hit = [] # List of matching filename indices
                    for filename in filenames:
                        basename = os.path.basename(filename)
                        idx = -1
                        for i in range(len(to_annotate)):
                            if to_annotate[i].endswith(basename):
                                idx = i
                                break
                        if idx != -1:
                            # Update state filenames
                            train_examples[to_annotate_indices[idx]] = filename
                            hit.append(idx)
                        else:
                            logger.info("Unrecognized filepath: %s" % filename)
                    # Remove matched paths
                    to_annotate_indices = np.delete(to_annotate_indices, hit)


                # Remove annotated examples from unlabelled set
                no_label_indices = no_label_indices[np.isin(no_label_indices,
                                                             low_conf_examples,
                                                             invert=True)]


                logger.info(
                    "Moving following examples to labelled set:\n%s" %
                    "\n".join(train_examples[low_conf_examples].tolist())
                )
                # First make the update to input stage before
                # commiting state change
                train_input_labelled[low_conf_examples] = True
                train_input.labelled = train_input_labelled


                # Step 4: Update state information
                labelled = np.append(labelled, low_conf_examples)
                unlabelled = unlabelled[np.isin(unlabelled, low_conf_examples,
                                            assume_unique=True, invert=True)]
                state["dataset"]["train"]["filenames"] = train_examples.tolist()
                state["dataset"]["train"]["labelled"] = labelled.tolist()
                state["dataset"]["train"]["unlabelled"] = unlabelled.tolist()
                state["iteration"] += 1
                state["checkpoint"] = checkpoint_path
                # Dump updated state
                with open(state_filename, "w") as f:
                    json.dump(state, f, indent=2)
                    f.truncate()
    return 0
Пример #5
0
def main(args):
    # Retrieve dataset specific object
    if args.dataset == "cityscapes":
        dataset = datasets.Cityscapes(coarse=args.coarse)
    elif args.dataset == "freiburg":
        dataset = datasets.Freiburg()
    elif args.dataset == "vistas":
        dataset = datasets.Vistas()
    else:
        raise NotImplementedError("Dataset \"%s\" not supported" %
                                  args.dataset)
    # Gather train and validation paths
    train_paths = os.path.join(args.data_dir, "train")
    val_paths = os.path.join(args.data_dir, "val")
    # Retrieve training parameters
    params = args.params
    hparams = params["hyperparams"]

    with tf.device("/device:CPU:0"):
        with tf.name_scope("Datasets"):
            # Setup input pipelines
            train_input = tt.input.InputStage(input_shape=[
                params["network"]["input"]["height"], params["network"]
                ["input"]["width"]
            ])
            val_input = tt.input.InputStage(input_shape=[
                params["network"]["input"]["height"], params["network"]
                ["input"]["width"]
            ])

            # Add datasets
            train_examples = train_input.add_dataset(
                "train",
                train_paths,
                batch_size=params["batch_size"],
                epochs=1,
                augment=True)
            val_examples = val_input.add_dataset(
                "val", val_paths, batch_size=params["batch_size"], epochs=1)
            # Calculate number of batches
            train_batches = (train_examples - 1) // params["batch_size"] + 1
            val_batches = (val_examples - 1) // params["batch_size"] + 1

            # Get iterator outputs
            _, train_image, train_label, train_mask = train_input.get_output()
            val_image, val_label, val_mask = val_input.get_output()

        # Create step variables
        with tf.variable_scope("StepCounters"):
            # I'll use one local (to this run) and a global step that
            # will be checkpointed in order to run various schedules on
            # the learning rate decay policy.
            global_step = tf.Variable(0,
                                      dtype=tf.int64,
                                      trainable=False,
                                      name="GlobalStep")
            local_step = tf.Variable(0,
                                     dtype=tf.int64,
                                     trainable=False,
                                     name="LocalStep")
            global_step_op = global_step + local_step
            epoch_step = tf.Variable(0, trainable=False, name="EpochStep")
            epoch_step_inc = tf.assign_add(epoch_step, 1, name="EpochStepInc")

    regularization = {}
    if hparams["weight_reg"]["L2"] > 0.0 \
        or hparams["weight_reg"]["L1"] > 0.0:
        regularization = {
            "weight_regularization":
            tf.keras.regularizers.l1_l2(l1=hparams["weight_reg"]["L1"],
                                        l2=hparams["weight_reg"]["L2"]),
            "regularization_scaling":
            hparams["weight_reg"]["glorot_scaling"]
        }
    # Build training and validation network and get prediction output
    train_net = models.ENet(dataset.num_classes, **regularization)
    val_net = models.ENet(dataset.num_classes)
    with tf.device("/device:GPU:0"):
        train_logits = train_net(train_image, training=True)
        train_pred = tf.math.argmax(train_logits,
                                    axis=-1,
                                    name="TrainPredictions")

    with tf.device("/device:GPU:1"):
        val_logits = val_net(val_image, training=False)
        val_pred = tf.math.argmax(val_logits,
                                  axis=-1,
                                  name="ValidationPredictions")

    # Build cost function
    with tf.name_scope("Cost"):
        with tf.device("/device:GPU:0"):
            # Establish loss function
            if hparams["softmax"]["multiscale"]:
                loss, loss_weights = \
                        tt.losses.multiscale_masked_softmax_cross_entropy(
                    train_label,
                    train_net.endpoint_outputs[0],
                    train_mask, dataset.num_classes,
                    weight=hparams["softmax"]["loginverse_scaling"],
                    label_smoothing=hparams["softmax"]["label_smoothing"],
                    scope="XEntropy")
                # NOTE: this will make @loss_weights checkpointed
                train_net.loss_scale_weights = loss_weights
            else:
                loss = tt.losses.masked_softmax_cross_entropy(
                    train_label,
                    train_logits,
                    train_mask,
                    dataset.num_classes,
                    weight=hparams["softmax"]["loginverse_scaling"],
                    label_smoothing=hparams["softmax"]["label_smoothing"],
                    scope="XEntropy")
            cost = loss
            # Add regularization to cost function
            if len(train_net.losses) > 0:
                regularization_loss = tf.math.add_n(train_net.losses,
                                                    name="Regularization")
                cost += tf.cast(regularization_loss, dtype=tf.float64)

            # Setup learning rate
            learning_rate = hparams["learning_rate"]
            if hparams["learning_rate_decay"] > 0.0:
                # Inverse time learning_rate if lr_decay specified
                learning_rate = tf.train.inverse_time_decay(
                    learning_rate,
                    local_step,
                    decay_steps=train_batches,
                    decay_rate=hparams["learning_rate_decay"])

            # Create optimization procedure
            optimizer = tf.train.AdamOptimizer(
                learning_rate, **hparams["optimizer"]["kwargs"])

            # Create training op
            train_op = optimizer.minimize(cost,
                                          global_step=local_step,
                                          name="TrainOp")
            # NOTE: Make sure to update batchnorm params and metrics for
            # each training iteration.

    # Create summary operations for training and validation network
    with tf.name_scope("Summary"):
        # Create colormap for image summaries
        colormap = tf.constant(dataset.colormap,
                               dtype=tf.uint8,
                               name="Colormap")
        # Create metric evaluation and summaries
        with tf.device("/device:GPU:0"):
            with tf.name_scope("TrainMetrics"):
                train_metrics = tt.metrics.Metrics(train_pred, train_label,
                                                   dataset.num_classes,
                                                   train_mask)
                metric_update_op = train_metrics.get_update_op()
                metric_summaries = train_metrics.get_summaries()

            train_summary_iter = tf.summary.merge([
                tf.summary.scalar("CrossEntropyLoss", loss, family="Losses"),
                tf.summary.scalar("TotalCost", cost, family="Losses"),
                tf.summary.scalar(
                    "LearningRate", learning_rate, family="Losses")
            ],
                                                  name="IterationSummaries")
            with tf.control_dependencies([metric_update_op]):
                train_summary_epoch = tf.summary.merge([
                    metric_summaries["Metrics"],
                    metric_summaries["ConfusionMat"],
                ],
                                                       name="EpochSummaries")

        # Create metric evaluation and summaries
        with tf.device("/device:GPU:1"):
            with tf.name_scope("ValidationMetrics"):
                val_metrics = tt.metrics.Metrics(val_pred, val_label,
                                                 dataset.num_classes, val_mask)
                val_metric_update_op = val_metrics.get_update_op()
                val_metric_summaries = val_metrics.get_summaries()

                with tf.control_dependencies([val_metric_update_op]):
                    val_summary_epoch = tf.summary.merge([
                        val_metric_summaries["Metrics"],
                        val_metric_summaries["ClassMetrics"],
                        val_metric_summaries["ConfusionMat"],
                        tf.summary.image("Input", val_image),
                        tf.summary.image(
                            "Label",
                            tf.gather(
                                colormap,
                                tf.cast(val_label + 255 *
                                        (1 - val_mask), tf.int32))),
                        tf.summary.image(
                            "Predictions",
                            tf.gather(colormap, tf.cast(val_pred, tf.int32)))
                    ],
                                                         name="EpochSummaries")
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
        # Dump parameter configuration (args)
        with open(os.path.join(args.log_dir, "config.json"), "w+") as f:
            json.dump(params, f, indent=4, sort_keys=True)

    # Create session with soft device placement
    #     - some ops neet to run on the CPU
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=sess_config) as sess:
        # Initialize/restore model variables
        logger.debug("Initializing model...")
        sess.run(tf.global_variables_initializer())
        # Create summary writer objects
        summary_writer = tf.summary.FileWriter(args.log_dir, graph=sess.graph)

        # Create checkpoint object
        with tf.name_scope("Checkpoint"):
            checkpoint = tf.train.Checkpoint(model=train_net,
                                             epoch=epoch_step,
                                             step=global_step,
                                             optimizer=optimizer)
            checkpoint_name = os.path.join(args.log_dir, "model")

            if args.checkpoint is not None:
                # CMDline checkpoint given
                ckpt = args.checkpoint
                if os.path.isdir(ckpt):
                    ckpt = tf.train.latest_checkpoint(ckpt)
                if ckpt is None:
                    logger.error("Checkpoint path \"%s\" is invalid.")
                    return 1
                logger.info("Resuming from checkpoint \"%s\"" % ckpt)
                status = checkpoint.restore(ckpt)
                if tf.__version__ < "1.14.0":
                    status.assert_existing_objects_matched()
                else:
                    status.expect_partial()
                status.initialize_or_restore(sess)

            elif tf.train.latest_checkpoint(args.log_dir) != None:
                # Try to restore from checkpoint in logdir
                ckpt = tf.train.latest_checkpoint(args.log_dir)
                logger.info("Resuming from checkpoint \"%s\"" % ckpt)
                status = checkpoint.restore(ckpt)
                if tf.__version__ < "1.14.0":
                    status.assert_existing_objects_matched()
                else:
                    status.expect_partial()
                status.initialize_or_restore(sess)

            with tf.name_scope("UpdateValidationWeights"):
                update_val_op = []
                for i in range(len(val_net.layers)):
                    for j in range(len(val_net.layers[i].variables)):
                        update_val_op.append(
                            tf.assign(val_net.layers[i].variables[j],
                                      train_net.layers[i].variables[j]))
                update_val_op = tf.group(update_val_op)
        # END scope Checkpoint

        # Prepare fetches
        fetches = {
            "train": {
                "iteration": {
                    "step": global_step_op,
                    "summary": train_summary_iter,
                    "train_op": train_op,
                    "update": metric_update_op,
                    "updates": train_net.updates
                },
                "epoch": {
                    "step": epoch_step,
                    "summary": train_summary_epoch
                }
            },
            "val": {
                "iteration": {
                    "update": val_metric_update_op
                },
                "epoch": {
                    "step": epoch_step,
                    "summary": val_summary_epoch
                }
            }
        }
        #run_options  = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        #run_metadata = tf.RunMetadata()
        logger.info("Starting training loop...")
        results = {}
        for epoch in range(1, params["epochs"] + 1):
            # Create iterator counter to track progress
            _iter = range(0, train_batches)
            if show_progress:
                _iter = tqdm.tqdm(_iter,
                                  desc="train[%3d/%3d]" %
                                  (epoch, params["epochs"]),
                                  ascii=True,
                                  dynamic_ncols=True)
            # Initialize input stage
            train_input.init_iterator("train", sess)
            val_input.init_iterator("val", sess)
            # Initialize or update validation network
            sess.run(update_val_op)
            # Reset for another round
            train_metrics.reset_metrics(sess)
            val_metrics.reset_metrics(sess)
            # Prepare initial fetches
            _fetches = {
                "train": {
                    "iteration": fetches["train"]["iteration"]
                },
                "val": {
                    "iteration": fetches["val"]["iteration"]
                }
            }

            for i in _iter:
                try:
                    # Dynamically update fetches
                    if i == train_batches - 1:
                        _fetches["train"]["epoch"] = fetches["train"]["epoch"]
                    if i == val_batches - 1:
                        _fetches["val"]["epoch"] = fetches["val"]["epoch"]
                    elif i == val_batches:
                        summary_writer.add_summary(
                            results["val"]["epoch"]["summary"],
                            results["val"]["epoch"]["step"])
                        _fetches.pop("val")
                    # Execute fetches
                    results = sess.run(
                        _fetches
                        #,options=run_options,
                        #run_metadata=run_metadata
                    )
                except tf.errors.OutOfRangeError:
                    pass
                # Update summaries
                summary_writer.add_summary(
                    results["train"]["iteration"]["summary"],
                    results["train"]["iteration"]["step"])
                #summary_writer.add_run_metadata(run_metadata, "step=%d" % i)

            # Update epoch counter
            _epoch = sess.run(epoch_step_inc)

            # Update epoch summaries
            summary_writer.add_summary(results["train"]["epoch"]["summary"],
                                       results["train"]["epoch"]["step"])
            summary_writer.flush()
            # Save checkpoint
            checkpoint.save(checkpoint_name, sess)

        ### FINAL VALIDATION ###
        _fetches = {"val": {"iteration": fetches["val"]["iteration"]}}
        _iter = range(0, val_batches)
        if show_progress:
            _iter = tqdm.tqdm(_iter,
                              desc="val[%3d/%3d]" %
                              (params["epochs"], params["epochs"]))
        # Re initialize network
        val_input.init_iterator("val", sess)
        sess.run(update_val_op)
        for i in _iter:
            try:
                if i >= val_batches - 1:
                    _fetches["val"]["epoch"] = fetches["val"]["epoch"]
                results = sess.run(_fetches)
            except tf.errors.OutOfRangeError:
                pass
        # Add final validation summary update
        summary_writer.add_summary(results["val"]["epoch"]["summary"],
                                   results["val"]["epoch"]["step"])
        # Close summary file
        summary_writer.close()
        logger.info("Training successfully finished %d epochs" %
                    params["epochs"])
    return 0