def main(args): if args.dataset.lower() == "cityscapes": use_coarse = True if args.extra is not None and \ "coarse" in args.extra[0].lower() \ else False dataset = datasets.Cityscapes(use_coarse) elif args.dataset.lower() == "freiburg": modalities = None if args.extra is None else args.extra dataset = datasets.Freiburg(modalities) elif args.dataset.lower() == "vistas": dataset = datasets.Vistas() else: raise ValueError("Invalid argument \"dataset\": %s" % args.dataset[0]) if os.path.exists(args.data_dir): dataset_paths = dataset.file_associations(args.data_dir) else: raise ValueError("Dataset path does not exist\n%s\n" % args.data_dir) if not os.path.exists(args.output_dir): sys.stdout.write("Directory \"%s\" does not exist. " % args.output_dir) sys.stdout.write("Do you want to create it? [y/N] ") sys.stdout.flush() user_input = sys.stdin.read(1) if user_input.lower()[0] != "y": sys.exit(0) else: os.makedirs(args.output_dir) for split in dataset_paths.keys(): # Create directory for the split split_path = os.path.join(args.output_dir, split) if not os.path.exists(split_path): os.mkdir(split_path) try: p = multiprocessing.Pool() # Progress bar examples = list(dataset_paths[split].items()) _record_example = partial(record_example, scale=args.scale_factor, dataset=dataset, split_path=split_path) if show_progress: example_iter = tqdm( p.imap_unordered(_record_example, examples), total=len(examples), desc="%-7s" % split).__iter__() else: example_iter = p.imap_unordered(_record_example, examples) # retrieve a single prototype feature features = next(example_iter) # multiprocess the rest for _ in example_iter: pass finally: p.close() # Write feature keys in order to dynamically being able to reconstruct the # content of the records when reading the records. meta_file = os.path.join(args.output_dir, "meta.txt") with open(meta_file, "w") as f: f.write("\n".join(features.keys()))
def main(args): tf.logging.set_verbosity(tf.logging.ERROR) dataset = None if args.dataset.lower() == "cityscapes": dataset = datasets.Cityscapes(args.use_coarse) elif args.dataset.lower() == "freiburg": dataset = datasets.Freiburg(args.modalities) elif args.dataset.lower() == "vistas": dataset = datasets.Vistas() elif args.dataset.lower() == "generic": dataset = datasets.Generic(args.image_dir, args.label_dir) else: raise ValueError("Dataset \"%s\" not supported." % args.dataset) if not os.path.exists(args.output_dir): sys.stdout.write("Directory \"%s\" does not exist. " % args.output_dir) sys.stdout.write("Do you want to create it? [y/N] ") sys.stdout.flush() user_input = sys.stdin.read(1) if user_input.lower()[0] != "y": sys.exit(0) else: os.makedirs(args.output_dir) file_associations = dataset.file_associations(args.data_dir) sess = tf.Session() for split in file_associations: # Create path to split split_path = os.path.join(args.output_dir, split) if not os.path.exists(split_path): os.mkdir(split_path) # Create generator and retrieve the length generator, output_len = generator_from_file_associations(file_associations[split]) # Create dataset from generator tf_dataset = tf.data.Dataset.from_generator(generator, output_types=(tf.string, tf.string), output_shapes=(output_len, output_len)) # Add fixed arguments @dataset / @split_path to map functions _read_images = lambda x, y: read_images(x, y, dataset, args.width) _tf_write_serialized_example = lambda x, y, z, u: \ tf_write_serialized_example(x, y, z, u, split_path) # Map the above functions tf_dataset = tf_dataset.map(_read_images, num_parallel_calls=_NUM_CPUS-1) tf_dataset = tf_dataset.map(_tf_write_serialized_example, num_parallel_calls=_NUM_CPUS-1) tf_dataset = tf_dataset.batch(_NUM_CPUS-1) # Create iterator _iter = tf_dataset.make_one_shot_iterator() _next = _iter.get_next() # Run over all examples with tqdm.tqdm(total=len(file_associations[split]), ascii=" #", desc="%-6s" % split, dynamic_ncols=True) as pbar: while True: try: filenames = sess.run(_next) pbar.update(len(filenames)) except tf.errors.OutOfRangeError: break sess.close() return 0
def main(args): dataset = None scale_factor = args.scale_factor scale_factor_image = None scale_factor_label = None if args.dataset.lower() == "cityscapes": use_coarse = True if args.extra is not None and \ "coarse" in args.extra[0].lower() \ else False dataset = datasets.Cityscapes(use_coarse) elif args.dataset.lower() == "freiburg": modalities = None if args.extra is None else args.extra dataset = datasets.Freiburg(modalities) elif args.dataset.lower() == "vistas": dataset = datasets.Vistas() else: raise ValueError("Invalid argument \"dataset\": %s" % args.dataset) ################ Build Tensorflow Graph ################## input_filename = tf.placeholder(dtype=tf.string) file_contents = tf.read_file(input_filename) # Seperate heads for decoding png or jpg image_decoding = tf.image.decode_image(file_contents) label_decoding = tf.image.decode_image(file_contents) # Get the shape of image / labels to assert them equal image_shape = tf.shape(image_decoding) label_shape = tf.shape(label_decoding) if args.width is not None: scale_factor_image = image_shape[1] / args.width scale_factor_label = label_shape[1] / args.width if scale_factor_image is not None: scale_factors_image = tf.stack( [scale_factor_image, scale_factor_image, 1]) scale_factors_label = tf.stack( [scale_factor_label, scale_factor_label, 1]) # Compute rescaled shapes image_shape = tf.cast( tf.round(tf.cast(image_shape, tf.float64) / scale_factors_image), tf.int32) label_shape = tf.cast( tf.round(tf.cast(label_shape, tf.float64) / scale_factors_label), tf.int32) image_decoding = tf.image.resize_nearest_neighbor( tf.expand_dims(image_decoding, axis=0), image_shape[:-1]) label_decoding = tf.image.resize_nearest_neighbor( tf.expand_dims(label_decoding, axis=0), label_shape[:-1]) image_decoding = tf.squeeze(image_decoding, axis=0) label_decoding = tf.squeeze(label_decoding, axis=0) image_encoding = tf.cond( tf.strings.regex_full_match(input_filename, ".+\.png$"), true_fn=lambda: tf.image.encode_png(image_decoding), false_fn=lambda: tf.image.encode_jpeg(image_decoding)) # Remapping of labels (can only be png) embedding = tf.constant(dataset.embedding, dtype=tf.uint8) label_remapped = tf.gather_nd(embedding, tf.cast(label_decoding, tf.int32)) label_remapped = tf.expand_dims(label_remapped, axis=-1) label_encoding = tf.image.encode_png(label_remapped) # In order to convert tiff to png tif_input_image = tf.placeholder(tf.uint8, shape=[None, None, None]) tif_png_encoding = tf.image.encode_png(tif_input_image) ########################################################## if os.path.exists(args.data_dir): dataset_paths = dataset.file_associations(args.data_dir) else: raise ValueError("Dataset path does not exist\n%s\n" % args.data_dir) if not os.path.exists(args.output_dir): sys.stdout.write("Directory \"%s\" does not exist. " % args.output_dir) sys.stdout.write("Do you want to create it? [y/N] ") sys.stdout.flush() user_input = sys.stdin.read(1) if user_input.lower()[0] != "y": sys.exit(0) else: os.makedirs(args.output_dir) # Create session on CPU config = tf.ConfigProto() config.gpu_options.visible_device_list = "" sess = tf.Session(config=config) # Write records for each split for split in dataset_paths.keys(): # Create directory for the split split_path = os.path.join(args.output_dir, split) if not os.path.exists(split_path): os.mkdir(split_path) # Progress bar if show_progress: example_iter = tqdm(list(dataset_paths[split].items()), desc="%-7s" % split, ascii=True, dynamic_ncols=True) else: example_iter = list(dataset_paths[split].items()) # Iterate over all examples in split and gather samples in # separate records for example in example_iter: # example = [str(ID), dict({str(type): str(path)})] features = {} shapes = [] for _type in example[1].keys(): # Only "label" key need to be treated differently all other # is assumed to contain image data (rgb/nir/depthmap) path = example[1][_type] ext = path.split(".")[-1] # path extension if "label" in _type: # label data # Check file extension if ext != "png": raise ValueError( "The label images need to be png files!" "Got \"%s\"" % ext) label, shape = sess.run( fetches=[label_encoding, label_shape], feed_dict={input_filename: path}) features["label"] = _bytes_feature(label) else: # image data # Handle the different file extensions separately if ext == "tif" or ext == "tiff": # read image and convert to png ext = "png" # Read image as is (iscolor=-1) image = cv2.imread(path, -1) shape = image.shape if len(shape) == 3 and shape[-1] == 3: # Opencv defaults to BGR whereas Tensorflow RGB image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) elif len(shape) == 2: image = np.expand_dims(image, axis=-1) image = sess.run(tif_png_encoding, feed_dict={tif_input_image: image}) elif ext == "png" or ext == "jpg" or ext == "jpeg": image, shape = sess.run( fetches=[image_encoding, image_shape], feed_dict={input_filename: path}) else: raise ValueError("Unsupported image format \"%s\"" % ext) if len(shape) == 3: channels = shape[2] else: channels = 1 # note that @_type/data is the raw image encoding features[_type + "/channels"] = _int64_feature(channels) features[_type + "/data"] = _bytes_feature(image) features[_type + "/encoding"] = _bytes_feature(ext) shapes.append(shape) # END for _type in example[1].keys() # Check that shapes are consistent for i in range(1, len(shapes)): if shapes[i][0] != shapes[i-1][0] or \ shapes[i][1] != shapes[i-1][1]: raise ValueError("Image dimensions does not match label.\n" "Got: %s" % shapes) # Add shape info to feature. Note that channels are allready added # and label image is assumed to be single channel png image. features["height"] = _int64_feature(shape[0]) features["width"] = _int64_feature(shape[1]) features["id"] = _bytes_feature(example[0]) # Construct feature example tf_features = tf.train.Features(feature=features) tf_example = tf.train.Example(features=tf_features) filename = example[0] + ".tfrecord" with tf.io.TFRecordWriter(os.path.join(split_path, filename)) as f: f.write(tf_example.SerializeToString()) # Write feature keys in order to dynamically being able to reconstruct the # content of the records when reading the records. meta_file = os.path.join(args.output_dir, "meta.txt") with open(meta_file, "w") as f: f.write("\n".join(features.keys()))
def main(args, logger): # Retrieve training parameters for convenience params = args.params # All parameters hparams = params["hyperparams"] # Hyperparamters alparams = params["active_learning"] # Active learning parameters state = None # State dict # Define state and config filenames state_filename = os.path.join(args.log_dir, "state.json") config_filename = os.path.join(args.log_dir, "config.json") if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) # Dump parameter config with open(config_filename, "w+") as f: json.dump(params, f, indent=4) # Retrieve dataset specific object if args.dataset == "cityscapes": dataset = datasets.Cityscapes(coarse=args.coarse) test_examples_glob = os.path.join(args.data_dir, "val", "*.tfrecord") elif args.dataset == "freiburg": dataset = datasets.Freiburg() test_examples_glob = os.path.join(args.data_dir, "test", "*.tfrecord") elif args.dataset == "vistas": dataset = datasets.Vistas() test_examples_glob = os.path.join(args.data_dir, "val", "*.tfrecord") else: raise NotImplementedError("Dataset \"%s\" not supported" % args.dataset) # Prepare dataset example file paths. train_examples_glob = os.path.join(args.data_dir, "train", "*.tfrecord") if not os.path.exists(state_filename): # Initialize state # Resolve example filenames train_val_examples = np.sort(np.array(glob.glob(train_examples_glob))) # Pick examples from training set to use for validation val_examples = train_val_examples[:alparams["num_validation"]] # Use the rest as training examples train_examples = train_val_examples[alparams["num_validation"]:] # Use annotated test set, NOTE: cityscapes validation set test_examples = np.array(glob.glob(test_examples_glob)) # Draw random train examples and mark as annotated train_indices = np.arange(len(train_examples), dtype=np.int32) np.random.shuffle(train_indices) initially_labelled = alparams["num_initially_labelled"] if initially_labelled < 0: # Use rest of labelled examples initially_labelled = len(train_examples) # Possibly add actually unlabelled examples no_label_indices = np.empty(0, dtype=str) if args.unlabelled is not None: no_label_glob = os.path.join(args.unlabelled, "*.tfrecord") no_label_examples = glob.glob(no_label_glob) no_label_indices = np.arange( len(train_indices), len(train_indices)+len(no_label_examples) ) train_examples = np.concatenate(train_examples, no_label_examples) train_indices = np.concatenate((train_indices, no_label_indices)) labelled = train_indices[:initially_labelled] unlabelled = train_indices[initially_labelled:] del train_indices # Setup initial state state = { "checkpoint" : None, # Keep track of latest checkpoint. "iteration" : 0, "dataset" : { "train" : { "filenames" : list(train_examples), "labelled" : labelled.tolist(), "unlabelled" : unlabelled.tolist(), "no_label" : no_label_indices.tolist() }, "val" : { "filenames" : list(val_examples) }, "test" : { "filenames" : list(test_examples) } } } with open(state_filename, "w+") as f: json.dump(state, f, indent=2) else: # Load state with open(state_filename, "r") as f: state = json.load(f) # Extract filename properties train_examples = np.array(state["dataset"]["train"]["filenames"]) val_examples = np.array(state["dataset"]["val"]["filenames"]) test_examples = np.array(state["dataset"]["test"]["filenames"]) labelled = np.array(state["dataset"]["train"]["labelled"]) unlabelled = np.array(state["dataset"]["train"]["unlabelled"]) no_label_indices = np.array(state["dataset"]["train"]["no_label"]) train_input_labelled = np.full_like(train_examples, False, dtype=bool) train_input_labelled[labelled] = True train_input_indices = np.arange(len(train_examples)) with tf.device("/device:CPU:0"): with tf.name_scope("Datasets"): # Create input placeholders train_input = tt.input.NumpyCapsule() train_input.filenames = train_examples train_input.labelled = train_input_labelled train_input.indices = train_input_indices val_input = tt.input.NumpyCapsule() val_input.filenames = val_examples test_input = tt.input.NumpyCapsule() test_input.filenames = test_examples # Setup input pipelines train_input_stage = tt.input.InputStage( input_shape=[params["network"]["input"]["height"], params["network"]["input"]["width"]]) # Validation AND Test input stage val_input_stage = tt.input.InputStage( input_shape=[params["network"]["input"]["height"], params["network"]["input"]["width"]]) # Add datasets train_input_stage.add_dataset_from_placeholders( "train", train_input.filenames, train_input.labelled, train_input.indices, batch_size=params["batch_size"], augment=True) # Validation set val_input_stage.add_dataset_from_placeholders( "val", val_input.filenames, batch_size=params["batch_size"]) # Test set val_input_stage.add_dataset_from_placeholders( "test", test_input.filenames, batch_size=params["batch_size"]) # Calculate number of batches in each iterator val_batches = (len(val_examples) - 1)//params["batch_size"] + 1 test_batches = (len(test_examples) - 1)//params["batch_size"] + 1 # Get iterator outputs train_image_raw, train_image, train_label, train_mask, \ train_labelled, train_index = train_input_stage.get_output() val_image, val_label, val_mask = val_input_stage.get_output() # Create step variables with tf.variable_scope("StepCounters"): global_step = tf.Variable(0, dtype=tf.int64, trainable=False, name="GlobalStep") local_step = tf.Variable(0, dtype=tf.int64, trainable=False, name="LocalStep") global_step_op = tf.assign_add(global_step, local_step) epoch_step = tf.Variable(0, trainable=False, name="EpochStep") epoch_step_inc = tf.assign_add(epoch_step, 1) # Build training- and validation network regularization = {"drop_rates": hparams["dropout_rates"]} if hparams["weight_reg"]["L2"] > 0.0 \ or hparams["weight_reg"]["L1"] > 0.0: regularization = { "weight_regularization" : tf.keras.regularizers.l1_l2( l1=hparams["weight_reg"]["L1"], l2=hparams["weight_reg"]["L2"]), "regularization_scaling" : hparams["weight_reg"]["glorot_scaling"], } # Initialize networks train_net = models.ENet( dataset.num_classes, **regularization ) val_net = models.ENet(dataset.num_classes) with tf.device("/device:GPU:0"): # Build graph for training train_logits = train_net(train_image, training=True) # Compute predictions: use @train_pred for metrics and # @pseudo_label for pseudo_annotation process. train_pred = tf.math.argmax(train_logits, axis=-1, name="TrainPredictions") with tf.name_scope("PseudoAnnotation"): # Build ops one more time without dropout. pseudo_logits = train_net(train_image_raw, training=False) # Just make sure not to propagate gradients a second time. pseudo_logits = tf.stop_gradient(pseudo_logits) pseudo_label = tf.math.argmax(pseudo_logits, axis=-1, name="TrainPredictions") pseudo_label = tf.cast(pseudo_label, tf.uint8) # Configure on-line high confidence pseudo labeling. pseudo_prob = tf.nn.softmax(pseudo_logits, axis=-1, name="TrainProb") if alparams["measure"] == "entropy": # Reduce entropy over last dimension. # Compute prediction entropy entropy = - pseudo_prob * tf.math.log(pseudo_prob+EPSILON) entropy = tf.math.reduce_sum(entropy, axis=-1) # Convert logarithm base to units of number of classes # NOTE this will make the metric independent of number of # classes as well the range in [0,1] log_base = tf.math.log(np.float32(dataset.num_classes)) entropy = entropy / log_base # Convert entropy to confidence pseudo_confidence = 1.0 - entropy elif alparams["measure"] == "margin": # Difference between the two largest entries in last dimension. values, indices = tf.math.top_k(pseudo_prob, k=2) pseudo_confidence = values[:,:,:,0] - values[:,:,:,1] elif alparams["measure"] == "confidence": # Reduce max over last dimension. pseudo_confidence = tf.math.reduce_max(pseudo_prob, axis=-1) else: raise NotImplementedError("Uncertainty function not implemented.") pseudo_mean_confidence = tf.reduce_mean( tf.cast(pseudo_confidence, tf.float64), axis=(1,2)) # Pseudo annotate high-confidence unlabeled example pixels pseudo_mask = tf.where(tf.math.less(pseudo_confidence, alparams["threshold"]), tf.zeros_like(pseudo_label, dtype=train_label.dtype), tf.ones_like(pseudo_label, dtype=train_label.dtype)) # Pseudo annotation logic (think of it as @tf.cond maped # over batch dimension) train_label = tf.where(train_labelled, train_label, pseudo_label, name="MaybeGenLabel") train_mask = tf.where(train_labelled, train_mask, pseudo_mask, name="MaybeGenMask") with tf.device("/device:GPU:1"): # Build validation network. val_logits = val_net(val_image, training=False) val_pred = tf.math.argmax(val_logits, axis=-1, name="ValidationPredictions") # Build cost function with tf.name_scope("Cost"): with tf.device("/device:GPU:0"): # Establish loss function if hparams["softmax"]["multiscale"]: loss, loss_weights = \ tt.losses.multiscale_masked_softmax_cross_entropy( train_label, train_net.endpoint_outputs[0], train_mask, dataset.num_classes, weight=hparams["softmax"]["loginverse_scaling"], label_smoothing=hparams["softmax"]["label_smoothing"], scope="XEntropy") # NOTE: this will make @loss_weights checkpointed train_net.loss_scale_weights = loss_weights else: loss = tt.losses.masked_softmax_cross_entropy( train_label, train_logits, train_mask, dataset.num_classes, weight=hparams["softmax"]["loginverse_scaling"], label_smoothing=hparams["softmax"]["label_smoothing"], scope="XEntropy") cost = loss # Add regularization to cost function if len(train_net.losses) > 0: regularization_loss = tf.math.add_n(train_net.losses, name="Regularization") cost += tf.cast(regularization_loss, dtype=tf.float64) # Setup learning rate learning_rate = hparams["learning_rate"] if hparams["learning_rate_decay"] > 0.0: # Inverse time learning_rate if lr_decay specified learning_rate = tf.train.inverse_time_decay( learning_rate, local_step, decay_steps=train_batches, decay_rate=hparams["learning_rate_decay"]) # Create optimization procedure optimizer = tf.train.AdamOptimizer(learning_rate, **hparams["optimizer"]["kwargs"]) # Create training op train_op = optimizer.minimize(cost, global_step=local_step, name="TrainOp") # END tf.device("/device:GPU:0") # END tf.name_scope("Cost") # Create summary operations for training and validation network with tf.name_scope("Summary"): # Create colormap for image summaries colormap = tf.constant(dataset.colormap, dtype=tf.uint8, name="Colormap") # Create metric evaluation and summaries with tf.device("/device:GPU:0"): with tf.name_scope("TrainMetrics"): # Create metrics object for training network. train_metrics = tt.metrics.Metrics(train_pred, train_label, dataset.num_classes, train_mask) # Get Tensorflow update op. metric_update_op = train_metrics.get_update_op() # Get Tensorflow summary operations. metric_summaries = train_metrics.get_summaries() train_summary_iter = tf.summary.merge( [ # Summaries run at each iteration. tf.summary.scalar("CrossEntropyLoss", loss, family="Losses"), tf.summary.scalar("TotalCost", cost, family="Losses"), tf.summary.scalar("LearningRate", learning_rate, family="Losses") ], name="IterationSummaries" ) with tf.control_dependencies([metric_update_op]): train_summary_epoch = tf.summary.merge( [ # Summaries run at epoch boundaries. metric_summaries["Metrics"], metric_summaries["ConfusionMat"] ], name="EpochSummaries" ) train_image_summary = tf.summary.merge( [ tf.summary.image( "PseudoLabel/input", train_image_raw, family="PseudoLabel" ), tf.summary.image( "PseudoLabel/confidence", tf.expand_dims(pseudo_confidence, axis=-1), family="PseudoLabel" ), tf.summary.image( "PseudoLabel", tf.gather(dataset.colormap, tf.cast(pseudo_label*pseudo_mask \ + (1 - pseudo_mask)*255, tf.int32)), family="PseudoLabel" ) ] ) # Create metric evaluation and summaries with tf.device("/device:GPU:1"): with tf.name_scope("ValidationTestMetrics"): # Create metrics object val_metrics = tt.metrics.Metrics(val_pred, val_label, dataset.num_classes, val_mask) # Get update tensorflow ops val_metric_update_op = val_metrics.get_update_op() # Get metric sumaries val_metric_summaries = val_metrics.get_summaries() with tf.control_dependencies([val_metric_update_op]): val_metric_summary = tf.summary.merge( [ # "Expensive" summaries run at epoch boundaries. val_metric_summaries["Metrics"], val_metric_summaries["ClassMetrics"], val_metric_summaries["ConfusionMat"] ], name="EpochSummaries" ) val_image_summary = tf.summary.merge( [ tf.summary.image("Input", val_image), tf.summary.image("Label", tf.gather( colormap, tf.cast(val_label + 255*(1-val_mask), tf.int32))), tf.summary.image("Predictions", tf.gather( colormap, tf.cast(val_pred, tf.int32))) ] ) val_summary_epoch = val_metric_summary test_summary_epoch = tf.summary.merge([ val_metric_summary, val_image_summary ] ) conf_summary_ph = tf.placeholder(tf.float64, shape=[None]) conf_summary = tf.summary.histogram("ConfidenceDistribution", conf_summary_ph) # END name_scope("Summary") # Create session with soft device placement # - some ops neet to run on the CPU sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: logger.debug("Initializing variables...") sess.run(tf.global_variables_initializer()) # Create checkpoint object with tf.name_scope("Checkpoint"): checkpoint = tf.train.Checkpoint(model=train_net, epoch=epoch_step, step=global_step, optimizer=optimizer) checkpoint_name = os.path.join(args.log_dir, "model") if args.checkpoint is not None: # CMDline checkpoint given ckpt = args.checkpoint if os.path.isdir(ckpt): ckpt = tf.train.latest_checkpoint(ckpt) if ckpt is None: logger.error("Checkpoint path \"%s\" is invalid.") return 1 logger.info("Resuming from checkpoint \"%s\"" % ckpt) status = checkpoint.restore(ckpt) if tf.__version__ < "1.14.0": status.assert_existing_objects_matched() else: status.expect_partial() status.initialize_or_restore(sess) if args.reinitialize_output: sess.run(train_net.Final.kernel.initializer) elif state["checkpoint"] != None: # Try to restore from checkpoint in logdir ckpt = state["checkpoint"] logger.info("Resuming from checkpoint \"%s\"" % ckpt) status = checkpoint.restore(ckpt) if tf.__version__ < "1.14.0": status.assert_existing_objects_matched() else: status.expect_partial() status.initialize_or_restore(sess) with tf.name_scope("UpdateValidationWeights"): update_val_op = [] for i in range(len(val_net.layers)): for j in range(len(val_net.layers[i].variables)): update_val_op.append( tf.assign(val_net.layers[i].variables[j], train_net.layers[i].variables[j])) update_val_op = tf.group(update_val_op) ckpt_manager = tt.checkpoint_manager.CheckpointManager(checkpoint, args.log_dir) # END scope Checkpoint # Prepare global fetches dict fetches = { "train" : { "iteration" : { "step" : global_step_op, "summary" : train_summary_iter, "train_op" : train_op, "update" : metric_update_op, "updates" : train_net.updates }, "epoch" : { "step" : epoch_step, "summary" : train_summary_epoch, "summary/image" : train_image_summary } }, "val" : { # Validation and test fetches "iteration" : { "update" : val_metric_update_op }, "epoch" : { "step" : epoch_step, "MeanIoU" : val_metrics.metrics["MeanIoU"], "summary" : val_summary_epoch, # Also add image summary, however only added to # writer every N epochs. "summary/image" : val_image_summary } }, "test" : { "iteration" : {"update" : val_metric_update_op}, "epoch" : {"summary" : test_summary_epoch} } } # Train loop (until convergence) -> Pick unlabeled examples -> test_loop def train_loop(summary_writer): """ Train loop closure. Runs training loop untill no improvement is seen in @params["epochs"] epochs before returning. """ # How many epoch until counting @no_improvement _initial_grace_period = alparams["epochs/warm_up"] best_ckpt = state["checkpoint"] best_mean_iou = 0.0 log_subdir = summary_writer.get_logdir() run_name = os.path.basename(log_subdir) checkpoint_prefix = os.path.join(log_subdir, "model") num_iter_per_epoch = np.maximum(train_input.size, val_input.size) no_improvement_count = 0 while no_improvement_count < params["epochs"] \ or _initial_grace_period >= 0: _initial_grace_period -= 1 # Increment in-graph epoch counter. epoch = sess.run(epoch_step_inc) # Prepare inner loop iterator _iter = range(0, num_iter_per_epoch, params["batch_size"]) if show_progress: _iter = tqdm.tqdm(_iter, desc="%s[%d]" % (run_name, epoch), dynamic_ncols=True, ascii=True, postfix={"NIC": no_improvement_count}) # Initialize iterators train_input_stage.init_iterator( "train", sess, train_input.feed_dict) val_input_stage.init_iterator( "val", sess, val_input.feed_dict) # Reset confusion matrices train_metrics.reset_metrics(sess) val_metrics.reset_metrics(sess) # Prepare iteration fetches _fetches = { "train" : {"iteration" : fetches["train"]["iteration"]}, "val" : {"iteration" : fetches["val"]["iteration"]} } # Update validation network weights sess.run(update_val_op) try: for i in _iter: if train_input.size-params["batch_size"] <= i < train_input.size: # Fetches for last training iteration. _fetches["train"]["epoch"] = fetches["train"]["epoch"] if val_input.size-params["batch_size"] <= i < val_input.size: _fetches["val"]["epoch"] = fetches["val"]["epoch"] # Run fetches results = sess.run(_fetches) if "train" in results.keys(): # Add iteration summary summary_writer.add_summary( results["train"]["iteration"]["summary"], results["train"]["iteration"]["step"]) # Maybe add epoch summary if "epoch" in results["train"].keys(): summary_writer.add_summary( results["train"]["epoch"]["summary"], results["train"]["epoch"]["step"] ) # Pop fetches to prohibit OutOfRangeError due to # asymmetric train-/val- input size. if results["train"]["epoch"]["step"] % 100 == 0: summary_writer.add_summary( results["train"]["epoch"]["summary/image"], results["train"]["epoch"]["step"] ) _fetches.pop("train") if "val" in results.keys() and \ "epoch" in results["val"].keys(): # Add summaries to event log. summary_writer.add_summary( results["val"]["epoch"]["summary"], results["val"]["epoch"]["step"] ) if results["val"]["epoch"]["step"] % 100 == 0: # Only report image summary every 100th epoch. summary_writer.add_summary( results["val"]["epoch"]["summary/image"], results["val"]["epoch"]["step"] ) # Check if MeanIoU improved and # update counter and best if results["val"]["epoch"]["MeanIoU"] > best_mean_iou: best_mean_iou = results["val"]["epoch"]["MeanIoU"] # Update checkpoint file used for # @tf.train.latest_checkpoint to point at # current best. _ckpt_name = ckpt_manager.commit( checkpoint_prefix, sess) if _ckpt_name != "": best_ckpt = _ckpt_name # Reset counter no_improvement_count = 0 else: # Result has not improved, increment counter. no_improvement_count += 1 if no_improvement_count >= params["epochs"] and \ _initial_grace_period < 0: _iter.close() break if show_progress: _iter.set_postfix(NIC=no_improvement_count) # Pop fetches to prohibit OutOfRangeError due to # asymmetric train-/val- input size. _fetches.pop("val") # END "maybe add epoch summary" except tf.errors.OutOfRangeError: logger.error("Out of range error. Attempting to continue.") pass summary_writer.flush() ckpt_manager.cache(sess) # END while no_improvement_count < params["epochs"] return best_ckpt def test_loop(summary_writer): """ Test loop closure. """ _step = len(labelled) # Initialize validation input stage with test set val_input_stage.init_iterator("test", sess, test_input.feed_dict) _iter = range(0, test_input.size, params["batch_size"]) if show_progress: _iter = tqdm.tqdm(_iter, desc="test[%d]" % (_step), ascii=True, dynamic_ncols=True) summary_proto = None val_metrics.reset_metrics(sess) try: for i in _iter: # Accumulate confusion matrix if i < test_input.size - params["batch_size"]: sess.run(fetches["test"]["iteration"]["update"]) else: # Run summary operation last iteration _, summary_proto = sess.run([fetches["test"]["iteration"]["update"], fetches["test"]["epoch"]["summary"]]) except tf.errors.OutOfRangeError: pass # Add summary with number of labelled examples as step. # NOTE this only runs on each major iteration. summary_writer.add_summary( summary_proto, _step ) def rank_confidence(): # Allocate array to store all confidence scores num_examples = len(state["dataset"]["train"]["filenames"]) confidence = np.zeros(num_examples, dtype=np.float32) # Initialize input stage train_input_stage.init_iterator("train", sess, train_input.feed_dict) _iter = range(0, train_input.size, params["batch_size"]) if show_progress: _iter = tqdm.tqdm(_iter, desc="ranking[%d]" % len(labelled), ascii=True, dynamic_ncols=True) try: for i in _iter: # Loop over all examples and compute confidence batch_confidence, batch_indices = sess.run( [pseudo_mean_confidence, train_index]) # Add to list of confidence confidence[batch_indices] = batch_confidence except tf.errors.OutOfRangeError: pass # Filter out labelled examples unlabelled_confidence = confidence[unlabelled] selection_size = np.minimum(len(unlabelled), alparams["selection_size"]) # Get the lowest confidence indices of unlabelled subset example_indices = np.argpartition(unlabelled_confidence, selection_size) example_indices = example_indices[:selection_size] # Convert to indices into all filenames list low_conf_examples = unlabelled[example_indices] return low_conf_examples, unlabelled_confidence checkpoint_path = state["checkpoint"] # Only add graph to first event file _graph = sess.graph if checkpoint_path == None else None with tf.summary.FileWriter(args.log_dir, graph=_graph) as test_writer: iterations = alparams["iterations"] if iterations < 0: # Iterate untill all data is consumed iterations = np.ceil(len(unlabelled) / float(alparams["selection_size"])) logger.info("Iteration count: %d" % iterations) while state["iteration"] < iterations: # Step 1: train_loop train_input.set_indices(labelled) if state["iteration"] == 0: # Pretrain log_subdir = os.path.join(args.log_dir, "pretrain") # Only use labelled subset else: # Any other iteration log_subdir = os.path.join(args.log_dir, "iter-%d" % state["iteration"]) # Sample from the unlabelled set p = alparams["pseudo_labelling_proportion"] sample_size = int(len(labelled)*p/(1-p)) sample_size = np.minimum(sample_size, len(unlabelled)) train_input.set_sample_size(sample_size) # Create subdir if it doesn't exist if not os.path.exists(log_subdir): os.mkdir(log_subdir) # Change checkpoint manager directory ckpt_manager.chdir(log_subdir) with tf.summary.FileWriter(log_subdir) as train_val_writer: # Enter train loop try: checkpoint_path = train_loop(train_val_writer) except KeyboardInterrupt as exception: # Quickly store state if ckpt_manager.latest_checkpoint != "": state["checkpoint"] = ckpt_manager.latest_checkpoint with open(state_filename, "w") as f: json.dump(state, f, indent=2) f.truncate() raise exception # Reload best checkpoint status = checkpoint.restore(checkpoint_path) status.run_restore_ops(sess) sess.run(update_val_op) # Step 2: test_loop if test_input.size > 0: # This step may be omitted on deployment test_loop(test_writer) # Step 3: Find low confidence examples # Reset train_input to use all examples for ranking train_input.set_indices() if alparams["selection_size"] > 0: low_conf_examples, unlabelled_conf = rank_confidence() _hist_summary = sess.run(conf_summary, {conf_summary_ph: unlabelled_conf}) test_writer.add_summary(_hist_summary, state["iteration"]) else: # Draw examples randomly selection_size = np.minimum(alparams["selection_size"], len(unlabelled.tolist())) if selection_size != 0: low_conf_examples = np.random.choice( unlabelled, np.abs(alparams["selection_size"])) else: low_conf_examples = [] # (maybe) Pause for user to annotate to_annotate_indices = no_label_indices[np.isin( no_label_indices, low_conf_examples)] while len(to_annotate_indices) > 0: to_annotate = train_examples[to_annotate_indices] # Poll user for filenames of annotated examples logger.info("Please annotate the following examples:\n%s" % "\n".join(to_annotate_basename.tolist())) filenames = tkinter.filedialog.askopenfilename( multiple=1, filetypes=(("TFRecord", "*.tfrecord"),)) hit = [] # List of matching filename indices for filename in filenames: basename = os.path.basename(filename) idx = -1 for i in range(len(to_annotate)): if to_annotate[i].endswith(basename): idx = i break if idx != -1: # Update state filenames train_examples[to_annotate_indices[idx]] = filename hit.append(idx) else: logger.info("Unrecognized filepath: %s" % filename) # Remove matched paths to_annotate_indices = np.delete(to_annotate_indices, hit) # Remove annotated examples from unlabelled set no_label_indices = no_label_indices[np.isin(no_label_indices, low_conf_examples, invert=True)] logger.info( "Moving following examples to labelled set:\n%s" % "\n".join(train_examples[low_conf_examples].tolist()) ) # First make the update to input stage before # commiting state change train_input_labelled[low_conf_examples] = True train_input.labelled = train_input_labelled # Step 4: Update state information labelled = np.append(labelled, low_conf_examples) unlabelled = unlabelled[np.isin(unlabelled, low_conf_examples, assume_unique=True, invert=True)] state["dataset"]["train"]["filenames"] = train_examples.tolist() state["dataset"]["train"]["labelled"] = labelled.tolist() state["dataset"]["train"]["unlabelled"] = unlabelled.tolist() state["iteration"] += 1 state["checkpoint"] = checkpoint_path # Dump updated state with open(state_filename, "w") as f: json.dump(state, f, indent=2) f.truncate() return 0
def main(args): # Retrieve dataset specific object if args.dataset == "cityscapes": dataset = datasets.Cityscapes(coarse=args.coarse) elif args.dataset == "freiburg": dataset = datasets.Freiburg() elif args.dataset == "vistas": dataset = datasets.Vistas() else: raise NotImplementedError("Dataset \"%s\" not supported" % args.dataset) # Gather train and validation paths train_paths = os.path.join(args.data_dir, "train") val_paths = os.path.join(args.data_dir, "val") # Retrieve training parameters params = args.params hparams = params["hyperparams"] with tf.device("/device:CPU:0"): with tf.name_scope("Datasets"): # Setup input pipelines train_input = tt.input.InputStage(input_shape=[ params["network"]["input"]["height"], params["network"] ["input"]["width"] ]) val_input = tt.input.InputStage(input_shape=[ params["network"]["input"]["height"], params["network"] ["input"]["width"] ]) # Add datasets train_examples = train_input.add_dataset( "train", train_paths, batch_size=params["batch_size"], epochs=1, augment=True) val_examples = val_input.add_dataset( "val", val_paths, batch_size=params["batch_size"], epochs=1) # Calculate number of batches train_batches = (train_examples - 1) // params["batch_size"] + 1 val_batches = (val_examples - 1) // params["batch_size"] + 1 # Get iterator outputs _, train_image, train_label, train_mask = train_input.get_output() val_image, val_label, val_mask = val_input.get_output() # Create step variables with tf.variable_scope("StepCounters"): # I'll use one local (to this run) and a global step that # will be checkpointed in order to run various schedules on # the learning rate decay policy. global_step = tf.Variable(0, dtype=tf.int64, trainable=False, name="GlobalStep") local_step = tf.Variable(0, dtype=tf.int64, trainable=False, name="LocalStep") global_step_op = global_step + local_step epoch_step = tf.Variable(0, trainable=False, name="EpochStep") epoch_step_inc = tf.assign_add(epoch_step, 1, name="EpochStepInc") regularization = {} if hparams["weight_reg"]["L2"] > 0.0 \ or hparams["weight_reg"]["L1"] > 0.0: regularization = { "weight_regularization": tf.keras.regularizers.l1_l2(l1=hparams["weight_reg"]["L1"], l2=hparams["weight_reg"]["L2"]), "regularization_scaling": hparams["weight_reg"]["glorot_scaling"] } # Build training and validation network and get prediction output train_net = models.ENet(dataset.num_classes, **regularization) val_net = models.ENet(dataset.num_classes) with tf.device("/device:GPU:0"): train_logits = train_net(train_image, training=True) train_pred = tf.math.argmax(train_logits, axis=-1, name="TrainPredictions") with tf.device("/device:GPU:1"): val_logits = val_net(val_image, training=False) val_pred = tf.math.argmax(val_logits, axis=-1, name="ValidationPredictions") # Build cost function with tf.name_scope("Cost"): with tf.device("/device:GPU:0"): # Establish loss function if hparams["softmax"]["multiscale"]: loss, loss_weights = \ tt.losses.multiscale_masked_softmax_cross_entropy( train_label, train_net.endpoint_outputs[0], train_mask, dataset.num_classes, weight=hparams["softmax"]["loginverse_scaling"], label_smoothing=hparams["softmax"]["label_smoothing"], scope="XEntropy") # NOTE: this will make @loss_weights checkpointed train_net.loss_scale_weights = loss_weights else: loss = tt.losses.masked_softmax_cross_entropy( train_label, train_logits, train_mask, dataset.num_classes, weight=hparams["softmax"]["loginverse_scaling"], label_smoothing=hparams["softmax"]["label_smoothing"], scope="XEntropy") cost = loss # Add regularization to cost function if len(train_net.losses) > 0: regularization_loss = tf.math.add_n(train_net.losses, name="Regularization") cost += tf.cast(regularization_loss, dtype=tf.float64) # Setup learning rate learning_rate = hparams["learning_rate"] if hparams["learning_rate_decay"] > 0.0: # Inverse time learning_rate if lr_decay specified learning_rate = tf.train.inverse_time_decay( learning_rate, local_step, decay_steps=train_batches, decay_rate=hparams["learning_rate_decay"]) # Create optimization procedure optimizer = tf.train.AdamOptimizer( learning_rate, **hparams["optimizer"]["kwargs"]) # Create training op train_op = optimizer.minimize(cost, global_step=local_step, name="TrainOp") # NOTE: Make sure to update batchnorm params and metrics for # each training iteration. # Create summary operations for training and validation network with tf.name_scope("Summary"): # Create colormap for image summaries colormap = tf.constant(dataset.colormap, dtype=tf.uint8, name="Colormap") # Create metric evaluation and summaries with tf.device("/device:GPU:0"): with tf.name_scope("TrainMetrics"): train_metrics = tt.metrics.Metrics(train_pred, train_label, dataset.num_classes, train_mask) metric_update_op = train_metrics.get_update_op() metric_summaries = train_metrics.get_summaries() train_summary_iter = tf.summary.merge([ tf.summary.scalar("CrossEntropyLoss", loss, family="Losses"), tf.summary.scalar("TotalCost", cost, family="Losses"), tf.summary.scalar( "LearningRate", learning_rate, family="Losses") ], name="IterationSummaries") with tf.control_dependencies([metric_update_op]): train_summary_epoch = tf.summary.merge([ metric_summaries["Metrics"], metric_summaries["ConfusionMat"], ], name="EpochSummaries") # Create metric evaluation and summaries with tf.device("/device:GPU:1"): with tf.name_scope("ValidationMetrics"): val_metrics = tt.metrics.Metrics(val_pred, val_label, dataset.num_classes, val_mask) val_metric_update_op = val_metrics.get_update_op() val_metric_summaries = val_metrics.get_summaries() with tf.control_dependencies([val_metric_update_op]): val_summary_epoch = tf.summary.merge([ val_metric_summaries["Metrics"], val_metric_summaries["ClassMetrics"], val_metric_summaries["ConfusionMat"], tf.summary.image("Input", val_image), tf.summary.image( "Label", tf.gather( colormap, tf.cast(val_label + 255 * (1 - val_mask), tf.int32))), tf.summary.image( "Predictions", tf.gather(colormap, tf.cast(val_pred, tf.int32))) ], name="EpochSummaries") if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) # Dump parameter configuration (args) with open(os.path.join(args.log_dir, "config.json"), "w+") as f: json.dump(params, f, indent=4, sort_keys=True) # Create session with soft device placement # - some ops neet to run on the CPU sess_config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=sess_config) as sess: # Initialize/restore model variables logger.debug("Initializing model...") sess.run(tf.global_variables_initializer()) # Create summary writer objects summary_writer = tf.summary.FileWriter(args.log_dir, graph=sess.graph) # Create checkpoint object with tf.name_scope("Checkpoint"): checkpoint = tf.train.Checkpoint(model=train_net, epoch=epoch_step, step=global_step, optimizer=optimizer) checkpoint_name = os.path.join(args.log_dir, "model") if args.checkpoint is not None: # CMDline checkpoint given ckpt = args.checkpoint if os.path.isdir(ckpt): ckpt = tf.train.latest_checkpoint(ckpt) if ckpt is None: logger.error("Checkpoint path \"%s\" is invalid.") return 1 logger.info("Resuming from checkpoint \"%s\"" % ckpt) status = checkpoint.restore(ckpt) if tf.__version__ < "1.14.0": status.assert_existing_objects_matched() else: status.expect_partial() status.initialize_or_restore(sess) elif tf.train.latest_checkpoint(args.log_dir) != None: # Try to restore from checkpoint in logdir ckpt = tf.train.latest_checkpoint(args.log_dir) logger.info("Resuming from checkpoint \"%s\"" % ckpt) status = checkpoint.restore(ckpt) if tf.__version__ < "1.14.0": status.assert_existing_objects_matched() else: status.expect_partial() status.initialize_or_restore(sess) with tf.name_scope("UpdateValidationWeights"): update_val_op = [] for i in range(len(val_net.layers)): for j in range(len(val_net.layers[i].variables)): update_val_op.append( tf.assign(val_net.layers[i].variables[j], train_net.layers[i].variables[j])) update_val_op = tf.group(update_val_op) # END scope Checkpoint # Prepare fetches fetches = { "train": { "iteration": { "step": global_step_op, "summary": train_summary_iter, "train_op": train_op, "update": metric_update_op, "updates": train_net.updates }, "epoch": { "step": epoch_step, "summary": train_summary_epoch } }, "val": { "iteration": { "update": val_metric_update_op }, "epoch": { "step": epoch_step, "summary": val_summary_epoch } } } #run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #run_metadata = tf.RunMetadata() logger.info("Starting training loop...") results = {} for epoch in range(1, params["epochs"] + 1): # Create iterator counter to track progress _iter = range(0, train_batches) if show_progress: _iter = tqdm.tqdm(_iter, desc="train[%3d/%3d]" % (epoch, params["epochs"]), ascii=True, dynamic_ncols=True) # Initialize input stage train_input.init_iterator("train", sess) val_input.init_iterator("val", sess) # Initialize or update validation network sess.run(update_val_op) # Reset for another round train_metrics.reset_metrics(sess) val_metrics.reset_metrics(sess) # Prepare initial fetches _fetches = { "train": { "iteration": fetches["train"]["iteration"] }, "val": { "iteration": fetches["val"]["iteration"] } } for i in _iter: try: # Dynamically update fetches if i == train_batches - 1: _fetches["train"]["epoch"] = fetches["train"]["epoch"] if i == val_batches - 1: _fetches["val"]["epoch"] = fetches["val"]["epoch"] elif i == val_batches: summary_writer.add_summary( results["val"]["epoch"]["summary"], results["val"]["epoch"]["step"]) _fetches.pop("val") # Execute fetches results = sess.run( _fetches #,options=run_options, #run_metadata=run_metadata ) except tf.errors.OutOfRangeError: pass # Update summaries summary_writer.add_summary( results["train"]["iteration"]["summary"], results["train"]["iteration"]["step"]) #summary_writer.add_run_metadata(run_metadata, "step=%d" % i) # Update epoch counter _epoch = sess.run(epoch_step_inc) # Update epoch summaries summary_writer.add_summary(results["train"]["epoch"]["summary"], results["train"]["epoch"]["step"]) summary_writer.flush() # Save checkpoint checkpoint.save(checkpoint_name, sess) ### FINAL VALIDATION ### _fetches = {"val": {"iteration": fetches["val"]["iteration"]}} _iter = range(0, val_batches) if show_progress: _iter = tqdm.tqdm(_iter, desc="val[%3d/%3d]" % (params["epochs"], params["epochs"])) # Re initialize network val_input.init_iterator("val", sess) sess.run(update_val_op) for i in _iter: try: if i >= val_batches - 1: _fetches["val"]["epoch"] = fetches["val"]["epoch"] results = sess.run(_fetches) except tf.errors.OutOfRangeError: pass # Add final validation summary update summary_writer.add_summary(results["val"]["epoch"]["summary"], results["val"]["epoch"]["step"]) # Close summary file summary_writer.close() logger.info("Training successfully finished %d epochs" % params["epochs"]) return 0