def compute_map(labels_and_predictions, val_json_file): """Use model predictions to compute mAP. The evaluation code is largely copied from the MLPerf reference implementation. While it is possible to write the evaluation as a tensor metric and use Estimator.evaluate(), this approach was selected for simplicity and ease of duck testing. """ with tf.gfile.Open(val_json_file, "r") as f: annotation_data = json.load(f) predictions = [] mlperf_log.ssd_print( key=mlperf_log.NMS_THRESHOLD, value=ssd_constants.OVERLAP_CRITERIA) mlperf_log.ssd_print( key=mlperf_log.NMS_MAX_DETECTIONS, value=ssd_constants.MAX_NUM_EVAL_BOXES) for example in labels_and_predictions: pred_box = example["pred_box"] pred_scores = example["pred_scores"] indices = example['indices'] loc, label, prob = decode_single( pred_box, pred_scores, indices, ssd_constants.OVERLAP_CRITERIA, ssd_constants.MAX_NUM_EVAL_BOXES, ssd_constants.MAX_NUM_EVAL_BOXES) htot, wtot, _ = example[ssd_constants.RAW_SHAPE] for loc_, label_, prob_ in zip(loc, label, prob): # Ordering convention differs, hence [1], [0] rather than [0], [1] predictions.append([ int(example[ssd_constants.SOURCE_ID]), loc_[1] * wtot, loc_[0] * htot, (loc_[3] - loc_[1]) * wtot, (loc_[2] - loc_[0]) * htot, prob_, ssd_constants.CLASS_INV_MAP[label_] ]) if val_json_file.startswith("gs://"): _, local_val_json = tempfile.mkstemp(suffix=".json") tf.gfile.Remove(local_val_json) tf.gfile.Copy(val_json_file, local_val_json) atexit.register(tf.gfile.Remove, local_val_json) else: local_val_json = val_json_file cocoGt = COCO(local_val_json) cocoDt = cocoGt.loadRes(np.array(predictions)) E = COCOeval(cocoGt, cocoDt, iouType='bbox') E.evaluate() E.accumulate() E.summarize() print("Current AP: {:.5f}".format(E.stats[0])) metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', 'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl'] # Prefix with "COCO" to group in TensorBoard. return {"COCO/" + key: value for key, value in zip(metric_names, E.stats)}
def concat_outputs(cls_outputs, box_outputs): """Concatenate predictions into a single tensor. This function takes the dicts of class and box prediction tensors and concatenates them into a single tensor for comparison with the ground truth boxes and class labels. Args: cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors * num_classses]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. Returns: concatenanted cls_outputs and box_outputs. """ assert set(cls_outputs.keys()) == set(box_outputs.keys()) # This sort matters. The labels assume a certain order based on # ssd_constants.FEATURE_SIZES, and this sort matches that convention. keys = sorted(cls_outputs.keys()) batch_size = int(cls_outputs[keys[0]].shape[0]) flat_cls = [] flat_box = [] mlperf_log.ssd_print(key=mlperf_log.FEATURE_SIZES, value=ssd_constants.FEATURE_SIZES) for i, k in enumerate(keys): # TODO(taylorrobie): confirm that this reshape, transpose, # reshape is correct. scale = ssd_constants.FEATURE_SIZES[i] split_shape = (ssd_constants.NUM_DEFAULTS[i], ssd_constants.NUM_CLASSES) assert cls_outputs[k].shape[3] == split_shape[0] * split_shape[1] intermediate_shape = (batch_size, scale, scale) + split_shape final_shape = (batch_size, scale**2 * split_shape[0], split_shape[1]) flat_cls.append( tf.reshape( tf.transpose(tf.reshape(cls_outputs[k], intermediate_shape), (0, 3, 1, 2, 4)), final_shape)) split_shape = (ssd_constants.NUM_DEFAULTS[i], 4) assert box_outputs[k].shape[3] == split_shape[0] * split_shape[1] intermediate_shape = (batch_size, scale, scale) + split_shape final_shape = (batch_size, scale**2 * split_shape[0], split_shape[1]) flat_box.append( tf.reshape( tf.transpose(tf.reshape(box_outputs[k], intermediate_shape), (0, 3, 1, 2, 4)), final_shape)) return tf.concat(flat_cls, axis=1), tf.concat(flat_box, axis=1)
def resnet_v1(resnet_depth, params, data_format='channels_last'): """Returns the ResNet model for a given size and number of output classes.""" model_params = {34: {'block': residual_block, 'layers': [3, 4, 6, 3]}} if resnet_depth not in model_params: raise ValueError('Not a valid resnet_depth:', resnet_depth) resnet_params = model_params[resnet_depth] mlperf_log.ssd_print(key=mlperf_log.BACKBONE, value='resnet{}'.format(resnet_depth)) return resnet_v1_generator(resnet_params['block'], resnet_params['layers'], params, data_format)
def normalize_image(image): """Normalize the image to zero mean and unit variance.""" mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_MEAN, value=ssd_constants.NORMALIZATION_MEAN) mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_STD, value=ssd_constants.NORMALIZATION_STD) image -= tf.constant(ssd_constants.NORMALIZATION_MEAN)[tf.newaxis, tf.newaxis, :] image /= tf.constant(ssd_constants.NORMALIZATION_STD)[tf.newaxis, tf.newaxis, :] return image
def dboxes300_coco(): figsize = 300 feat_size = [38, 19, 10, 5, 3, 1] mlperf_log.ssd_print(key=mlperf_log.FEATURE_SIZES, value=feat_size) steps = [8, 16, 32, 64, 100, 300] mlperf_log.ssd_print(key=mlperf_log.STEPS, value=steps) # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py scales = [21, 45, 99, 153, 207, 261, 315] mlperf_log.ssd_print(key=mlperf_log.SCALES, value=scales) aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] mlperf_log.ssd_print(key=mlperf_log.ASPECT_RATIOS, value=aspect_ratios) dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios) mlperf_log.ssd_print(key=mlperf_log.NUM_DEFAULTS, value=len(dboxes.default_boxes)) return dboxes
def __init__(self): self.sample_options = ( # Do nothing None, # min IoU, max IoU (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), # no IoU requirements (None, None), ) # Implementation uses 1 iteration to find a possible candidate, this # was shown to produce the same mAP as using more iterations. self.num_cropping_iterations = 1 mlperf_log.ssd_print(key=mlperf_log.NUM_CROPPING_ITERATIONS, value=self.num_cropping_iterations)
def __init__(self, dboxes, size=(300, 300), val=False): # define vgg16 mean self.size = size self.val = val self.dboxes_ = dboxes # DefaultBoxes300() self.encoder = Encoder(self.dboxes_) self.crop = SSDCropping() self.img_trans = transforms.Compose([ transforms.Resize(self.size), # transforms.Resize((300, 300)), # transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05), transforms.ToTensor() # LightingNoice(), ]) self.hflip = RandomHorizontalFlip() # All Pytorch Tensor will be normalized # https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683 normalization_mean = [0.485, 0.456, 0.406] normalization_std = [0.229, 0.224, 0.225] mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_MEAN, value=normalization_mean) mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_STD, value=normalization_std) self.normalize = transforms.Normalize(mean=normalization_mean, std=normalization_std) # self.normalize = transforms.Normalize(mean = [104.0, 117.0, 123.0], # std = [1.0, 1.0, 1.0]) self.trans_val = transforms.Compose([ transforms.Resize(self.size), transforms.ToTensor(), # ToTensor(), self.normalize, ])
def __init__(self, label_num, backbone='resnet34', model_path="./resnet34-333f7ec4.pth"): super(SSD300, self).__init__() self.label_num = label_num if backbone == 'resnet34': self.model = ResNet34() mlperf_log.ssd_print(key=mlperf_log.BACKBONE, value='resnet34') out_channels = 256 out_size = 38 self.out_chan = [out_channels, 512, 512, 256, 256, 256] mlperf_log.ssd_print(key=mlperf_log.LOC_CONF_OUT_CHANNELS, value=self.out_chan) else: raise ValueError('Invalid backbone chosen') self._build_additional_features(out_size, self.out_chan) # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2 # classifer 1, 2, 3, 4, 5 ,6 self.num_defaults = [4, 6, 6, 6, 4, 4] mlperf_log.ssd_print(key=mlperf_log.NUM_DEFAULTS_PER_CELL, value=self.num_defaults) self.loc = [] self.conf = [] for nd, oc in zip(self.num_defaults, self.out_chan): #self.loc.append(bf16cutfp_mod()) self.loc.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1)) #self.loc.append(bf16cutbp_mod()) #self.conf.append(bf16cutfp_mod()) self.conf.append( nn.Conv2d(oc, nd * label_num, kernel_size=3, padding=1)) #self.conf.append(bf16cutbp_mod()) self.loc = nn.ModuleList(self.loc) self.conf = nn.ModuleList(self.conf) # intitalize all weights self._init_weights()
def main(): args = parse_args() if not os.path.isdir('./models'): os.mkdir('./models') if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) torch.backends.cudnn.benchmark = True # start timing here mlperf_log.ssd_print(key=mlperf_log.RUN_START) success = train300_mlperf_coco(args) # end timing here mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={"success": success}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL)
def __init__(self): mlperf_log.ssd_print(key=mlperf_log.STEPS, value=ssd_constants.STEPS) fk = ssd_constants.IMAGE_SIZE / np.array(ssd_constants.STEPS) self.default_boxes = [] mlperf_log.ssd_print(key=mlperf_log.ASPECT_RATIOS, value=ssd_constants.ASPECT_RATIOS) # size of feature and number of feature for idx, feature_size in enumerate(ssd_constants.FEATURE_SIZES): sk1 = ssd_constants.SCALES[idx] / ssd_constants.IMAGE_SIZE sk2 = ssd_constants.SCALES[idx + 1] / ssd_constants.IMAGE_SIZE sk3 = math.sqrt(sk1 * sk2) all_sizes = [(sk1, sk1), (sk3, sk3)] for alpha in ssd_constants.ASPECT_RATIOS[idx]: w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha) all_sizes.append((w, h)) all_sizes.append((h, w)) assert len(all_sizes) == ssd_constants.NUM_DEFAULTS[idx] for w, h in all_sizes: for i, j in it.product(range(feature_size), repeat=2): cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx] box = tuple(np.clip(k, 0, 1) for k in (cy, cx, h, w)) self.default_boxes.append(box) mlperf_log.ssd_print(key=mlperf_log.NUM_DEFAULTS, value=len(self.default_boxes)) assert len(self.default_boxes) == ssd_constants.NUM_SSD_BOXES def to_ltrb(cy, cx, h, w): return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2 # For IoU calculation self.default_boxes_ltrb = tuple( to_ltrb(*i) for i in self.default_boxes)
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available if not torch.cuda.is_available(): print("Error. No GPU available.") return False dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) train_pipe = COCOPipeline(args.batch_size, train_coco_root, train_annotate, dboxes, args.seed) train_pipe.build() train_loader = DALIGenericIterator(train_pipe, ["images", "boxes", "labels"], train_pipe.epoch_size("Reader")) mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD") mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} mean, std = generate_mean_std() data_perf = AverageMeter() batch_perf = AverageMeter() end = time.time() train_start = end mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for nbatch, data in enumerate(train_loader): img = data[0]["images"] bbox = data[0]["boxes"] label = data[0]["labels"] boxes_in_batch = len(label.nonzero()) if boxes_in_batch == 0: print("No labels in batch") continue label = label.type(torch.cuda.LongTensor) img = Variable(img, requires_grad=True) trans_bbox = bbox.transpose(1, 2).contiguous() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) data_perf.update(time.time() - end) if iter_num == 160000: current_lr = 1e-4 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == 200000: current_lr = 1e-5 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ploc, plabel = ssd300(img) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() optim.zero_grad() loss.backward() optim.step() batch_perf.update(time.time() - end) if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) try: if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num): return True except: print("Eval error on iteration {0}".format(iter_num)) print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Data perf: {:3f} img/sec, Batch perf: {:3f} img/sec, Avg Data perf: {:3f} img/sec, Avg Batch perf: {:3f} img/sec"\ .format(iter_num, loss.item(), avg_loss, args.batch_size / data_perf.val, args.batch_size / batch_perf.val, args.batch_size / data_perf.avg, args.batch_size / batch_perf.avg), end="\r") end = time.time() iter_num += 1 if iter_num == 10 and epoch == 0: data_perf.reset() batch_perf.reset() train_loader.reset() print("\n\n") print("Training end: Data perf: {:3f} img/sec, Batch perf: {:3f} img/sec, Total time: {:3f} sec"\ .format(args.batch_size / data_perf.avg, args.batch_size / batch_perf.avg, time.time() - train_start)) return False
def __init__(self, p=0.5): self.p = p mlperf_log.ssd_print(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=self.p)
def ssd(features, params, is_training_bn=False): """SSD classification and regression model.""" # upward layers with tf.variable_scope('resnet%s' % ssd_constants.RESNET_DEPTH): resnet_fn = resnet_v1(ssd_constants.RESNET_DEPTH, params) _, _, u4, _ = resnet_fn(features, is_training_bn) with tf.variable_scope('ssd'): feats = {} # output channels for mlperf logging. out_channels = [256] feats[3] = u4 feats[4] = tf.layers.conv2d(feats[3], filters=256, kernel_size=(1, 1), padding='same', activation=tf.nn.relu, name='block7-conv1x1') feats[4] = tf.layers.conv2d(feats[4], filters=512, strides=(2, 2), kernel_size=(3, 3), padding='same', activation=tf.nn.relu, name='block7-conv3x3') out_channels.append(512) feats[5] = tf.layers.conv2d(feats[4], filters=256, kernel_size=(1, 1), padding='same', activation=tf.nn.relu, name='block8-conv1x1') feats[5] = tf.layers.conv2d(feats[5], filters=512, strides=(2, 2), kernel_size=(3, 3), padding='same', activation=tf.nn.relu, name='block8-conv3x3') out_channels.append(512) feats[6] = tf.layers.conv2d(feats[5], filters=128, kernel_size=(1, 1), padding='same', activation=tf.nn.relu, name='block9-conv1x1') feats[6] = tf.layers.conv2d(feats[6], filters=256, strides=(2, 2), kernel_size=(3, 3), padding='same', activation=tf.nn.relu, name='block9-conv3x3') out_channels.append(256) feats[7] = tf.layers.conv2d(feats[6], filters=128, kernel_size=(1, 1), padding='same', activation=tf.nn.relu, name='block10-conv1x1') feats[7] = tf.layers.conv2d(feats[7], filters=256, kernel_size=(3, 3), padding='valid', activation=tf.nn.relu, name='block10-conv3x3') out_channels.append(256) feats[8] = tf.layers.conv2d(feats[7], filters=128, kernel_size=(1, 1), padding='same', activation=tf.nn.relu, name='block11-conv1x1') feats[8] = tf.layers.conv2d(feats[8], filters=256, kernel_size=(3, 3), padding='valid', activation=tf.nn.relu, name='block11-conv3x3') out_channels.append(256) mlperf_log.ssd_print(key=mlperf_log.LOC_CONF_OUT_CHANNELS, value=out_channels) class_outputs = {} box_outputs = {} min_level = ssd_constants.MIN_LEVEL max_level = ssd_constants.MAX_LEVEL num_classes = ssd_constants.NUM_CLASSES mlperf_log.ssd_print(key=mlperf_log.NUM_DEFAULTS_PER_CELL, value=ssd_constants.NUM_DEFAULTS) with tf.variable_scope('class_net', reuse=tf.AUTO_REUSE): for level in range(min_level, max_level + 1): class_outputs[level] = class_net(feats[level], level, num_classes) with tf.variable_scope('box_net', reuse=tf.AUTO_REUSE): for level in range(min_level, max_level + 1): box_outputs[level] = box_net(feats[level], level) return class_outputs, box_outputs
def main(argv): del argv # Unused. global SUCCESS print(FLAGS.model_dir) if FLAGS.model_dir: print(FLAGS.model_dir) else: print(FLAGS.training_file_pattern) raise Exception('No model dir') # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') run_config, params = construct_run_config(FLAGS.iterations_per_loop) if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once': if params['train_with_low_level_api']: params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards trunner = train_low_level_runner.TrainLowLevelRunner( iterations=FLAGS.iterations_per_loop) input_fn = dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data) mlperf_log.ssd_print(key=mlperf_log.RUN_START) trunner.initialize(input_fn, ssd_model.ssd_model_fn, params) else: mlperf_log.ssd_print(key=mlperf_log.RUN_START) if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'): if params['eval_with_low_level_api']: params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards erunner = eval_low_level_runner.EvalLowLevelRunner( eval_steps=int(FLAGS.eval_samples / FLAGS.eval_batch_size)) input_fn = dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data) erunner.initialize(input_fn, params) erunner.build_model(ssd_model.ssd_model_fn, params) # TPU Estimator if FLAGS.mode == 'train': if params['train_with_low_level_api']: train_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0) trunner.train(train_steps) trunner.shutdown() else: if FLAGS.device == 'gpu': params['dataset_num_shards'] = 1 params['dataset_index'] = 0 train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0) hooks = [] if FLAGS.use_async_checkpoint: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) train_estimator.train( input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size), hooks=hooks) if FLAGS.eval_after_training: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) eval_results = coco_metric.compute_map(predictions, FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'train_and_eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) current_step = 0 mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) threads = [] for eval_step in ssd_constants.EVAL_STEPS: # Compute the actual eval steps based on the actural train_batch_size steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size) current_epoch = current_step // params['steps_per_epoch'] # TODO(wangtao): figure out how to log for each epoch. mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=current_epoch) tf.logging.info('Starting training cycle for %d steps.' % steps) if params['train_with_low_level_api']: trunner.train(steps) else: run_config, params = construct_run_config(steps) if FLAGS.device == 'gpu': train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) train_estimator.train(input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=steps) if SUCCESS: break current_step = current_step + steps current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('Starting evaluation cycle at step %d.' % current_step) mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) # Run evaluation at the given step. if params['eval_with_low_level_api']: predictions = list(erunner.predict()) else: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer)) threads.append(t) t.start() trunner.shutdown() for t in threads: t.join() # success is a string right now as boolean is not JSON serializable. if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) summary_writer.close() elif FLAGS.mode == 'eval': if not params['eval_with_low_level_api']: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist() eval_epochs = [ steps * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size // params['steps_per_epoch'] for steps in eval_steps ] # For 8x8 slices and above. if FLAGS.train_batch_size >= 4096: eval_epochs = [i * 2 for i in eval_epochs] tf.logging.info('Eval epochs: %s' % eval_epochs) # Run evaluation when there's a new checkpoint threads = [] count = 1 for ckpt in next_checkpoint(FLAGS.model_dir): print("current count is {}\n".format(count)) count += 1 if SUCCESS: break current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('current step: %s' % current_step) tf.logging.info('current epoch: %s' % current_epoch) if not params[ 'eval_every_checkpoint'] and current_epoch not in eval_epochs: continue tf.logging.info('Starting to evaluate.') try: mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer)) threads.append(t) t.start() # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) for t in threads: t.join() if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) summary_writer.close() elif FLAGS.mode == 'eval_once': if not params['eval_with_low_level_api']: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) # Run evaluation when there's a new checkpoint for ckpt in next_checkpoint(FLAGS.model_dir): current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] print('current epoch: %s' % current_epoch) if FLAGS.eval_epoch < current_epoch: break if FLAGS.eval_epoch > current_epoch: continue tf.logging.info('Starting to evaluate.') try: mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) coco_eval(predictions, current_epoch, current_step, summary_writer) # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) print('Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. print('Checkpoint %s no longer exists, skipping checkpoint' % ckpt) print('%d ending' % FLAGS.eval_epoch) summary_writer.close()
def coco_eval(predictions, current_epoch, current_step, summary_writer): """Call the coco library to get the eval metrics.""" global SUCCESS eval_results = coco_metric.compute_map(predictions, FLAGS.val_json_file) mlperf_log.ssd_print(key=mlperf_log.EVAL_STOP, value=current_epoch) mlperf_log.ssd_print(key=mlperf_log.EVAL_SIZE, value=FLAGS.eval_samples) mlperf_log.ssd_print(key=mlperf_log.EVAL_ACCURACY, value={ 'epoch': current_epoch, 'value': eval_results['COCO/AP'] }) mlperf_log.ssd_print(key=mlperf_log.EVAL_TARGET, value=ssd_constants.EVAL_TARGET) mlperf_log.ssd_print(key=mlperf_log.EVAL_ITERATION_ACCURACY, value={ 'iteration': current_step, 'value': eval_results['COCO/AP'] }) print("The coco AP is: {}\n".format(eval_results['COCO/AP'])) if eval_results['COCO/AP'] >= ssd_constants.EVAL_TARGET and not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'true'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) SUCCESS = True tf.logging.info('Eval results: %s' % eval_results) # Write out eval results for the checkpoint. with tf.Graph().as_default(): summaries = [] for metric in eval_results: summaries.append( tf.Summary.Value(tag=metric, simple_value=eval_results[metric])) tf_summary = tf.Summary(value=list(summaries)) summary_writer.add_summary(tf_summary, current_step)
def coco_eval(model, coco, cocoGt, encoder, inv_map, threshold, epoch, iteration, use_cuda=True): from pycocotools.cocoeval import COCOeval print("") model.eval() if use_cuda: model.cuda() ret = [] overlap_threshold = 0.50 nms_max_detections = 200 mlperf_log.ssd_print(key=mlperf_log.NMS_THRESHOLD, value=overlap_threshold) mlperf_log.ssd_print(key=mlperf_log.NMS_MAX_DETECTIONS, value=nms_max_detections) mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=epoch) start = time.time() for idx, image_id in enumerate(coco.img_keys): img, (htot, wtot), _, _ = coco[idx] with torch.no_grad(): print("Parsing image: {}/{}".format(idx + 1, len(coco)), end="\r") inp = img.unsqueeze(0) if use_cuda: inp = inp.cuda() ploc, plabel = model(inp) try: result = encoder.decode_batch(ploc, plabel, overlap_threshold, nms_max_detections)[0] except: #raise print("") print("No object detected in idx: {}".format(idx)) continue loc, label, prob = [r.cpu().numpy() for r in result] for loc_, label_, prob_ in zip(loc, label, prob): ret.append([image_id, loc_[0]*wtot, \ loc_[1]*htot, (loc_[2] - loc_[0])*wtot, (loc_[3] - loc_[1])*htot, prob_, inv_map[label_]]) print("") print("Predicting Ended, total time: {:.2f} s".format(time.time() - start)) cocoDt = cocoGt.loadRes(np.array(ret)) E = COCOeval(cocoGt, cocoDt, iouType='bbox') E.evaluate() E.accumulate() E.summarize() print("Current AP: {:.5f} AP goal: {:.5f}".format(E.stats[0], threshold)) # put your model back into training mode model.train() current_accuracy = E.stats[0] mlperf_log.ssd_print(key=mlperf_log.EVAL_SIZE, value=idx + 1) mlperf_log.ssd_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": current_accuracy }) mlperf_log.ssd_print(key=mlperf_log.EVAL_ITERATION_ACCURACY, value={ "iteration": iteration, "value": current_accuracy }) mlperf_log.ssd_print(key=mlperf_log.EVAL_TARGET, value=threshold) mlperf_log.ssd_print(key=mlperf_log.EVAL_STOP, value=epoch) return current_accuracy >= threshold #Average Precision (AP) @[ IoU=050:0.95 | area= all | maxDets=100 ]
def _model_fn(features, labels, mode, params, model): """Model defination for the SSD model based on ResNet-50. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the SSD model outputs class logits and box regression outputs. Returns: spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation, or prediction. """ if mode == tf.estimator.ModeKeys.PREDICT: labels = features features = labels.pop('image') # Manually apply the double transpose trick for training data. if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) labels[ssd_constants.BOXES] = tf.transpose(labels[ssd_constants.BOXES], [2, 0, 1]) labels[ssd_constants.CLASSES] = tf.transpose( labels[ssd_constants.CLASSES], [2, 0, 1]) # Normalize the image to zero mean and unit variance. mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_MEAN, value=ssd_constants.NORMALIZATION_MEAN) mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_STD, value=ssd_constants.NORMALIZATION_STD) features -= tf.constant(ssd_constants.NORMALIZATION_MEAN, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(ssd_constants.NORMALIZATION_STD, shape=[1, 1, 3], dtype=features.dtype) def _model_outputs(): return model(features, params, is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN)) if params['use_bfloat16']: with bfloat16.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs) mlperf_log.ssd_print(key=mlperf_log.SCALES, value=ssd_constants.BOX_CODER_SCALES) ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) anchors = box_list.BoxList( tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb'))) decoded_boxes = box_coder.batch_decode(encoded_boxes=flattened_box, box_coder=ssd_box_coder, anchors=anchors) pred_scores = tf.nn.softmax(flattened_cls, axis=2) pred_scores, indices = select_top_k_scores( pred_scores, ssd_constants.MAX_NUM_EVAL_BOXES) predictions = dict( labels, indices=indices, pred_scores=pred_scores, pred_box=decoded_boxes, ) if params['visualize_dataloader']: # this is for inference visualization. predictions['image'] = features if params['use_tpu']: return tpu_estimator.TPUEstimatorSpec(mode=mode, predictions=predictions) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint( params['resnet_checkpoint'], { '/': 'resnet%s/' % ssd_constants.RESNET_DEPTH, }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) mlperf_log.ssd_print(key=mlperf_log.OPT_LR, deferred=True) # cls_loss and box_loss are for logging. only total_loss is optimized. total_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels) total_loss += params['weight_decay'] * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=ssd_constants.MOMENTUM) if params['use_tpu']: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value='tf.train.MomentumOptimizer') # TODO(wangtao): figure out how to log learning rate. # mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=learning_rate) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=ssd_constants.MOMENTUM) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=params['weight_decay']) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if params['device'] == 'gpu': # GPU uses tf.group to avoid dependency overhead on update_ops; also, # multi-GPU requires a different EstimatorSpec class object train_op = tf.group(optimizer.minimize(total_loss, global_step), update_ops) return model_fn_lib.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold=scaffold_fn()) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step) if params['use_host_call']: def host_call_fn(global_step, total_loss, cls_loss, box_loss, learning_rate): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: global_step: `Tensor with shape `[batch, ]` for the global_step. total_loss: `Tensor` with shape `[batch, ]` for the training loss. cls_loss: `Tensor` with shape `[batch, ]` for the training cls loss. box_loss: `Tensor` with shape `[batch, ]` for the training box loss. learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate. Returns: List of summary ops to run on the CPU host. """ # Outfeed supports int32 but global_step is expected to be int64. global_step = tf.reduce_mean(global_step) # Host call fns are executed FLAGS.iterations_per_loop times after one # TPU loop is finished, setting max_queue value to the same as number of # iterations will make the summary writer only flush the data to storage # once per loop. with (tf.contrib.summary.create_file_writer( params['model_dir'], max_queue=params['iterations_per_loop']).as_default()): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('total_loss', tf.reduce_mean(total_loss), step=global_step) tf.contrib.summary.scalar('cls_loss', tf.reduce_mean(cls_loss), step=global_step) tf.contrib.summary.scalar('box_loss', tf.reduce_mean(box_loss), step=global_step) tf.contrib.summary.scalar( 'learning_rate', tf.reduce_mean(learning_rate), step=global_step) return tf.contrib.summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. global_step_t = tf.reshape(global_step, [1]) total_loss_t = tf.reshape(total_loss, [1]) cls_loss_t = tf.reshape(cls_loss, [1]) box_loss_t = tf.reshape(box_loss, [1]) learning_rate_t = tf.reshape(learning_rate, [1]) host_call = (host_call_fn, [ global_step_t, total_loss_t, cls_loss_t, box_loss_t, learning_rate_t ]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: raise NotImplementedError return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) # set shuffle=True in DataLoader mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD") mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if iter_num == 160000: current_lr = 1e-4 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == 200000: current_lr = 1e-5 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() optim.step() if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num): return True iter_num += 1 return False
def ssd_print(*args, **kwargs): barrier() if get_rank() == 0: kwargs['stack_offset'] = 2 mlperf_log.ssd_print(*args, **kwargs)
def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = tf.image.convert_image_dtype(data['image'], dtype=tf.float32) raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # random_horizontal_flip() is hard coded to flip with 50% chance. mlperf_log.ssd_print( key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # TODO(taylorrobie): Check that this cast is valid. encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes, ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: encoded_classes, } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor( DefaultBoxes()('ltrb')))).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=ssd_constants.IMAGE_SIZE) image = tf.image.resize_images( image[tf.newaxis, :, :, :], size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))[0, :, :, :] image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants. MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) return { ssd_constants.IMAGE: image, ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, }
def __call__(self, params): example_decoder = tf_example_decoder.TfExampleDecoder() def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = tf.image.convert_image_dtype(data['image'], dtype=tf.float32) raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # random_horizontal_flip() is hard coded to flip with 50% chance. mlperf_log.ssd_print( key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # TODO(taylorrobie): Check that this cast is valid. encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes, ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: encoded_classes, } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor( DefaultBoxes()('ltrb')))).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=ssd_constants.IMAGE_SIZE) image = tf.image.resize_images( image[tf.newaxis, :, :, :], size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))[0, :, :, :] image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants. MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) return { ssd_constants.IMAGE: image, ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, } batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size) if self._is_training: dataset = dataset.shard( params['context'].num_hosts, params['context'].current_input_fn_deployment()[1]) dataset = dataset.shuffle( tf.to_int64(256 / params['context'].num_hosts)) # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) # Parse the fetched records to input tensors for model function. dataset = dataset.map(example_decoder.decode, num_parallel_calls=64) if self._is_training: dataset = dataset.map( # pylint: disable=g-long-lambda lambda data: (data, tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0)), num_parallel_calls=64) dataset = dataset.filter(lambda data, pred: pred) dataset = dataset.prefetch(batch_size * 64) dataset = dataset.cache().apply( tf.contrib.data.shuffle_and_repeat(64)) dataset = dataset.prefetch(batch_size * 64) dataset = dataset.apply( tf.contrib.data.map_and_batch( lambda data, _: _parse_example(data), batch_size=batch_size, drop_remainder=True, num_parallel_calls=128)) else: dataset = dataset.prefetch(batch_size * 64) dataset = dataset.apply( tf.contrib.data.map_and_batch(_parse_example, batch_size=batch_size, drop_remainder=True, num_parallel_calls=128)) # Manually apply the double transpose trick for training data. def _transpose_dataset(image, labels): image = tf.transpose(image, [1, 2, 3, 0]) labels[ssd_constants.BOXES] = tf.transpose( labels[ssd_constants.BOXES], [1, 2, 0]) labels[ssd_constants.CLASSES] = tf.transpose( labels[ssd_constants.CLASSES], [1, 2, 0]) return image, labels if self._transpose_input and self._is_training: dataset = dataset.map(_transpose_dataset, num_parallel_calls=128) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def crop_proposal(): mlperf_log.ssd_print(key=mlperf_log.NUM_CROPPING_ITERATIONS, value=ssd_constants.NUM_CROP_PASSES) rand_vec = lambda minval, maxval: tf.random_uniform(shape=( ssd_constants.NUM_CROP_PASSES, 1), minval=minval, maxval=maxval, dtype=tf.float32) width, height = rand_vec(0.3, 1), rand_vec(0.3, 1) left, top = rand_vec(0, 1 - width), rand_vec(0, 1 - height) right = left + width bottom = top + height ltrb = tf.concat([left, top, right, bottom], axis=1) min_iou = tf.random_shuffle(ssd_constants.CROP_MIN_IOU_CHOICES)[0] ious = calc_iou_tensor(ltrb, boxes) # discard any bboxes whose center not in the cropped image xc, yc = [ tf.tile(0.5 * (boxes[:, i + 0] + boxes[:, i + 2])[tf.newaxis, :], (ssd_constants.NUM_CROP_PASSES, 1)) for i in range(2) ] masks = tf.reduce_all(tf.stack([ tf.greater(xc, tf.tile(left, (1, num_boxes))), tf.less(xc, tf.tile(right, (1, num_boxes))), tf.greater(yc, tf.tile(top, (1, num_boxes))), tf.less(yc, tf.tile(bottom, (1, num_boxes))), ], axis=2), axis=2) # Checks of whether a crop is valid. valid_aspect = tf.logical_and(tf.less(height / width, 2), tf.less(height / width, 2)) valid_ious = tf.reduce_all(tf.greater(ious, min_iou), axis=1, keepdims=True) valid_masks = tf.reduce_any(masks, axis=1, keepdims=True) valid_all = tf.cast( tf.reduce_all(tf.concat([valid_aspect, valid_ious, valid_masks], axis=1), axis=1), tf.int32) # One indexed, as zero is needed for the case of no matches. index = tf.range(1, 1 + ssd_constants.NUM_CROP_PASSES, dtype=tf.int32) # Either one-hot, or zeros if there is no valid crop. selection = tf.equal(tf.reduce_max(index * valid_all), index) use_crop = tf.reduce_any(selection) output_ltrb = tf.reduce_sum(tf.multiply( ltrb, tf.tile(tf.cast(selection, tf.float32)[:, tf.newaxis], (1, 4))), axis=0) output_masks = tf.reduce_any(tf.logical_and( masks, tf.tile(selection[:, tf.newaxis], (1, num_boxes))), axis=0) return use_crop, output_ltrb, output_masks