def detect(): config.batch_size = 1 imgs = tf.placeholder(shape=(1, 512, 512, 3), dtype=tf.float32) #ig = AddCoords(x_dim=512, y_dim=512)(imgs) pred_loc, pred_confs, vbs = retinanet.model(imgs,config) box,score,pp = predict(imgs,pred_loc, pred_confs, vbs,config.Config) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, '/home/dsl/all_check/face_detect/resnet50_pasc/model.ckpt-199863') for ip in glob.glob('/media/dsl/20d6b919-92e1-4489-b2be-a092290668e4/VOCdevkit/VOCdevkit/VOC2012/JPEGImages/*.jpg'): print(ip) img = cv2.imread(ip) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) org, window, scale, padding, crop = utils.resize_image(img, min_dim=512, max_dim=512) #img = (org/ 255.0-0.5)*2 img = org - [123.15, 115.90, 103.06] img = np.expand_dims(img, axis=0) t = time.time() bx,sc,p=[box,score,pp],feed_dict={imgs:img}) print(time.time()-t) bxx = [] cls = [] scores = [] for s in range(len(p)): if sc[s]>0.3: bxx.append(bx[s]) cls.append(p[s]) scores.append(sc[s]) if len(bxx) > 0: #visual.display_instances(org,np.asarray(bxx)*300) visual.display_instances_title(org,np.asarray(bxx)*512,class_ids=np.asarray(cls),class_names=config.VOC_CLASSES,scores=scores)
def video(): config.batch_size = 1 ig = tf.placeholder(shape=(1, 512, 512, 3), dtype=tf.float32) pred_loc, pred_confs, vbs = retinanet.model(ig,config) box,score,pp = predict(ig,pred_loc, pred_confs, vbs,config.Config) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, '/home/dsl/all_check/face_detect/resnet50/model.ckpt-18756') cap = cv2.VideoCapture('/media/dsl/20d6b919-92e1-4489-b2be-a092290668e4/face_detect/jijing.mp4') #cap = cv2.VideoCapture(0) cap.set(3, 320 * 3) cap.set(4, 320 * 3) t1 = time.time() while True: ret, frame = if not ret: continue img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) org, window, scale, padding, crop = utils.resize_image(img, min_dim=config.Config['min_dim'], max_dim=config.Config['min_dim']) img = org - [123.15, 115.90, 103.06] img = np.expand_dims(img, axis=0) t = time.time() bx, sc, p =[box, score, pp], feed_dict={ig: img}) fps = int(1 / (time.time() - t) * 10) / 10.0 cv2.putText(frame, 'fps:' + str(fps), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), lineType=cv2.LINE_AA) bxx = [] cls = [] scores = [] for s in range(len(p)): if sc[s] > 0.4: bxx.append(bx[s]) cls.append(p[s]) scores.append(sc[s]) if len(bxx) > 0: finbox = utils.revert_image(scale, padding, config.Config['min_dim'], np.asarray(bxx)) for ix, s in enumerate(finbox): cv2.rectangle(frame, pt1=(s[0], s[1]), pt2=(s[2], s[3]), color=(0, 255, 0), thickness=2) cv2.putText(frame, config.VOC_CLASSES[cls[ix]] + '_' + str(scores[ix])[0:4], (s[0], s[1]), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), lineType=cv2.LINE_AA) cv2.imshow('fram', frame) if cv2.waitKeyEx(1) & 0xFF == ord('q'): break print('ss') cap.release() cv2.destroyAllWindows()
def train(): img = tf.placeholder(shape=[config.batch_size, config.Config['min_dim'], config.Config['min_dim'], 3], dtype=tf.float32) #ig = AddCoords(x_dim=512,y_dim=512)(img) anchors_num = sum( [config.Config['feature_maps'][s] ** 2 * config.Config['aspect_num'][s] for s in range(5)]) loc = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32) conf = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32) pred_loc, pred_confs, vbs = retinanet.model(img,config) train_tensors = get_loss(conf, loc, pred_loc, pred_confs,config) gen = data_gen.get_batch_inception(batch_size=config.batch_size,image_size=config.Config['min_dim'],max_detect=50) global_step = slim.get_or_create_global_step() lr = tf.train.exponential_decay( learning_rate=0.001, global_step=global_step, decay_steps=40000, decay_rate=0.7, staircase=True) tf.summary.scalar('lr', lr) sum_op = tf.summary.merge_all() optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=0.9) train_op = slim.learning.create_train_op(train_tensors, optimizer) vbs = [] for s in slim.get_variables(): print( if 'resnet_v2_50' in and 'Momentum' not in print( vbs.append(s) saver = tf.train.Saver(vbs) def restore(sess): saver.restore(sess, config.check_dir) sv = tf.train.Supervisor(logdir=config.save_dir, summary_op=None, init_fn=restore) with sv.managed_session() as sess: for step in range(200000): print(' '+' '.join(['*']*(step%10))) images, true_box, true_label = q.get() loct, conft = np_utils.get_loc_conf(true_box, true_label, batch_size=config.batch_size,cfg=config.Config) feed_dict = {img: images, loc: loct, conf: conft} ls, step =[train_op, global_step], feed_dict=feed_dict) if step % 10 == 0: print('step:' + str(step) + ' ' + 'class_loss:' + str(ls[0]) + ' ' + 'loc_loss:' + str(ls[1]) ) summaries =, feed_dict=feed_dict) sv.summary_computed(sess, summaries)
def main(args=None): global thres global rel_thresh global attr_thresh parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') parser.add_argument('--data_path', help='Path to COCO directory') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') parser.add_argument('--net', help='Network to use', default='fasterrcnn') parser.add_argument('--set', help='Set on which evaluation will be performed', default='validation') parser.add_argument('--store_detections', action='store_true', default=False, help='Cache all detections with very low threshold in order to enable filtering after extraction') parser.add_argument('--load_detections', action='store_true', default=False, help='Load cached detections') parser.add_argument('--model_rel', help='Path to model (.pt) file for relationships.', default=None) parser.add_argument('--model_attr', help='Path to model (.pt) file for attributes.', default=None) parser.add_argument('--model_detector', help='Path to model (.pt) file for the detector.') parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser = parser.parse_args(args) assert parser.model_rel is not None and parser.model_attr is not None and parser.model_detector is not None, \ 'Models snapshots have to be specified!' assert not (parser.load_detections and parser.store_detections) det_output_path = os.path.split(parser.model_rel)[0] if parser.dataset == 'openimages': dataset_val = OidDatasetVRD(parser.data_path, subset=parser.set, transform=Compose([ToTensor()])) else: raise ValueError('Dataset type not understood (must be csv or coco), exiting.') #sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=1, collate_fn=collate_fn, batch_size=1) # Create the model detector = create_detection_model(dataset_val.num_classes(), parser, box_score_thresh=thres) model = VRD(detector, dataset=dataset_val, train_relationships=parser.model_rel is not None, train_attributes=parser.model_attr is not None, max_objects=max_objects) # Load the detector checkpoint = torch.load(parser.model_detector, map_location=lambda storage, loc: storage) weights = checkpoint['model'] weights = {k.replace('module.', ''): v for k, v in weights.items()} model.detector.load_state_dict(weights) print('Detector correctly loaded!') # Load the attributes, if needed if parser.model_rel: checkpoint = torch.load(parser.model_rel, map_location=lambda storage, loc: storage) weights = checkpoint['model_rel'] weights = {k.replace('module.', ''): v for k, v in weights.items()} model.relationships_net.load_state_dict(weights) print('Relationships correctly loaded!') if parser.model_attr: checkpoint = torch.load(parser.model_attr, map_location=lambda storage, loc: storage) weights = checkpoint['model_attr'] weights = {k.replace('module.', ''): v for k, v in weights.items()} model.attributes_net.load_state_dict(weights) print('Attributes correctly loaded!') if use_gpu: model = model.cuda() model.eval() all_detections = [] if parser.load_detections or parser.store_detections: print('Opening detections database file...') flag = 'r' if parser.load_detections else 'c' loaded_detections =, 'cached_detections_detthr{}.db'.format(thres)), flag=flag) for idx, data in enumerate(tqdm.tqdm(dataloader_val)): if parser.load_detections: loaded_det = loaded_detections[str(idx)] scores = loaded_det[0] classification = loaded_det[1] boxes = loaded_det[2] relationships = loaded_det[3] rel_scores = loaded_det[4] attributes = loaded_det[5] attr_scores = loaded_det[6] else: with torch.no_grad(): st = time.time() images, targets = data # targets = [{k: v.cuda() for k, v in t.items()} for t in targets] if use_gpu: input_images = list(image.cuda().float() for image in images) else: input_images = list(image.float() for image in images) # TODO: adapt retinanet output to the one by torchvision 0.3 # scores, classification, transformed_anchors = model(data_img.float()) outputs = model(input_images) outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs] output = outputs[0] # take the only batch scores = output['scores'] classification = output['labels'] boxes = output['boxes'] relationships = output['relationships'] rel_scores = output['relationships_scores'] attributes = output['attributes'] attr_scores = output['attributes_scores'] if parser.store_detections: loaded_detections[str(idx)] = [scores, classification, boxes, relationships, rel_scores, attributes, attr_scores] else: '''if parser.load_detections: pdb.set_trace() # filter objects, relationships and attributes filtered_idxs = np.where(scores > thres)[0] scores = scores[filtered_idxs] classification = classification[filtered_idxs] boxes = boxes[filtered_idxs] relationships = relationships[np.ix_(filtered_idxs, filtered_idxs)] rel_scores = rel_scores[np.ix_(filtered_idxs, filtered_idxs)] attributes = attributes[filtered_idxs] attr_scores = attr_scores[filtered_idxs] ''' subj_boxes_out = [] subj_labels_out = [] obj_boxes_out = [] obj_labels_out = [] rel_labels_out = [] rel_scores_out = [] if len(boxes) != 0: # num_objects = min(boxes.shape[0], max_objects) # Collect objects and attributes for j in range(attributes.shape[0]): bbox = boxes[j, :4] attr = attributes[j, 0].item() if parser.model_attr is not None and attr_scores[j, 0] > attr_thresh else 0 # TODO: only the top rank attribute is considered, generalize better! # We add an 'is' relation. 'is' relation is mapped to relation index of -1. if attr != 0: subj_boxes_out.append(bbox) obj_boxes_out.append(bbox) rel_labels_out.append(-1) rel_scores_out.append(attr_scores[j, 0]) subj_labels_out.append(int(classification[j])) obj_labels_out.append(attr) # Collect relationships for s_ind in range(relationships.shape[0]): for o_ind in range(relationships.shape[1]): subj = boxes[s_ind, :4] obj = boxes[o_ind, :4] rel = relationships[s_ind, o_ind].item() if rel_scores[s_ind, o_ind] > rel_thresh else 0 if rel != 0: subj_boxes_out.append(subj) obj_boxes_out.append(obj) rel_labels_out.append(rel) rel_scores_out.append(rel_scores[s_ind, o_ind]) subj_labels_out.append(int(classification[s_ind])) obj_labels_out.append(int(classification[o_ind])) all_detections.append([idx, subj_boxes_out, subj_labels_out, obj_boxes_out, obj_labels_out, rel_labels_out, rel_scores_out]) # if idx == 400: # break if not parser.store_detections: print('Evaluating...') # TODO: add identification parameter to evaluate so that detections from different checkpoints are not overwritten dataset_val.evaluate(all_detections, det_output_path, file_identifier='{}_relthr{}_attrthr{}_detthr{}'.format(parser.set, rel_thresh, attr_thresh, thres)) print('DONE!')
def main(args=None): parser = argparse.ArgumentParser( description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') parser.add_argument('--data_path', help='Path to COCO directory') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') parser.add_argument( '--csv_val', help= 'Path to file containing validation annotations (optional, see readme)' ) parser.add_argument('--net', help='Network to use', default='fasterrcnn') parser.add_argument('--model', help='Path to model (.pt) file.') parser.add_argument( '--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser = parser.parse_args(args) if parser.dataset == 'coco': raise NotImplementedError() dataset_val = CocoDataset(parser.data_path, set_name='val2017', transform=Compose([Normalizer(), Resizer()])) elif parser.dataset == 'openimages': dataset_val = OidDataset(parser.data_path, subset='validation', transform=Compose([ToTensor()])) elif parser.dataset == 'csv': raise NotImplementedError() dataset_val = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=Compose([Normalizer(), Resizer()])) else: raise ValueError( 'Dataset type not understood (must be csv or coco), exiting.') sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=1, collate_fn=collate_fn, batch_sampler=sampler_val) # Create the model model = create_detection_model(dataset_val.num_classes(), parser) checkpoint = torch.load(parser.model, map_location=lambda storage, loc: storage) weights = checkpoint['model'] weights = {k.replace('module.', ''): v for k, v in weights.items()} model.load_state_dict(weights) if use_gpu: model = model.cuda() model.eval() def draw_caption(image, box, caption): b = np.array(box).astype(int) cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2) cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) for idx, data in enumerate(dataloader_val): with torch.no_grad(): st = time.time() images, targets = data # targets = [{k: v.cuda() for k, v in t.items()} for t in targets] if use_gpu: input_images = list(image.cuda().float() for image in images) else: input_images = list(image.float() for image in images) # TODO: adapt retinanet output to the one by torchvision 0.3 # scores, classification, transformed_anchors = model(data_img.float()) outputs = model(input_images) outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs] output = outputs[0] # take the only batch scores = output['scores'] classification = output['labels'] transformed_anchors = output['boxes'] # from here, interface to the code already written in the original repo print('Elapsed time: {}'.format(time.time() - st)) idxs = np.where(scores > thres) img = np.array(255 * images[0]).copy() img[img < 0] = 0 img[img > 255] = 255 img = np.transpose(img, (1, 2, 0)) img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) ''' # Visualize ground truth bounding boxes for bbox, label in zip(targets[0]['boxes'], targets[0]['labels']): # bbox = transformed_anchors[idxs[0][j], :] x1 = int(bbox[0]) y1 = int(bbox[1]) x2 = int(bbox[2]) y2 = int(bbox[3]) label_name = dataset_val.labels[int(label)] draw_caption(img, (x1, y1, x2, y2), label_name) cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=1) print('GT: '+label_name) ''' for j in range(idxs[0].shape[0]): bbox = transformed_anchors[idxs[0][j], :] x1 = int(bbox[0]) y1 = int(bbox[1]) x2 = int(bbox[2]) y2 = int(bbox[3]) label_name = dataset_val.labels[int( classification[idxs[0][j]])] draw_caption(img, (x1, y1, x2, y2), label_name) cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 0, 255), thickness=2) print('Detection: ' + label_name) cv2.imshow('img', img) cv2.waitKey(0)
def detect(): rt = '/media/dsl/20d6b919-92e1-4489-b2be-a092290668e4/VOCdevkit/VOCdevkit/VOC2007/' dts = '/media/dsl/20d6b919-92e1-4489-b2be-a092290668e4/VOCdevkit/VOCdevkit/VOC2007/ImageSets/Main/test.txt' config.batch_size = 1 ig = tf.placeholder(shape=(1, 512, 512, 3), dtype=tf.float32) pred_loc, pred_confs, vbs = retinanet.model(ig, config) box, score, pp = predict(ig, pred_loc, pred_confs, vbs, config) saver = tf.train.Saver() with tf.Session() as sess: saver.restore( sess, '/home/dsl/all_check/face_detect/resnet50_pasc/model.ckpt-199863') with open(dts) as f: ct = 1 total_aps = [] for s in f.readlines(): img_id = s.replace('\n', '') img_path = os.path.join(rt, 'JPEGImages', img_id + '.jpg') img = cv2.imread(img_path) height, width, channels = img.shape gt_box, gt_cls = parse_rec( os.path.join(rt, 'Annotations', img_id + '.xml'), height, width) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) org, window, scale, padding, crop = utils.resize_image( img, min_dim=512, max_dim=512) img = (org / 255.0 - 0.5) * 2 img = np.expand_dims(img, axis=0) t = time.time() bx, sc, p =[box, score, pp], feed_dict={ig: img}) bxx = [] cls = [] scores = [] for kk in range(len(p)): if sc[kk] > 0.4: bxx.append(bx[kk]) cls.append(p[kk]) scores.append(sc[kk]) if len(cls) > 0: finbox = utils.revert_image(scale, padding, config.Config['min_dim'], np.asarray(bxx)) finbox = np.asarray(finbox, np.float32) finbox[:, 0] = finbox[:, 0] * 1.0 / width finbox[:, 1] = finbox[:, 1] * 1.0 / height finbox[:, 2] = finbox[:, 2] * 1.0 / width finbox[:, 3] = finbox[:, 3] * 1.0 / height mAP, precisions, recalls, overlaps = eval_utils.compute_ap( gt_boxes=np.asarray(gt_box), gt_class_ids=np.asarray(gt_cls), pred_boxes=finbox, pred_class_ids=np.asarray(cls), pred_scores=np.asarray(scores)) print(mAP) print(precisions) total_aps.append(mAP) print(sum(total_aps) / len(total_aps)) ct = ct + 1 visual.display_instances_title(org, np.asarray(bxx) * 512, class_ids=np.asarray(cls), class_names=config.VOC_CLASSES, scores=scores)
def main(args=None): parser = argparse.ArgumentParser( description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') parser.add_argument('--data_path', help='Path to COCO directory') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') parser.add_argument( '--csv_val', help= 'Path to file containing validation annotations (optional, see readme)' ) parser.add_argument('--net', help='Network to use', default='fasterrcnn') parser.add_argument('--set', help='Set on which evaluation will be performed', default='validation') parser.add_argument('--model_rel', help='Path to model (.pt) file for relationships.', default=None) parser.add_argument('--model_attr', help='Path to model (.pt) file for attributes.', default=None) parser.add_argument('--model_detector', help='Path to model (.pt) file for the detector.') parser.add_argument( '--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser = parser.parse_args(args) if parser.dataset == 'openimages': dataset_val = OidDatasetVRD(parser.data_path, subset=parser.set, transform=Compose([ToTensor()])) else: raise ValueError( 'Dataset type not understood (must be csv or coco), exiting.') #sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=1, collate_fn=collate_fn, batch_size=1, shuffle=True) # Create the model detector = create_detection_model(dataset_val.num_classes(), parser, box_score_thresh=thres) model = VRD(detector, dataset=dataset_val, train_relationships=parser.model_rel is not None, train_attributes=parser.model_attr is not None, max_objects=max_objects) # Load the detector checkpoint = torch.load(parser.model_detector, map_location=lambda storage, loc: storage) weights = checkpoint['model'] weights = {k.replace('module.', ''): v for k, v in weights.items()} model.detector.load_state_dict(weights) print('Detector correctly loaded!') # Load the attributes, if needed if parser.model_rel: checkpoint = torch.load(parser.model_rel, map_location=lambda storage, loc: storage) weights = checkpoint['model_rel'] weights = {k.replace('module.', ''): v for k, v in weights.items()} model.relationships_net.load_state_dict(weights) print('Relationships correctly loaded!') if parser.model_attr: checkpoint = torch.load(parser.model_attr, map_location=lambda storage, loc: storage) weights = checkpoint['model_attr'] weights = {k.replace('module.', ''): v for k, v in weights.items()} model.attributes_net.load_state_dict(weights) print('Attributes correctly loaded!') if use_gpu: model = model.cuda() model.eval() def draw_object_bb(image, box, caption): b = np.array(box).astype(int) cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2) cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), color=(0, 0, 255), thickness=2) def draw_relationship(image, subj, obj, rel_name): cv2.arrowedLine(image, (subj[0], subj[1]), (obj[0], obj[1]), (255, 0, 0), 2, tipLength=0.02) cv2.putText(image, rel_name, ((subj[0] + obj[0]) / 2, (subj[1] + obj[1]) / 2), cv2.FONT_HERSHEY_PLAIN, 1, (0, 255, 0), 2) for idx, data in enumerate(dataloader_val): with torch.no_grad(): st = time.time() images, targets = data # targets = [{k: v.cuda() for k, v in t.items()} for t in targets] if use_gpu: input_images = list(image.cuda().float() for image in images) else: input_images = list(image.float() for image in images) # TODO: adapt retinanet output to the one by torchvision 0.3 # scores, classification, transformed_anchors = model(data_img.float()) outputs = model(input_images) outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs] output = outputs[0] # take the only batch scores = output['scores'] classification = output['labels'] boxes = output['boxes'] if parser.model_rel: relationships = output['relationships'] rel_scores = output['relationships_scores'] if parser.model_attr: attributes = output['attributes'] attr_scores = output['attributes_scores'] # from here, interface to the code already written in the original repo print('Elapsed time: {}'.format(time.time() - st)) img = np.array(255 * images[0]).copy() img[img < 0] = 0 img[img > 255] = 255 img = np.transpose(img, (1, 2, 0)) img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) ''' # Visualize ground truth bounding boxes for bbox, label in zip(targets[0]['boxes'], targets[0]['labels']): # bbox = transformed_anchors[idxs[0][j], :] x1 = int(bbox[0]) y1 = int(bbox[1]) x2 = int(bbox[2]) y2 = int(bbox[3]) label_name = dataset_val.labels[int(label)] draw_caption(img, (x1, y1, x2, y2), label_name) cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=1) print('GT: '+label_name) ''' if len(boxes) != 0: # Draw objects for j in range(attributes.shape[0]): bbox = boxes[j, :4].int() attr = attributes[j, 0].item( ) if parser.model_attr is not None and attr_scores[ j, 0] > attr_thresh else 0 # TODO: only the top rank attribute is considered, generalize better! label_name = dataset_val.labels[int(classification[j])] attr_name = ': ' + dataset_val.attr_id_to_labels[ attr] if attr != 0 else '' draw_object_bb(img, bbox, label_name + attr_name) print('Detection: ' + label_name) # Draw relationships if parser.model_rel: for s_ind in range(relationships.shape[0]): for o_ind in range(relationships.shape[1]): subj = boxes[s_ind, :4].int() obj = boxes[o_ind, :4].int() rel = relationships[s_ind, o_ind].item( ) if rel_scores[s_ind, o_ind] > rel_thresh else 0 if rel != 0: rel_name = dataset_val.rel_id_to_labels[rel] draw_relationship(img, subj, obj, rel_name) cv2.imshow('img', img) cv2.waitKey(0)
def main(args=None): parser = argparse.ArgumentParser( description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') parser.add_argument('--data_path', help='Path to COCO directory') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') parser.add_argument( '--csv_val', help= 'Path to file containing validation annotations (optional, see readme)' ) parser.add_argument('--net', help='Network to use', default='fasterrcnn') parser.add_argument('--set', help='Set on which evaluation will be performed', default='validation') parser.add_argument('--model', help='Path to model (.pt) file.') parser.add_argument( '--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser = parser.parse_args(args) if parser.dataset == 'coco': raise NotImplementedError() dataset = CocoDataset(parser.data_path, set_name='val2017', transform=Compose([Normalizer(), Resizer()])) elif parser.dataset == 'openimages': dataset = OidDataset(parser.data_path, subset=parser.set, transform=Compose([ToTensor()])) elif parser.dataset == 'csv': raise NotImplementedError() dataset = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=Compose([Normalizer(), Resizer()])) else: raise ValueError( 'Dataset type not understood (must be csv or coco), exiting.') # sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader = DataLoader(dataset, num_workers=1, collate_fn=collate_fn, shuffle=False) # Create the model model = create_detection_model(dataset.num_classes(), parser) checkpoint = torch.load(parser.model, map_location=lambda storage, loc: storage) weights = checkpoint['model'] weights = {k.replace('module.', ''): v for k, v in weights.items()} model.load_state_dict(weights) if use_gpu: model = model.cuda() model.eval() all_detections = [] det_output_path = os.path.split(parser.model)[0] for idx, data in enumerate(tqdm.tqdm(dataloader)): with torch.no_grad(): st = time.time() images, targets = data # targets = [{k: v.cuda() for k, v in t.items()} for t in targets] if use_gpu: input_images = list(image.cuda().float() for image in images) else: input_images = list(image.float() for image in images) outputs = model(input_images) outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs] output = outputs[0] # take the only batch scores = output['scores'] classification = output['labels'] transformed_anchors = output['boxes'] # from here, interface to the code already written in the original repo # TODO: 0.5 should be a parameter in a configuration file.. that hopefully should be created and handled.. det_idxs = np.where(scores > det_thres) bboxes = transformed_anchors[ det_idxs[0][det_idxs], :].cpu().numpy() labels = classification[det_idxs[0][det_idxs]].cpu().numpy() scores = scores[det_idxs[0][det_idxs]].cpu().numpy() packed_detections = [idx, bboxes, labels, scores] all_detections.append(packed_detections) #if idx == 3: # break print('Evaluating...') # TODO: add identification parameter to evaluate so that detections from different checkpoints are not overwritten dataset.evaluate(all_detections, det_output_path, file_identifier='{}_IoU{}'.format(parser.set, det_thres)) print('DONE!')