def getGTLabels(myRel, GTMeta, cfg): labels = [] for relID, relGT in GTMeta['rels'].items(): prsIoU = utils.get_iou(myRel['prsBB'], relGT['prsBB']) objIoU = utils.get_iou(myRel['objBB'], relGT['objBB']) if prsIoU > cfg.minIoU and objIoU > cfg.minIoU: labels.append(relGT['label']) return labels
def calculate_iou(m_name, gp_id, r_dict): checkpoint_path = checkpoint_folder + '/' + m_name criterion = nn.CrossEntropyLoss() model = load_model(checkpoint_path, num_labels, gp_id) average_iou = [] correct = 0 num_img = 0 for folder_name in subfolders: label = int(folder_name.split('.')[0]) - 1 target_tensor = torch.tensor([label]) image_names = os.listdir(val_folder_path + '/' + folder_name) for img_name in image_names: img_path = val_folder_path + '/' + folder_name + '/' + img_name x1_gt, y1_gt, x2_gt, y2_gt = bbox.get_bbox_from_path(img_path) gt = [x1_gt, y1_gt, x2_gt, y2_gt] prediction, grad = predict(model, img_path, transform, criterion, target_tensor, height, width, num_channels, gp_id) pred = bounding_box_grad(grad) overlap = check_overlap(pred, gt) if overlap: iou = get_iou(pred, gt) else: iou = 0.0 average_iou.append(iou) num_img += 1 if prediction == label and iou >= 0.5: correct += 1 # print(f'{m_name}: {np.mean(average_iou)}') loc_acc = float(correct) / float(num_img) avg_iou = np.mean(average_iou) r_dict[m_name] = [avg_iou, loc_acc]
def cut_img(model, file, realoutput=None): fullpath = os.path.join(test_dir, file) print(fullpath) img_origin = cv.imdecode(np.fromfile(fullpath, dtype=np.uint8), cv.IMREAD_COLOR) w, h, c = img_origin.shape img = cv.resize(img_origin, (im_size, im_size)) img = img[..., ::-1] # RGB img = transforms.ToPILImage()(img) img = transformer(img) with torch.no_grad(): output = model(torch.unsqueeze(img, 0).to(device)) output = output.reshape(4, -1) output = output.cpu().numpy() output = output * [h, w] output = output.reshape(-1) output = sort_four_dot(output) img = draw_bboxes2(img_origin, output) cv.imwrite('{}_out.jpg'.format(test_dir + '/image/' + file), img) img2 = cut_and_adjust_img(img, output) cv.imwrite('{}_adjust.jpg'.format(test_dir + '/image/' + file), img2) if realdots is not None: roi = get_iou(output, realoutput) print(roi) img0 = draw_bboxes2(img_origin, realoutput, 'g') cv.imwrite('{}_real.jpg'.format(test_dir + '/image/' + file), img0) return roi
def add_features(df): df['iou'] = df.apply(lambda row: get_iou(row), axis=1) df['size1'] = df.apply(lambda row: (row.XMax1 - row.XMin1) * (row.YMax1 - row.YMin1), axis=1) df['size2'] = df.apply(lambda row: (row.XMax2 - row.XMin2) * (row.YMax2 - row.YMin2), axis=1) df['xcenter1'] = df.apply(lambda row: (row.XMax1 + row.XMin1) / 2, axis=1) df['xcenter2'] = df.apply(lambda row: (row.XMax2 + row.XMin2) / 2, axis=1) df['ycenter1'] = df.apply(lambda row: (row.YMax1 + row.YMin1) / 2, axis=1) df['ycenter2'] = df.apply(lambda row: (row.YMax2 + row.YMin2) / 2, axis=1) df['aspect1'] = df.apply(lambda row: (row.XMax1 - row.XMin1) / (row.YMax1 - row.YMin1 + 1e-6), axis=1) df['aspect2'] = df.apply(lambda row: (row.XMax2 - row.XMin2) / (row.YMax2 - row.YMin2 + 1e-6), axis=1) df['xcenterdiff'] = df.apply(lambda row: ((row.XMax1 + row.XMin1) - (row.XMax2 + row.XMin2)) / 2, axis=1) df['ycenterdiff'] = df.apply(lambda row: ((row.YMax1 + row.YMin1) - (row.YMax2 + row.YMin2)) / 2, axis=1) return df
def calculate_iou(m_name, gp_id): checkpoint_path = checkpoint_folder + '/' + m_name criterion = nn.CrossEntropyLoss() model = load_model(checkpoint_path, num_labels, gp_id) average_iou = [] for folder_name in subfolders: label = int(folder_name.split('.')[0]) - 1 target_tensor = torch.tensor([label]) image_names = os.listdir(test_folder_path + '/' + folder_name) for img_name in image_names: img_path = test_folder_path + '/' + folder_name + '/' + img_name x1_gt, y1_gt, x2_gt, y2_gt = bbox.get_bbox_from_path(img_path) gt = [x1_gt, y1_gt, x2_gt, y2_gt] grad = predict(model, img_path, transform, criterion, target_tensor, height, width, num_channels, gp_id) pred = bounding_box_grad(grad) overlap = check_overlap(pred, gt) if overlap: iou = get_iou(pred, gt) else: iou = 0.0 average_iou.append(iou) return np.average(average_iou)
def calculate_loc(m_name, gp_id): checkpoint_path = checkpoint_folder + '/' + m_name criterion = nn.CrossEntropyLoss() model = load_model(checkpoint_path, num_labels, gp_id) correct = 0 total = 0 for folder_name in subfolders: label = int(folder_name.split('.')[0]) - 1 target_tensor = torch.tensor([label]) image_names = os.listdir(test_folder_path + '/' + folder_name) for img_name in image_names: img_path = test_folder_path + '/' + folder_name + '/' + img_name x1_gt, y1_gt, x2_gt, y2_gt = bbox.get_bbox_from_path(img_path) gt = [x1_gt, y1_gt, x2_gt, y2_gt] prediction, grad = predict(model, img_path, transform, criterion, target_tensor, height, width, num_channels, gp_id) pred = bounding_box_grad(grad) overlap = check_overlap(pred, gt) if overlap: iou = get_iou(pred, gt) else: iou = 0.0 total += 1 if prediction == label and iou >= 0.5: correct += 1 acc = float(correct) / float(total) return acc
def __call__(self, gt, pred): gt_egg, gt_pan = gt pred_egg, pred_pan = pred # Apply sigmoid and threshold by 0.5 pred_egg = (torch.sigmoid(pred_egg) >= 0.5).type(pred_egg.dtype) pred_pan = (torch.sigmoid(pred_pan) >= 0.5).type(pred_pan.dtype) egg_iou = get_iou(gt_egg, pred_egg) pan_iou = get_iou(gt_pan, pred_pan) self.running_iou += (egg_iou.item() + pan_iou.item()) / 2 self.running_samples += 1 return self.running_iou / self.running_samples
def train_model(model, optimizer, train_dataloader, val_dataloader, criterion, regression_criterion, num_epochs=1): best_model_wts = copy.deepcopy(model.state_dict()) best_mIoU = 0.0 loss_history = [] iou_history = [] for epoch in range(num_epochs): running_loss = 0.0 running_iou = 0.0 for inputs, labels in train_dataloader: inputs = inputs.to(device) target_bbox = labels[:, :4].to(device) target_class = labels[:, 4].to(device) optimizer.zero_grad() pred_class, pred_bbox = model(inputs) classification_loss = criterion(pred_class.squeeze(), target_class) regression_loss = regression_criterion(pred_bbox, target_bbox).sqrt() loss = 0.5 * classification_loss + regression_loss loss.backward() optimizer.step() running_loss += loss.data for inputs, labels in val_dataloader: inputs = inputs.to(device) target_bbox = labels[:, :4].to(device) target_class = labels[:, 4].to(device) with torch.no_grad(): pred_class, pred_bbox = model(inputs) mIoU = get_iou(pred_bbox, target_bbox, inputs) running_iou += mIoU mean_loss = running_loss/len(train_dataloader) mIoU = running_iou/len(val_dataloader) loss_history.append(mean_loss) iou_history.append(mIoU) print(f'Epoch {epoch}/{num_epochs-1}. Loss: {mean_loss:.4f}. mIoU: {mIoU:.4f}') if mIoU > best_mIoU: best_mIoU = float(mIoU.cpu()) best_model_wts = copy.deepcopy(model.state_dict()) model.load_state_dict(best_model_wts) return model, loss_history, iou_history
def calculate_yolo_metric(sess, X, pred, data_files, batch_size=100): count = 0 iou_total = 0 wrong_area = 0 correct_g = 0 correct_a = 0 img_iter = single_np_datapoint_generator(data_files) for imgs, outs in group_iterable_into_list(img_iter, batch_size, 2): preds = sess.run(pred, feed_dict={X: imgs}) for c in range(len(imgs)): out = outs[c] o = preds[c] orig = np.argwhere(out[:, :, 0] > 0.6) y_o, x_o, h_o, w_o, g_o, a_o = get_prediction( out, orig[0], give_prob=False) # assuming only one face found = np.argwhere(o[:, :, 0] > 0.7) blist = [] probs = [] for i in found: y, x, h, w, g, a = get_prediction(o, i, give_prob=False) blist.append((x, y, w, h, g, a)) probs.append(o[i[0], i[1], 0]) filtered = non_max_suppress(blist, probs) max_iou = -1 o_box = {'x1': x_o, 'x2': x_o + w_o, 'y1': y_o, 'y2': y_o + h_o} g_p, a_p = '', '' for x, y, w, h, g, a in filtered: iou = get_iou(o_box, { 'x1': x, 'x2': x + w, 'y1': y, 'y2': y + h }) if iou > 0.2 and iou > max_iou: if max_iou > 0: wrong_area += 1.0 - max_iou max_iou = iou g_p = g a_p = a else: wrong_area += 1.0 - iou if max_iou > 0: iou_total += max_iou if g_o == g_p: correct_g += 1.0 if a_o == a_p: correct_a += 1.0 count += 1 print("Processed", count, "images") return { "total_count": count, "average_iou": iou_total / count, "average_false_iou": wrong_area / count, "gender_accuracy": correct_g / count, "age_accuracy": correct_a / count }
def drawOverlapRois(img, rois, imageMeta, imageDims, cfg, obj_mapping): import filters_helper as helper import utils f, spl = plt.subplots(1) spl.axis('off') spl.imshow(img) bboxes = [] gta = helper.normalizeGTboxes(imageMeta['objects'], scale=imageDims['scale'], rpn_stride=cfg.rpn_stride) inv_obj_mapping = {x: key for key, x in obj_mapping.items()} for roi in rois: (xmin, ymin, width, height) = roi[0:4] label = int(roi[5]) prop = roi[4] rt = { 'xmin': xmin, 'ymin': ymin, 'xmax': xmin + width, 'ymax': ymin + height } best_iou = 0.0 for bbidx, gt in enumerate(gta): gt_label = obj_mapping[gt['label']] if label != gt_label: continue curr_iou = utils.get_iou(gt, rt) if curr_iou > best_iou: # print(curr_iou) best_iou = curr_iou if best_iou >= 0.5: c = 'red' print('Pos. label:', inv_obj_mapping[label], prop, best_iou) elif best_iou >= 0: c = 'blue' continue print('Neg. label:', inv_obj_mapping[label], prop, best_iou) else: continue bb = {key: x * cfg.rpn_stride for key, x in rt.items()} bbox = np.copy(drawBoundingBox(bb)) spl.plot(bbox[0, :], bbox[1, :], c=c) bboxes.append(bb) f.subplots_adjust(left=0.02, right=0.98, top=0.98, bottom=0.02)
def gen_neg_img(img, boxes, nums=50): """ 产生negative图片 :param img: 原图 :param boxes: 人脸框, shape=(-1, 4) :param nums: 总需产生个数 :return: tuple, 裁剪后的图片和标签 """ h, w, _ = img.shape have = 0 while nums: # 随机裁剪 size = npr.randint(12, max(min(h, w)//2, 13)) x = npr.randint(0, w-size) y = npr.randint(0, h-size) iou = get_iou(np.array([x, y, size, size]), boxes) if np.max(iou) < 0.3: yield img[y:y+size, x:x+size], 0, [0.] * 4 have += 1 # # 在boxes周围裁剪 # size = npr.randint(boxes[:, 2:]*0.7-1, boxes[:, 2:]*1.3) # size = np.maximum([12, 12], size) # point = npr.randint(boxes[:, :2]-size*0.3-1, boxes[:, :2]+size*0.3) # point = np.maximum([0, 0], point) # crop_box = np.hstack([point, size]) # for box in crop_box: # if box[0] + box[2] >= w or box[1] + box[3] >= h: # continue # iou = get_iou(box, boxes) # if np.max(iou) >= 0.3: # continue # x, y, ww, hh = box # yield img[y:y+hh, x:x+ww], 0, [0.] * 4 # have += 1 nums -= 1 return have
def get_pair(self): self.result_pair = {} for i,true_box in enumerate(self.true_result.all_box): max_iou = 0 pair = -1 for j,pre_box in enumerate(self.pre_result.connect_result): iou = get_iou(pre_box.bbox,true_box.bbox) if iou>max_iou: max_iou = iou pair = j if max_iou>0.5: self.result_pair[i] = pair else: self.result_pair[i] = -1 return self.result_pair
def visual_img(model): with open(pickle_file, 'rb') as file: data = pickle.load(file) samples = [item for item in data] samples = random.sample(samples, img_num) imgs = torch.zeros([img_num, 3, im_size, im_size], dtype=torch.float) ensure_folder('images') origin_pts = [] for i in range(img_num): sample = samples[i] fullpath = sample['fullpath'] raw = cv.imread(fullpath) raw = cv.resize(raw, (im_size, im_size)) img = raw[..., ::-1] # RGB img = transforms.ToPILImage()(img) img = transformer(img) imgs[i] = img cv.imwrite('images/{}_img.jpg'.format(i), raw) # print(sample['pts']) raw = draw_bboxes2(raw, sample['pts'], thick=3) origin_pts.append(sample['pts']) cv.imwrite('images/{}_true.jpg'.format(i), raw) with torch.no_grad(): outputs = model(imgs.to(device)) iou_sum = 0 for i in range(img_num): output = outputs[i].cpu().numpy() output = output * im_size # print('output: ' + str(output)) # print('output.shape: ' + str(output.shape)) img = cv.imread('images/{}_img.jpg'.format(i)) # print(output) img = draw_bboxes2(img, output, thick=3) iou_sum += get_iou(origin_pts[i], output) cv.imwrite('images/{}_out.jpg'.format(i), img) return iou_sum / img_num
def gen_hard(net, img, boxes): if net not in DETECTORS: DETECTORS[net] = Detector(0.5, 0.5, net=net) detector = DETECTORS[net] bbox, _ = detector.predict(img, 20, net=net) crop_nums = [0] * 3 for box in bbox: crop_img = img[box[1]: box[3], box[0]: box[2]] box[2] -= box[0] box[3] -= box[1] iou = get_iou(box, boxes) max_ind = np.argmax(iou) if iou[max_ind] < 0.3: if crop_nums[0] < 50: yield crop_img, 0, [0.] * 4 crop_nums[0] += 1 continue if iou[max_ind] < 0.4: continue true_box = boxes[max_ind] x1, y1, w, h = true_box offset_x1 = (x1 - box[0]) / float(box[2]) offset_y1 = (y1 - box[1]) / float(box[3]) offset_x2 = (x1+w-box[0]-box[2]) / float(box[2]) offset_y2 = (y1+h-box[1]-box[3]) / float(box[3]) if iou[max_ind] >= 0.65: yield crop_img, 1, (offset_x1, offset_y1, offset_x2, offset_y2) crop_nums[1] += 1 else: yield crop_img, -1, (offset_x1, offset_y1, offset_x2, offset_y2) crop_nums[2] += 1
def drawOverlapAnchors(img, anchors, imageMeta, imageDims, cfg): import filters_helper as helper import utils f, spl = plt.subplots(1) spl.axis('off') spl.imshow(img) bboxes = [] gta = helper.normalizeGTboxes(imageMeta['objects'], scale=imageDims['scale'], rpn_stride=cfg.rpn_stride) for anchor in anchors: (xmin, ymin, width, height) = anchor[0:4] rt = { 'xmin': xmin, 'ymin': ymin, 'xmax': xmin + width, 'ymax': ymin + height } best_iou = 0.0 for bbidx, gt in enumerate(gta): curr_iou = utils.get_iou(gt, rt) if curr_iou > best_iou: # print(curr_iou) best_iou = curr_iou if best_iou >= 0.5: c = 'red' else: c = 'blue' continue bb = {key: x * cfg.rpn_stride for key, x in rt.items()} bbox = drawBoundingBox(bb) spl.plot(bbox[0, :], bbox[1, :], c=c) bboxes.append(bb) f.subplots_adjust(left=0.02, right=0.98, top=0.98, bottom=0.02)
# ----------------- inputs, labels = Variable(inputs, requires_grad=True), Variable(labels) if gpu_id >= 0: inputs, labels = inputs.cuda(), labels.cuda() with torch.no_grad(): outputs = net.forward(inputs) predictions = torch.max(outputs, 1)[1] loss = criterion(outputs, labels, size_average=False, batch_average=True) running_loss_val += loss.item() total_iou += utils.get_iou(predictions, labels) # Print loss and MIoU if ii % num_img_val == num_img_val - 1: miou = total_iou / (ii * valBatch + inputs.data.shape[0]) running_loss_val = running_loss_val / num_img_val print('Validation') print('[Epoch: %d, numImages: %5d]' % (epoch, ii * valBatch + inputs.data.shape[0])) print('Loss: %f' % running_loss_val) print('MIoU: %f\n' % miou) running_loss_val = 0
def createTargets(bboxes, imageMeta, imageDims, class_mapping, cfg): #out: rois [{1}, {...}, (1,ymin,xmin,ymax,xmax)] #out: labels [{1}, {...}, {nb_object_classes}] #out: deltas [{1}, {...}, (dx,dy,dw,dh) * (nb_object_classes-1)] ############################# ########## Image ############ ############################# gt_bboxes = imageMeta['objects'] scale = imageDims['scale'] # shape = imageDims['shape'] ############################# ###### Set Parameters ####### ############################# rpn_stride = cfg.rpn_stride detection_max_overlap = cfg.detection_max_overlap detection_min_overlap = cfg.detection_min_overlap ############################# #### Initialize matrices #### ############################# x_roi = [] y_class_num = [] y_class_regr_coords = [] y_class_regr_label = [] IoUs = [] # for debugging only ############################# ##### Ground truth boxes #### ############################# gta = helper.normalizeGTboxes(gt_bboxes, scale=scale, rpn_stride=rpn_stride) ############################# #### Ground truth objects ### ############################# for ix in range(bboxes.shape[0]): (xmin, ymin, width, height, prop) = bboxes[ix, :5] # xmin = int(round(xmin)) # ymin = int(round(ymin)) # xmax = int(round(xmax)) # ymax = int(round(ymax)) rt = { 'xmin': xmin, 'ymin': ymin, 'xmax': xmin + width, 'ymax': ymin + height } best_iou = 0.0 best_bbox = -1 for bbidx, gt in enumerate(gta): curr_iou = utils.get_iou(gt, rt) if curr_iou > best_iou: # print(curr_iou) best_iou = curr_iou best_bbox = bbidx if best_iou < detection_min_overlap: continue else: x_roi.append([xmin, ymin, width, height, prop]) IoUs.append(best_iou) if detection_min_overlap <= best_iou < detection_max_overlap: # hard negative example cls_name = 'bg' elif detection_max_overlap <= best_iou: cls_name = gt_bboxes[best_bbox]['label'] tx, ty, tw, th = helper.get_GT_deltas(gta[best_bbox], rt) # bxmin, bymin, bw, bh = helper.apply_regr([xmin,ymin,width,height], [tx,ty,tw,th]) # print(best_iou) # print('rt',rt['xmin'], rt['ymin'], rt['xmax'], rt['ymax']) # print('gt',gta[best_bbox]['xmin'], gta[best_bbox]['ymin'], gta[best_bbox]['xmax'], gta[best_bbox]['ymax']) # print('bb',bxmin, bymin, bxmin + bw, bymin + bh) else: print('roi = {}'.format(best_iou)) raise RuntimeError # Classification ground truth class_num = class_mapping[cls_name] class_label = len(class_mapping) * [0] class_label[class_num] = 1 y_class_num.append(copy.deepcopy(class_label)) # Regression ground truth coords = [0] * 4 * (len(class_mapping) - 1) labels = [0] * 4 * (len(class_mapping) - 1) if cls_name != 'bg': label_pos = 4 * (class_num - 1) sx, sy, sw, sh = cfg.det_regr_std coords[label_pos:4 + label_pos] = [tx * sx, ty * sy, tw * sw, th * sh] labels[label_pos:4 + label_pos] = [1, 1, 1, 1] y_class_regr_coords.append(copy.deepcopy(coords)) y_class_regr_label.append(copy.deepcopy(labels)) else: y_class_regr_coords.append(copy.deepcopy(coords)) y_class_regr_label.append(copy.deepcopy(labels)) if len(x_roi) == 0: # print('x roi none') return None, None, None, None rois = np.array(x_roi) y_class_regr_label = np.array(y_class_regr_label) y_class_regr_coords = np.array(y_class_regr_coords) true_labels = np.array(y_class_num) true_boxes = np.concatenate([y_class_regr_label, y_class_regr_coords], axis=1) return np.expand_dims(rois, axis=0), np.expand_dims( true_labels, axis=0), np.expand_dims(true_boxes, axis=0), IoUs
def getBoundingBoxes(imagesMeta, objects, labels): newImagesMeta = {} imagesBadOnes = {} noImage = 0 total = 0 for imageID, imageMeta in imagesMeta.items(): try: root = ET.parse(url + 'bbox/' + imageID + '.xml').getroot() except FileNotFoundError as e: print("missing", imageID) continue relsObj = imageMeta['rels'] relsTmp = [] relsObjBad = [] relsPrsBad = [] persons = [] # Add objects for elem in root: if elem.tag != "object": continue # BB name objID = elem.find('name').text if objID not in objects.keys(): continue objName = objects[objID] ## BB coordinates bbXML = elem.find('bndbox') xmin = int(bbXML.find('xmin').text) xmax = int(bbXML.find('xmax').text) ymin = int(bbXML.find('ymin').text) ymax = int(bbXML.find('ymax').text) bb = {'xmin': xmin, 'xmax': xmax, 'ymin': ymin, 'ymax': ymax} if objName == 'person': persons.append(bb) else: #Meta relation if objName in relsObj.keys(): pred = relsObj[objName] label = labels[pred + objName] relsTmp.append({ 'labels': [label], 'names': [{ 'pred': pred, 'obj': objName }], 'objBB': bb }) else: relsObjBad.append({ 'pred': pred, 'name': objName, 'bb': bb }) total += 1 # Choose best person boxes bestPrs = np.array([[0.0, 0.0, None] for i in range(len(relsTmp))]) prsIdx = 0 for perBB in persons: IoUs = np.zeros([len(relsTmp), 2]) relIdx = 0 for rel in relsTmp: objBB = rel['objBB'] IoUPsy = utils.get_iou(objBB, perBB, False) IoU = utils.get_iou(objBB, perBB) IoUs[relIdx, :] = [IoUPsy, IoU] relIdx += 1 bestIdx = np.argmax(IoUs[:, 0]) if IoUs[bestIdx, 0] > bestPrs[bestIdx, 0] or IoUs[bestIdx, 0] == bestPrs[ bestIdx, 0] and IoUs[bestIdx, 1] > bestPrs[bestIdx, 1]: bestPrs[bestIdx, :] = np.array( [IoUs[bestIdx, 0], IoUs[bestIdx, 1], prsIdx]) prsIdx += 1 # Add best persons relsFinal = {} relIdx = 0 objGood = False perGood = True for [bestIoUPsy, _, prsIdx] in bestPrs: if bestIoUPsy > 0.1: relTmp = relsTmp[relIdx] relTmp['prsBB'] = persons[int(prsIdx)] relTmp['prsID'] = int(prsIdx) relsFinal[relIdx] = relTmp else: perGood = False relIdx += 1 objGood = True # Add bad persons bestPrsIdx = bestPrs[:, 2] bestPrsIdx = bestPrsIdx[bestPrsIdx != np.array(None)] bestPrsIdx = bestPrsIdx.astype(int) for i in range(len(persons)): if i not in bestPrsIdx: relsPrsBad.append({ 'pred': '', 'name': 'person', 'bb': persons[i] }) if not objGood or not perGood: continue imageMeta['rels'] = relsFinal newImagesMeta[imageID] = imageMeta imagesBadOnes[imageID] = [relsObjBad, relsPrsBad] noImage += 1 #print(badOnes) print(total) return newImagesMeta, imagesBadOnes
def detect_pedrestrian(img, pedestrians_bounding_boxes, sliding_window_parameters, classifier_svm, grayscale=False, must_normalize=True): """A partir de la imagen pasada por parametro se realiza una ventana deslizante y se dibujan las areas donde fue detectada una persona""" height, width = len(img), len(img[1]) # block_heigth, block_width = SLIDING_WINDOW_SIZE block_width, block_heigth = sliding_window_parameters y = 0 plt.imshow(img, cmap='gray') hogs_to_hard_mining = [] # stride_y, stride_x = SLIDING_WINDOW_STRIDE stride_y, stride_x = int(SLIDING_WINDOW_STRIDE[0] / 2), int( SLIDING_WINDOW_STRIDE[1] / 2) # Datos de precision, recall, etc total_pedestrian = len(pedestrians_bounding_boxes) pedrestrian_predected = pedrestrian_success = 0 # Paso a escalas de grises if grayscale: img = grayscaled_img( img) # Los hogs solo se pueden calcular sobre escala de grises # Normalizo if must_normalize: img = normalize_img(img) # Comienzo a correr la ventana deslizante while y < height: x = 0 while x < width: try: # sub_img = img[y:y + block_heigth, x:x + block_width, :] # Obtengo una subregion/subimagen sub_img = img[ y:y + block_heigth, x:x + block_width, :] # Obtengo una subregion/subimagen except IndexError: # Puede ser posible que algunas imagenes sin RGB arroje este error # sub_img = img[y:y + block_heigth, x:x + block_width] sub_img = img[y:y + block_heigth, x:x + block_width] finally: sub_img = resize(sub_img) sub_img_hog = hog(sub_img, block_norm='L2-Hys', transform_sqrt=True) predictions = classifier_svm.predict([sub_img_hog]) if SHOW_IMG and DRAW_SLIDING_WINDOW: draw_rectangle(x, y, block_width, block_heigth, 'yellow') # Busco los falsos positivos! if predictions[0] == 1: pedrestrian_predected += 1 img_box = [x, y, x + block_width, y + block_heigth] # Veo si tiene algun IOU que valga la pena con algun bounding box de peatones declarados must_be_added = False intersects_with_pedestrian = False for pedestrian_bounding_box in pedestrians_bounding_boxes: # Grafico el bounding boxs si asi se quiere if SHOW_IMG and DRAW_PEDRESTRIAN_BOUNDING_BOX: draw_rectangle(pedestrian_bounding_box[0], pedestrian_bounding_box[1], pedestrian_bounding_box[2], pedestrian_bounding_box[3]) box_to_iou = [ pedestrian_bounding_box[0], pedestrian_bounding_box[1], pedestrian_bounding_box[0] + pedestrian_bounding_box[2], pedestrian_bounding_box[1] + pedestrian_bounding_box[3] ] # Calculo el IOU iou = utils.get_iou(img_box, box_to_iou) # print("iou", iou) # Si es menor que el limite de IOU seteado, lo considero para agregar al HNM if iou < IOU_limit: must_be_added = True else: # Grafico en verde los peatones correctamente detectados # draw_rectangle(x, y, block_width, block_heigth, '#008744') draw_rectangle(x, y, block_width, block_heigth, 'blue') intersects_with_pedestrian = True if not intersects_with_pedestrian and must_be_added: # Si no esta arriba de una persona y es un falso # positivo lo grafico en rojo ('#d62d20') en la imagen # y lo considero para hacer hard negative mining draw_rectangle(x, y, block_width, block_heigth, '#d62d20') hogs_to_hard_mining.append( sub_img_hog ) # Almaceno para hacer hard negative mining else: pedrestrian_success += 1 x += stride_x y += stride_y if SHOW_IMG: plt.title( "Bounding boxes en negro. Ventana deslizante en amarillo. Falsos positivos en Rojo. Positivos en Verde" ) plt.show() # Muestro la imagen return hogs_to_hard_mining, total_pedestrian, pedrestrian_predected, pedrestrian_success
def matching_cascade(tracks, detections, kalman_filter, label_index, age=3, init_age=3, gating_threshold=9.4877, iou_threshold=0.3): """matching cascade tracking list: [1, 2, ..., N] detection list: [1, 2, ..., M] 1. matching by maha_distance. 2. if it is tentative, the max age is 3. if age > 3, just delete this tracker. 3. if it has been matched, the max age is 30. if it has been matched, the age is set to 0. and we need to update the location of bounding boxes that have been matched. if it has not been matched, the age is added 1. 4. How to match targets? predict by linear model tracking list(previous frame) -----------------------> tracking list(current frame) | | if min(distance) < 9.4877 V update target by detection(measurement) that matched <----------------------- compute maha distance Args: tracks: a list of trackers [x, y, a, h] detections: a list of detections [x, y, a, h] kalman_filter: KalmanFilter object label_index: the label that is monotonically increased age: the max age of confirmed trackers init_age: the max age of tentative(unconfirmed) trackers gating_threshold: the threshold of maha_distance iou_threshold: iou threshold of iou matching Returns: new_tracks """ num_trackers = len(tracks) delete_index = [] # starting tracking for i in range(num_trackers): tracker = tracks[i] # the last frame optimal estimation mean = tracker.mean cov = tracker.cov measure = tracker.measurement # predict the current estimation by transformation matrix mean_pred, cov_pred = kalman_filter.predict(mean, cov) tracker.update(mean_pred, cov_pred, measure) # age = age + 1 tracker.predict() if len(detections) > 0: if tracker.tentative and tracker.age <= init_age: maha_distances = kalman_filter.maha_distance( mean_pred, cov_pred, detections) min_distance = np.min(maha_distances) min_arg = np.argmin(maha_distances) if min_distance <= gating_threshold: # 1.set tracker.tentative = False and age = 0 # 2.update distribution and measurement # 3.delete this detection in detections # 4.label this target tracker.matching() # update prediction results by kalman filter new_mean, new_cov = kalman_filter.update( mean_pred, cov_pred, detections[min_arg]) tracker.update(new_mean, new_cov, detections[min_arg]) detections.pop(min_arg) # set label label_index += 1 tracker.label(label_index) elif (not tracker.tentative) and tracker.age <= age: maha_distances = kalman_filter.maha_distance( mean_pred, cov_pred, detections) min_distance = np.min(maha_distances) min_arg = np.argmin(maha_distances) if min_distance <= gating_threshold: # 1.set tracker.tentative = False and age = 0 # 2.update distribution and measurement # 3.delete this detection in detections tracker.matching() # update prediction results by kalman filter new_mean, new_cov = kalman_filter.update( mean_pred, cov_pred, detections[min_arg]) tracker.update(new_mean, new_cov, detections[min_arg]) detections.pop(min_arg) if tracker.tentative and tracker.age > init_age: delete_index.append(i) if (not tracker.tentative) and tracker.age > age: delete_index.append(i) # delete trackers new_tracks = [] delete_set = set(delete_index) total_set = set(np.arange(num_trackers)) remain_set = total_set - delete_set for k in remain_set: new_tracks.append(tracks[k]) # IOU association on the set of unconfirmed and unmatched tracks of age n = 1 for j, tracker in enumerate(new_tracks): tentative = tracker.tentative age = tracker.age mean_ = tracker.mean cov_ = tracker.cov if tentative and age == 1: if len(detections) > 0: tracker_measure = tracker.measurement ious = get_iou(tracker_measure, detections) max_iou = np.max(ious) max_arg = np.argmax(ious) if max_iou >= iou_threshold: tracker.matching() # update prediction results by kalman filter new_mean, new_cov = kalman_filter.update( mean_, cov_, detections[max_arg]) tracker.update(new_mean, new_cov, detections[max_arg]) detections.pop(max_arg) # set label label_index += 1 tracker.label(label_index) # initialize unmatched detections if len(detections) > 0: for t, detection in enumerate(detections): mean_init, cov_init = kalman_filter.initiate(detection) new_tracker = create_tracker(mean_init, cov_init, detection) new_tracks.append(new_tracker) return new_tracks, label_index
def createTargets(imageMeta, imageDims, cfg): #in: imageMeta #out: non-reduced targets ############################# ########## Image ############ ############################# bboxes = imageMeta['objects'] scale = imageDims['scale'] reduced_shape = imageDims['redux_shape'] image_height = reduced_shape[0] image_width = reduced_shape[1] ############################# ###### Set Parameters ####### ############################# rpn_stride = cfg.rpn_stride output_width = int(image_width / rpn_stride) output_height = int(image_height / rpn_stride) anchor_sizes = cfg.anchor_sizes anchor_ratios = cfg.anchor_ratios num_anchors = len(anchor_sizes) * len(anchor_ratios) rpn_min_overlap = cfg.rpn_min_overlap rpn_max_overlap = cfg.rpn_max_overlap ############################# #### Initialize matrices #### ############################# y_rpn_overlap = np.zeros((output_height, output_width, num_anchors)) y_is_box_valid = np.zeros((output_height, output_width, num_anchors)) y_rpn_regr = np.zeros((output_height, output_width, num_anchors * 4)) y_rpn_ancs = np.zeros((output_height, output_width, num_anchors * 4)) num_bboxes = len(bboxes) num_anchors_for_gtbox = np.zeros(num_bboxes).astype(int) best_anchor_for_gtbox = -1 * np.ones((num_bboxes, 4)).astype(int) best_iou_for_gtbox = np.zeros(num_bboxes).astype(np.float32) best_x_for_gtbox = np.zeros((num_bboxes, 4)).astype(int) best_dx_for_gtbox = np.zeros((num_bboxes, 4)).astype(np.float32) ############################# ##### Ground truth boxes #### ############################# gta = helper.normalizeGTboxes(bboxes, scale=scale, roundoff=False) # draw.drawHOI(image, gta[0,:], gta[0,:]) ############################# # Map ground truth 2 anchor # ############################# for anchor_size_idx in range(len(anchor_sizes)): for anchor_ratio_idx in range(len(anchor_ratios)): # w_anc = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][0] # h_anc = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][1] size_ratio = cfg.rpn_stride**2 w = np.round(np.sqrt(size_ratio / anchor_ratios[anchor_ratio_idx])) h = w * anchor_ratios[anchor_ratio_idx] w_anc = w * anchor_sizes[anchor_size_idx] h_anc = h * anchor_sizes[anchor_size_idx] for ix in range(output_width): xmin_anc = float(rpn_stride) * (ix + 0.5) - w_anc / 2 xmax_anc = float(rpn_stride) * (ix + 0.5) + w_anc / 2 if xmin_anc < 0 or xmax_anc > image_width - 1: continue for jy in range(output_height): ymin_anc = float(rpn_stride) * (jy + 0.5) - h_anc / 2 ymax_anc = float(rpn_stride) * (jy + 0.5) + h_anc / 2 if ymin_anc < 0 or ymax_anc > image_height - 1: continue bbox_type = 'neg' best_iou_for_loc = 0.0 at = { 'xmin': xmin_anc, 'ymin': ymin_anc, 'xmax': xmax_anc, 'ymax': ymax_anc } # print((rpn_stride*(ix+0.5), rpn_stride*(jy+0.5)), anchor_sizes[anchor_size_idx], anchor_ratios[anchor_ratio_idx]) for gtidx in range(num_bboxes): gt = gta[gtidx] curr_iou = utils.get_iou(gt, at) if curr_iou > best_iou_for_gtbox[ gtidx] or curr_iou > rpn_max_overlap: tx, ty, tw, th = helper.get_GT_deltas(gt, at) # bxmin, bymin, bw, bh = helper.apply_regr([at['xmin'],at['ymin'],at['xmax']-at['xmin'],at['ymax']-at['ymin']], [tx,ty,tw,th]) # print(curr_iou) # print('at',at['xmin'], at['ymin'], at['xmax'], at['ymax']) # print('gt',gt['xmin'], gt['ymin'], gt['xmax'], gt['ymax']) # print('bb',bxmin, bymin, bxmin + bw, bymin + bh) # all GT boxes should be mapped to an anchor box, so we keep track of which anchor box was best if curr_iou > best_iou_for_gtbox[gtidx]: best_anchor_for_gtbox[gtidx] = [ jy, ix, anchor_ratio_idx, anchor_size_idx ] best_iou_for_gtbox[gtidx] = curr_iou best_x_for_gtbox[gtidx, :] = [ at['xmin'], at['xmax'], at['ymin'], at['ymax'] ] best_dx_for_gtbox[gtidx, :] = [tx, ty, tw, th] # we set the anchor to positive if the IOU is >0.7 (it does not matter if there was another better box, it just indicates overlap) if curr_iou > rpn_max_overlap: # print(curr_iou, at) # print(anchor_sizes[anchor_size_idx], anchor_ratios[anchor_ratio_idx]) bbox_type = 'pos' num_anchors_for_gtbox[gtidx] += 1 # we update the regression layer target if this IOU is the best for the current (x,y) and anchor position if curr_iou > best_iou_for_loc: best_iou_for_loc = curr_iou best_regr = (tx, ty, tw, th) # if the IOU is >0.3 and <0.7, it is ambiguous and no included in the objective if rpn_min_overlap < curr_iou < rpn_max_overlap: # gray zone between neg and pos if bbox_type != 'pos': bbox_type = 'neutral' # turn on or off outputs depending on IOUs anc_idx = (anchor_ratio_idx + len(anchor_ratios) * anchor_size_idx) if bbox_type == 'neg': y_is_box_valid[jy, ix, anc_idx] = 1 y_rpn_overlap[jy, ix, anc_idx] = 0 elif bbox_type == 'neutral': y_is_box_valid[jy, ix, anc_idx] = 0 y_rpn_overlap[jy, ix, anc_idx] = 0 elif bbox_type == 'pos': y_is_box_valid[jy, ix, anc_idx] = 1 y_rpn_overlap[jy, ix, anc_idx] = 1 y_rpn_regr[jy, ix, 4 * anc_idx:4 * anc_idx + 4] = best_regr y_rpn_ancs[jy, ix, 4 * anc_idx:4 * anc_idx + 4] = [ xmin_anc, ymin_anc, xmax_anc - xmin_anc, ymax_anc - ymin_anc ] ############################# ##### Ensure GT Anchors ##### ############################# # we ensure that every bbox has at least one positive RPN region for idx in range(num_anchors_for_gtbox.shape[0]): # print('anchors', idx) if num_anchors_for_gtbox[idx] == 0: # no box with an IOU greater than zero ... # print('no anchors', idx, gta[idx]) if best_anchor_for_gtbox[idx, 0] == -1: continue anc_idx = best_anchor_for_gtbox[ idx, 2] + len(anchor_ratios) * best_anchor_for_gtbox[idx, 3] y_is_box_valid[best_anchor_for_gtbox[idx, 0], best_anchor_for_gtbox[idx, 1], anc_idx] = 1 y_rpn_overlap[best_anchor_for_gtbox[idx, 0], best_anchor_for_gtbox[idx, 1], anc_idx] = 1 y_rpn_regr[best_anchor_for_gtbox[idx, 0], best_anchor_for_gtbox[idx, 1], 4 * anc_idx:4 * anc_idx + 4] = best_dx_for_gtbox[idx, :] # y_rpn_overlap = np.transpose(y_rpn_overlap, (2, 0, 1)) y_rpn_overlap = np.expand_dims(y_rpn_overlap, axis=0) # y_is_box_valid = np.transpose(y_is_box_valid, (2, 0, 1)) y_is_box_valid = np.expand_dims(y_is_box_valid, axis=0) # y_rpn_regr = np.transpose(y_rpn_regr, (2, 0, 1)) y_rpn_regr = np.expand_dims(y_rpn_regr, axis=0) return [ np.copy(y_rpn_overlap), np.copy(y_rpn_regr), np.copy(y_is_box_valid) ]
def model_fn(features, labels, mode, params): """Model function.""" is_training = mode == tf.estimator.ModeKeys.TRAIN query, len_q, ref, len_r = features batch_size = tf.shape(query)[0] # Video feature aggregation (Sec. 3.1). cell = tf.nn.rnn_cell.BasicLSTMCell(params.mem_dim) with tf.variable_scope('video_lstm', reuse=tf.AUTO_REUSE): out1, state1 = tf.nn.dynamic_rnn(cell, query, len_q, dtype=tf.float32) out2, state2 = tf.nn.dynamic_rnn(cell, ref, len_r, dtype=tf.float32) out1 = slim.dropout(out1, keep_prob=params.keep_prob, is_training=is_training) out2 = slim.dropout(out2, keep_prob=params.keep_prob, is_training=is_training) # Matching (Sec. 3.2). forward = tf.nn.rnn_cell.BasicLSTMCell(params.att_dim, name='forward') forward = MatchCellWrapper(forward, out1, len_q) backward = tf.nn.rnn_cell.BasicLSTMCell(params.att_dim, name='backward') backward = MatchCellWrapper(backward, out1, len_q, reuse=tf.AUTO_REUSE) with tf.variable_scope('att'): forward_out, forward_state = tf.nn.dynamic_rnn(forward, out2, len_r, dtype=tf.float32) out2_reverse = tf.reverse_sequence(out2, len_r, 1, 0) backward_out, backward_state = tf.nn.dynamic_rnn(backward, out2_reverse, len_r, dtype=tf.float32) backward_out = tf.reverse_sequence(backward_out, len_r, 1, 0) h = tf.concat([forward_out, backward_out], axis=2, name='concat_H') h = slim.dropout(h, keep_prob=params.keep_prob + 0.2, is_training=is_training) # Localization (Section 3.3). pointer = tf.nn.rnn_cell.BasicLSTMCell(params.att_dim) maxlen = tf.shape(h)[1] with tf.variable_scope('pointer'): point_out, _ = tf.nn.dynamic_rnn(pointer, h, len_r, dtype=tf.float32) logits = slim.fully_connected(point_out, 4, activation_fn=None, scope='loc') # Make predictions. def map_body(x): logits = x[0] length = x[1] logits = logits[:length] prob = tf.nn.log_softmax(logits, axis=1) prob = tf.transpose(prob) initial_it = tf.constant(0, dtype=tf.int32) initial_idx_ta = tf.TensorArray(tf.int32, size=0, dynamic_size=True, element_shape=[]) initial_val_ta = tf.TensorArray(tf.float32, size=0, dynamic_size=True, element_shape=[]) def cond(it, *unused): # Limits the length to be smaller than 1024 frames. return it < tf.minimum(length, 64) def while_body(it, idx_ta, val_ta): # Eq. (11) is implemented here. total = tf.cond(tf.equal(it, 0), lambda: tf.reduce_sum(prob[:2], axis=0), lambda: prob[0, :-it] + prob[1, it:]) def get_inside(): score = tf.tile(prob[2, None, :], [it, 1]) score = tf.reverse_sequence(score, tf.zeros([it], tf.int32) + length, 1, 0) score = tf.reverse_sequence(score, length - tf.range(it), 1, 0) score = score[:, :-it] score = tf.reduce_mean(score, axis=0) return score ave = tf.cond(tf.equal(it, 0), lambda: prob[2], get_inside) total += ave idx = tf.argmax(total, output_type=tf.int32, name='max1') idx_ta = idx_ta.write(it, idx) val_ta = val_ta.write(it, total[idx]) it += 1 return it, idx_ta, val_ta res = tf.while_loop(cond, while_body, [initial_it, initial_idx_ta, initial_val_ta]) final_idx = res[1].stack() final_val = res[2].stack() idx = tf.argmax(final_val, output_type=tf.int32) pred = tf.stack([final_idx[idx], final_idx[idx] + idx + 1]) return pred predictions = tf.map_fn(map_body, [logits, len_r], tf.int32) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, ) # Loss computation. idx = tf.stack([tf.range(batch_size), labels[:, 0]], axis=1) label_st = tf.scatter_nd(idx, tf.ones(batch_size), [batch_size, maxlen]) idx = tf.stack([tf.range(batch_size), labels[:, 1] - 1], axis=1) label_en = tf.scatter_nd(idx, tf.ones(batch_size), [batch_size, maxlen]) inside_t = tf.sequence_mask(labels[:, 1] - labels[:, 0], maxlen) inside_t = tf.reverse_sequence(inside_t, labels[:, 1], 1, 0) outside = tf.logical_not(inside_t) inside_t = tf.to_float(inside_t) outside = tf.to_float(outside) label = tf.stack([label_st, label_en, inside_t, outside], axis=2) # Eq. (10) heavy = tf.reduce_sum(label[:, :, :2], axis=-1) > 0.9 heavy = tf.to_float(heavy) * 9 + 1 label = label / tf.reduce_sum(label, axis=2, keepdims=True) loss = tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=logits) loss *= heavy mask = tf.sequence_mask(len_r, maxlen) loss = tf.boolean_mask(loss, mask) loss = tf.reduce_mean(loss) model_params = tf.trainable_variables() weights = [i for i in model_params if 'bias' not in i.name] loss += params.weight_decay * tf.add_n([tf.nn.l2_loss(v) for v in weights]) # Optimization. gradients = tf.gradients(loss, model_params) clipped_gradients, gradient_norm = tf.clip_by_global_norm( gradients, params.max_gradient_norm) tf.summary.scalar('grad_norm', gradient_norm) tf.summary.scalar('clipped_gradient', tf.global_norm(clipped_gradients)) # boundaries = [200, 400, 600] # staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]] # learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(), # boundaries, staged_lr) # tf.summary.scalar('learning_rate', learning_rate) tensors_to_log = { 'loss': loss, 'step': tf.train.get_global_step(), 'len_q': tf.shape(features[0])[1], 'len_r': tf.shape(features[2])[1] } logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=10) train_hooks = [logging_hook] optimizer = tf.train.AdamOptimizer(params.learning_rate) if is_training: train_op = optimizer.apply_gradients( zip(clipped_gradients, model_params), tf.train.get_global_step()) else: train_op = None # Evaluation. iou = get_iou(predictions, labels) metrics = get_eval_metric(iou) for variable in tf.trainable_variables(): tf.summary.histogram(variable.op.name, variable) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics)
def run_training_loop(model, optimizer, scheduler, device, train_loader, test_loader, criterion_classification, criterion_localization, epochs): logger.info('Start training') for epoch in range(epochs): logger.debug(f'Epoch {epoch + 1}') model.train() running_loss_ce = 0.0 running_loss_mse = 0.0 for i, data in enumerate(train_loader): images, labels, bboxes_gt = data bboxes_gt = torch.stack(bboxes_gt, dim=1) images = images.to(device) labels = labels.to(device) bboxes_gt = bboxes_gt.to(device) optimizer.zero_grad() logits, bboxes = model(images) loss_ce = criterion_classification(logits, labels) loss_mse = criterion_localization(bboxes, bboxes_gt.float()) * 50 loss = loss_ce + loss_mse loss.backward() optimizer.step() running_loss_ce += loss_ce.item() running_loss_mse += loss_mse.item() print_every = 5 if (i + 1) % print_every == 0: running_loss_ce = running_loss_ce / print_every running_loss_mse = running_loss_mse / print_every logger.debug(f'[{epoch + 1}, {i + 1}] ' f'loss_ce: {running_loss_ce:.3f}, ' f'loss_mse {running_loss_mse:.3f}') running_loss_ce = 0 running_loss_mse = 0 scheduler.step() correct = 0 total = 0 iou = 0 model.eval() with torch.no_grad(): for data in test_loader: test_images, test_labels, test_bboxes = data test_images = test_images.to(device) outputs = model(test_images) _, predicted = torch.max(outputs[0].cpu().data, 1) total += test_labels.size(0) correct += (predicted == test_labels).sum().item() bbox_gt = [a.item() for a in test_bboxes] bbox = outputs[1].cpu().data.numpy().flatten() iou += utils.get_iou(bbox_gt, bbox) iou = iou / total accuracy = 100 * correct / total logger.info(f'Test -- Accuracy: {accuracy:.4f}, IoU: {iou:.4f}')
def gen_crop(img, boxes, landmark=None, display=False): height, width, _ = img.shape # 每一种类别的数量, pos, part, neg, landmark crop_nums = [0] * 4 # 先随机产生一定数量neg-img if landmark is None: crop_nums[2] += yield from gen_neg_img(img, boxes) # 在每个人脸框附近产生每个类别的数据 for box in boxes: x1, y1, w, h = box # 忽略小脸 if min(w, h) < 20 or x1 < 0 or y1 < 0: continue if landmark is not None: yield img[y1: y1+h, x1: x1+w], -2, (*[0.] * 4, (landmark - box[:2]) / box[3:]), # 每个框判断 for i in range(15): size = npr.randint(int(min(w, h) * 0.8), np.ceil(1.2 * max(w, h))) delta_x = npr.randint(-w * 0.2, w * 0.2) delta_y = npr.randint(-h * 0.2, h * 0.2) nx1 = int(max(x1 + w / 2 + delta_x - size / 2, 0)) ny1 = int(max(y1 + h / 2 + delta_y - size / 2, 0)) nx2 = nx1 + size ny2 = ny1 + size if nx2 > width or ny2 > height: continue crop_box = np.array([nx1, ny1, size, size]) # crop cropped_im = img[ny1: ny2, nx1: nx2] box_ = box.reshape(1, -1) iou = get_iou(crop_box, box_) if iou < 0.4: continue # yu gt de offset offset_x1 = (x1 - nx1) / size offset_y1 = (y1 - ny1) / size offset_x2 = (x1+w-nx1-size) / size offset_y2 = (y1+h-ny1-size) / size if iou >= 0.65: if landmark is None: yield cropped_im, 1, (offset_x1, offset_y1, offset_x2, offset_y2) crop_box[0] += 1 else: marks = (landmark - crop_box[:2]) / crop_box[3:] yield cropped_im, -2, (offset_x1, offset_y1, offset_x2, offset_y2, marks) crop_box[3] += 1 elif landmark is None and iou >= 0.4: yield cropped_im, -1, (offset_x1, offset_y1, offset_x2, offset_y2) crop_box[1] += 1 if display: print("pos: %d part: %d neg: %d lanmark: %d" % tuple(crop_box))
def get_VDG(box1, box2): ''' Visual Dependency Grammar input: 2 bbox output: spatial relation between 2 boxes size of image 785 x 1024 (width = 1024, height = 785) ''' box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1]) box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1]) box1_overlay = utils.percent_overlap(box1, box2) box2_overlay = utils.percent_overlap(box2, box1) iou = utils.get_iou(box1, box2) if box2_overlay > 0.9: # box2 is in box1 --> box1 - '13':'covering' - box2 predicate = 'covering' encode = '13' return (predicate, encode) if box1_overlay > 0.5: # box1 - '31':'on' - box2 predicate = 'on' encode = '31' return (predicate, encode) box_np1 = np.asarray(box1) box_np2 = np.asarray(box2) centroid1 = np.asarray([(box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2]) centroid2 = np.asarray([(box2[0] + box2[2]) / 2, (box2[1] + box2[3]) / 2]) vec_centroid = centroid2 - centroid1 vec_anchor = np.asarray([1, 0]) unit_vector_1 = vec_centroid / np.linalg.norm(vec_centroid) unit_vector_2 = vec_anchor / np.linalg.norm(vec_anchor) dot_product = np.dot(unit_vector_1, unit_vector_2) # cos(alpha) thres_cos_1 = np.cos(45 * np.pi / 180) thres_cos_2 = np.cos(135 * np.pi / 180) if dot_product > thres_cos_1 or dot_product < thres_cos_2: # beside or opposite if np.abs( centroid1[0] - centroid2[0] ) / 1024 > 0.7: # And different size of 2 objects (need implement) predicate = 'across' # '2' encode = '2' return (predicate, encode) else: predicate = 'near' # '29' encode = '29' return (predicate, encode) if dot_product <= thres_cos_1 and dot_product >= thres_cos_2: # below or above if box1_overlay < 0.1 and box2_overlay < 0.1: if centroid1[1] > centroid2[1] and box1[1] > centroid2[1]: predicate = 'under' # '43' encode = '43' return (predicate, encode) if centroid1[1] <= centroid2[1] and box1[3] <= centroid2[1]: predicate = 'above' # '1' encode = '1' return (predicate, encode) else: predicate = 'near' # '29' encode = '29' return (predicate, encode) else: predicate = 'near' # '29' encode = '29' return (predicate, encode)
def combineSimilarBBs(imagesMeta, labels, minIoU=0.4): new_imagesMeta = {} for imageID, imageMeta in imagesMeta.items(): # print('ID', imageID) data = {'prsBB': [], 'objBB': [], 'labels': []} nb_rels = 0 for relID, rel in imageMeta['rels'].items(): data['prsBB'].append(rel['prsBB']) data['objBB'].append(rel['objBB']) data['labels'].append(rel['label']) nb_rels += 1 for key in ['prsBB', 'objBB']: bbData = data[key] similars = np.array([i for i in range(nb_rels)], dtype=np.int) already_taken = [] disabled = [] while True: should_I_stay_or_should_I_go = 'go' for fstID, fstBB in enumerate(bbData[0:-1]): if fstID in already_taken: continue for secID, secBB in enumerate(bbData[fstID + 1:]): secID += fstID + 1 # print(data['labels']) # print(data['labels'][fstID]) fstNames = labels[data['labels'][fstID]] secNames = labels[data['labels'][secID]] if key == 'objBB' and \ (fstNames['pred'] == secNames['pred'] or \ fstNames['obj'] != secNames['obj']): # print(fstNames, secNames) continue if secID in disabled: continue # print('bbs',key, fstBB, secBB) # print(utils.get_iou(fstBB, secBB, weight=True)) if utils.get_iou(fstBB, secBB, weight=True) > minIoU: if similars[secID] != secID: similars[similars == fstID] = similars[secID] already_taken.append(fstID) else: similars[similars == secID] = fstID already_taken.append(secID) should_I_stay_or_should_I_go = 'stay' if should_I_stay_or_should_I_go == 'go': # converged break new_bbData = [{} for i in range(len(bbData))] tmp_conn = [] for sim in similars: bb = bbData[sim] if sim in tmp_conn: meanBB = new_bbData[sim] meanBB = utils.meanBB(meanBB, bb) else: meanBB = bb new_bbData[sim] = meanBB bbData = new_bbData disabled = already_taken data[key] = bbData data[key + 'sims'] = similars tmp_rels = {} for relID in range(nb_rels): prsIdx = data['prsBBsims'][relID] objIdx = data['objBBsims'][relID] label = data['labels'][relID] if imageID == 'HICO_test2015_00000007.jpg': print(prsIdx, objIdx, label) if prsIdx not in tmp_rels: tmp_rels[prsIdx] = {} if objIdx not in tmp_rels[prsIdx]: tmp_rels[prsIdx][objIdx] = [] tmp_rels[prsIdx][objIdx].append(label) rels = {} relID = 0 for prsIdx, sub_rels in tmp_rels.items(): for objIdx, insLabels in sub_rels.items(): prsBB = data['prsBB'][prsIdx] objBB = data['objBB'][objIdx] rel = {'prsBB': prsBB, 'objBB': objBB, 'labels': insLabels} rels[relID] = rel relID += 1 new_imagesMeta[imageID] = { 'imageName': imageMeta['imageName'], 'rels': rels } return new_imagesMeta
def __getitem__(self, idx): out_dict = {} out_dict['args'] = self.args datum = self.data[idx] # uid = datum['uid'] # out_dict['uid'] = uid # test = 'test' in datum['annot_id'] # out_dict['is_test'] = test ###### Image ###### if self.args.use_vision: img_id = datum['image_id'] out_dict['img_id'] = img_id # img_path = coco_img_dir.joinpath(datum['img_fn']) # assert img_path.exists() # out_dict['img_path'] = img_path # source = self.img_ids_to_source[img_id] source = self.split f = self.source_to_h5[source] if isinstance(f, Path): f = h5py.File(f, 'r') self.source_to_h5[source] = f img_h = f[f'{img_id}/img_h'][()] img_w = f[f'{img_id}/img_w'][()] # pred_boxes = f[f'{img_id}/boxes'] boxes = f[f'{img_id}/boxes'][:self.args.n_boxes] # shuffle box order if self.args.shuffle_boxes and self.mode == 'train': box_indices = np.arange(len(boxes)) np.random.shuffle(box_indices) boxes = boxes[box_indices] n_boxes = len(boxes) out_dict['n_boxes'] = n_boxes ref_box = datum['refBox'] ref_box = xywh_to_xyxy(np.array([ref_box])) ious = get_iou(torch.tensor(boxes, dtype=torch.float), torch.tensor(ref_box, dtype=torch.float)) threshold = 0.5 scores = ious.detach().numpy().flatten() scores[scores < threshold] = 0 scores = scores.astype(np.float64) exists_target = scores.sum() > 0 if exists_target: correct_indices = np.nonzero(scores)[0].tolist() prob = scores / scores.sum() choice = np.random.multinomial(1, prob).argmax() else: correct_indices = [] choice = -100 # Normalize the boxes (to 0 ~ 1) boxes[:, (0, 2)] /= img_w boxes[:, (1, 3)] /= img_h np.testing.assert_array_less(boxes, 1 + 1e-5) # np.testing.assert_array_less(boxes, 1+5e-2) np.testing.assert_array_less(-boxes, 0 + 1e-5) boxes = torch.from_numpy(boxes) # assert boxes.size() == (36, 4), (boxes.size(), # datum['img_id'], gt_boxes.shape, pred_boxes.shape) boxes.clamp_(min=0.0, max=1.0) out_dict['boxes'] = boxes feats = f[f'{img_id}/features'][:self.args.n_boxes] if self.args.shuffle_boxes and self.mode == 'train': feats = feats[box_indices] feats = torch.from_numpy(feats) out_dict['vis_feats'] = feats out_dict['boxes'] = boxes ###### Text #####x sent = datum['caption'] # prefix = "refer expressions:" prefix = "visual grounding:" # prefix = "grounding:" input_text = f'{prefix} {sent}' if exists_target: if self.args.vis_pointer: all_target_ids = correct_indices target_text = '' else: target_text = f'<vis_extra_id_{choice}>' all_target_ids = self.tokenizer.convert_tokens_to_ids( [f'<vis_extra_id_{idx}>' for idx in correct_indices]) else: if self.args.vis_pointer: all_target_ids = [] target_text = '' else: target_text = '' all_target_ids = [] out_dict['exists_target'] = exists_target out_dict['iou'] = ious out_dict['target'] = choice out_dict['all_targets'] = correct_indices out_dict['all_target_ids'] = all_target_ids input_ids = self.tokenizer.encode(input_text, max_length=self.args.max_text_length, truncation=True) target_ids = self.tokenizer.encode( target_text, max_length=self.args.max_text_length, truncation=True) out_dict['input_ids'] = torch.LongTensor(input_ids) out_dict['input_length'] = len(input_ids) out_dict['target_ids'] = torch.LongTensor(target_ids) out_dict['target_length'] = len(target_ids) out_dict['input_text'] = input_text out_dict['target_text'] = target_text return out_dict