Пример #1
0
def main():

    args = parse_args()
    cfg.set_args(args.gpu_ids)
    cudnn.fastest = True
    cudnn.benchmark = True
    cudnn.deterministic = False
    cudnn.enabled = True

    tester = Tester(args.backbone)
    tester._make_batch_generator()

    for epoch in range(args.model_epoch[0], args.model_epoch[1]):

        tester._make_model(epoch)

        preds = []

        with torch.no_grad():
            for itr, input_img in enumerate(tqdm(tester.batch_generator)):

                # forward
                coord_out = tester.model(input_img)

                if cfg.flip_test:
                    flipped_input_img = flip(input_img, dims=3)
                    flipped_coord_out = tester.model(flipped_input_img)
                    flipped_coord_out[:, :, 0] = cfg.output_shape[
                        1] - flipped_coord_out[:, :, 0] - 1
                    for pair in tester.flip_pairs:
                        flipped_coord_out[:, pair[
                            0], :], flipped_coord_out[:, pair[
                                1], :] = flipped_coord_out[:, pair[1], :].clone(
                                ), flipped_coord_out[:, pair[0], :].clone()
                    coord_out = (coord_out + flipped_coord_out) / 2.

                vis = False
                if vis:
                    filename = str(itr)
                    tmpimg = input_img[0].cpu().numpy()
                    tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(
                        3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1)
                    tmpimg = tmpimg.astype(np.uint8)
                    tmpimg = tmpimg[::-1, :, :]
                    tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy()
                    tmpkps = np.zeros((3, tester.joint_num))
                    tmpkps[:2, :] = coord_out[
                        0, :, :2].cpu().numpy().transpose(
                            1, 0) / cfg.output_shape[0] * cfg.input_shape[0]
                    tmpkps[2, :] = 1
                    tmpimg = vis_keypoints(tmpimg, tmpkps, tester.skeleton)
                    cv2.imwrite(filename + '_output.jpg', tmpimg)

                coord_out = coord_out.cpu().numpy()
                preds.append(coord_out)

        # evaluate
        preds = np.concatenate(preds, axis=0)
        tester._evaluate(preds, cfg.result_dir)
Пример #2
0
    def evaluate(self, preds, result_dir):
        
        print('Evaluation start...')
        gts = self.data
        sample_num = len(preds)
        joint_num = self.original_joint_num
 
        pred_2d_save = {}
        pred_3d_save = {}
        for n in range(sample_num):
            
            gt = gts[n]
            f = gt['f']
            c = gt['c']
            bbox = gt['bbox']
            gt_3d_root = gt['root_cam']
            img_name = gt['img_path'].split('/')
            img_name = img_name[-2] + '_' + img_name[-1].split('.')[0] # e.g., TS1_img_0001
            
            # restore coordinates to original space
            pred_2d_kpt = preds[n].copy()
            # only consider eval_joint
            pred_2d_kpt = np.take(pred_2d_kpt, self.eval_joint, axis=0)
            pred_2d_kpt[:,0] = pred_2d_kpt[:,0] / cfg.output_shape[1] * bbox[2] + bbox[0]
            pred_2d_kpt[:,1] = pred_2d_kpt[:,1] / cfg.output_shape[0] * bbox[3] + bbox[1]
            pred_2d_kpt[:,2] = (pred_2d_kpt[:,2] / cfg.depth_dim * 2 - 1) * (cfg.bbox_3d_shape[0]/2) + gt_3d_root[2]

            # 2d kpt save
            if img_name in pred_2d_save:
                pred_2d_save[img_name].append(pred_2d_kpt[:,:2])
            else:
                pred_2d_save[img_name] = [pred_2d_kpt[:,:2]]

            vis = False
            if vis:
                cvimg = cv2.imread(gt['img_path'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
                filename = str(random.randrange(1,500))
                tmpimg = cvimg.copy().astype(np.uint8)
                tmpkps = np.zeros((3,joint_num))
                tmpkps[0,:], tmpkps[1,:] = pred_2d_kpt[:,0], pred_2d_kpt[:,1]
                tmpkps[2,:] = 1
                tmpimg = vis_keypoints(tmpimg, tmpkps, self.skeleton)
                cv2.imwrite(filename + '_output.jpg', tmpimg)

            # back project to camera coordinate system
            pred_3d_kpt = np.zeros((joint_num,3)) = pixel2cam(pred_2d_kpt, f, c)
            
            # 3d kpt save
            if img_name in pred_3d_save:
                pred_3d_save[img_name].append(pred_3d_kpt)
            else:
                pred_3d_save[img_name] = [pred_3d_kpt]
        
        output_path = osp.join(result_dir,'preds_2d_kpt_mupots.mat')
        sio.savemat(output_path, pred_2d_save)
        print("Testing result is saved at " + output_path)
        output_path = osp.join(result_dir,'preds_3d_kpt_mupots.mat')
        sio.savemat(output_path, pred_3d_save)
        print("Testing result is saved at " + output_path)
    def evaluate(self, preds, result_dir):
        
        print('Evaluation start...')
        gts = self.data
        assert len(gts) == len(preds)
        sample_num = len(gts)
        joint_num = self.joint_num
        
        pred_save = []
        for n in range(sample_num):
            gt = gts[n]
            image_id = gt['img_id']
            f = gt['f']
            c = gt['c']
            bbox = gt['bbox']
            gt_3d_root = gt['root_cam']
            gt_3d_kpt = gt['joint_cam']
            gt_vis = gt['joint_vis']
            
            # restore coordinates to original space
            pre_2d_kpt = preds[n].copy()
            pre_2d_kpt[:,0], pre_2d_kpt[:,1], pre_2d_kpt[:,2] = warp_coord_to_original(pre_2d_kpt, bbox, gt_3d_root)
            
            vis = False
            if vis:
                cvimg = cv2.imread(gt['img_path'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
                filename = str(random.randrange(1,500))
                tmpimg = cvimg.copy().astype(np.uint8)
                tmpkps = np.zeros((3,joint_num))
                tmpkps[0,:], tmpkps[1,:] = pre_2d_kpt[:,0], pre_2d_kpt[:,1]
                tmpkps[2,:] = 1
                tmpimg = vis_keypoints(tmpimg, tmpkps, self.skeleton)
                cv2.imwrite(filename + '_output.jpg', tmpimg)

            # back project to camera coordinate system
            pred_3d_kpt = np.zeros((joint_num,3))
            pred_3d_kpt[:,0], pred_3d_kpt[:,1], pred_3d_kpt[:,2] = pixel2cam(pre_2d_kpt, f, c)

            # root joint alignment
            pred_3d_kpt = pred_3d_kpt - pred_3d_kpt[self.root_idx]
            gt_3d_kpt  = gt_3d_kpt - gt_3d_kpt[self.root_idx]

            # rigid alignment for PA MPJPE (protocol #1)
            pred_3d_kpt_align = rigid_align(pred_3d_kpt, gt_3d_kpt)
            
            # exclude thorax
            pred_3d_kpt = np.take(pred_3d_kpt, self.eval_joint, axis=0)
            pred_3d_kpt_align = np.take(pred_3d_kpt_align, self.eval_joint, axis=0)

            # prediction save
            pred_save.append({'image_id': image_id, 'joint_cam': pred_3d_kpt.tolist(), 'joint_cam_aligned': pred_3d_kpt_align.tolist(), 'bbox': bbox.tolist(), 'root_cam': gt_3d_root.tolist()}) # joint_cams are root-relative

        output_path = osp.join(result_dir, 'bbox_root_pose_human36m_output.json')
        with open(output_path, 'w') as f:
            json.dump(pred_save, f)
        print("Test result is saved at " + output_path)

        calculate_score(output_path, self.annot_dir, self.subject_list)
Пример #4
0
def show_pose(input_patch, img_patch, coord_out, skeleton, save_path, frame):
    tmpimg = input_patch[0].cpu().numpy()
    tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(3,1,1) + np.array(cfg.pixel_mean).reshape(3,1,1)
    tmpimg = (tmpimg).astype(np.uint8)
    tmpimg = tmpimg[::-1, :, :]
    tmpimg = np.transpose(tmpimg,(1,2,0)).copy()
    tmpkps = np.zeros((3,18))
    tmpkps[:2,:] = coord_out[0,:,:2].transpose(1,0) / cfg.output_shape[0] * cfg.input_shape[0]
    tmpkps[2,:] = 1
    tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton)
    tmpimg = cv2.resize(tmpimg,(img_patch.shape[1],img_patch.shape[0]))
    file_name = save_path+'\\{0}.png'.format(str(frame).zfill(2))
    cv2.imwrite(file_name, tmpimg)
Пример #5
0
def train(loader,
          ds_rd,
          model,
          criterion,
          optimizer,
          epoch,
          n_iter=-1,
          logger=None,
          opts=None,
          visualizer=None):
    '''
	iter through epoch , return rst{'acc', loss'} each as list can be used outside for updating.
	:param loader:
	:param model:
	:param criterion:
	:param optimizer:
	:param epoch:  for print infor
	:param n_iter: the iteration wanted, -1 for all iters
	:param opts: keep some additional controls
	:param visualizer: for visualizer
	:return:
	'''
    batch_time = ut.AverageMeter()
    data_time = ut.AverageMeter()
    losses = ut.AverageMeter()
    acc = ut.AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()
    li_loss = []
    li_acc = []
    for i, inp_dct in enumerate(loader):
        # get items
        if i >= n_iter and n_iter > 0:  # break if iter is set and i is greater than that
            break
        input = inp_dct['pch']
        target = inp_dct['hms']  # 14 x 64 x 1??
        target_weight = inp_dct['joints_vis']

        # measure data loading time     weight, visible or not
        data_time.update(time.time() - end)

        # compute output
        outputs = model(input)  # no need to cuda it?

        target = target.cuda(non_blocking=True)
        target_weight = target_weight.cuda(non_blocking=True)

        if isinstance(outputs, list):  # list multiple stage version
            loss = criterion(outputs[0], target, target_weight)
            for output in outputs[1:]:
                loss += criterion(output, target, target_weight)
        else:
            output = outputs
            loss = criterion(output, target, target_weight)

        # compute gradient and do update step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure accuracy and record loss
        losses.update(loss.item(), input.size(0))
        _, avg_acc, cnt, pred = accuracy(
            output.detach().cpu().numpy(),
            target.detach().cpu().numpy()
        )  # hm directly, with normalize with 1/10 dim,  pck0.5,  cnt: n_smp,  pred
        acc.update(avg_acc, cnt)  # keep average acc

        if visualizer and 0 == i % opts.update_html_freq:  # update current result, get vis dict
            n_jt = ds_rd.joint_num_ori
            mod0 = opts.mod_src[0]
            mean = ds_rd.means[mod0]
            std = ds_rd.stds[mod0]
            img_patch_vis = ut.ts2cv2(
                input[0], mean,
                std)  # to CV BGR, mean std control channel detach inside
            # pseudo change
            cm = getattr(cv2, ds_rd.dct_clrMap[mod0])
            img_patch_vis = cv2.applyColorMap(img_patch_vis,
                                              cm)[..., ::-1]  # RGB

            # get pred
            pred2d_patch = np.ones((n_jt, 3))  # 3rd for  vis
            pred2d_patch[:, :2] = pred[0] / opts.out_shp[0] * opts.sz_pch[1]
            img_skel = vis.vis_keypoints(img_patch_vis, pred2d_patch,
                                         ds_rd.skels_idx)

            hm_gt = target[0].cpu().detach().numpy().sum(axis=0)  # HXW
            hm_gt = ut.normImg(hm_gt)

            hm_pred = output[0].detach().cpu().numpy().sum(axis=0)
            hm_pred = ut.normImg(hm_pred)
            img_cb = vis.hconcat_resize([img_skel, hm_gt, hm_pred])
            vis_dict = {'img_cb': img_cb}
            visualizer.display_current_results(vis_dict, epoch, False)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % opts.print_freq == 0:
            msg = 'Epoch: [{0}][{1}/{2}]\t' \
                  'Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \
                  'Speed {speed:.1f} samples/s\t' \
                  'Data {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \
                  'Loss {loss.val:.5f} ({loss.avg:.5f})\t' \
                  'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
             epoch, i, len(loader), batch_time=batch_time,
             speed=input.size(0) / batch_time.val,
             data_time=data_time, loss=losses, acc=acc)
            logger.info(msg)
            li_loss.append(losses.val)  # the current loss
            li_acc.append(acc.val)

    return {'losses': li_loss, 'accs': li_acc}
Пример #6
0
    def evaluate(self, preds, result_dir):

        print('Evaluation start...')
        gts = self.data
        assert len(gts) == len(preds)
        sample_num = len(gts)

        pred_save = []
        error = np.zeros((sample_num, self.joint_num - 1))  # joint error
        error_action = [[] for _ in range(len(self.action_name))
                        ]  # error for each sequence
        for n in range(sample_num):
            gt = gts[n]
            image_id = gt['img_id']
            f = gt['f']
            c = gt['c']
            bbox = gt['bbox']
            gt_3d_root = gt['root_cam']
            gt_3d_kpt = gt['joint_cam']
            gt_vis = gt['joint_vis']

            # restore coordinates to original space
            pred_2d_kpt = preds[n].copy()
            pred_2d_kpt[:, 0] = pred_2d_kpt[:, 0] / cfg.output_shape[1] * bbox[
                2] + bbox[0]
            pred_2d_kpt[:, 1] = pred_2d_kpt[:, 1] / cfg.output_shape[0] * bbox[
                3] + bbox[1]
            pred_2d_kpt[:,
                        2] = (pred_2d_kpt[:, 2] / cfg.depth_dim * 2 -
                              1) * (cfg.bbox_3d_shape[0] / 2) + gt_3d_root[2]

            vis = False
            if vis:
                cvimg = cv2.imread(
                    gt['img_path'],
                    cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
                filename = str(random.randrange(1, 500))
                tmpimg = cvimg.copy().astype(np.uint8)
                tmpkps = np.zeros((3, self.joint_num))
                tmpkps[0, :], tmpkps[1, :] = pred_2d_kpt[:, 0], pred_2d_kpt[:,
                                                                            1]
                tmpkps[2, :] = 1
                tmpimg = vis_keypoints(tmpimg, tmpkps, self.skeleton)
                cv2.imwrite(filename + '_output.jpg', tmpimg)

            # back project to camera coordinate system
            pred_3d_kpt = pixel2cam(pred_2d_kpt, f, c)

            # root joint alignment
            pred_3d_kpt = pred_3d_kpt - pred_3d_kpt[self.root_idx]
            gt_3d_kpt = gt_3d_kpt - gt_3d_kpt[self.root_idx]

            if self.protocol == 1:
                # rigid alignment for PA MPJPE (protocol #1)
                pred_3d_kpt = rigid_align(pred_3d_kpt, gt_3d_kpt)

            # exclude thorax
            pred_3d_kpt = np.take(pred_3d_kpt, self.eval_joint, axis=0)
            gt_3d_kpt = np.take(gt_3d_kpt, self.eval_joint, axis=0)

            # error calculate
            error[n] = np.sqrt(np.sum((pred_3d_kpt - gt_3d_kpt)**2, 1))
            img_name = gt['img_path']
            action_idx = int(img_name[img_name.find('act') +
                                      4:img_name.find('act') + 6]) - 2
            error_action[action_idx].append(error[n].copy())

            # prediction save
            pred_save.append({
                'image_id': image_id,
                'joint_cam': pred_3d_kpt.tolist(),
                'bbox': bbox.tolist(),
                'root_cam': gt_3d_root.tolist()
            })  # joint_cam is root-relative coordinate

        # total error
        tot_err = np.mean(error)
        metric = 'PA MPJPE' if self.protocol == 1 else 'MPJPE'
        eval_summary = 'Protocol ' + str(
            self.protocol) + ' error (' + metric + ') >> tot: %.2f\n' % (
                tot_err)

        # error for each action
        for i in range(len(error_action)):
            err = np.mean(np.array(error_action[i]))
            eval_summary += (self.action_name[i] + ': %.2f ' % err)

        print(eval_summary)

        # prediction save
        output_path = osp.join(result_dir,
                               'bbox_root_pose_human36m_output.json')
        with open(output_path, 'w') as f:
            json.dump(pred_save, f)
        print("Test result is saved at " + output_path)
Пример #7
0
    pose_3d[:, 0] = pose_3d[:, 0] / cfg.output_shape[1] * cfg.input_shape[1]
    pose_3d[:, 1] = pose_3d[:, 1] / cfg.output_shape[0] * cfg.input_shape[0]
    pose_3d_xy1 = np.concatenate(
        (pose_3d[:, :2], np.ones_like(pose_3d[:, :1])), 1)
    img2bb_trans_001 = np.concatenate(
        (img2bb_trans, np.array([0, 0, 1]).reshape(1, 3)))
    pose_3d[:, :2] = np.dot(np.linalg.inv(img2bb_trans_001),
                            pose_3d_xy1.transpose(1, 0)).transpose(1, 0)[:, :2]
    output_pose_2d_list.append(pose_3d[:, :2].copy())

    # root-relative discretized depth -> absolute continuous depth
    pose_3d[:, 2] = (pose_3d[:, 2] / cfg.depth_dim * 2 -
                     1) * (cfg.bbox_3d_shape[0] / 2) + root_depth_list[n]
    pose_3d = pixel2cam(pose_3d, focal, princpt)
    output_pose_3d_list.append(pose_3d.copy())

# visualize 2d poses
vis_img = original_img.copy()
for n in range(person_num):
    vis_kps = np.zeros((3, joint_num))
    vis_kps[0, :] = output_pose_2d_list[n][:, 0]
    vis_kps[1, :] = output_pose_2d_list[n][:, 1]
    vis_kps[2, :] = 1
    vis_img = vis_keypoints(vis_img, vis_kps, skeleton)
cv2.imwrite('output_pose_2d.jpg', vis_img)

# visualize 3d poses
vis_kps = np.array(output_pose_3d_list)
vis_3d_multiple_skeleton(vis_kps, np.ones_like(vis_kps), skeleton,
                         'output_pose_3d (x,y,z: camera-centered. mm.)')
Пример #8
0
    def evaluate(self, preds, result_dir):

        print()
        print('Evaluation start...')

        gts = self.load_data()

        assert len(gts) == len(preds)

        sample_num = len(gts)
        joint_num = self.joint_num

        p1_error = np.zeros(
            (sample_num, joint_num, 3))  # PA MPJPE (protocol #1 metric)
        p2_error = np.zeros(
            (sample_num, joint_num, 3))  # MPJPE (protocol #2 metroc)
        p1_error_action = [[] for _ in range(len(self.action_idx))
                           ]  # PA MPJPE for each action
        p2_error_action = [[] for _ in range(len(self.action_idx))
                           ]  # MPJPE error for each action
        pred_to_save = []
        for n in range(sample_num):

            gt = gts[n]
            f = gt['f']
            c = gt['c']
            bbox = gt['bbox']
            gt_3d_center = gt['center_cam']
            gt_3d_kpt = gt['joint_cam']
            gt_vis = gt['joint_vis'].copy()

            # restore coordinates to original space
            pre_2d_kpt = preds[n].copy()
            pre_2d_kpt[:,
                       0], pre_2d_kpt[:,
                                      1], pre_2d_kpt[:,
                                                     2] = warp_coord_to_original(
                                                         pre_2d_kpt, bbox,
                                                         gt_3d_center)

            vis = False
            if vis:
                cvimg = cv2.imread(
                    gt['img_path'],
                    cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
                filename = str(random.randrange(1, 500))
                tmpimg = cvimg.copy().astype(np.uint8)
                tmpkps = np.zeros((3, joint_num))
                tmpkps[0, :], tmpkps[1, :] = pre_2d_kpt[:, 0], pre_2d_kpt[:, 1]
                tmpkps[2, :] = 1
                tmpimg = vis_keypoints(tmpimg, tmpkps, self.skeleton)
                cv2.imwrite(osp.join(cfg.vis_dir, filename + '_output.jpg'),
                            tmpimg)

            # back project to camera coordinate system
            pre_3d_kpt = np.zeros((joint_num, 3))
            pre_3d_kpt[:, 0], pre_3d_kpt[:, 1], pre_3d_kpt[:, 2] = pixel2cam(
                pre_2d_kpt, f, c)

            vis = False
            if vis:
                vis_3d_skeleton(pre_3d_kpt, gt_vis, self.skeleton, filename)

            # root joint alignment
            pre_3d_kpt = pre_3d_kpt - pre_3d_kpt[self.root_idx]
            gt_3d_kpt = gt_3d_kpt - gt_3d_kpt[self.root_idx]

            # rigid alignment for PA MPJPE (protocol #1)
            pre_3d_kpt_align = rigid_align(pre_3d_kpt, gt_3d_kpt)

            # prediction save
            pred_to_save.append({
                'pred': pre_3d_kpt,
                'align_pred': pre_3d_kpt_align,
                'gt': gt_3d_kpt
            })

            # error save
            p1_error[n] = np.power(pre_3d_kpt_align - gt_3d_kpt,
                                   2)  # PA MPJPE (protocol #1)
            p2_error[n] = np.power(pre_3d_kpt - gt_3d_kpt,
                                   2)  # MPJPE (protocol #2)

            img_name = gt['img_path']
            action_idx = int(img_name[img_name.find('act') +
                                      4:img_name.find('act') + 6]) - 2
            p1_error_action[action_idx].append(p1_error[n].copy())
            p2_error_action[action_idx].append(p2_error[n].copy())

        # total error calculate
        p1_error = np.take(p1_error, self.eval_joint, axis=1)
        p2_error = np.take(p2_error, self.eval_joint, axis=1)
        p1_error = np.mean(np.power(np.sum(p1_error, axis=2), 0.5))
        p2_error = np.mean(np.power(np.sum(p2_error, axis=2), 0.5))

        p1_eval_summary = 'Protocol #1 error (PA MPJPE) >> %.2f' % (p1_error)
        p2_eval_summary = 'Protocol #2 error (MPJPE) >> %.2f' % (p2_error)
        print()
        print(p1_eval_summary)
        print(p2_eval_summary)

        # error for each action calculate
        p1_action_eval_summary = 'Protocol #1 error (PA MPJPE) for each action: \n'
        for i in range(len(p1_error_action)):
            err = np.array(p1_error_action[i])
            err = np.take(err, self.eval_joint, axis=1)
            err = np.mean(np.power(np.sum(err, axis=2), 0.5))

            action_name = self.action_name[i]
            p1_action_eval_summary += (action_name + ': %.2f\n' % err)

        p2_action_eval_summary = 'Protocol #2 error (MPJPE) for each action: \n'
        for i in range(len(p2_error_action)):
            err = np.array(p2_error_action[i])
            err = np.take(err, self.eval_joint, axis=1)
            err = np.mean(np.power(np.sum(err, axis=2), 0.5))

            action_name = self.action_name[i]
            p2_action_eval_summary += (action_name + ': %.2f\n' % err)
        print()
        print(p1_action_eval_summary)
        print(p2_action_eval_summary)

        # result save
        f_pred_3d_kpt = open(osp.join(result_dir, 'pred_3d_kpt.txt'), 'w')
        f_pred_3d_kpt_align = open(
            osp.join(result_dir, 'pred_3d_kpt_align.txt'), 'w')
        f_gt_3d_kpt = open(osp.join(result_dir, 'gt_3d_kpt.txt'), 'w')
        for i in range(len(pred_to_save)):
            for j in range(joint_num):
                for k in range(3):
                    f_pred_3d_kpt.write('%.3f ' %
                                        pred_to_save[i]['pred'][j][k])
                    f_pred_3d_kpt_align.write(
                        '%.3f ' % pred_to_save[i]['align_pred'][j][k])
                    f_gt_3d_kpt.write('%.3f ' % pred_to_save[i]['gt'][j][k])
            f_pred_3d_kpt.write('\n')
            f_pred_3d_kpt_align.write('\n')
            f_gt_3d_kpt.write('\n')
        f_pred_3d_kpt.close()
        f_pred_3d_kpt_align.close()
        f_gt_3d_kpt.close()

        f_eval_result = open(osp.join(result_dir, 'eval_result.txt'), 'w')
        f_eval_result.write(p1_eval_summary)
        f_eval_result.write('\n')
        f_eval_result.write(p2_eval_summary)
        f_eval_result.write('\n')
        f_eval_result.write(p1_action_eval_summary)
        f_eval_result.write('\n')
        f_eval_result.write(p2_action_eval_summary)
        f_eval_result.write('\n')
        f_eval_result.close()
Пример #9
0
    def visualize(self, preds, filename, img_path, inv_trans):
        joint_valid = [1.0]*21 + [1.0]*21 #change 1.0 to 0 if that hand is not resent right hand is comes first in output
        focal = [1500, 1500] # x-axis, y-axis
        princpt = [256/2, 256/2] 
        root_joint_idx = {'right': 20, 'left': 41}

        preds_joint_coord, preds_rel_root_depth, preds_hand_type = preds['joint_coord'], preds['rel_root_depth'], preds['hand_type']

        print(preds_hand_type)


        # img = load_image(img_path)
        
        # inv_trans = augmentation(img, 
        #                         np.array(bbox), 
        #                         preds_joint_coord, 
        #                         joint_valid, 
        #                         preds_hand_type, 
        #                         'test', 
        #                         None)
        
        pred_joint_coord_img = preds_joint_coord[0].copy()
        pred_joint_coord_img[:,0] = pred_joint_coord_img[:,0]/cfg.output_hm_shape[2]*cfg.input_img_shape[1]
        pred_joint_coord_img[:,1] = pred_joint_coord_img[:,1]/cfg.output_hm_shape[1]*cfg.input_img_shape[0]
        for j in range(21*2):
            pred_joint_coord_img[j,:2] = trans_point2d(pred_joint_coord_img[j,:2], inv_trans)
        pred_joint_coord_img[:,2] = (pred_joint_coord_img[:,2]/cfg.output_hm_shape[0] * 2 - 1) * (cfg.bbox_3d_size/2)

        # if preds_hand_type[0][0] == 0.9 and preds_hand_type[0][1] == 0.9:  #change threshold to execute this parth if both handa are present
        #     pred_rel_root_depth = (preds_rel_root_depth[0]/cfg.output_root_hm_shape * 2 - 1) * (cfg.bbox_3d_size_root/2)

        #     pred_left_root_img = pred_joint_coord_img[root_joint_idx['left']].copy()
        #     pred_left_root_img[2] +=  pred_rel_root_depth
        #     pred_left_root_cam = pixel2cam(pred_left_root_img[None,:], focal, princpt)[0]

        #     pred_right_root_img = pred_joint_coord_img[root_joint_idx['right']].copy()
        #     pred_right_root_cam = pixel2cam(pred_right_root_img[None,:], focal, princpt)[0]
            
        #     pred_rel_root = pred_left_root_cam - pred_right_root_cam

        # pred_joint_coord_cam = pixel2cam(pred_joint_coord_img, focal, princpt)
        
        # joint_type = {'right': np.arange(0,21), 'left': np.arange(21,21*2)}
        # for h in ('right', 'left'):
        #     pred_joint_coord_cam[joint_type[h]] = pred_joint_coord_cam[joint_type[h]] - pred_joint_coord_cam[root_joint_idx[h],None,:]

        # print("^^^^^^^^^^^^^^^^^^^^^^^^^^^")
        # print(pred_joint_coord_cam.shape)
        
        print(img_path)
        cvimg = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        _img = cvimg[:,:,::-1].transpose(2,0,1)
        vis_kps = pred_joint_coord_img.copy()
        vis_valid = joint_valid.copy()
        
        filename = img_path.replace(".jpg", "2D.jpg").replace("main/custom_data", "custom_output")
        vis_keypoints(_img, vis_kps, joint_valid, self.skeleton, filename)
        filename = img_path.replace(".jpg", "3D.jpg").replace("main/custom_data", "custom_output")
        vis_3d_keypoints(pred_joint_coord_img, joint_valid, self.skeleton, filename)

        print("Finished Processing Image!!!!" + "\n")
Пример #10
0
# handedness
joint_valid = np.zeros((joint_num * 2), dtype=np.float32)
right_exist = False
if hand_type[0] > 0.5:
    right_exist = True
    joint_valid[joint_type['right']] = 1
left_exist = False
if hand_type[1] > 0.5:
    left_exist = True
    joint_valid[joint_type['left']] = 1

print('Right hand exist: ' + str(right_exist) + ' Left hand exist: ' +
      str(left_exist))

# visualize joint coord in 2D space
filename = 'result_2d.jpg'
vis_img = original_img.copy()[:, :, ::-1].transpose(2, 0, 1)
vis_img = vis_keypoints(vis_img,
                        joint_coord,
                        joint_valid,
                        skeleton,
                        filename,
                        save_path='.')

# visualize joint coord in 3D space
# The 3D coordinate in here consists of x,y pixel and z root-relative depth.
# To make x,y, and z in real unit (e.g., mm), you need to know camera intrincis and root depth.
# The root depth can be obtained from RootNet (https://github.com/mks0601/3DMPPE_ROOTNET_RELEASE)
filename = 'result_3d'
vis_3d_keypoints(joint_coord, joint_valid, skeleton, filename)
Пример #11
0
def main():
    video_list = ['Cant stop the feeling - Justin Timberlake - Easy Dance for Kids', 'Dance like yo daddy', 'Danny Ocean - Baby I Wont', 'Si una vez - If I Once', 'Vaiven - MegaMix']
    for video in video_list:
        video_dir = 'dance_videos\\' + video + '.mp4'
        beat_dir = video_dir.strip('mp4') + 'npy'

        cudnn.fastest = True
        cudnn.benchmark = True
        cudnn.deterministic = False
        cudnn.enabled = True

        time_0 = time.time()
        tester = Tester(24)

        ##loading 3D pose estimation model
        tester._make_model()

        time_1 = time.time()
        print('loading integral pose model elapse:',round(time_1-time_0,2),'s')

        ##loading yolo detector
        detector = YOLOv3( model_def="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\config\\yolov3.cfg",
                            class_path="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\data\\coco.names",
                            weights_path="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\weights\\yolov3.weights",
                            classes=('person',),
                            max_batch_size=16,
                            device=torch.device('cuda:{}'.format(cfg.gpu_ids[0])))
        print('loading yolo elapse:',round(time.time()-time_1,2),'s')
        skeleton = ( (0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6) )
        #fig = plt.figure(figsize=(10,10)) 
        transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)]
                                )
        
        if not os.path.exists(video_dir.strip('mp4')+'wav'):
            videoclip = VideoFileClip(video_dir)
            audioclip = videoclip.audio
            audioclip.write_audiofile(video_dir.strip('mp4')+'wav')

        video = cv2.VideoCapture(video_dir)
        if not os.path.exists(beat_dir):
            time_2 = time.time()
            videoclip = VideoFileClip(video_dir)
            audioclip = videoclip.audio
            beat_activation = RNNBeatProcessor()(video_dir.strip('mp4')+'wav')
            processor = DBNBeatTrackingProcessor(fps=100)
            beats = processor(beat_activation)
            frames_at_beat = (beats/audioclip.duration*video.get(cv2.CAP_PROP_FRAME_COUNT)).astype(int)
            print('extracting beat sequence elapse:', round(time.time()-time_2, 2), 's')
            np.save(beat_dir, frames_at_beat)
        frames_at_beat = np.load(beat_dir).tolist()

        ##########################################
        dance_primitives_dir = '.\\danceprimitives_trial'
        if not os.path.exists(dance_primitives_dir):
            os.mkdir(dance_primitives_dir)
        motion_index = len(os.listdir(dance_primitives_dir))
        for i in range(len(frames_at_beat)-1):

            motion_dir = os.path.join(dance_primitives_dir, '{0}'.format(str(motion_index).zfill(5)))
            if not os.path.exists(motion_dir):
                os.mkdir(motion_dir)

            start = frames_at_beat[i]
            end =frames_at_beat[i+1]
            dance_primitive = np.empty((0, 17*3)) # for motion control
            #dance_primitive_norm = np.empty((0, 17*3)) # for motion clustering
            video.set(1, start)
            jump_flag = 0
            frame = 0
            with torch.no_grad():
                time_start = time.time()
                while True:
                    current_frame = video.get(cv2.CAP_PROP_POS_FRAMES)
                    ret_val, raw_image = video.read()
                    if current_frame == end:
                        break
                    ##using yolo to get human bounding box
                    input_img = raw_image.copy()
                    detections = detector.predict_single(input_img)
                    if detections is None or detections.size()[0] == 0:
                        jump_flag = 1
                        break
                    last_conf = 0
                    for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf, cls_pred) in enumerate(detections):
                        if conf.item() > last_conf:
                            x1 = max(int(round(x1_pred.item())) - 40, 0)
                            x2 = min(int(round(x2_pred.item())) + 40, input_img.shape[1]-1)
                            y1 = max(int(round(y1_pred.item())) - 20, 0)
                            y2 = min(int(round(y2_pred.item())) + 20, input_img.shape[0]-1)   #for getting a larger bounding box to cover the full body, in order to get more accurate pose
                            last_conf = conf.item()
                    img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32)
                    ##using ResPoseNet to get 3D human pose
                    input_patch = cv2.resize(img_patch,(cfg.input_shape))
                    input_patch = transform(input_patch).unsqueeze(0)
                    coord_out = tester.model(input_patch).cpu().numpy() #dimention: 1 X 18 X 3, where '3' refers to x, z, y in sequence.
                    #show_pose(input_patch, img_patch, coord_out, skeleton, motion_dir, frame)
                    coord_out_resize = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) #transform to original scale
                    coord_out = coord_out_resize[:, :-1, :] # neglect the key point for "throx"
                    #coord_out_norm = (coord_out-np.mean(coord_out, axis=1))/np.std(coord_out, axis=1)
                    dance_primitive = np.vstack((dance_primitive, np.reshape(coord_out[0], -1)))
                    #dance_primitive_norm = np.vstack((dance_primitive_norm, np.reshape(coord_out_norm[0], -1)))
                    frame += 1
                print('Processing Time Elapse:', round(time.time()-time_start,2), 's')

            if jump_flag == 1:
                continue

            #norm_sample = np.empty((0, 17*3))
            #num_sample = 10
            #print(dance_primitive_norm.shape[0])
            #sample_step = (dance_primitive_norm.shape[0]-1)/(num_sample-1)
            #for i in range(num_sample):
            #    norm_sample = np.vstack((norm_sample, dance_primitive_norm[round(i * sample_step)]))
            
            #print(norm_sample.shape)
            print(dance_primitive.shape)
            #np.save(os.path.join(motion_dir, 'dance_motion_normlized_'+ str(motion_index)), norm_sample)
            np.save(os.path.join(motion_dir, 'dance_motion_'+ str(motion_index)), dance_primitive)

            motion_index+=1



    ###########################################
    sys.exit()
    video.set(1, interval[0])
    frame=0
    next_beat = 0
    last_beat = 0
    num_beat = 0
    num_frame_between_beats = []
    with torch.no_grad():
        while True:
            time_start = time.time()
            current_frame = video.get(cv2.CAP_PROP_POS_FRAMES)
            ret_val, raw_image = video.read()
            if current_frame == interval[1]:
                break
            input_img = raw_image.copy()
                    ##using yolo to get human bounding box
            detections = detector.predict_single(input_img)
            # if not detections.cpu().numpy().all():
            #     detections = (0,0,input_img.shape[1],input_img.shape[0],1,1)
            #     print('not detected')

            if detections is None:
                detections = np.array([[0,0,input_img.shape[1],input_img.shape[0],1,1,1]])
                print('not detected')
            elif detections.size()[0] == 0:
                detections = np.array([[0,0,input_img.shape[1],input_img.shape[0],1,1,1]])
                print('not detected')
            last_conf = 0
            last_last_conf = 0
            for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf, cls_pred) in enumerate(detections):
                if conf.item() > last_conf:
                    x1 = int(round(x1_pred.item())) - 40
                    x2 = int(round(x2_pred.item())) + 40
                    y1 = int(round(y1_pred.item())) - 20
                    y2 = int(round(y2_pred.item())) + 20    #for getting a larger bounding box to cover the full body, in order to get more accurate pose
                    last_last_conf = last_conf
                    last_conf = conf.item()
                print(last_conf, last_last_conf)
                if last_last_conf != 0:
                    sys.exit()
            #print(x1, x2, y1, y2, last_conf)
            img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32)
            input_patch = cv2.resize(img_patch,(cfg.input_shape))

            input_patch = transform(input_patch).unsqueeze(0)
            coord_out = tester.model(input_patch)
            print('Running model time:',round(time.time()-time_start,2),'s')

            motion['frame'][frame] = {}
            if frame+interval[0] in frames_at_beat:
                motion['frame'][frame]['next_beat'] = 0
                motion['frame'][frame]['last_beat'] = 0
                #frames_at_beat.remove(frame)
                next_beat = frames_at_beat.index(frame+interval[0]) + 1
                last_beat = frames_at_beat.index(frame+interval[0])
                num_beat += 1
                num_frame_between_beats.append(frames_at_beat[next_beat] - frames_at_beat[last_beat])
                print('Record key frame with beat:', current_frame)
            else:
                motion['frame'][frame]['next_beat'] = frames_at_beat[next_beat] - (frame+interval[0])
                motion['frame'][frame]['last_beat'] = (frame+interval[0]) - frames_at_beat[last_beat]

            coord_out = coord_out.cpu().numpy()
            coord_out_resize = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1])

            for idx in range(coord_out_resize.shape[1]-1):
                motion['frame'][frame][idx]=(coord_out_resize[0][idx][0].item(), coord_out_resize[0][idx][2].item(), coord_out_resize[0][idx][1].item())
            
            vis = True
            vis_3d = False
            if vis:
                    tmpimg = input_patch[0].cpu().numpy()
                    tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(3,1,1) + np.array(cfg.pixel_mean).reshape(3,1,1)
                    tmpimg = (tmpimg).astype(np.uint8)
                    tmpimg = tmpimg[::-1, :, :]
                    tmpimg = np.transpose(tmpimg,(1,2,0)).copy()
                    tmpkps = np.zeros((3,18))
                    tmpkps[:2,:] = coord_out[0,:,:2].transpose(1,0) / cfg.output_shape[0] * cfg.input_shape[0]
                    tmpkps[2,:] = 1
                    tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton)
                    tmpimg = cv2.resize(tmpimg,(img_patch.shape[1],img_patch.shape[0]))
                    file_name = pose_save_dir+'\\{0}.png'.format(str(frame).zfill(4))
                    cv2.imwrite(file_name, tmpimg)
            if vis_3d:
                #coord_out = coord_out.cpu().numpy()
                #coord_out = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1])
                pred=coord_out_resize.squeeze() #remove first batch dimension

                ax=plt.subplot('121',projection='3d')
                plt.axis('off')
                show3D_pose(pred,ax,skeleton,radius=40)
                file_name = pose_save_dir + '\\{0}.png'.format(str(frame).zfill(4))
                plt.savefig(file_name)
                # cv2.imwrite(file_name, tmpimg)

            frame+=1
            print('Processing Frame:',round(time.time()-time_start,2),'s')

        motion['feature']['fpb'] = np.mean(num_frame_between_beats)
        if REDU:
            motion_base[len(motion_base)-1] = motion
        else:
            motion_base[len(motion_base)] = motion
        #with open(motion_base_dir, 'w') as f:
        #    json.dump(motion_base, f)
    print('done with', num_beat + 1, 'beats! (This should be even for a normal dance)')
    print('num_frame between beats:')
    print(num_frame_between_beats)
    def __getitem__(self, index):

        if self.multiple_db:
            db_idx = index // max([len(db) for db in self.db])

            joint_num = self.joint_num[db_idx]
            skeleton = self.skeleton[db_idx]
            lr_skeleton = self.lr_skeleton[0]
            flip_pairs = self.flip_pairs[db_idx]
            joints_have_depth = self.joints_have_depth[db_idx]

            ref_joints_name = self.joints_name[0]
            joints_name = self.joints_name[db_idx]

            item_idx = index % max([len(db)
                                    for db in self.db]) % len(self.db[db_idx])
            data = copy.deepcopy(self.db[db_idx][item_idx])

        else:
            joint_num = self.joint_num
            skeleton = self.skeleton
            lr_skeleton = self.lr_skeleton
            flip_pairs = self.flip_pairs
            joints_have_depth = self.joints_have_depth

            data = copy.deepcopy(self.db[index])

        bbox = data['bbox']
        joint_img = data['joint_img']
        joint_vis = data['joint_vis']

        # 1. load image
        cvimg = cv2.imread(data['img_path'],
                           cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        if not isinstance(cvimg, np.ndarray):
            raise IOError("Fail to read %s" % data['img_path'])
        img_height, img_width, img_channels = cvimg.shape

        # 2. get augmentation params
        if self.do_augment:
            scale, rot, do_flip, color_scale = get_aug_config()
        else:
            scale, rot, do_flip, color_scale = 1.0, 0, False, [1.0, 1.0, 1.0]

        # 3. crop patch from img and perform data augmentation (flip, scale, rot, color scale)
        img_patch, trans = generate_patch_image(cvimg, bbox, do_flip, scale,
                                                rot)
        for i in range(img_channels):
            img_patch[:, :, i] = np.clip(img_patch[:, :, i] * color_scale[i],
                                         0, 255)

        # 4. generate patch joint ground truth
        # flip joints and apply Affine Transform on joints
        if do_flip:
            joint_img[:, 0] = img_width - joint_img[:, 0] - 1
            for pair in flip_pairs:
                joint_img[pair[0], :], joint_img[pair[1], :] = joint_img[
                    pair[1], :], joint_img[pair[0], :].copy()
                joint_vis[pair[0], :], joint_vis[pair[1], :] = joint_vis[
                    pair[1], :], joint_vis[pair[0], :].copy()

        for i in range(len(joint_img)):
            joint_img[i, 0:2] = trans_point2d(joint_img[i, 0:2], trans)
            joint_img[i, 2] /= (
                cfg.bbox_3d_shape[0] / 2. * scale
            )  # expect depth lies in -bbox_3d_shape[0]/2 ~ bbox_3d_shape[0]/2 -> -1.0 ~ 1.0
            joint_img[i, 2] = (joint_img[i, 2] + 1.0) / 2.  # 0~1 normalize
            joint_vis[i] *= (
                            (joint_img[i,0] >= 0) & \
                            (joint_img[i,0] < cfg.input_shape[1]) & \
                            (joint_img[i,1] >= 0) & \
                            (joint_img[i,1] < cfg.input_shape[0]) & \
                            (joint_img[i,2] >= 0) & \
                            (joint_img[i,2] < 1)
                            )

        vis = False
        if vis:
            filename = str(random.randrange(1, 500))
            tmpimg = img_patch.copy().astype(np.uint8)
            tmpkps = np.zeros((3, joint_num))
            tmpkps[:2, :] = joint_img[:, :2].transpose(1, 0)
            tmpkps[2, :] = joint_vis[:, 0]
            tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton)
            cv2.imwrite(osp.join(cfg.vis_dir, filename + '_gt.jpg'), tmpimg)

        vis = False
        if vis:
            vis_3d_skeleton(joint_img, joint_vis, skeleton, filename)

        # change coordinates to output space
        joint_img[:,
                  0] = joint_img[:,
                                 0] / cfg.input_shape[1] * cfg.output_shape[1]
        joint_img[:,
                  1] = joint_img[:,
                                 1] / cfg.input_shape[0] * cfg.output_shape[0]
        joint_img[:, 2] = joint_img[:, 2] * cfg.depth_dim

        # change joint coord, vis to reference dataset. 0th db is reference dataset
        if self.multiple_db:
            joint_img = transform_joint_to_other_db(joint_img, joints_name,
                                                    ref_joints_name)
            joint_vis = transform_joint_to_other_db(joint_vis, joints_name,
                                                    ref_joints_name)

        if self.is_train:
            img_patch = self.transform(img_patch)
            joint_img = joint_img.astype(np.float32)
            joint_vis = (joint_vis > 0).astype(np.float32)
            joints_have_depth = np.array([joints_have_depth
                                          ]).astype(np.float32)

            return img_patch, joint_img, joint_vis, joints_have_depth
        else:
            img_patch = self.transform(img_patch)
            return img_patch
Пример #13
0
    def evaluate(self, preds):

        print() 
        print('Evaluation start...')

        gts = self.datalist
        preds_joint_coord, preds_rel_root_depth, preds_hand_type, inv_trans = preds['joint_coord'], preds['rel_root_depth'], preds['hand_type'], preds['inv_trans']
        assert len(gts) == len(preds_joint_coord)
        sample_num = len(gts)
        
        mpjpe = [[] for _ in range(self.joint_num)] # treat right and left hand identical
        acc_hand_cls = 0
        for n in range(sample_num):
            data = gts[n]
            bbox, cam_param, joint, gt_hand_type = data['bbox'], data['cam_param'], data['joint'], data['hand_type']
            focal = cam_param['focal']
            princpt = cam_param['princpt']
            gt_joint_coord = joint['cam_coord']
            joint_valid = joint['valid']

            # restore coordinates to original space
            pred_joint_coord_img = preds_joint_coord[n].copy()
            pred_joint_coord_img[:,0] = pred_joint_coord_img[:,0]/cfg.output_hm_shape[2]*cfg.input_img_shape[1]
            pred_joint_coord_img[:,1] = pred_joint_coord_img[:,1]/cfg.output_hm_shape[1]*cfg.input_img_shape[0]
            for j in range(self.joint_num*2):
                pred_joint_coord_img[j,:2] = trans_point2d(pred_joint_coord_img[j,:2],inv_trans[n])
            pred_joint_coord_img[:,2] = (pred_joint_coord_img[:,2]/cfg.output_hm_shape[0] * 2 - 1) * (cfg.bbox_3d_size/2)
            pred_joint_coord_img[:,2] = pred_joint_coord_img[:,2] + data['abs_depth']

            # back project to camera coordinate system
            pred_joint_coord_cam = pixel2cam(pred_joint_coord_img, focal, princpt)

            # root joint alignment
            pred_joint_coord_cam[self.joint_type['right']] = pred_joint_coord_cam[self.joint_type['right']] - pred_joint_coord_cam[self.root_joint_idx['right'],None,:]
            pred_joint_coord_cam[self.joint_type['left']] = pred_joint_coord_cam[self.joint_type['left']] - pred_joint_coord_cam[self.root_joint_idx['left'],None,:]
            gt_joint_coord[self.joint_type['right']] = gt_joint_coord[self.joint_type['right']] - gt_joint_coord[self.root_joint_idx['right'],None,:]
            gt_joint_coord[self.joint_type['left']] = gt_joint_coord[self.joint_type['left']] - gt_joint_coord[self.root_joint_idx['left'],None,:]
            
            # select right or left hand using groundtruth hand type
            pred_joint_coord_cam = pred_joint_coord_cam[self.joint_type[gt_hand_type]]
            gt_joint_coord = gt_joint_coord[self.joint_type[gt_hand_type]]
            joint_valid = joint_valid[self.joint_type[gt_hand_type]]

            # mpjpe save
            for j in range(self.joint_num):
                if joint_valid[j]:
                    mpjpe[j].append(np.sqrt(np.sum((pred_joint_coord_cam[j] - gt_joint_coord[j])**2)))
            
            if gt_hand_type == 'right' and preds_hand_type[n][0] > 0.5 and preds_hand_type[n][1] < 0.5:
                acc_hand_cls += 1
            elif gt_hand_type == 'left' and preds_hand_type[n][0] < 0.5 and preds_hand_type[n][1] > 0.5:
                acc_hand_cls += 1

            vis = False
            if vis:
                img_path = data['img_path']
                cvimg = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
                _img = cvimg[:,:,::-1].transpose(2,0,1)
                vis_kps = pred_joint_coord_img.copy()
                vis_valid = joint_valid.copy()
                filename = 'out_' + str(n) + '.jpg'
                vis_keypoints(_img, vis_kps, vis_valid, self.skeleton[:self.joint_num], filename)

            vis = False
            if vis:
                filename = 'out_' + str(n) + '_3d.png'
                vis_3d_keypoints(pred_joint_coord_cam, joint_valid, self.skeleton[:self.joint_num], filename)

        print('Handedness accuracy: ' + str(acc_hand_cls / sample_num))

        eval_summary = 'MPJPE for each joint: \n'
        for j in range(self.joint_num):
            mpjpe[j] = np.mean(np.stack(mpjpe[j]))
            joint_name = self.skeleton[j]['name']
            eval_summary += (joint_name + ': %.2f, ' % mpjpe[j])
        print(eval_summary)
        print('MPJPE: %.2f' % (np.mean(mpjpe)))
Пример #14
0
def main():
    video_dir = 'dance_videos\\Danny Ocean - Baby I Wont.mp4'
    beat_dir = video_dir.strip('mp4') + 'npy'
    interval = [32, 36]  #in second
    REDU = True

    motion_base_dir = 'MyNao\\motion_base\\motion_base.json'
    if not os.path.exists(motion_base_dir):
        motion_base = {}
        with open(motion_base_dir, 'w') as f:
            json.dump(motion_base, f)
    with open(motion_base_dir, 'r') as f:
        motion_base = json.load(f)
    if REDU:
        pose_save_dir = 'MyNao\\motion_glance\\' + str(len(motion_base) - 1)
    else:
        pose_save_dir = 'MyNao\\motion_glance\\' + str(len(motion_base))
    if not os.path.exists(pose_save_dir):
        os.mkdir(pose_save_dir)

    motion = {}
    motion['feature'] = {}
    motion['feature']['bps'] = [None]
    motion['feature']['symmetric'] = False
    motion['feature']['repeat'] = True
    motion['frame'] = {}
    #args = parse_args()
    #cfg.set_args(args.gpu_ids)
    cudnn.fastest = True
    cudnn.benchmark = True
    cudnn.deterministic = False
    cudnn.enabled = True

    time_0 = time.time()
    tester = Tester(24)

    ##loading 3D pose estimation model
    tester._make_model()

    time_1 = time.time()
    print('loading integral pose model elapse:', round(time_1 - time_0, 2),
          's')

    ##loading yolo detector
    detector = YOLOv3(
        model_def=
        "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\config\\yolov3.cfg",
        class_path=
        "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\data\\coco.names",
        weights_path=
        "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\weights\\yolov3.weights",
        classes=('person', ),
        max_batch_size=16,
        device=torch.device('cuda:{}'.format(cfg.gpu_ids[0])))
    print('loading yolo elapse:', round(time.time() - time_1, 2), 's')
    skeleton = ((0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13),
                (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4),
                (4, 5), (5, 6))
    fig = plt.figure(figsize=(10, 10))
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)
    ])
    ##load model

    if not os.path.exists(video_dir.strip('mp4') + 'wav'):
        videoclip = VideoFileClip(video_dir)
        audioclip = videoclip.audio
        audioclip.write_audiofile(video_dir.strip('mp4') + 'wav')

    video = cv2.VideoCapture(video_dir)
    if not os.path.exists(beat_dir):
        time_2 = time.time()
        videoclip = VideoFileClip(video_dir)
        audioclip = videoclip.audio
        beat_activation = RNNBeatProcessor()(video_dir.strip('mp4') + 'wav')
        processor = DBNBeatTrackingProcessor(fps=100)
        beats = processor(beat_activation)
        frames_at_beat = (beats / audioclip.duration *
                          video.get(cv2.CAP_PROP_FRAME_COUNT)).astype(int)
        print('extracting beat sequence elapse:',
              round(time.time() - time_2, 2), 's')
        np.save(beat_dir, frames_at_beat)
    frames_at_beat = np.load(beat_dir).tolist()

    for beat in frames_at_beat:
        if interval[0] * video.get(cv2.CAP_PROP_FPS) > beat:
            continue
        else:
            interval[0] = beat
            break
    for beat in frames_at_beat:
        if interval[1] * video.get(cv2.CAP_PROP_FPS) > beat:
            continue
        else:
            interval[1] = beat
            break

    video.set(1, interval[0])
    frame = 0
    next_beat = 0
    last_beat = 0
    num_beat = 0
    num_frame_between_beats = []
    with torch.no_grad():
        while True:
            time_start = time.time()
            current_frame = video.get(cv2.CAP_PROP_POS_FRAMES)
            ret_val, raw_image = video.read()
            if current_frame == interval[1]:
                break
            input_img = raw_image.copy()
            ##using yolo to get human bounding box
            detections = detector.predict_single(input_img)
            # if not detections.cpu().numpy().all():
            #     detections = (0,0,input_img.shape[1],input_img.shape[0],1,1)
            #     print('not detected')

            if detections is None:
                detections = np.array(
                    [[0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1]])
                print('not detected')
            elif detections.size()[0] == 0:
                detections = np.array(
                    [[0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1]])
                print('not detected')
            last_conf = 0
            last_last_conf = 0
            for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf,
                    cls_pred) in enumerate(detections):
                if conf.item() > last_conf:
                    x1 = int(round(x1_pred.item())) - 40
                    x2 = int(round(x2_pred.item())) + 40
                    y1 = int(round(y1_pred.item())) - 20
                    y2 = int(
                        round(y2_pred.item())
                    ) + 20  #for getting a larger bounding box to cover the full body, in order to get more accurate pose
                    last_last_conf = last_conf
                    last_conf = conf.item()
                print(last_conf, last_last_conf)
                if last_last_conf != 0:
                    sys.exit()
            #print(x1, x2, y1, y2, last_conf)
            img_patch = (input_img[y1:y2,
                                   x1:x2, ::-1]).copy().astype(np.float32)
            input_patch = cv2.resize(img_patch, (cfg.input_shape))

            input_patch = transform(input_patch).unsqueeze(0)
            coord_out = tester.model(input_patch)
            print('Running model time:', round(time.time() - time_start, 2),
                  's')

            motion['frame'][frame] = {}
            if frame + interval[0] in frames_at_beat:
                motion['frame'][frame]['next_beat'] = 0
                motion['frame'][frame]['last_beat'] = 0
                #frames_at_beat.remove(frame)
                next_beat = frames_at_beat.index(frame + interval[0]) + 1
                last_beat = frames_at_beat.index(frame + interval[0])
                num_beat += 1
                num_frame_between_beats.append(frames_at_beat[next_beat] -
                                               frames_at_beat[last_beat])
                print('Record key frame with beat:', current_frame)
            else:
                motion['frame'][frame]['next_beat'] = frames_at_beat[
                    next_beat] - (frame + interval[0])
                motion['frame'][frame]['last_beat'] = (
                    frame + interval[0]) - frames_at_beat[last_beat]

            coord_out = coord_out.cpu().numpy()
            coord_out_resize = coord_out * np.array([
                img_patch.shape[1] / cfg.input_shape[1],
                img_patch.shape[0] / cfg.input_shape[0], 1
            ])

            for idx in range(coord_out_resize.shape[1] - 1):
                motion['frame'][frame][idx] = (
                    coord_out_resize[0][idx][0].item(),
                    coord_out_resize[0][idx][2].item(),
                    coord_out_resize[0][idx][1].item())

            vis = True
            vis_3d = False
            if vis:
                tmpimg = input_patch[0].cpu().numpy()
                tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(
                    3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1)
                tmpimg = (tmpimg).astype(np.uint8)
                tmpimg = tmpimg[::-1, :, :]
                tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy()
                tmpkps = np.zeros((3, 18))
                tmpkps[:2, :] = coord_out[0, :, :2].transpose(
                    1, 0) / cfg.output_shape[0] * cfg.input_shape[0]
                tmpkps[2, :] = 1
                tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton)
                tmpimg = cv2.resize(tmpimg,
                                    (img_patch.shape[1], img_patch.shape[0]))
                file_name = pose_save_dir + '\\{0}.png'.format(
                    str(frame).zfill(4))
                cv2.imwrite(file_name, tmpimg)
            if vis_3d:
                #coord_out = coord_out.cpu().numpy()
                #coord_out = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1])
                pred = coord_out_resize.squeeze(
                )  #remove first batch dimension

                ax = plt.subplot('121', projection='3d')
                plt.axis('off')
                show3D_pose(pred, ax, skeleton, radius=40)
                file_name = pose_save_dir + '\\{0}.png'.format(
                    str(frame).zfill(4))
                plt.savefig(file_name)
                # cv2.imwrite(file_name, tmpimg)

            frame += 1
            print('Processing Frame:', round(time.time() - time_start, 2), 's')

        motion['feature']['fpb'] = np.mean(num_frame_between_beats)
        if REDU:
            motion_base[len(motion_base) - 1] = motion
        else:
            motion_base[len(motion_base)] = motion
        #with open(motion_base_dir, 'w') as f:
        #    json.dump(motion_base, f)
    print('done with', num_beat + 1,
          'beats! (This should be even for a normal dance)')
    print('num_frame between beats:')
    print(num_frame_between_beats)
Пример #15
0
    def evaluate(self, preds):

        print() 
        print('Evaluation start...')

        gts = self.datalist
        preds_joint_coord, preds_rel_root_depth, preds_hand_type, inv_trans = preds['joint_coord'], preds['rel_root_depth'], preds['hand_type'], preds['inv_trans']
        assert len(gts) == len(preds_joint_coord)
        sample_num = len(gts)
        
        mpjpe_sh = [[] for _ in range(self.joint_num*2)]
        mpjpe_ih = [[] for _ in range(self.joint_num*2)]
        mrrpe = []
        acc_hand_cls = 0; hand_cls_cnt = 0;
        for n in range(sample_num):
            data = gts[n]
            bbox, cam_param, joint, gt_hand_type, hand_type_valid = data['bbox'], data['cam_param'], data['joint'], data['hand_type'], data['hand_type_valid']
            focal = cam_param['focal']
            princpt = cam_param['princpt']
            gt_joint_coord = joint['cam_coord']
            joint_valid = joint['valid']
            
            # restore xy coordinates to original image space
            pred_joint_coord_img = preds_joint_coord[n].copy()
            pred_joint_coord_img[:,0] = pred_joint_coord_img[:,0]/cfg.output_hm_shape[2]*cfg.input_img_shape[1]
            pred_joint_coord_img[:,1] = pred_joint_coord_img[:,1]/cfg.output_hm_shape[1]*cfg.input_img_shape[0]
            for j in range(self.joint_num*2):
                pred_joint_coord_img[j,:2] = trans_point2d(pred_joint_coord_img[j,:2],inv_trans[n])
            # restore depth to original camera space
            pred_joint_coord_img[:,2] = (pred_joint_coord_img[:,2]/cfg.output_hm_shape[0] * 2 - 1) * (cfg.bbox_3d_size/2)
 
            # mrrpe
            if gt_hand_type == 'interacting' and joint_valid[self.root_joint_idx['left']] and joint_valid[self.root_joint_idx['right']]:
                pred_rel_root_depth = (preds_rel_root_depth[n]/cfg.output_root_hm_shape * 2 - 1) * (cfg.bbox_3d_size_root/2)

                pred_left_root_img = pred_joint_coord_img[self.root_joint_idx['left']].copy()
                pred_left_root_img[2] += data['abs_depth']['right'] + pred_rel_root_depth
                pred_left_root_cam = pixel2cam(pred_left_root_img[None,:], focal, princpt)[0]

                pred_right_root_img = pred_joint_coord_img[self.root_joint_idx['right']].copy()
                pred_right_root_img[2] += data['abs_depth']['right']
                pred_right_root_cam = pixel2cam(pred_right_root_img[None,:], focal, princpt)[0]
                
                pred_rel_root = pred_left_root_cam - pred_right_root_cam
                gt_rel_root = gt_joint_coord[self.root_joint_idx['left']] - gt_joint_coord[self.root_joint_idx['right']]
                mrrpe.append(float(np.sqrt(np.sum((pred_rel_root - gt_rel_root)**2))))

           
            # add root joint depth
            pred_joint_coord_img[self.joint_type['right'],2] += data['abs_depth']['right']
            pred_joint_coord_img[self.joint_type['left'],2] += data['abs_depth']['left']

            # back project to camera coordinate system
            pred_joint_coord_cam = pixel2cam(pred_joint_coord_img, focal, princpt)

            # root joint alignment
            for h in ('right', 'left'):
                pred_joint_coord_cam[self.joint_type[h]] = pred_joint_coord_cam[self.joint_type[h]] - pred_joint_coord_cam[self.root_joint_idx[h],None,:]
                gt_joint_coord[self.joint_type[h]] = gt_joint_coord[self.joint_type[h]] - gt_joint_coord[self.root_joint_idx[h],None,:]
            
            # mpjpe
            for j in range(self.joint_num*2):
                if joint_valid[j]:
                    if gt_hand_type == 'right' or gt_hand_type == 'left':
                        mpjpe_sh[j].append(np.sqrt(np.sum((pred_joint_coord_cam[j] - gt_joint_coord[j])**2)))
                    else:
                        mpjpe_ih[j].append(np.sqrt(np.sum((pred_joint_coord_cam[j] - gt_joint_coord[j])**2)))

            # handedness accuray
            if hand_type_valid:
                if gt_hand_type == 'right' and preds_hand_type[n][0] > 0.5 and preds_hand_type[n][1] < 0.5:
                    acc_hand_cls += 1
                elif gt_hand_type == 'left' and preds_hand_type[n][0] < 0.5 and preds_hand_type[n][1] > 0.5:
                    acc_hand_cls += 1
                elif gt_hand_type == 'interacting' and preds_hand_type[n][0] > 0.5 and preds_hand_type[n][1] > 0.5:
                    acc_hand_cls += 1
                hand_cls_cnt += 1

            vis = False
            if vis:
                img_path = data['img_path']
                cvimg = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
                _img = cvimg[:,:,::-1].transpose(2,0,1)
                vis_kps = pred_joint_coord_img.copy()
                vis_valid = joint_valid.copy()
                capture = str(data['capture'])
                cam = str(data['cam'])
                frame = str(data['frame'])
                filename = 'out_' + str(n) + '_' + gt_hand_type + '.jpg'
                vis_keypoints(_img, vis_kps, vis_valid, self.skeleton, filename)

            vis = False
            if vis:
                filename = 'out_' + str(n) + '_3d.jpg'
                vis_3d_keypoints(pred_joint_coord_cam, joint_valid, self.skeleton, filename)
        

        if hand_cls_cnt > 0: print('Handedness accuracy: ' + str(acc_hand_cls / hand_cls_cnt))
        if len(mrrpe) > 0: print('MRRPE: ' + str(sum(mrrpe)/len(mrrpe)))
        print()
 
        tot_err = []
        eval_summary = 'MPJPE for each joint: \n'
        for j in range(self.joint_num*2):
            tot_err_j = np.mean(np.concatenate((np.stack(mpjpe_sh[j]), np.stack(mpjpe_ih[j]))))
            joint_name = self.skeleton[j]['name']
            eval_summary += (joint_name + ': %.2f, ' % tot_err_j)
            tot_err.append(tot_err_j)
        print(eval_summary)
        print('MPJPE for all hand sequences: %.2f' % (np.mean(tot_err)))
        print()

        eval_summary = 'MPJPE for each joint: \n'
        for j in range(self.joint_num*2):
            mpjpe_sh[j] = np.mean(np.stack(mpjpe_sh[j]))
            joint_name = self.skeleton[j]['name']
            eval_summary += (joint_name + ': %.2f, ' % mpjpe_sh[j])
        print(eval_summary)
        print('MPJPE for single hand sequences: %.2f' % (np.mean(mpjpe_sh)))
        print()

        eval_summary = 'MPJPE for each joint: \n'
        for j in range(self.joint_num*2):
            mpjpe_ih[j] = np.mean(np.stack(mpjpe_ih[j]))
            joint_name = self.skeleton[j]['name']
            eval_summary += (joint_name + ': %.2f, ' % mpjpe_ih[j])
        print(eval_summary)
        print('MPJPE for interacting hand sequences: %.2f' % (np.mean(mpjpe_ih)))
Пример #16
0
def main():

    args = parse_args()
    cfg.set_args(args.gpu_ids)
    cudnn.fastest = True
    cudnn.benchmark = True
    cudnn.deterministic = False
    cudnn.enabled = True

    tester = Tester(24)

    # The pretrained model used is assumed to be  Human36M keypoints and skeleton. Change if a different model is used.
    tester.joint_num = 18
    tester.skeleton = (
        (0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3),
        (0, 4), (4, 5), (5, 6))
    #tester._make_batch_generator()
    ##load 3D pose estimation model
    tester._make_model()

    ##loading yolo detector
    """detector = YOLOv3( model_def="/data1/cx/project/3DMPPE_POSENET_RELEASE/common/detectors/yolo/config/yolov3.cfg",
                       class_path="/data1/cx/project/3DMPPE_POSENET_RELEASE/common/detectors/yolo/data/coco.names",
                       weights_path="/data1/cx/project/3DMPPE_POSENET_RELEASE/common/detectors/yolo/weights/yolov3.weights",
                       classes=('person',),
                       max_batch_size=16,
                       device=torch.device('cuda:{}'.format(cfg.gpu_ids[0])))"""

    preds = []
    with torch.no_grad():
        itr = 1 # iteration
        imgpath = 'D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\image\\test.jpg'
        # imgpath = '/data1/cx/project/3DMPPE_POSENET_RELEASE/image/test.jpg'
        bbox = (0, 0, 256, 256)
        input_img = prepare_input(imgpath, bbox)
        # forward
        coord_out = tester.model(input_img)
        print('coord out shape:', coord_out.shape)

        vis = True
        if vis:
            filename = str(itr)
            tmpimg = input_img[0].cpu().numpy()
            tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(3,1,1) + np.array(cfg.pixel_mean).reshape(3,1,1)
            tmpimg = tmpimg.astype(np.uint8)
            tmpimg = tmpimg[::-1, :, :]
            tmpimg = np.transpose(tmpimg,(1,2,0)).copy()
            tmpkps = np.zeros((3,tester.joint_num))
            tmpkps[:2,:] = coord_out[0,:,:2].cpu().numpy().transpose(1,0) / cfg.output_shape[0] * cfg.input_shape[0]
            tmpkps[2,:] = 1
            tmpimg = vis_keypoints(tmpimg, tmpkps, tester.skeleton)
            cv2.imwrite(filename + '_output.jpg', tmpimg)
            # cv2.imshow('img', tmpimg)
            # cv2.waitKey(0)
            # cv2.destroyAllWindows()

        coord_out = coord_out.cpu().numpy()

        vis_3d=True
        if vis_3d:
            pred=coord_out.squeeze() #remove first batch dimension
            plot_3dkeypoints(pred, tester.skeleton)
        preds.append(coord_out)
            
    # evaluate
    preds = np.concatenate(preds, axis=0)
def main():

    args = parse_args()
    cfg.set_args(args.gpu_ids)
    cudnn.fastest = True
    cudnn.benchmark = True
    cudnn.deterministic = False
    cudnn.enabled = True

    time_0 = time.time()
    tester = Tester(24)

    ##loading 3D pose estimation model
    tester._make_model()

    time_1 = time.time()
    print('loading integral pose model elapse:', round(time_1 - time_0, 2),
          's')

    ##loading yolo detector
    detector = YOLOv3(
        model_def=
        "D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\config\\yolov3.cfg",
        class_path=
        "D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\data\\coco.names",
        weights_path=
        "D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\weights\\yolov3.weights",
        classes=('person', ),
        max_batch_size=16,
        device=torch.device('cuda:{}'.format(cfg.gpu_ids[0])))
    print('loading yolo elapse:', round(time.time() - time_1, 2), 's')
    skeleton = ((0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13),
                (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4),
                (4, 5), (5, 6))
    fig = plt.figure(figsize=(10, 10))
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)
    ])
    ##load model

    preds = []
    if args.demo == 'image':

        image_path = 'D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\image\\test_sample_20200711.jpg'
        raw_image = cv2.imread(
            image_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        input_img = raw_image.copy()

        ##using yolo to get human bounding box
        detections = detector.predict_single(input_img)
        # if not detections.cpu().numpy():
        #     detections = (0,0,input_img.shape[1],input_img.shape[0],1,1)
        for i, (x1, y1, x2, y2, conf, cls_conf,
                cls_pred) in enumerate(detections):
            x1 = int(round(x1.item()))
            x2 = int(round(x2.item()))
            y1 = int(round(y1.item()))
            y2 = int(round(y2.item()))
        img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32)
        input_patch = cv2.resize(img_patch, (cfg.input_shape))

        with torch.no_grad():
            # forward
            input_patch = transform(input_patch).unsqueeze(0)
            coord_out = tester.model(input_patch)

            vis = True
            itr = 3
            if vis:
                filename = str(itr)
                tmpimg = input_patch[0].cpu().numpy()
                tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(
                    3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1)
                tmpimg = (tmpimg).astype(np.uint8)
                tmpimg = tmpimg[::-1, :, :]
                tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy()
                tmpkps = np.zeros((3, 18))
                tmpkps[:2, :] = coord_out[0, :, :2].cpu().numpy().transpose(
                    1, 0) / cfg.output_shape[0] * cfg.input_shape[0]
                tmpkps[2, :] = 1
                tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton)
                tmpimg = cv2.resize(tmpimg,
                                    (img_patch.shape[1], img_patch.shape[0]))
                cv2.imwrite(filename + '_output.jpg', tmpimg)
            # print(tmpkps)

            coord_out = coord_out.cpu().numpy()
            coord_out = coord_out * np.array([
                img_patch.shape[1] / cfg.input_shape[1],
                img_patch.shape[0] / cfg.input_shape[0], 1
            ])
            #coord_out = (coord_out - np.mean(coord_out, axis = 1))/np.std(coord_out, axis = 1)
            coord_out = coord_out[:, :-1, :]
            print(coord_out)
            print(coord_out.shape)
            np.save('coord_out_0711.npy', coord_out)
            vis_3d = True
            if vis_3d:
                pred = coord_out.squeeze()  #remove first batch dimension
                plot_3dkeypoints(pred, skeleton)
            motion_frame = {}
            for idx in range(coord_out.shape[1] - 1):
                motion_frame[idx] = (coord_out[0][idx][0].item(),
                                     coord_out[0][idx][2].item(),
                                     coord_out[0][idx][1].item())
        with open('motionFrame.json', 'w') as f:
            json.dump(motion_frame, f)
    elif args.demo == 'video':
        video = cv2.VideoCapture(
            'D:\\Robot Dance\\AI-Project-Portfolio\\dance_videos\\test_sample.mp4'
        )
        ret_val, image = video.read()
        motion = {}
        frame = 0
        with torch.no_grad():
            while True:
                time_start = time.time()
                ret_val, raw_image = video.read()
                if not ret_val:
                    break
                input_img = raw_image.copy()
                ##using yolo to get human bounding box
                detections = detector.predict_single(input_img)
                # if not detections.cpu().numpy().all():
                #     detections = (0,0,input_img.shape[1],input_img.shape[0],1,1)
                #     print('not detected')

                if detections is None:
                    detections = np.array([[
                        0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1
                    ]])
                    print('not detected')
                elif detections.size()[0] == 0:
                    detections = np.array([[
                        0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1
                    ]])
                    print('not detected')
                for i, (x1, y1, x2, y2, conf, cls_conf,
                        cls_pred) in enumerate(detections):
                    x1 = int(round(x1.item()))
                    x2 = int(round(x2.item()))
                    y1 = int(round(y1.item()))
                    y2 = int(round(y2.item()))
                img_patch = (input_img[y1:y2,
                                       x1:x2, ::-1]).copy().astype(np.float32)
                input_patch = cv2.resize(img_patch, (cfg.input_shape))

                input_patch = transform(input_patch).unsqueeze(0)
                coord_out = tester.model(input_patch)
                print('Running model time:', round(time.time() - time_start,
                                                   2), 's')

                motion[frame] = {}
                for idx in range(coord_out.shape[1] - 1):
                    motion[frame][idx] = (coord_out[0][idx][0].item(),
                                          coord_out[0][idx][2].item(),
                                          coord_out[0][idx][1].item())

                vis = True
                vis_3d = False

                if vis:
                    tmpimg = input_patch[0].cpu().numpy()
                    tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(
                        3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1)
                    tmpimg = (tmpimg).astype(np.uint8)
                    tmpimg = tmpimg[::-1, :, :]
                    tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy()
                    tmpkps = np.zeros((3, 18))
                    tmpkps[:2, :] = coord_out[
                        0, :, :2].cpu().numpy().transpose(
                            1, 0) / cfg.output_shape[0] * cfg.input_shape[0]
                    tmpkps[2, :] = 1
                    tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton)
                    tmpimg = cv2.resize(
                        tmpimg, (img_patch.shape[1], img_patch.shape[0]))
                    file_name = '../demo_result/{0}.png'.format(
                        str(frame).zfill(4))
                    cv2.imwrite(file_name, tmpimg)
                    frame += 1
                if vis_3d:
                    coord_out = coord_out.cpu().numpy()
                    coord_out = coord_out * np.array([
                        img_patch.shape[1] / cfg.input_shape[1],
                        img_patch.shape[0] / cfg.input_shape[0], 1
                    ])
                    pred = coord_out.squeeze()  #remove first batch dimension

                    ax = plt.subplot('121', projection='3d')
                    plt.axis('off')
                    show3D_pose(pred, ax, skeleton, radius=40)

                    # plot_3dkeypoints(pred,skeleton)
                    file_name = '../test_sample_result/{0}.png'.format(
                        str(frame).zfill(4))
                    plt.savefig(file_name)
                    # cv2.imwrite(file_name, tmpimg)
                    frame += 1
                print('Processing Frame:', round(time.time() - time_start, 2),
                      's')

            with open('motionTest.json', 'w') as f:
                json.dump(motion, f)
Пример #18
0
def main():

    args = parse_args()
    cfg.set_args(args.gpu_ids)
    cudnn.fastest = True
    cudnn.benchmark = True
    cudnn.deterministic = False
    cudnn.enabled = True

    tester = Tester(args.test_epoch)
    tester._make_batch_generator()
    tester._make_model()

    preds = []
    tmpp = 0
    with torch.no_grad():
        for itr, input_img in enumerate(tqdm(tester.batch_generator)):

            # forward
            coord_out = tester.model(input_img)

            if cfg.flip_test:
                flipped_input_img = flip(input_img, dims=3)
                flipped_coord_out = tester.model(flipped_input_img)
                flipped_coord_out[:, :, 0] = cfg.output_shape[
                    1] - flipped_coord_out[:, :, 0] - 1
                for pair in tester.flip_pairs:
                    flipped_coord_out[:, pair[0], :], flipped_coord_out[:, pair[
                        1], :] = flipped_coord_out[:, pair[1], :].clone(
                        ), flipped_coord_out[:, pair[0], :].clone()
                coord_out = (coord_out + flipped_coord_out) / 2.

            vis = True
            if vis:
                filename = str(itr)
                tmpimg = input_img[0].cpu().numpy()
                tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(
                    3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1)
                tmpimg = tmpimg.astype(np.uint8)
                tmpimg = tmpimg[::-1, :, :]
                tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy()
                tmpkps = np.zeros((3, tester.joint_num))
                tmpkps[:2, :] = coord_out[0, :, :2].cpu().numpy().transpose(
                    1, 0) / cfg.output_shape[0] * cfg.input_shape[0]
                tmpkps[2, :] = 1
                tmpimg = vis_keypoints(tmpimg, tmpkps, tester.skeleton)
                # print(tmpkps)
                cv2.imwrite(filename + '_output.jpg', tmpimg)

            coord_out = coord_out.cpu().numpy()
            preds.append(coord_out)
            tmpp += 1
            if tmpp == 5: break
    # evaluate
    preds = np.concatenate(preds, axis=0)
    # print(preds.shape)

    # It takes testest from common/base.py
    # It then calls evaluate of that dataset
    # from data/MSCOCO/MSCOCO.py call visualise keypoints
    # from common/utils/vis.py the plts are plotted

    tester._evaluate(preds, cfg.result_dir)