示例#1
0
def _get_image_blob(roidb, scale_inds):
    """Builds an input blob from the images in the roidb at the specified
  scales.
  """
    num_images = len(roidb)

    processed_ims = []
    im_scales = []
    for i in range(num_images):
        #im = cv2.imread(roidb[i]['image'])
        im = imread(roidb[i]['image'])

        if len(im.shape) == 2:
            im = im[:, :, np.newaxis]
            im = np.concatenate((im, im, im), axis=2)
        # flip the channel, since the original one using cv2
        # rgb -> bgr
        im = im[:, :, ::-1]

        if roidb[i]['flipped']:
            im = im[:, ::-1, :]
        target_size = cfg.TRAIN.SCALES[scale_inds[i]]
        im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
                                        cfg.TRAIN.MAX_SIZE)
        im_scales.append(im_scale)
        processed_ims.append(im)

    # Create a blob to hold the input images
    blob = im_list_to_blob(processed_ims)

    return blob, im_scales
示例#2
0
def roi_data(image, target_size=240):
    """ Prepare the input of faster rcnn for detecting objects """
    # flip the channel, since the original one using cv2
    # rgb -> bgr
    # image = image[:, :, ::-1]

    # Pixel mean values (BGR order) as a (1, 1, 3) array
    # We use the same pixel mean for all networks even though it's not exactly what
    # they were trained with
    pixel_means = np.array([[[102.9801, 115.9465, 122.7717]]])
    image, im_scale = prep_im_for_blob(im=image,
                                       pixel_means=pixel_means,
                                       target_size=target_size,
                                       max_size=0,
                                       normalize=None)
    im_info = np.array([image.shape[0], image.shape[1], im_scale],
                       dtype=np.float32)

    # numpy to tensor
    image = torch.Tensor(image.astype(np.float32))
    image = image.permute(2, 0, 1)
    im_info = torch.Tensor(im_info)
    gt_boxes = torch.ones(5, dtype=torch.float32)
    num_boxes = torch.LongTensor([0])

    image.requires_grad = False
    im_info.requires_grad = False
    gt_boxes.requires_grad = False
    num_boxes.requires_grad = False

    return image, im_info, gt_boxes, num_boxes
def _get_image_blob(roidb, scale_inds):
    """Builds an input blob from the images in the roidb at the specified
    scales.
    """
    num_images = len(roidb)  # num_images = 1
    processed_ims = []
    im_scales = []
    for i in range(num_images):
        im = imread(roidb[i]['image'])
        if len(im.shape) == 2:
            im = im[:, :, np.newaxis]
            im = np.concatenate((im, im, im), axis=2)
        # flip the channel, since the original one using cv2
        # rgb -> bgr
        im = im[:, :, ::-1]
        if roidb[i]['flipped']:
            im = im[:, ::-1, :]  # 对图像进行水平翻转
        target_size = cfg.TRAIN.SCALES[scale_inds[i]]
        im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
                                        cfg.TRAIN.MAX_SIZE)
        # im_scale = (target_size) / float(im_size_min),表示原始图像的短边到训练尺寸600的变换倍数
        im_scales.append(im_scale)
        processed_ims.append(im)

    # Create a blob to hold the input images
    blob = im_list_to_blob(processed_ims)
    # 返回blob形式[1,w,h,c],im_scales表示图像resize的倍数
    return blob, im_scales
  def get_evaluate_batch(self, im_path, index):
      # Sample random scales to use for each image in this batch

      # Get the input image blob, formatted for caffe
      # im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
      im = imread(im_path)

      if len(im.shape) == 2:
          im = im[:, :, np.newaxis]
          im = np.concatenate((im, im, im), axis=2)
      # flip the channel, since the original one using cv2
      # rgb -> bgr
      im = im[:, :, ::-1]

      target_size = cfg.TRAIN.SCALES[0]
      im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
                                      cfg.TRAIN.MAX_SIZE)
      im_blob = im_list_to_blob([im])
      blobs = {'data': im_blob}

      # gt boxes: (x1, y1, x2, y2, cls)
      gt_boxes = np.empty((0, 5), dtype=np.float32)
      blobs['gt_boxes'] = gt_boxes
      blobs['im_info'] = np.array([[im.shape[0], im.shape[1], im_scale]], dtype=np.float32)
      blobs['img_id'] = index
      return blobs
示例#5
0
def prepare_im_func(prefix, random_idx, frame_idx, flipped):
    frame_path = os.path.join(prefix,
                              'image_' + str(frame_idx).zfill(5) + '.jpg')
    frame = cv2.imread(frame_path)
    # process the boundary frame
    if frame is None:
        frames = sorted(os.listdir(prefix))
        frame_path = frame_path = os.path.join(prefix, frames[-1])
        frame = cv2.imread(frame_path)

    frame = prep_im_for_blob(frame, cfg.PIXEL_MEANS,
                             tuple(cfg.TRAIN.FRAME_SIZE[::-1]),
                             cfg.TRAIN.CROP_SIZE, random_idx)

    if flipped:
        frame = frame[:, ::-1, :]

    if DEBUG:
        cv2.imshow('frame', frame / 255.0)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

    return frame
示例#6
0
def _get_video_blob(
    roidb,
    scale_inds,
    phase='train'
):  # ([{'gt_classes': array([18.]), 'bg_name':... }], [0], 'train')
    """Builds an input blob from the videos in the roidb at the specified
    scales.
    """
    processed_videos = []

    for i, item in enumerate(
            roidb
    ):  # i= 0,item的shape: {'gt_classes': array([18.]), 'bg_name':... }
        # just one scale implementated
        video_length = cfg.TRAIN.LENGTH[scale_inds[0]]  # video_length = 512
        video = np.zeros((
            video_length,
            cfg.TRAIN.CROP_SIZE,  # (512, 112, 112, 3)
            cfg.TRAIN.CROP_SIZE,
            3))
        j = 0

        if phase == 'train':
            random_idx = [
                np.random.randint(
                    cfg.TRAIN.FRAME_SIZE[1] - cfg.TRAIN.CROP_SIZE
                ),  # [np.random.randint(59), np.random.randint(16)]
                np.random.randint(cfg.TRAIN.FRAME_SIZE[0] -
                                  cfg.TRAIN.CROP_SIZE)
            ]
            # TODO: data argumentation
            #image_w, image_h, crop_w, crop_h = cfg.TRAIN.FRAME_SIZE[1], cfg.TRAIN.FRAME_SIZE[0], cfg.TRAIN.CROP_SIZE, cfg.TRAIN.CROP_SIZE
            #offsets = GroupMultiScaleCrop.fill_fix_offset(False, image_w, image_h, crop_w, crop_h)
            #random_idx = offsets[ npr.choice(len(offsets)) ]
        else:
            random_idx = [
                int((cfg.TRAIN.FRAME_SIZE[1] - cfg.TRAIN.CROP_SIZE) / 2),
                int((cfg.TRAIN.FRAME_SIZE[0] - cfg.TRAIN.CROP_SIZE) / 2)
            ]

        if DEBUG:
            print("offsets: {}, random_idx: {}".format(offsets, random_idx))

        video_info = item['frames'][
            0]  # item['frames'][0]的shape:[0,1317,2085,1]
        step = video_info[3] if cfg.INPUT == 'video' else 1  # step = 1
        prefix = item['fg_name'] if video_info[0] else item[
            'bg_name']  # 视频帧文件夹的绝对路径

        if cfg.TEMP_SPARSE_SAMPLING:
            if phase == 'train':
                segment_offsets = npr.randint(
                    step, size=len(range(video_info[1], video_info[2], step)))
            else:
                segment_offsets = np.zeros(
                    len(range(video_info[1], video_info[2], step))) + step // 2
        else:  # 走这条路
            segment_offsets = np.zeros(
                len(range(video_info[1], video_info[2], step)))  # 时序片段的帧长度

        for i, idx in enumerate(range(video_info[1], video_info[2],
                                      step)):  # idx为该时序片段内的所有帧的下标
            frame_idx = int(segment_offsets[i] + idx +
                            1)  # frame_idx为该时序片段内的所有帧的下标+1
            frame_path = os.path.join(prefix,
                                      'image_' + str(frame_idx).zfill(5) +
                                      '.jpg')  # frame_path为该时序片段内的所有帧的绝对路径
            frame = cv2.imread(frame_path)  # 读取该路径下帧的图片
            # process the boundary frame
            if frame is None:  # 若该时序片段下的帧为空,则读取该视频帧文件夹的最后一帧作为frame
                frames = sorted(os.listdir(
                    prefix))  # frames为帧文件夹下所有帧图片名字构成的列表,且列表内的图片名字的数字由小到大排列
                frame_path = frame_path = os.path.join(prefix, frames[-1])
                frame = cv2.imread(frame_path)
            frame = prep_im_for_blob(frame, cfg.PIXEL_MEANS,
                                     tuple(cfg.TRAIN.FRAME_SIZE[::-1]),
                                     cfg.TRAIN.CROP_SIZE, random_idx)
            if item['flipped']:  # 不走这条路(flipped=False)
                frame = frame[:, ::-1, :]

            if DEBUG:  # 不走这条路(DEBUG=False)
                cv2.imshow('frame', frame / 255.0)
                cv2.waitKey(0)
                cv2.destroyAllWindows()

            video[
                j] = frame  # 把每一个时序片段的所有帧重新装入video这个列表里,video形状:[video_info[2]-video_info[1], 112, 112, 3]
            j = j + 1

        video[j:video_length] = video[
            j -
            1]  # 若video长度不足512,则把video的最后一帧重复填充直至第512帧,最终video形状:[512, 112, 112, 3]

    processed_videos.append(
        video
    )  # 把每个video合在一起(可能是形成一个batch)(但实际上这级的for循环只会执行一次?所以batch_size在这里=1?) processed_videos的shape:[batch_size, 512, 112, 112, 3]
    # Create a blob to hold the input images, dimension trans CLHW
    blob = video_list_to_blob(
        processed_videos)  # blob的shape:[batch_size, 3, 512, 112, 112]

    return blob  # blob的shape:[batch_size, 3, 512, 112, 112]