예제 #1
0
def get_next_image_crops(images, labels, dd, noisy_box, mirrored, real_motion, network_outs):
    if network_outs is not None:
        xyxy_pred = network_outs.squeeze() / 10
        output_box = bb_util.from_crop_coordinate_system(xyxy_pred, noisy_box, CROP_PAD, 1)
        bbox_prev = noisy_box
    elif dd == 0:
        bbox_prev = labels[dd]
    else:
        bbox_prev = labels[dd - 1]

    bbox_on = labels[dd]
    if dd == 0:
        noisy_box = bbox_on.copy()
    elif not real_motion and network_outs is None:
        noisy_box = add_noise(bbox_on, bbox_on, images[0].shape[1], images[0].shape[0])
    else:
        noisy_box = fix_bbox_intersection(bbox_prev, bbox_on)

    image0 = im_util.get_cropped_input(images[max(dd - 1, 0)], bbox_prev, CROP_PAD, CROP_SIZE)[0]

    image1 = im_util.get_cropped_input(images[dd], noisy_box, CROP_PAD, CROP_SIZE)[0]

    shifted_bbox = bb_util.to_crop_coordinate_system(bbox_on, noisy_box, CROP_PAD, 1)
    shifted_bbox_xywh = bb_util.xyxy_to_xywh(shifted_bbox)
    xywh_labels = shifted_bbox_xywh
    xyxy_labels = bb_util.xywh_to_xyxy(xywh_labels) * 10
    return image0, image1, xyxy_labels, noisy_box
예제 #2
0
    def returnConvLayers(self, bbox, image, starting_boxes=None):
        start_time = time.time()

        if type(image) == str:
            image = cv2.imread(image)[:, :, ::-1]
        else:
            image = image.copy()

        image_read_time = time.time() - start_time

        # Get inputs for each track.
        images = []
        # lstmStates = [[] for _ in range(4)]
        pastBBoxesPadded = []

        croppedInput0, pastBBoxPadded = im_util.get_cropped_input(
            image, bbox, CROP_PAD, CROP_SIZE)
        cv2.imshow('', croppedInput0)
        input = np.tile(croppedInput0[np.newaxis, ...], (2, 1, 1, 1))
        feed_dict = {
            self.imagePlaceholder: input,
        }

        convFeatures = self.sess.run([self.conv_layers], feed_dict=feed_dict)
        return convFeatures
예제 #3
0
    def track(self, unique_id, image, starting_box=None):
        start_time = time.time()

        if type(image) == str:
            image = cv2.imread(image)[:, :, ::-1]
        else:
            image = image

        image_read_time = time.time() - start_time

        if starting_box is not None:
            lstmState = [np.zeros((1, LSTM_SIZE)) for _ in range(4)]
            pastBBox = np.array(starting_box)  # turns list into numpy array if not and copies for safety.
            prevImage = image
            originalFeatures = None
            forwardCount = 0
        elif unique_id in self.tracked_data:
            lstmState, pastBBox, prevImage, originalFeatures, forwardCount = self.tracked_data[unique_id]
        else:
            raise Exception('Unique_id %s with no initial bounding box' % unique_id)

        self._profiler.start(self._re3_crop_profiler)
        croppedInput0, pastBBoxPadded = im_util.get_cropped_input(prevImage, pastBBox, CROP_PAD, CROP_SIZE)
        croppedInput1, _ = im_util.get_cropped_input(image, pastBBox, CROP_PAD, CROP_SIZE)
        self._profiler.stop(self._re3_crop_profiler)

        feed_dict = {
            self.imagePlaceholder: [croppedInput0, croppedInput1],
            self.prevLstmState: lstmState,
            self.batch_size: 1,
        }
        self._profiler.start(self._re3_sess_profiler)
        rawOutput, s1, s2 = self.sess.run([self.outputs, self.state1, self.state2], feed_dict=feed_dict)
        lstmState = [s1[0], s1[1], s2[0], s2[1]]
        self._profiler.stop(self._re3_sess_profiler)
        if forwardCount == 0:
            originalFeatures = [s1[0], s1[1], s2[0], s2[1]]

        # prevImage = image

        # Shift output box to full image coordinate system.
        outputBox = bb_util.from_crop_coordinate_system(rawOutput.squeeze() / 10.0, pastBBoxPadded, 1, 1)

        if forwardCount > 0 and forwardCount % MAX_TRACK_LENGTH == 0:
            self._profiler.start(self._re3_crop2_profiler)
            croppedInput, _ = im_util.get_cropped_input(image, outputBox, CROP_PAD, CROP_SIZE)
            input = np.tile(croppedInput[np.newaxis, ...], (2, 1, 1, 1))
            self._profiler.stop(self._re3_crop2_profiler)
            feed_dict = {
                self.imagePlaceholder: input,
                self.prevLstmState: originalFeatures,
                self.batch_size: 1,
            }
            self._profiler.start(self._re3_sess2_profiler)
            rawOutput, s1, s2 = self.sess.run([self.outputs, self.state1, self.state2], feed_dict=feed_dict)
            self._profiler.stop(self._re3_sess2_profiler)
            lstmState = [s1[0], s1[1], s2[0], s2[1]]

        forwardCount += 1
        self.total_forward_count += 1

        if starting_box is not None:
            # Use label if it's given
            outputBox = np.array(starting_box)

        self.tracked_data[unique_id] = (lstmState, outputBox, image, originalFeatures, forwardCount)
        end_time = time.time()
        if self.total_forward_count > 0:
            self.time += (end_time - start_time - image_read_time)
        if SPEED_OUTPUT and self.total_forward_count % 100 == 0:
            print('Current tracking speed:   %.3f FPS' % (1 / (end_time - start_time - image_read_time)))
            print('Current image read speed: %.3f FPS' % (1 / (image_read_time)))
            print('Mean tracking speed:      %.3f FPS\n' % (self.total_forward_count / max(.00001, self.time)))
        return outputBox
예제 #4
0
    def multi_track(self, unique_ids, image, starting_boxes=None):
        start_time = time.time()
        assert type(unique_ids) == list, 'unique_ids must be a list for multi_track'
        assert len(unique_ids) > 1, 'unique_ids must be at least 2 elements'

        if type(image) == str:
            image = cv2.imread(image)[:, :, ::-1]
        else:
            image = image.copy()

        image_read_time = time.time() - start_time

        # Get inputs for each track.
        images = []
        lstmStates = [[] for _ in range(4)]
        pastBBoxesPadded = []
        if starting_boxes is None:
            starting_boxes = dict()
        for unique_id in unique_ids:
            if unique_id in starting_boxes:
                lstmState = [np.zeros((1, LSTM_SIZE)) for _ in range(4)]
                # turns list into numpy array if not and copies for safety.
                pastBBox = np.array(starting_boxes[unique_id])
                prevImage = image
                originalFeatures = None
                forwardCount = 0
                self.tracked_data[unique_id] = (lstmState, pastBBox, image, originalFeatures, forwardCount)
            elif unique_id in self.tracked_data:
                lstmState, pastBBox, prevImage, originalFeatures, forwardCount = self.tracked_data[unique_id]
            else:
                raise Exception('Unique_id %s with no initial bounding box' % unique_id)

            self._profiler.start(self._re3_crop_profiler)
            croppedInput0, pastBBoxPadded = im_util.get_cropped_input(prevImage, pastBBox, CROP_PAD, CROP_SIZE)
            croppedInput1, _ = im_util.get_cropped_input(image, pastBBox, CROP_PAD, CROP_SIZE)
            self._profiler.stop(self._re3_crop_profiler)
            pastBBoxesPadded.append(pastBBoxPadded)
            images.extend([croppedInput0, croppedInput1])
            for ss, state in enumerate(lstmState):
                lstmStates[ss].append(state.squeeze())

        lstmStateArrays = []
        for state in lstmStates:
            lstmStateArrays.append(np.array(state))

        feed_dict = {
            self.imagePlaceholder: images,
            self.prevLstmState: lstmStateArrays,
            self.batch_size: len(images) / 2
        }
        self._profiler.start(self._re3_sess_profiler)
        rawOutput, s1, s2 = self.sess.run([self.outputs, self.state1, self.state2], feed_dict=feed_dict)
        self._profiler.stop(self._re3_sess_profiler)
        outputBoxes = np.zeros((len(unique_ids), 4))
        for uu, unique_id in enumerate(unique_ids):
            lstmState, pastBBox, prevImage, originalFeatures, forwardCount = self.tracked_data[unique_id]
            lstmState = [s1[0][[uu], :], s1[1][[uu], :], s2[0][[uu], :], s2[1][[uu], :]]
            if forwardCount == 0:
                originalFeatures = [s1[0][[uu], :], s1[1][[uu], :], s2[0][[uu], :], s2[1][[uu], :]]

            # prevImage = image

            # Shift output box to full image coordinate system.
            pastBBoxPadded = pastBBoxesPadded[uu]
            outputBox = bb_util.from_crop_coordinate_system(rawOutput[uu, :].squeeze() / 10.0, pastBBoxPadded, 1, 1)

            if forwardCount > 0 and forwardCount % MAX_TRACK_LENGTH == 0:
                self._profiler.start(self._re3_crop2_profiler)
                croppedInput, _ = im_util.get_cropped_input(image, outputBox, CROP_PAD, CROP_SIZE)
                input = np.tile(croppedInput[np.newaxis, ...], (2, 1, 1, 1))
                self._profiler.stop(self._re3_crop2_profiler)
                feed_dict = {
                    self.imagePlaceholder: input,
                    self.prevLstmState: originalFeatures,
                    self.batch_size: 1,
                }
                self._profiler.start(self._re3_sess2_profiler)
                _, s1_new, s2_new = self.sess.run([self.outputs, self.state1, self.state2], feed_dict=feed_dict)
                self._profiler.stop(self._re3_sess2_profiler)
                lstmState = [s1_new[0], s1_new[1], s2_new[0], s2_new[1]]

            forwardCount += 1
            self.total_forward_count += 1

            if unique_id in starting_boxes:
                # Use label if it's given
                outputBox = np.array(starting_boxes[unique_id])

            outputBoxes[uu, :] = outputBox
            self.tracked_data[unique_id] = (lstmState, outputBox, image, originalFeatures, forwardCount)
        end_time = time.time()
        if self.total_forward_count > 0:
            self.time += (end_time - start_time - image_read_time)
        if SPEED_OUTPUT and self.total_forward_count % 100 == 0:
            print('Current tracking speed per object: %.3f FPS' % (
                        len(unique_ids) / (end_time - start_time - image_read_time)))
            print('Current tracking speed per frame:  %.3f FPS' % (1 / (end_time - start_time - image_read_time)))
            print('Current image read speed:          %.3f FPS' % (1 / (image_read_time)))
            print('Mean tracking speed per object:    %.3f FPS\n' % (self.total_forward_count / max(.00001, self.time)))
        return outputBoxes
예제 #5
0
    def track(self, unique_id, image, starting_box=None):
        convFeatures1 = []
        start_time = time.time()

        if type(image) == str:
            image = cv2.imread(image)[:, :, ::-1]
        else:
            image = image.copy()

        image_read_time = time.time() - start_time

        if starting_box is not None:
            lstmState = [np.zeros((1, LSTM_SIZE)) for _ in range(4)]
            print("starting_box>>")
            pastBBox = np.array(
                starting_box
            )  # turns list into numpy array if not and copies for safety.
            prevImage = image
            originalFeatures = None
            forwardCount = 0
            convFeatures = None
        elif unique_id in self.tracked_data:
            lstmState, pastBBox, prevImage, originalFeatures, forwardCount, convFeatures = self.tracked_data[
                unique_id]
        else:
            raise Exception('Unique_id %s with no initial bounding box' %
                            unique_id)

        croppedInput0, pastBBoxPadded = im_util.get_cropped_input(
            prevImage, pastBBox, CROP_PAD, CROP_SIZE)
        croppedInput1, _ = im_util.get_cropped_input(image, pastBBox, CROP_PAD,
                                                     CROP_SIZE)

        feed_dict = {
            self.imagePlaceholder: [croppedInput0, croppedInput1],
            self.prevLstmState: lstmState,
            self.batch_size: 1,
        }
        rawOutput, s1, s2, convFeatures = self.sess.run(
            [self.outputs, self.state1, self.state2, self.conv_layers1],
            feed_dict=feed_dict)
        convFeatures1 = self.sess.run([self.conv_layers], feed_dict=feed_dict)
        lstmState = [s1[0], s1[1], s2[0], s2[1]]
        if forwardCount == 0:
            originalFeatures = [s1[0], s1[1], s2[0], s2[1]]

        prevImage = image

        # Shift output box to full image coordinate system.

        outputBox = bb_util.from_crop_coordinate_system(
            rawOutput.squeeze() / 10.0, pastBBoxPadded, 1, 1)

        if forwardCount > 0 and forwardCount % MAX_TRACK_LENGTH == 0:
            croppedInput, _ = im_util.get_cropped_input(
                image, outputBox, CROP_PAD, CROP_SIZE)
            input = np.tile(croppedInput[np.newaxis, ...], (2, 1, 1, 1))
            feed_dict = {
                self.imagePlaceholder: input,
                self.prevLstmState: originalFeatures,
                self.batch_size: 1,
            }
            rawOutput, s1, s2, convFeatures = self.sess.run(
                [self.outputs, self.state1, self.state2, self.conv_layers1],
                feed_dict=feed_dict)
            convFeatures1 = self.sess.run([self.conv_layers],
                                          feed_dict=feed_dict)
            lstmState = [s1[0], s1[1], s2[0], s2[1]]

        forwardCount += 1
        self.total_forward_count += 1

        if starting_box is not None:
            # Use label if it's given
            outputBox = np.array(starting_box)

        self.tracked_data[unique_id] = (lstmState, outputBox, image,
                                        originalFeatures, forwardCount,
                                        convFeatures1)
        end_time = time.time()
        if self.total_forward_count > 0:
            self.time += (end_time - start_time - image_read_time)

        return outputBox, self.tracked_data
예제 #6
0
    def get_data_sequence(self):
        try:
            # Preallocate the space for the images and labels.
            tImage = np.zeros((self.delta, 2, CROP_SIZE, CROP_SIZE, 3),
                              dtype=np.uint8)
            xywhLabels = np.zeros((self.delta, 4), dtype=np.float32)

            mirrored = random.random() < 0.5
            useSimulator = random.random() < USE_SIMULATOR
            gtType = random.random()
            realMotion = random.random() < REAL_MOTION_PROB

            # Initialize first frame (give the network context).
            if useSimulator:
                # Initialize the simulation and run through a few frames.
                trackingObj, trackedObjects, background = simulator.create_new_track(
                )
                for _ in range(random.randint(0, 200)):
                    simulator.step(trackedObjects)
                    bbox = trackingObj.get_object_box()
                    occlusion = simulator.measure_occlusion(
                        bbox, trackingObj.occluder_boxes, cropPad=1)
                    if occlusion > .2:
                        break
                for _ in range(1000):
                    bbox = trackingObj.get_object_box()
                    occlusion = simulator.measure_occlusion(
                        bbox, trackingObj.occluder_boxes, cropPad=1)
                    if occlusion < 0.01:
                        break
                    simulator.step(trackedObjects)
                initBox = trackingObj.get_object_box()
                if self.debug:
                    images = [
                        simulator.get_image_for_frame(trackedObjects,
                                                      background)
                    ]
                else:
                    images = [np.zeros((SIMULATION_HEIGHT, SIMULATION_WIDTH))]

            else:
                # Read a new data sequence from batch cache and get the ground truth.
                (batchKey, images) = self.getData()
                gtKey = batchKey
                imageIndex = self.key_lookup[gtKey]
                initBox = self.datasets[gtKey[0]][imageIndex, :4].copy()
            if self.debug:
                bboxes = []
                cropBBoxes = []

            # bboxPrev starts at the initial box and is the best guess (or gt) for the image0 location.
            # noisyBox holds the bboxPrev estimate plus some noise.
            bboxPrev = initBox
            lstmState = None

            for dd in range(self.delta):
                # bboxOn is the gt location in image1
                if useSimulator:
                    bboxOn = trackingObj.get_object_box()
                else:
                    newKey = list(gtKey)
                    newKey[3] += dd
                    newKey = tuple(newKey)
                    imageIndex = self.key_lookup[newKey]
                    bboxOn = self.datasets[newKey[0]][imageIndex, :4].copy()
                if dd == 0:
                    noisyBox = bboxOn.copy()
                elif not realMotion and not useSimulator and gtType >= USE_NETWORK_PROB:
                    noisyBox = self.add_noise(bboxOn, bboxOn,
                                              images[0].shape[1],
                                              images[0].shape[0])
                else:
                    noisyBox = self.fix_bbox_intersection(
                        bboxPrev, bboxOn, images[0].shape[1],
                        images[0].shape[0])

                if useSimulator:
                    patch = simulator.render_patch(bboxPrev, background,
                                                   trackedObjects)
                    tImage[dd, 0, ...] = patch
                    if dd > 0:
                        simulator.step(trackedObjects)
                        bboxOn = trackingObj.get_object_box()
                        noisyBox = self.fix_bbox_intersection(
                            bboxPrev, bboxOn, images[0].shape[1],
                            images[0].shape[0])
                else:
                    tImage[dd, 0, ...] = im_util.get_cropped_input(
                        images[max(dd - 1, 0)], bboxPrev, CROP_PAD,
                        CROP_SIZE)[0]

                if useSimulator:
                    patch = simulator.render_patch(noisyBox, background,
                                                   trackedObjects)
                    tImage[dd, 1, ...] = patch
                    if self.debug:
                        images.append(
                            simulator.get_image_for_frame(
                                trackedObjects, background))
                else:
                    tImage[dd, 1, ...] = im_util.get_cropped_input(
                        images[dd], noisyBox, CROP_PAD, CROP_SIZE)[0]

                shiftedBBox = bb_util.to_crop_coordinate_system(
                    bboxOn, noisyBox, CROP_PAD, 1)
                shiftedBBoxXYWH = bb_util.xyxy_to_xywh(shiftedBBox)
                xywhLabels[dd, :] = shiftedBBoxXYWH

                if gtType < USE_NETWORK_PROB:
                    # Run through a single forward pass to get the next box estimate.
                    if dd < self.delta - 1:
                        if dd == 0:
                            lstmState = self.initialLstmState

                        feed_dict = {
                            self.forwardNetworkImagePlaceholder: tImage[dd,
                                                                        ...],
                            self.prevLstmState: lstmState
                        }
                        networkOuts, s1, s2 = self.sess.run(
                            [self.networkOutputs, self.state1, self.state2],
                            feed_dict=feed_dict)
                        lstmState = (s1[0], s1[1], s2[0], s2[1])

                        xyxyPred = networkOuts.squeeze() / 10
                        outputBox = bb_util.from_crop_coordinate_system(
                            xyxyPred, noisyBox, CROP_PAD, 1)

                        bboxPrev = outputBox
                        if self.debug:
                            bboxes.append(outputBox)
                            cropBBoxes.append(xyxyPred)
                else:
                    bboxPrev = bboxOn

                if self.debug:
                    # Look at the inputs to make sure they are correct.
                    image0 = tImage[dd, 0, ...].copy()
                    image1 = tImage[dd, 1, ...].copy()

                    xyxyLabel = bb_util.xywh_to_xyxy(
                        xywhLabels[dd, :].squeeze())
                    print('xyxy raw', xyxyLabel, 'actual',
                          xyxyLabel * CROP_PAD)
                    label = np.zeros((CROP_PAD, CROP_PAD))
                    drawing.drawRect(label, xyxyLabel * CROP_PAD, 0, 1)
                    drawing.drawRect(
                        image0,
                        bb_util.xywh_to_xyxy(np.full((4, 1), .5) * CROP_SIZE),
                        2, [255, 0, 0])
                    bigImage0 = images[max(dd - 1, 0)].copy()
                    bigImage1 = images[dd].copy()
                    if dd < len(cropBBoxes):
                        drawing.drawRect(bigImage1, bboxes[dd], 5, [255, 0, 0])
                        drawing.drawRect(image1, cropBBoxes[dd] * CROP_SIZE, 1,
                                         [0, 255, 0])
                        print('pred raw', cropBBoxes[dd], 'actual',
                              cropBBoxes[dd] * CROP_PAD)
                    print('\n')

                    label[0, 0] = 1
                    label[0, 1] = 0
                    plots = [bigImage0, bigImage1, image0, image1]
                    subplot = drawing.subplot(plots,
                                              2,
                                              2,
                                              outputWidth=OUTPUT_WIDTH,
                                              outputHeight=OUTPUT_HEIGHT,
                                              border=5)
                    cv2.imshow('debug', subplot[:, :, ::-1])
                    cv2.waitKey(1)

            if mirrored:
                tImage = np.fliplr(tImage.transpose(2, 3, 4, 0, 1)).transpose(
                    3, 4, 0, 1, 2)
                xywhLabels[..., 0] = 1 - xywhLabels[..., 0]

            tImage = tImage.reshape([self.delta * 2] + list(tImage.shape[2:]))
            xyxyLabels = bb_util.xywh_to_xyxy(xywhLabels.T).T * 10
            xyxyLabels = xyxyLabels.astype(np.float32)
            return tImage, xyxyLabels
        except Exception as e:
            import traceback
            traceback.print_exc()
            import pdb
            pdb.set_trace()
            print('exception')
예제 #7
0
    def track(self, unique_id, image, starting_box=None):
        start_time = time.time()

        if type(image) == str:
            image = cv2.imread(image)[:, :, ::-1]
        else:
            image = image.copy()

        image_read_time = time.time() - start_time

        if starting_box is not None:
            lstm_state = None
            past_bbox = np.array(starting_box)  # turns list into numpy array if not and copies for safety.
            prev_image = image
            original_features = None
            forward_count = 0
        elif unique_id in self.tracked_data:
            lstm_state, past_bbox, prev_image, original_features, forward_count = self.tracked_data[unique_id]
        else:
            raise Exception("Unique_id %s with no initial bounding box" % unique_id)

        cropped_input0, past_b_box_padded = im_util.get_cropped_input(prev_image, past_bbox, CROP_PAD, CROP_SIZE)
        cropped_input1, _ = im_util.get_cropped_input(image, past_bbox, CROP_PAD, CROP_SIZE)
        # import pdb
        # pdb.set_trace()

        image_input = pt_util.from_numpy((np.stack([cropped_input0, cropped_input1])))
        raw_output = self.network(image_input, lstm_state)
        raw_output = pt_util.to_numpy_array(raw_output)
        lstm_state = self.network.lstm_state
        if forward_count == 0:
            original_features = [var.clone().detach() for var in self.network.lstm_state]

        prev_image = image

        # Shift output box to full image coordinate system.
        output_box = bb_util.from_crop_coordinate_system(raw_output.squeeze() / 10.0, past_b_box_padded, 1, 1)
        # import pdb
        # pdb.set_trace()
        if forward_count > 0 and forward_count % MAX_TRACK_LENGTH == 0:
            cropped_input, _ = im_util.get_cropped_input(image, output_box, CROP_PAD, CROP_SIZE)
            image_input = pt_util.from_numpy(np.tile(cropped_input[np.newaxis, ...], (2, 1, 1, 1)))
            self.network(image_input, original_features)
            lstm_state = self.network.lstm_state

        forward_count += 1
        self.total_forward_count += 1

        if starting_box is not None:
            # Use label if it's given
            output_box = np.array(starting_box)

        self.tracked_data[unique_id] = (lstm_state, output_box, image, original_features, forward_count)
        end_time = time.time()
        if self.total_forward_count > 0:
            self.t_time += end_time - start_time - image_read_time
        if SPEED_OUTPUT and self.total_forward_count % 100 == 0:
            print("Current tracking speed:   %.3f FPS" % (1 / (end_time - start_time - image_read_time)))
            print("Current image read speed: %.3f FPS" % (1 / (image_read_time)))
            print("Mean tracking speed:      %.3f FPS\n" % (self.total_forward_count / max(0.00001, self.t_time)))
        return output_box