def __init__(self, paths_file, loader=default_loader, transform=None, target_transform=None, frac_train=0.8, visibility=0.3, min_width=0, min_height=0): """ inits of all file names and bounding boxes """ self.loader = loader self.transform = trans.Compose([transform, trans.ToTensor()]) self.target_transform = target_transform paths = motu.parse_videos_file(paths_file) current_index = 0 used_index = [0, 2, 3, 4, 5] self.all_gt = np.zeros([0, 5]) self.all_imagepaths = [] self.num_classes = 80 for path in paths: gt, img, info = motu.get_gt_img_inf(path) modified_length = int(info.seqLength * frac_train) gt = gt[gt[:, 2] < info.imWidth] # remove boxes out of the image gt = gt[gt[:, 3] < info.imHeight] # remove boxes out of the image neg_ids_x = np.where(gt[:, 2] < 0) neg_ids_y = np.where(gt[:, 3] < 0) pos_ids_x = np.where(gt[:, 2] + gt[:, 4] >= info.imWidth) pos_ids_y = np.where(gt[:, 3] + gt[:, 5] >= info.imHeight) gt[neg_ids_x, 4] += gt[ neg_ids_x, 2] # if we move the top left corner into the image we must adapt the height gt = gt[gt[:, 4] > 0] # make sure that width stayed positiv gt[neg_ids_x, 2] = 0 gt[neg_ids_y, 5] += gt[neg_ids_y, 3] # same here (dont know if y<0 exists) gt = gt[gt[:, 5] > 0] # make sure that height stayed positiv gt[neg_ids_y, 3] = 0 gt[pos_ids_x, 4] = info.imWidth - gt[pos_ids_x, 2] - 1 # equal would also be bad gt[pos_ids_y, 5] = info.imHeight - gt[pos_ids_y, 3] - 1 gt = gt[gt[:, 8] > visibility, :] gt = gt[gt[:, 5] > min_height, :] gt = gt[gt[:, 4] > min_width, :] gt = motu.transform_bb_to_centered(gt) gt = motu.filter_person(gt) gt = motu.filter_frames(gt, 0, modified_length) gt = gt[:, used_index] gt[:, 0] += current_index current_index += modified_length self.all_gt = np.concatenate((self.all_gt, gt), 0) self.all_imagepaths += img[:modified_length]
def __getitem__(self, index): """ load the image itself and choose the right bb frame """ target = motu.filter_gt(self.all_gt, index) target = torch.from_numpy(target[:, 1:]) sample = self.loader(self.all_imagepaths[index]) width, height = sample.size if self.transform is not None: sample = self.transform(sample) if height != sample.shape[2] and width != sample.shape[1]: height_scale = height / sample.shape[2] width_scale = width / sample.shape[1] if self.target_transform is not None: target = trans.Compose([ trans.Lambda(lambda x: motu.resize_bb( x, height_scale, width_scale)), self.target_transform ])(target) else: target = motu.resize_bb(target, height_scale, width_scale) elif self.target_transform is not None: target = self.target_transform(target) return sample, target
if not os.path.isdir(log_path): os.makedirs(log_path) batch_size = 1 # only works with batchsize 1 cuda = False # careful can take up much gpu memory save_every_image = False # if true all images are additionaly saved as .png anchors = [(0.43, 1.715), (0.745625, 3.645), (1.24375, 5.9325), (2.5, 12.24), (6.1225, 22.41375)] net = YoloLSTM(batch_size, image_size=832, anchors=anchors) net.load_snapshot(snapshot_path) net.eval() if cuda: net.cuda() net.reinit_lstm(batch_size) imgs = [] _, img_paths, _ = motu.get_gt_img_inf(path) a = 0 with torch.no_grad(): for img_path in img_paths[int(len(img_paths) * .8):]: image = cv2.imread(img_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = cv2.resize(image, (net.image_size, net.image_size)) img = image.copy() image = torch.from_numpy( np.transpose(image.astype(np.float32), (2, 0, 1))).unsqueeze(0) if cuda: image = image.cuda() logits = net(image) if cuda: logits = logits.cpu() boxes = post_processing(logits, net.image_size, net.anchors, 0.25, 0.5)
def __init__(self, paths_file, seq_length=20, new_height=416, new_width=416, step=5, valid_ratio=0.2, use_only_first_video=False): """ :param paths_file: path to the file which includes the paths of the images and labels :param seq_length: length of the sequences to be generated :param new_height: resize images to this height :param new_width: resize labels to this width :param step: steps between each sequence :param valid_ratio: number of images used for validation. I.e the last 20% will be used for validation :param use_only_first_video: for debug purposes-> only the first video is used for dataset construction """ # framerate is not constant 05 SDP has 14 fps vs 02 SDP 30 fps -> could be a problem # the movement is also a problem especially if the movement changes for the prediction time # crop negative boxes (maybe not necessary) self.seq_length = seq_length self.im_size = (new_height, new_width) paths = motu.parse_videos_file(paths_file) # dict list list # tmp format video->frame->boxes with id videos_train = {i: {"gt": [], "path": ""} for i in range(len(paths))} videos_valid = {i: {"gt": [], "path": ""} for i in range(len(paths))} frame_offsets = [] # stats stuff all_traj_lengths = np.zeros(1) all_displacements = np.zeros(1) for i, path in enumerate(paths): if i != 0 and use_only_first_video: break # gt format: [frame id 4 box parameter] # get ground truth (gt) gt, info = motu.get_gt_info(path) num_frames = info.seqLength # remove everything that is not a pedestrian cond = np.vstack([(gt[:, 7] == 1).reshape(1, -1), (gt[:, 7] == 7).reshape(1, -1)]).T gt = gt[np.where(np.any(cond, axis=1))] # pedestrian ids are now discontinuous which results in tougher stats calculation # -> calc correction and apply it ids = gt[:, 1] ids_diff = ids[1:] - ids[:-1] # first try with convolve didn't work ids_diff[ids_diff != 0] -= 1 # remove natural discontinuity resulting from counting upwards correct = np.cumsum(ids_diff) # cum gt[1:, 1] -= correct # assume the first sample has correct id gt[:, 1] -= gt[0, 1] - 1 # the pedestrians diff may begin with n != 1 # change negative boxes (dont know if it is really important)(remove it if not necessary anymore) # ( i really hope there are no to big boxes -> there are to big boxes (face palm)) neg_ids_x = np.where(gt[:, 2] < 0) neg_ids_y = np.where(gt[:, 3] < 0) pos_ids_x = np.where(gt[:, 2] + gt[:, 4] >= info.imWidth) pos_ids_y = np.where(gt[:, 3] + gt[:, 5] >= info.imHeight) gt[neg_ids_x, 4] += gt[ neg_ids_x, 2] # if we move the top left corner into the image we must adapt the height gt[neg_ids_x, 2] = 0 gt[neg_ids_y, 5] += gt[neg_ids_y, 3] # same here (dont know if y<0 exists) gt[neg_ids_y, 3] = 0 gt[pos_ids_x, 4] = info.imWidth - gt[pos_ids_x, 2] - 1 # equal would also be bad gt[pos_ids_y, 5] = info.imHeight - gt[pos_ids_y, 3] - 1 # resize boxes and normalize data height_scale = info.imHeight / new_height width_scale = info.imWidth / new_width gt = motu.transform_bb_to_centered(gt) gt[:, [2, 4]] /= width_scale gt[:, [3, 5]] /= height_scale # calc stats i.e trajectory length (mean , std, max min) max_ped_id = int(np.max(gt[:, 1])) # ids start at 1 # for each id calc the number of sample traj_lengths = np.zeros(max_ped_id) displacements = np.zeros(max_ped_id) for ped_id in range(max_ped_id): ids = np.where(gt[:, 1] == ped_id + 1)[0] # trajectory length traj_lengths[ped_id] = ids.shape[0] # displacement from begin to end start = gt[ids[0], 2:4] end = gt[ids[-1], 2:4] # if np.sqrt(np.sum(np.square(start-end))) > 416: # print(start, end) displacements[ped_id] = np.sqrt(np.sum(np.square(start - end))) all_traj_lengths = np.concatenate([all_traj_lengths, traj_lengths]) all_displacements = np.concatenate([all_displacements, displacements]) # create structure # create 10 frame gap between train and valid data train_test_split_idx = int(num_frames * (1.0 - valid_ratio)) frame_offsets.append(train_test_split_idx) num_train_frames = train_test_split_idx - 10 num_valid_frames = num_frames - train_test_split_idx videos_train[i]["gt"] = [None for j in range(num_train_frames)] videos_train[i]["path"] = path + "img1/" for j in range(num_train_frames): videos_train[i]["gt"][j] = gt[np.where(gt[:, 0] == j)] videos_valid[i]["gt"] = [None for j in range(num_valid_frames)] videos_valid[i]["path"] = path + "img1/" for j in range(num_valid_frames): videos_valid[i]["gt"][j] = gt[np.where(gt[:, 0] == j + train_test_split_idx)] # [batch_size, seq_index, (id bb)]: goal # self.sequences contains train+valid # each element of sequence is [seq_length, 120, 5] # for debug frame_sequences contains list with source path of the corresponding image self.sequences, self.frame_paths = self._intermediate_to_final(videos_train, self.seq_length, step, remaining_peds_idx=self.seq_length//2) self.valid_begin = len(self.sequences) # important for datasplit with data.subset valid_seq, valid_frames = self._intermediate_to_final(videos_valid, self.seq_length, step, frame_offsets, remaining_peds_idx=self.seq_length//2) self.sequences += valid_seq self.frame_paths += valid_frames # print stats all_traj_lengths = all_traj_lengths[1:] # first element is dummy print("Mean trajectory length: {}".format(np.mean(all_traj_lengths))) print("Deviation of trajectory length: {}".format(np.std(all_traj_lengths))) print("Max trajectory length: {}".format(np.max(all_traj_lengths))) print("Min trajectory length: {}".format(np.min(all_traj_lengths))) all_displacements = all_displacements[1:] print("Mean displacement: {}".format(np.mean(all_displacements))) print("Deviation of displacements: {}".format(np.std(all_displacements))) print("Max displacement: {}".format(np.max(all_displacements))) print("Min displacement: {}".format(np.min(all_displacements)))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('-filelist', default='Mot17_test_single.txt', help='path to filelist\n') parser.add_argument('-output_dir', default='generated_anchors/anchors', type=str, help='Output anchor directory\n') parser.add_argument('-num_clusters', default=0, type=int, help='number of clusters\n') filelist = 'Mot17_test_single.txt' output_dir = 'anchors' arg_num_clusters = 0 if not os.path.exists(output_dir): os.mkdir(output_dir) paths_file = filelist paths = motu.parse_videos_file(paths_file) # todo all_boxes = np.zeros([0, 2]) for i, path in enumerate(paths): # gt format: [frame id 4 box parameter] # tmp debug until i download the dataset path = '../' + path # tmp # get ground truth (gt) gt, info = motu.get_gt_info(path) num_frames = info.seqLength # remove everything that is not a pedestrian cond = np.vstack([(gt[:, 7] == 1).reshape(1, -1), (gt[:, 7] == 7).reshape(1, -1)]).T gt = gt[np.where(np.any(cond, axis=1))] # change negative boxes (dont know if it is really important)(remove it if not necessary anymore) # ( i really hope there are no to big boxes -> there are to big boxes (face palm)) neg_ids_x = np.where(gt[:, 2] < 0) neg_ids_y = np.where(gt[:, 3] < 0) pos_ids_x = np.where(gt[:, 2] + gt[:, 4] >= info.imWidth) pos_ids_y = np.where(gt[:, 3] + gt[:, 5] >= info.imHeight) gt[neg_ids_x, 4] += gt[ neg_ids_x, 2] # if we move the top left corner into the image we must adapt the height gt[neg_ids_x, 2] = 0 gt[neg_ids_y, 5] += gt[neg_ids_y, 3] # same here (dont know if y<0 exists) gt[neg_ids_y, 3] = 0 gt[pos_ids_x, 4] = info.imWidth - gt[pos_ids_x, 2] - 1 # equal would also be bad gt[pos_ids_y, 5] = info.imHeight - gt[pos_ids_y, 3] - 1 # normalise boxes height_scale = info.imHeight / height_in_cfg_file width_scale = info.imWidth / width_in_cfg_file gt[:, [2, 4]] /= width_scale gt[:, [3, 5]] /= height_scale all_boxes = np.concatenate([all_boxes, gt[:, [4, 5]]]) annotation_dims = all_boxes eps = 0.005 if arg_num_clusters == 0: for num_clusters in range(1, 11): # we make 1 through 10 clusters anchor_file = join(output_dir, 'anchors%d.txt' % (num_clusters)) indices = [ random.randrange(annotation_dims.shape[0]) for i in range(num_clusters) ] centroids = annotation_dims[indices] kmeans(annotation_dims, centroids, eps, anchor_file) print('centroids.shape', centroids.shape) else: anchor_file = join(output_dir, 'anchors%d.txt' % (arg_num_clusters)) indices = [ random.randrange(annotation_dims.shape[0]) for i in range(arg_num_clusters) ] centroids = annotation_dims[indices] kmeans(annotation_dims, centroids, eps, anchor_file) print('centroids.shape', centroids.shape)