def __init__(self, root, list_file, train, transform, input_size): ''' Args: root: (str) ditectory to images. list_file: (str) path to index file. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) model input size. ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.fnames = [] self.boxes = [] self.labels = [] self.encoder = DataEncoder() with open(list_file) as f: lines = f.readlines() self.num_samples = len(lines) for line in lines: splited = line.strip().split() self.fnames.append(splited[0]) num_boxes = (len(splited) - 1) // 5 box = [] label = [] for i in range(num_boxes): xmin = splited[1+5*i] ymin = splited[2+5*i] xmax = splited[3+5*i] ymax = splited[4+5*i] c = splited[5+5*i] box.append([float(xmin),float(ymin),float(xmax),float(ymax)]) label.append(int(c)) self.boxes.append(torch.Tensor(box)) self.labels.append(torch.LongTensor(label))
class ListDataset(data.Dataset): def __init__(self, root, list_file, train, transform, input_size): ''' Args: root: (str) ditectory to images. list_file: (str) path to index file. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) model input size. ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.fnames = [] self.boxes = [] self.labels = [] self.encoder = DataEncoder() with open(list_file) as f: lines = f.readlines() self.num_samples = len(lines) for line in lines: splited = line.strip().split() self.fnames.append(splited[0]) num_boxes = (len(splited) - 1) // 5 box = [] label = [] for i in range(num_boxes): xmin = splited[1+5*i] ymin = splited[2+5*i] xmax = splited[3+5*i] ymax = splited[4+5*i] c = splited[5+5*i] box.append([float(xmin),float(ymin),float(xmax),float(ymax)]) label.append(int(c)) self.boxes.append(torch.Tensor(box)) self.labels.append(torch.LongTensor(label)) def __getitem__(self, idx): '''Load image. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_targets: (tensor) location targets. cls_targets: (tensor) class label targets. ''' # Load image and boxes. fname = self.fnames[idx] img = Image.open(os.path.join(self.root, fname)) if img.mode != 'RGB': img = img.convert('RGB') boxes = self.boxes[idx].clone() labels = self.labels[idx] size = self.input_size # Data augmentation. if self.train: img, boxes = random_flip(img, boxes) img, boxes = random_crop(img, boxes) img, boxes = resize(img, boxes, (size,size)) else: img, boxes = resize(img, boxes, size) img, boxes = center_crop(img, boxes, (size,size)) img = self.transform(img) return img, boxes, labels def collate_fn(self, batch): '''Pad images and encode targets. As for images are of different sizes, we need to pad them to the same size. Args: batch: (list) of images, cls_targets, loc_targets. Returns: padded images, stacked cls_targets, stacked loc_targets. ''' imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] h = w = self.input_size num_imgs = len(imgs) inputs = torch.zeros(num_imgs, 3, h, w) loc_targets = [] cls_targets = [] for i in range(num_imgs): inputs[i] = imgs[i] loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(w,h)) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack(cls_targets) def __len__(self): return self.num_samples
def test_eval(): id_net.eval() fnames = [] ids = [] ids_list = list(range(2874)) im_name_list = [] root = "./../face_a/train" encoder = DataEncoder() list_file = "./../face_a/train.csv" file_list = csv.reader(open(list_file,'r')) file_list = list(file_list) # 2874 for content_counter in range(len(file_list)): fnames.append(os.path.join(root, file_list[content_counter][0])) ids.append(int(file_list[content_counter][1])) for id_counter in range(2874): seq_num = ids.index(id_counter) im_name_list.append(fnames[seq_num]) del(ids[seq_num]) del(fnames[seq_num]) im_name_valid = fnames[:400] im_name_train = fnames[400:]+im_name_list ids_valid = ids[:400] ids_train = ids[400:]+ids_list eval_list_feature = torch.zeros(len(ids_list), 1024) for i in range(len(ids_list)): name = im_name_list[i] img = Image.open(name).convert('RGB') img = alignment(img) img, img_ = transform(img), transform(F.hflip(img)) img, img_ = Variable(img.unsqueeze(0).cuda(), volatile=True), Variable(img_.unsqueeze(0).cuda(), volatile=True) print(i) face_feature = torch.cat((id_net(img), id_net(img_)), 1).data.cpu()[0] eval_list_feature[i,:] = face_feature id_ = [] for i in range(len(ids_valid)): #pdb.set_trace() name = im_name_valid[i] img = Image.open(name).convert('RGB') img = alignment(img) img, img_ = transform(img), transform(F.hflip(img)) img, img_ = Variable(img.unsqueeze(0).cuda(), volatile=True), Variable(img_.unsqueeze(0).cuda(), volatile=True) face_feature = torch.cat((id_net(img), id_net(img_)), 1).data.cpu()[0] dis = [] for gallery_counter in range(eval_list_feature.size(0)): f1 = eval_list_feature[gallery_counter, :] f2 = face_feature cos_dis = f1.dot(f2) / (f1.norm() * f2.norm() + 1e-5) dis.append(float(cos_dis)) id_num = dis.index(max(dis)) id_.append(str(ids_list[id_num])) pdb.set_trace() acc_counter =0 for id_counter in range(len(id_)): if id_[id_counter] == ids_valid[id_counter]: acc_counter +=1 print(acc_counter/400.0)
image = Image.open('IMG_3321.JPG').convert('RGB') image = image.resize((1280, 960)) img = image.copy() transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) print(image.size) image = transform(image) # forward loc_preds = traced_script_module(image.unsqueeze(0).cuda()) loc_preds = loc_preds.argmax() print(loc_preds) encoder = DataEncoder() ref_table = encoder._get_anchor_boxes(torch.Tensor([1280, 960])) boxes = [ref_table[loc_preds]] box = boxes[0] print(boxes) box[0] = (box[0] - box[2] / 2) box[1] = (box[1] - box[3] / 2) box[2] = (box[2] + box[0]) box[3] = (box[3] + box[1]) print(ref_table[215999]) draw = ImageDraw.Draw(img) draw.rectangle(list(box), outline='red')
transforms.ToTensor(), transforms.Normalize(cfg.mean, cfg.std) ] if cfg.scale is not None: train_transform_list.insert(0, transforms.Scale(cfg.scale)) train_transform = transforms.Compose(train_transform_list) val_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(cfg.mean, cfg.std)]) trainset = VocLikeDataset(image_dir=cfg.image_dir, annotation_dir=cfg.annotation_dir, imageset_fn=cfg.train_imageset_fn, image_ext=cfg.image_ext, classes=cfg.classes, encoder=DataEncoder(), transform=train_transform) valset = VocLikeDataset(image_dir=cfg.image_dir, annotation_dir=cfg.annotation_dir, imageset_fn=cfg.val_imageset_fn, image_ext=cfg.image_ext, classes=cfg.classes, encoder=DataEncoder(), transform=val_transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers, collate_fn=trainset.collate_fn) valloader = torch.utils.data.DataLoader(valset, batch_size=cfg.batch_size,
class BottleLoader(Dataset): def __init__(self, dir, encoder, json_suffix='', transform=None, val=False): self.dir = dir self.encoder = DataEncoder() self.json_suffix = json_suffix self.transform = transform self.encoder = encoder files = listdir(self.dir) prefixes = list( map(lambda f: f.replace('.jpg', ''), filter(lambda f: '.jpg' in f, files))) prefixes = list(map(lambda f: path.join(self.dir, f), prefixes)) self.impath = list(map(lambda f: f'{f}.jpg', prefixes)) self.annotations = list( map(lambda f: f'{f}{self.json_suffix}.json', prefixes)) labelset = set() for p in self.annotations: with open(p, 'r') as f: j = json.load(f) labelset = labelset.union(set(map(lambda f: f['id'], j))) self.label_index = dict((k, v) for v, k in enumerate(labelset)) self.val = val def annotate(self, fname, imsize): boxes = [] with open(fname, 'r') as f: groups = json.load(f) coords, labels = [], [] for group in groups: for obj in group['data']: boxes.append( BoundingBox( obj['boundingBox']['X'], obj['boundingBox']['Y'] + obj['boundingBox']['Height'], obj['boundingBox']['X'] + obj['boundingBox']['Width'], obj['boundingBox']['Y'], imsize[0], imsize[1], self.label_index[group['id']])) return boxes def __getitem__(self, i): data = list(self.metadata['paths'][i]) shape = self.metadata['shape'][i] img = np.array(Image.open(data[0])) img = resize(img, (sizeremap[shape[0]], sizeremap[shape[1]])) img = torch.Tensor(img.transpose(2, 0, 1)) coords = torch.Tensor(np.stack(coords)) labels = torch.LongTensor( np.array(list(map(self.metadata['label_index'].get, labels)))).view(-1, 1) return img, coords, labels def __getitem__(self, index): impath = self.impath[index] annotation = self.annotations[index] image = Image.open(impath) boxes = self.annotate(annotation, image.size) example = {'image': image, 'boxes': boxes} if self.transform: example = self.transform(example) return example def __len__(self): return len(self.impath) def collate_fn(self, batch): imgs = [example['image'] for example in batch] boxes = [example['boxes'] for example in batch] labels = [example['labels'] for example in batch] img_sizes = [img.size()[1:] for img in imgs] max_h = max([im.size(1) for im in imgs]) max_w = max([im.size(2) for im in imgs]) num_imgs = len(imgs) inputs = torch.zeros(num_imgs, 3, max_h, max_w) loc_targets = [] cls_targets = [] for i in range(num_imgs): im = imgs[i] imh, imw = im.size(1), im.size(2) inputs[i, :, :imh, :imw] = im loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(max_w, max_h)) loc_targets.append(loc_target) cls_targets.append(cls_target) if not self.val: return inputs, torch.stack(loc_targets), torch.stack(cls_targets) return inputs, img_sizes, torch.stack(loc_targets), torch.stack( cls_targets)
class ListDataset(data.Dataset): def __init__(self, root, list_file, train, transform, input_size, max_size): ''' Args: root: (str) ditectory to images. list_file: (str) path to index file. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) image shorter side size. max_size: (int) maximum image longer side size. ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.max_size = max_size self.fnames = [] self.boxes = [] self.labels = [] self.data_encoder = DataEncoder() with open(list_file) as f: lines = f.readlines() self.num_samples = len(lines) for line in lines: splited = line.strip().split() self.fnames.append(splited[0]) num_boxes = (len(splited) - 3) // 5 box = [] label = [] for i in range(num_boxes): xmin = splited[3 + 5 * i] ymin = splited[4 + 5 * i] xmax = splited[5 + 5 * i] ymax = splited[6 + 5 * i] c = splited[7 + 5 * i] box.append( [float(xmin), float(ymin), float(xmax), float(ymax)]) label.append(int(c)) self.boxes.append(torch.Tensor(box)) self.labels.append(torch.LongTensor(label)) def __getitem__(self, idx): '''Load image. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_targets: (tensor) location targets. cls_targets: (tensor) class label targets. ''' # Load image and bbox locations. fname = self.fnames[idx] img = Image.open(os.path.join(self.root, fname)) boxes = self.boxes[idx] labels = self.labels[idx] # Data augmentation while training. if self.train: img, boxes = self.random_flip(img, boxes) img, im_scale = self.resize(img) boxes *= im_scale img = self.transform(img) return img, boxes, labels def resize(self, img): '''Resize the image shorter side to input_size. Args: img: (PIL.Image) image. Returns: (PIL.Image) resized image. (float) image scale. Reference: https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/utils/blob.py ''' im_size_min = min(img.size) im_size_max = max(img.size) im_scale = float(self.input_size) / float(im_size_min) if round(im_scale * im_size_max ) > self.max_size: # limit the longer side to MAX_SIZE im_scale = float(self.max_size) / float(im_size_max) w = int(img.width * im_scale) h = int(img.height * im_scale) return img.resize((w, h)), im_scale def random_flip(self, img, boxes): '''Randomly flip the image and adjust the bbox locations. For bbox (xmin, ymin, xmax, ymax), the flipped bbox is: (w-xmax, ymin, w-xmin, ymax). Args: img: (PIL.Image) image. boxes: (tensor) bbox locations, sized [#obj, 4]. Returns: img: (PIL.Image) randomly flipped image. boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4]. ''' if random.random() < 0.5: img = img.transpose(Image.FLIP_LEFT_RIGHT) w = img.width xmin = w - boxes[:, 2] xmax = w - boxes[:, 0] boxes[:, 0] = xmin boxes[:, 2] = xmax return img, boxes def collate_fn(self, batch): '''Pad images and encode targets. As for images are of different sizes, we need to pad them to the same size. Args: batch: (list) of images, cls_targets, loc_targets. Returns: (list) of padded images, stacked cls_targets, stacked loc_targets. Reference: https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/utils/blob.py ''' imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] max_size, _ = torch.IntTensor([im.size() for im in imgs]).max(0) max_h, max_w = max_size[1], max_size[2] num_imgs = len(imgs) inputs = torch.zeros(num_imgs, 3, max_h, max_w) loc_targets = [] cls_targets = [] for i in range(num_imgs): im = imgs[i] imh, imw = im.size(1), im.size(2) inputs[i, :, :imh, :imw] = im # Encode data. loc_target, cls_target = self.data_encoder.encode( boxes[i], labels[i], input_size=(max_h, max_w)) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack(cls_targets) def __len__(self): return self.num_samples
class ListDataset(data.Dataset): img_size = InputImgSize def __init__(self, root, list_file, train, transform): ''' Args: root: (str) ditectory to images. list_file: (str) path to index file. train: (boolean) train or test. transform: ([transforms]) image transforms. ''' self.root = root self.train = train self.transform = transform self.fnames = [] self.boxes = [] self.labels = [] self.data_encoder = DataEncoder() with open(list_file) as f: lines = f.readlines() self.num_samples = len(lines) for line in lines: splited = line.strip().split() self.fnames.append(splited[0]) num_objs = int(splited[1]) box = [] label = [] for i in range(num_objs): xmin = splited[2+5*i] ymin = splited[3+5*i] xmax = splited[4+5*i] ymax = splited[5+5*i] c = splited[6+5*i] box.append([float(xmin),float(ymin),float(xmax),float(ymax)]) label.append(int(c)) self.boxes.append(torch.Tensor(box)) self.labels.append(torch.LongTensor(label)) def __getitem__(self, idx): '''Load a image, and encode its bbox locations and class labels. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_target: (tensor) location targets, sized [8732,4]. conf_target: (tensor) label targets, sized [8732,]. ''' # Load image and bbox locations. fname = self.fnames[idx] ###############3 img = Image.open(os.path.join(self.root, fname)).convert('L') ################# boxes = self.boxes[idx].clone() labels = self.labels[idx] # Data augmentation while training. #if self.train: # img, boxes = self.random_flip(img, boxes) # img, boxes, labels = self.random_crop(img, boxes, labels) # Scale bbox locaitons to [0,1]. w,h = img.size boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes) img = img.resize((self.img_size,self.img_size)) img = self.transform(img) # Encode loc & conf targets. loc_target, conf_target = self.data_encoder.encode(boxes, labels) return img, loc_target, conf_target def random_crop(self, img, boxes, labels): '''Randomly crop the image and adjust the bbox locations. For more details, see 'Chapter2.2: Data augmentation' of the paper. Args: img: (PIL.Image) image. boxes: (tensor) bbox locations, sized [#obj, 4]. labels: (tensor) bbox labels, sized [#obj,]. Returns: img: (PIL.Image) cropped image. selected_boxes: (tensor) selected bbox locations. labels: (tensor) selected bbox labels. ''' imw, imh = img.size while True: min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9]) if min_iou is None: return img, boxes, labels for _ in range(100): w = random.randrange(int(0.1*imw), imw) h = random.randrange(int(0.1*imh), imh) if h > 2*w or w > 2*h: continue x = random.randrange(imw - w) y = random.randrange(imh - h) roi = torch.Tensor([[x, y, x+w, y+h]]) center = (boxes[:,:2] + boxes[:,2:]) / 2 # [N,2] roi2 = roi.expand(len(center), 4) # [N,4] mask = (center > roi2[:,:2]) & (center < roi2[:,2:]) # [N,2] mask = mask[:,0] & mask[:,1] #[N,] if not mask.any(): continue selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) iou = self.data_encoder.iou(selected_boxes, roi) if iou.min() < min_iou: continue img = img.crop((x, y, x+w, y+h)) selected_boxes[:,0].add_(-x).clamp_(min=0, max=w) selected_boxes[:,1].add_(-y).clamp_(min=0, max=h) selected_boxes[:,2].add_(-x).clamp_(min=0, max=w) selected_boxes[:,3].add_(-y).clamp_(min=0, max=h) return img, selected_boxes, labels[mask] def __len__(self): return self.num_samples
class ListDataset(data.Dataset): def __init__(self, root, list_file, train, transform, input_size): ''' Args: root: (str) ditectory to images. ".data" list_file: (str) path to index file. '.data/find_star_split/find_star_train_bbx_gt.txt' train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) model input size. (800 * 800) ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.fnames = [] # 存储的是image_name self.boxes = [] self.labels = [] self.encoder = DataEncoder() with open(list_file) as f: lines = f.readlines() self.num_samples = len(lines) for line in lines: splited = line.strip().split() self.fnames.append(splited[0]) num_boxes = (len(splited) - 1) // 5 box = [] label = [] for i in range(num_boxes): xmin = splited[1+5*i] ymin = splited[2+5*i] xmax = splited[3+5*i] ymax = splited[4+5*i] c = splited[5+5*i] box.append([float(xmin),float(ymin),float(xmax),float(ymax)]) label.append(int(c)) self.boxes.append(torch.Tensor(box)) self.labels.append(torch.LongTensor(label)) def __getitem__(self, idx): '''Load image. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_targets: (tensor) location targets. cls_targets: (tensor) class label targets. ''' # Load image and boxes. fname = self.fnames[idx] prefix_name = fname[:2] if self.train: image_path = self.root + '/' + prefix_name + '/' + fname else: image_path = self.root + '/' + prefix_name + '/' + fname # img = Image.open(os.path.join(self.root, fname)) img_a = Image.open(image_path + '_a.jpg') img_b = Image.open(image_path + '_b.jpg') img_c = Image.open(image_path + '_c.jpg') img = Image.merge('RGB', (img_a, img_b, img_c)) # if img.mode != 'RGB': # img = img.convert('RGB') boxes = self.boxes[idx].clone() labels = self.labels[idx] size = self.input_size # Data augmentation. if self.train: img, boxes = random_flip(img, boxes) img, boxes = random_crop(img, boxes) img, boxes = resize(img, boxes, (size,size)) else: img, boxes = resize(img, boxes, size) img, boxes = center_crop(img, boxes, (size,size)) img = self.transform(img) # if self.transforms is not None: # # if img is a byte or uint8 array, it will convert from 0-255 to 0-1 # # this converts from (HxWxC) to (CxHxW) as well # img_a, img_b, img_c = image # img_a = self.transforms(img_a) # img_b = self.transforms(img_b) # img_c = self.transforms(img_c) # img = (img_a, img_b, img_c) return img, boxes, labels def collate_fn(self, batch): '''Pad images and encode targets. As for images are of different sizes, we need to pad them to the same size. Args: batch: (list) of images, cls_targets, loc_targets. Returns: padded images, stacked cls_targets, stacked loc_targets. ''' imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] h = w = self.input_size num_imgs = len(imgs) inputs = torch.zeros(num_imgs, 3, h, w) loc_targets = [] cls_targets = [] for i in range(num_imgs): inputs[i] = imgs[i] loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(w,h)) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack(cls_targets) def __len__(self): return self.num_samples
class ListDataset(data.Dataset): def __init__(self, list_file, root=None, train=True, transform=None, image_size=96, small_threshold=5, big_threshold=60, setmin=6, setmax=50, fm_size=None, ac_size=None, ac_density=None, stride=4, offset=12): print('data init') self.image_size = image_size self.root = root self.train = train self.transform = transform self.fnames = [] self.boxes = [] self.labels = [] self.small_threshold = float( small_threshold) #img_48:8,45,10,40 img_36:8,36,10,35 self.big_threshold = float(big_threshold) self.data_encoder = DataEncoder(img_size=image_size, fm_size=fm_size, ac_size=ac_size, ac_density=ac_density, stride=stride, offset=offset) self.setmin = setmin self.setmax = setmax with open(list_file) as f: lines = f.readlines() for line in lines: splited = line.strip().split() self.fnames.append(splited[0]) num_faces = int(splited[1]) box = [] label = [] for i in range(num_faces): x = float(splited[2 + 5 * i]) y = float(splited[3 + 5 * i]) w = float(splited[4 + 5 * i]) h = float(splited[5 + 5 * i]) c = int(splited[6 + 5 * i]) box.append([x, y, x + w, y + h]) label.append(c) self.boxes.append(torch.Tensor(box)) self.labels.append(torch.LongTensor(label)) self.num_samples = len(self.boxes) def __getitem__(self, idx): while True: fname = self.fnames[idx] img = cv2.imread(os.path.join(self.root + fname)) if img is None: idx = random.randrange(0, self.num_samples) continue imh, imw, _ = img.shape boxes = self.boxes[idx].clone() labels = self.labels[idx].clone() boxwh = boxes[:, 2:] - boxes[:, :2] center = (boxes[:, :2] + boxes[:, 2:]) / 2. # boxar = boxwh[:,0] * boxwh[:,1] ratio = boxwh.max(1)[0] / boxwh.min(1)[0] mask = (boxwh[:, 0] >= self.setmin) & (boxwh[:, 1] >= self.setmin) & ( ratio < float(self.setmax) / self.setmin) & ( center[:, 0] > 0) & (center[:, 0] < imw - 1) & ( center[:, 1] > 0) & (center[:, 1] < imh - 1) if mask.any(): break else: idx = random.randrange(0, self.num_samples) if self.train: while True: bbox_idx = random.randint(0, boxwh.size(0) - 1) # area = boxwh[bbox_idx][0]*boxwh[bbox_idx][1] # if area >= self.setmin**2: if mask[bbox_idx]: break # if area > self.setmax**2: if max(boxwh[bbox_idx][0], boxwh[bbox_idx][1]) > self.setmax: oh, ow, _ = img.shape fct_min = self.setmin / min(boxwh[bbox_idx][0], boxwh[bbox_idx][1]) fct_max = self.setmax / max(boxwh[bbox_idx][0], boxwh[bbox_idx][1]) # tgt_size = random.randint(self.setmin, self.setmax) # factor = tgt_size / math.sqrt(area) # factor = tgt_size / max(boxwh[bbox_idx][0], boxwh[bbox_idx][1]) factor = random.uniform(fct_min, fct_max) img = cv2.resize(img, (0, 0), fx=factor, fy=factor) h, w, _ = img.shape boxes *= torch.Tensor([ float(w) / ow, float(h) / oh, float(w) / ow, float(h) / oh ]).expand_as(boxes) new_center = (boxes[:, :2] + boxes[:, 2:]) / 2 tmp = (new_center[:, 0] > 0) & (new_center[:, 0] < w) & ( new_center[:, 1] > 0) & (new_center[:, 1] < h) if not tmp.any(): print 'center:', center print imw, imh print 'new_center:', new_center print w, h assert tmp.any() else: h, w, _ = img.shape center = (boxes[:, :2] + boxes[:, 2:]) / 2 tmp = (center[:, 0] > 0) & (center[:, 0] < w - 1) & ( center[:, 1] > 0) & (center[:, 1] < h - 1) if not tmp.any(): print 'center:', center print w, h assert tmp.any() boxwh = boxes[:, 2:] - boxes[:, :2] new_mask = (boxwh[:, 0] > self.small_threshold) & ( boxwh[:, 1] > self.small_threshold) & ( boxwh[:, 0] < self.big_threshold) & (boxwh[:, 1] < self.big_threshold) if not new_mask.any(): print boxes assert new_mask.any() if max(h, w) < self.image_size: img, boxes, labels = self.supple_filter(img, boxes, labels) elif h >= self.image_size and w >= self.image_size: img, boxes, labels = self.random_crop(img, boxes, labels, bbox_idx) else: img, boxes, labels = self.supple(img, boxes, labels) img, boxes, labels = self.random_crop(img, boxes, labels, bbox_idx) if random.random() < 0.5: img = self.random_bright(img) img = self.random_contrast(img) img = self.random_saturation(img) img = self.random_hue(img) else: img = self.random_bright(img) img = self.random_saturation(img) img = self.random_hue(img) img = self.random_contrast(img) img, boxes = self.random_flip(img, boxes) boxwh = boxes[:, 2:] - boxes[:, :2] # print('boxwh', boxwh) h, w, _ = img.shape assert (h == w and h == self.image_size) # img = cv2.resize(img,(self.image_size,self.image_size)) boxes_wh = boxes[:, 2:] - boxes[:, :2] if ((boxes_wh[:, 0] == 0) | (boxes_wh[:, 1] == 0)).any(): print boxes # save_path = '/home/michael/data/tmp/wider_acn/' # cv2.imwrite(save_path+'%d_old.jpg'%idx, img) # self.visual(img, boxes, idx) # cv2.imwrite(save_path+'%d_new.jpg'%idx, img) # print 'idx:', idx # print 'boxes:', boxes # print 'label:', labels boxes /= torch.Tensor([w, h, w, h]).expand_as(boxes) for t in self.transform: img = t(img) loc_target, conf_target = self.data_encoder.encode(idx, boxes, labels) return img, loc_target, conf_target def random_getim(self): idx = random.randrange(0, self.num_samples) fname = self.fnames[idx] img = cv2.imread(os.path.join(self.root + fname)) boxes = self.boxes[idx].clone() labels = self.labels[idx] return img, boxes, labels def __len__(self): return self.num_samples def random_flip(self, im, boxes): if random.random() < 0.5: im_lr = np.fliplr(im).copy() h, w, _ = im.shape xmin = w - boxes[:, 2] xmax = w - boxes[:, 0] boxes[:, 0] = xmin boxes[:, 2] = xmax return im_lr, boxes return im, boxes def visual(self, im, boxes, idx): save_path = '/home/michael/data/tmp/wider_acn/%d.jpg' % idx for j, (box) in enumerate(boxes): x1 = int(box[0]) x2 = int(box[2]) y1 = int(box[1]) y2 = int(box[3]) cv2.rectangle(im, (x1, y1 + 2), (x2, y2), (0, 255, 0), 2) cv2.imwrite(save_path, im) def supple(self, im, boxes, labels): h, w, _ = im.shape im = cv2.copyMakeBorder(im, 0, max(0, self.image_size - h), 0, max(0, self.image_size - w), cv2.BORDER_CONSTANT, value=0) return im, boxes, labels def supple_filter(self, im, boxes, labels): h, w, _ = im.shape im = cv2.copyMakeBorder(im, 0, max(0, self.image_size - h), 0, max(0, self.image_size - w), cv2.BORDER_CONSTANT, value=0) boxwh = boxes[:, 2:] - boxes[:, :2] mask = (boxwh[:, 0] > self.small_threshold) & ( boxwh[:, 1] > self.small_threshold) & ( boxwh[:, 0] < self.big_threshold) & (boxwh[:, 1] < self.big_threshold) if not mask.any(): print boxes assert mask.any() selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) selected_labels = labels.index_select(0, mask.nonzero().squeeze(1)) return im, selected_boxes, selected_labels def random_crop(self, im, boxes, labels, bbox_idx): imh, imw, _ = im.shape w = self.image_size h = w tgt_box = boxes[bbox_idx] #print 'tgt:', tgt_box if tgt_box[0] <= 0 or imw == w: x = 0 elif tgt_box[2] >= imw: x = imw - 1 - w else: x_min = int(max(0, tgt_box[2] - w)) x_max = int(min(tgt_box[0], imw - w)) x = random.randint(x_min, x_max) if tgt_box[1] <= 0 or imh == h: y = 0 elif tgt_box[3] >= imh: y = imh - 1 - h else: y_min = int(max(0, tgt_box[3] - h)) y_max = int(min(tgt_box[1], imh - h)) y = random.randint(y_min, y_max) #print 'xy:', x, y roi = torch.Tensor([[x, y, x + w, y + h]]) center = (boxes[:, :2] + boxes[:, 2:]) / 2 roi2 = roi.expand(len(center), 4) mask = (center > roi2[:, :2]) & (center < roi2[:, 2:] + 1) mask = mask[:, 0] & mask[:, 1] if not mask.any(): print 'roi:', roi print 'center:', center print 'box:', boxes print 'img:', imw, imh assert mask.any() selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) img = im[y:y + h, x:x + w, :] tmph, tmpw, _ = img.shape if tmph != tmpw: print tgt_box[0], tgt_box[2], x, y, imw, imh, tmph, tmpw assert tmph == tmpw selected_boxes[:, 0].add_(-x) #.clamp_(min=0, max=w) selected_boxes[:, 1].add_(-y) #.clamp_(min=0, max=h) selected_boxes[:, 2].add_(-x) #.clamp_(min=0, max=w) selected_boxes[:, 3].add_(-y) #.clamp_(min=0, max=h) #print selected_boxes boxwh = selected_boxes[:, 2:] - selected_boxes[:, :2] mask = (boxwh[:, 0] > self.small_threshold) & ( boxwh[:, 1] > self.small_threshold) & ( boxwh[:, 0] < self.big_threshold) & (boxwh[:, 1] < self.big_threshold) if not mask.any(): print selected_boxes print 'boxes:', boxes print 'roi:', roi print 'center:', center print 'idx:', bbox_idx print 'img:', imw, imh cv2.imwrite('wrong.jpg', img) assert mask.any() selected_boxes_selected = selected_boxes.index_select( 0, mask.nonzero().squeeze(1)) selected_labels = labels.index_select(0, mask.nonzero().squeeze(1)) return img, selected_boxes_selected, selected_labels def random_bright(self, im, delta=32): if random.random() > 0.5: im = im + random.randrange(-delta, delta) im = im.clip(min=0, max=255).astype(np.uint8) return im def random_contrast(self, im): if random.random() > 0.5: alpha = random.uniform(0.5, 1.5) im = im * alpha im = im.clip(min=0, max=255).astype(np.uint8) return im def random_saturation(self, im): if random.random() > 0.5: alpha = random.uniform(0.5, 1.5) hsv_im = cv2.cvtColor(im, cv2.COLOR_BGR2HSV) hsv_im = hsv_im * [1.0, alpha, 1.0] hsv_im = hsv_im.clip(min=0, max=255).astype(np.uint8) im = cv2.cvtColor(hsv_im, cv2.COLOR_HSV2BGR) return im def random_hue(self, im, delta=18): if random.random() > 0.5: alpha = random.randrange(-delta, delta) hsv_im = cv2.cvtColor(im, cv2.COLOR_BGR2HSV) hsv_im = hsv_im + [alpha, 0, 0] hsv_im = hsv_im.clip(min=0, max=179).astype(np.uint8) im = cv2.cvtColor(hsv_im, cv2.COLOR_HSV2BGR) return im def testGet(self, idx): fname = self.fnames[idx] img = cv2.imread(os.path.join(self.root, fname)) cv2.imwrite('test_encoder_source.jpg', img) boxes = self.boxes[idx].clone() # print(boxes) labels = self.labels[idx].clone() for box in boxes: cv2.rectangle(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 0, 255)) cv2.imwrite(fname, img) if self.train: img, boxes, labels = self.random_crop(img, boxes, labels) img = self.random_bright(img) img, boxes = self.random_flip(img, boxes) h, w, _ = img.shape boxes /= torch.Tensor([w, h, w, h]).expand_as(boxes) img = cv2.resize(img, (self.image_size, self.image_size)) for t in self.transform: img = t(img) print(idx, fname, boxes) return img, boxes, labels
print("gpu available : ", torch.cuda.is_available()) print("num_gpus : ", torch.cuda.device_count()) # Set data parallel training net = torch.nn.DataParallel(net, device_ids=[0,1,2,3]) net.cuda() # Training print("==>training start...") net.train() # Freeze BN layer for pre-trained backbone net.module.freeze_bn() # Set optimizer -- SGD or Adam optimizer = optim.SGD(net.parameters(), lr=cur_lr, momentum=0.9, weight_decay=1e-4) #optim.Adam(net.parameters(), lr=cur_lr) # Encode anchor to each feature maps encoder = DataEncoder(cls_thresh=0.5, nms_thresh=0.2) # Tensorboard visualize recorder writer = SummaryWriter(logdir=args.logdir) lossest = 1 save_lossest = False t0 = time.time() for epoch in range(start_epoch, 10000): if iteration > args.max_iter: break for inputs, loc_targets, cls_targets in trainloader: # prepare data and cls & loc label inputs = Variable(inputs.cuda()) loc_targets = Variable(loc_targets.cuda()) cls_targets = Variable(cls_targets.cuda())
class ListDataset(data.Dataset): def __init__(self, root, dataset, train, transform, input_size, multi_scale=False): ''' Args: root: (str) DB root ditectory. dataset: (str) Dataset name(dir). train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) model input size. multi_scale: (bool) use multi-scale training or not. ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.fnames = [] self.boxes = [] self.labels = [] self.multi_scale = multi_scale self.MULTI_SCALES = [ 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960 ] #step1, 2 #self.MULTI_SCALES = [960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280] #step3 self.encoder = DataEncoder() if "SynthText" in dataset: self.get_SynthText() if "ICDAR2015" in dataset: self.get_ICDAR2015() if "MLT" in dataset: self.get_MLT() if "ICDAR2013" in dataset: self.get_ICDAR2013() def __getitem__(self, idx): '''Load image. Args: idx: (int) dataset index. Returns: image: (tensor) image array. boxes: (tensor) boxes array. labels: (tensor) labels array. ''' # Load image, boxes and labels. fname = self.fnames[idx] img = cv2.imread(os.path.join(self.root, fname)) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) boxes = self.boxes[idx].copy() labels = self.labels[idx] return {"image": img, "boxes": boxes, "labels": labels} def collate_fn(self, batch): '''bbox encode and make batch Args: batch: (dict list) images, boxes and labels Returns: batch_images, batch_loc, batch_cls ''' size = self.input_size if self.multi_scale: # get random input_size for multi-scale traininig random_choice = random.randint(0, len(self.MULTI_SCALES) - 1) size = self.MULTI_SCALES[random_choice] inputs = torch.zeros(len(batch), 3, size, size) loc_targets = [] cls_targets = [] for n, data in enumerate(batch): img, boxes, labels = self.transform(size=size)(data['image'], data['boxes'], data['labels']) inputs[n] = img loc_target, cls_target = self.encoder.encode(boxes, labels, input_size=(size, size)) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack(cls_targets) def __len__(self): return self.num_samples def get_SynthText(self): import scipy.io as sio data_dir = os.path.join(self.root, 'SynthText/train/') gt = sio.loadmat(data_dir + 'gt.mat') dataset_size = gt['imnames'].shape[1] img_files = gt['imnames'][0] labels = gt['wordBB'][0] self.num_samples = dataset_size print("Training on SynthText : ", dataset_size) for i in range(dataset_size): img_file = data_dir + str(img_files[i][0]) label = labels[i] _quad = [] _classes = [] if label.ndim == 3: for i in range(label.shape[2]): _x0 = label[0][0][i] _y0 = label[1][0][i] _x1 = label[0][1][i] _y1 = label[1][1][i] _x2 = label[0][2][i] _y2 = label[1][2][i] _x3 = label[0][3][i] _y3 = label[1][3][i] _quad.append([_x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3]) _classes.append(1) else: _x0 = label[0][0] _y0 = label[1][0] _x1 = label[0][1] _y1 = label[1][1] _x2 = label[0][2] _y2 = label[1][2] _x3 = label[0][3] _y3 = label[1][3] _quad.append([_x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3]) _classes.append(1) self.fnames.append(img_file) self.boxes.append(np.array(_quad, dtype=np.float32)) self.labels.append(np.array(_classes)) def get_ICDAR2015(self): data_dir = os.path.join(self.root, 'ICDAR2015_Incidental/') dataset_list = os.listdir(data_dir + "train") dataset_list = [l[:-4] for l in dataset_list if "jpg" in l] dataset_size = len(dataset_list) mode = 'train' if self.train else 'test' self.num_samples = dataset_size print(mode, "ing on ICDAR2015 : ", dataset_size) for i in dataset_list: img_file = data_dir + "%s/%s.jpg" % (mode, i) label_file = open(data_dir + "%s/gt_%s.txt" % (mode, i)) label_file = label_file.readlines() _quad = [] _classes = [] for label in label_file: _x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3, txt = label.split( ",")[:9] if "###" in txt: continue try: _x0 = int(_x0) except: _x0 = int(_x0[1:]) _y0, _x1, _y1, _x2, _y2, _x3, _y3 = [ int(p) for p in [_y0, _x1, _y1, _x2, _y2, _x3, _y3] ] _quad.append([_x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3]) _classes.append(1) if len(_quad) is 0: self.num_samples -= 1 continue self.fnames.append(img_file) self.boxes.append(np.array(_quad, dtype=np.float32)) self.labels.append(np.array(_classes)) def get_MLT(self): data_dir = os.path.join(self.root, 'MLT/') dataset_list = os.listdir(data_dir + "train") dataset_list = [l[:-4] for l in dataset_list if "jpg" in l] dataset_size = len(dataset_list) mode = 'train' if self.train else 'test' self.num_samples = dataset_size print(mode, "ing on MLT : ", dataset_size) for i in dataset_list: img_file = data_dir + "%s/%s.jpg" % (mode, i) label_file = open(data_dir + "%s/gt_%s.txt" % (mode, i)) label_file = label_file.readlines() _quad = [] _classes = [] for label in label_file: _x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3, lang, txt = label.split( ",")[:10] if "###" in txt: continue try: _x0 = int(_x0) except: _x0 = int(_x0[1:]) _y0, _x1, _y1, _x2, _y2, _x3, _y3 = [ int(p) for p in [_y0, _x1, _y1, _x2, _y2, _x3, _y3] ] _quad.append([_x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3]) _classes.append(1) if len(_quad) is 0: self.num_samples -= 1 continue self.fnames.append(img_file) self.boxes.append(np.array(_quad, dtype=np.float32)) self.labels.append(np.array(_classes)) def get_ICDAR2013(self): data_dir = os.path.join(self.root, 'ICDAR2013_FOCUSED/') dataset_list = os.listdir(data_dir + "train") dataset_list = [l[:-4] for l in dataset_list if "jpg" in l] dataset_size = len(dataset_list) mode = 'train' if self.train else 'test' self.num_samples = dataset_size print(mode, "ing on ICDAR2013 : ", dataset_size) for i in dataset_list: img_file = data_dir + "%s/%s.jpg" % (mode, i) label_file = open(data_dir + "%s/gt_%s.txt" % (mode, i)) label_file = label_file.readlines() _quad = [] _classes = [] for label in label_file: _xmin, _ymin, _xmax, _ymax = label.split(" ")[:4] _x0 = _xmin _y0 = _ymin _x1 = _xmax _y1 = _ymin _x2 = _xmax _y2 = _ymax _x3 = _xmin _y3 = _ymax _x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3 = [ int(p) for p in [_x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3] ] _quad.append([_x0, _y0, _x1, _y1, _x2, _y2, _x3, _y3]) _classes.append(1) if len(_quad) is 0: self.num_samples -= 1 continue self.fnames.append(img_file) self.boxes.append(np.array(_quad, dtype=np.float32)) self.labels.append(np.array(_classes))
def prediction(bin_of_images, checkpoint_dir, minimum_idx, result_dir): print('Loading model..') if torch.cuda.is_available(): load_pth = torch.load(checkpoint_dir + "/ckpt-" + str(minimum_idx) + ".pth") else: load_pth = torch.load(checkpoint_dir + "/ckpt-" + str(minimum_idx) + ".pth", map_location=lambda storage, loc: storage) valid_loss = load_pth['loss'] print("valid loss : " + str(valid_loss)) num_classes = load_pth['num_classes'] num_batch = load_pth['batch'] num_crops = load_pth['crops'] print("num. batch : " + str(num_batch)) print("num. crops : " + str(num_crops)) net = load_sstdnet(num_classes=num_classes, using_pretrained=False) net.load_state_dict(load_pth['net']) net.eval() transform = transforms.Compose([transforms.ToTensor()]) for img_file in bin_of_images: img = Image.open(img_file) w = img.width h = img.height print('Predicting : ' + img_file) x = transform(img) x = x.unsqueeze(0) x = Variable(x, volatile=True) loc_preds, cls_preds, mask_pred = net(x) # print('Decoding..') encoder = DataEncoder() boxes, labels = encoder.decode(loc_preds.data.squeeze(), cls_preds.data.squeeze(), (w, h)) draw = ImageDraw.Draw(img) img_file_name = img_file.split("/")[-1] txt_file_name = img_file_name.replace(".jpg", ".result") result_txt = open(result_dir + "/" + txt_file_name, 'w') for result_idx in range(0, boxes.__len__(), 1): draw.rectangle(list(boxes[result_idx]), outline='red') result_txt.write( str(boxes[result_idx][0]) + "\t" + str(boxes[result_idx][1]) + "\t" + str(boxes[result_idx][2]) + "\t" + str(boxes[result_idx][3]) + "\t" + str(labels[result_idx]) + "\n") result_txt.close() img.save(result_dir + "/" + img_file_name) mask_pred = F.softmax(mask_pred) mask_data = mask_pred.data.numpy() mask_data = mask_data[:, 1:2, :, :] mask_data = np.squeeze(mask_data) mask_img = Image.fromarray(np.uint8(mask_data * 255.), 'L') mask_img.save(result_dir + "//" + img_file_name.replace(".jpg", ".png"))
class ListDataset(data.Dataset): classes = [ "articulated_truck", "bicycle", "bus", "car", "motorcycle", 'motorized_vehicle', "non-motorized_vehicle", "pedestrian", "pickup_truck", "single_unit_truck", "work_van" ] n_class = len(classes) def __init__(self, root, list_file, train, transform, input_size, max_size): ''' Args: root: (str) ditectory to images. list_file: (str) path to index file. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) image shorter side size. max_size: (int) maximum image longer side size. ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.max_size = max_size self.fnames = [] self.boxes = [] self.labels = [] self.data_encoder = DataEncoder() with open(list_file) as f: lines = f.readlines() self.num_samples = len(lines) datas = defaultdict(lambda: {'box': [], 'label': []}) for line in lines: splited = line.strip().split(',') fname, c, xmin, ymin, xmax, ymax = splited lab = self.classes.index(c) assert lab != -1, c datas[fname]['box'].append( [float(xmin), float(ymin), float(xmax), float(ymax)]) datas[fname]['label'].append(lab) for file, vals in datas.items(): self.fnames.append(file + '.jpg') self.boxes.append(torch.Tensor(vals['box'])) self.labels.append(torch.LongTensor(vals['label'])) def __getitem__(self, idx): '''Load image. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_targets: (tensor) location targets. cls_targets: (tensor) class label targets. ''' # Load image and boxes. fname = self.fnames[idx] img = Image.open(os.path.join(self.root, fname)) boxes = self.boxes[idx] labels = self.labels[idx] # Data augmentation while training. if self.train: img, boxes = self.random_flip(img, boxes) img, boxes = self.scale_jitter(img, boxes) img, boxes = self.resize(img, boxes) img = self.transform(img) return img, boxes, labels def resize(self, img, boxes): '''Resize the image shorter side to input_size. Args: img: (PIL.Image) image. boxes: (tensor) object boxes, sized [#obj, 4]. Returns: (PIL.Image) resized image. (tensor) resized object boxes. Reference: https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/utils/blob.py ''' # im_size_min = min(img.size) # im_size_max = max(img.size) # scale = float(self.input_size) / float(im_size_min) # if round(scale*im_size_max) > self.max_size: # limit the longer side to MAX_SIZE # scale = float(self.max_size) / float(im_size_max) # w = int(img.width*scale) # h = int(img.height*scale) w = h = self.input_size ws = 1.0 * w / img.width hs = 1.0 * h / img.height scale = torch.Tensor([ws, hs, ws, hs]) return img.resize((w, h)), scale * boxes def random_flip(self, img, boxes): '''Randomly flip the image and adjust the boxes. For box (xmin, ymin, xmax, ymax), the flipped box is: (w-xmax, ymin, w-xmin, ymax). Args: img: (PIL.Image) image. boxes: (tensor) object boxes, sized [#obj, 4]. Returns: img: (PIL.Image) randomly flipped image. boxes: (tensor) randomly flipped boxes, sized [#obj, 4]. ''' if random.random() < 0.5: img = img.transpose(Image.FLIP_LEFT_RIGHT) w = img.width xmin = w - boxes[:, 2] xmax = w - boxes[:, 0] boxes[:, 0] = xmin boxes[:, 2] = xmax return img, boxes def scale_jitter(self, img, boxes): '''Scale image size randomly to [3/4,4/3]. Args: img: (PIL.Image) image. boxes: (tensor) object boxes, sized [#obj, 4]. Returns: img: (PIL.Image) scaled image. boxes: (tensor) scaled object boxes, sized [#obj, 4]. ''' imw, imh = img.size sw = random.uniform(3 / 4., 4 / 3.) sh = random.uniform(3 / 4., 4 / 3.) w = int(imw * sw) h = int(imh * sh) img = img.resize((w, h)) boxes[:, ::2] *= sw boxes[:, 1::2] *= sh return img, boxes def collate_fn(self, batch): '''Pad images and encode targets. As for images are of different sizes, we need to pad them to the same size. Args: batch: (list) of images, cls_targets, loc_targets. Returns: padded images, stacked cls_targets, stacked loc_targets. Reference: https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/utils/blob.py ''' imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] max_h = max([im.size(1) for im in imgs]) max_w = max([im.size(2) for im in imgs]) num_imgs = len(imgs) inputs = torch.zeros(num_imgs, 3, max_h, max_w) loc_targets = [] cls_targets = [] for i in range(num_imgs): im = imgs[i] imh, imw = im.size(1), im.size(2) inputs[i, :, :imh, :imw] = im # Encode data. loc_target, cls_target = self.data_encoder.encode( boxes[i], labels[i], input_size=(max_w, max_h), train=self.train) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack(cls_targets) def __len__(self): return len(self.fnames)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data', '-data', type=str, default='VOC') parser.add_argument('--loss_fn', '-loss', type=str, default='sigmoid') parser.add_argument('--epoch', '-e', type=str, default='None') parser.add_argument('--debug', '-d', type=str, default='False') parser.add_argument('--weight_path', '-w', type=str, default='None') args = parser.parse_args() scale = 600 use_cuda = torch.cuda.is_available() num_workers = os.cpu_count() batch_size = 1 gpus = [0,1] save_path = args.weight_path if not os.path.exists(save_path+'/test_img/'): os.mkdir(save_path+'/test_img/') if args.debug == 'True': num_workers = 0 transform = transforms.Compose([transforms.ToTensor(), \ transforms.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225))]) if args.data == "VOC": test_root = '/media/NAS/dataset/PASCALVOC/VOCdevkit/07+12/test.txt' if args.loss_fn == 'sigmoid': voc_label = ['aeroplane','bicycle','bird','boat','bottle','bus','car', 'cat','chair','cow','diningtable','dog','horse','motorbike' ,'person','pottedplant','sheep','sofa','train','tvmonitor',] num_classes = 20 elif args.loss_fn == 'softmax': voc_label = ['background','aeroplane','bicycle','bird','boat','bottle','bus','car', 'cat','chair','cow','diningtable','dog','horse','motorbike' ,'person','pottedplant','sheep','sofa','train','tvmonitor',] num_classes = 21 color_label = [( 0, 0, 0), ( 0, 0, 0), (111, 74, 0), ( 81, 0, 81), (128, 64, 128), (244, 35, 232), (230, 150, 140), ( 70, 70, 700), (102, 102, 156), (190, 153, 153), (150, 120, 90), (153, 153, 153), (250, 170, 30), (220, 220, 0), (107, 142, 35), ( 52, 151, 52), ( 70, 130, 180), (220, 20, 60), ( 0, 0, 142), ( 0, 0, 230), (119, 11, 32)] elif args.data == "COCO": test_root = '/media/NAS/dataset/COCO/minival2014/test.txt' if args.loss_fn == 'sigmoid': num_classes = 80 elif args.loss_fn == 'softmax': num_classes = 81 global device device = torch.device("cuda" if use_cuda else "cpu") print('Loading model..') if args.data == 'VOC': weights = './{}/retina_{}.pth'.format(args.weight_path,args.epoch) elif args.data == 'COCO': weights = './{}/retina_{}.pth'.format(args.weight_path,args.epoch) model = RetinaNet(num_classes) checkpoint = torch.load(weights) if use_cuda: if len(gpus) >= 1: model = torch.nn.DataParallel(model).to(device) else: model = model.to(device) model.cuda() model.load_state_dict(checkpoint['state_dict']) print('\nTest') with open(test_root, 'r') as file: lines = file.readlines() encoder = DataEncoder(args.loss_fn) model.eval() result = '' for img_idx in lines[:100]: img_path = img_idx.rstrip() labelpath = img_path.replace('images','labels').replace('JPEGImages' ,'labels').replace('.jpg','.txt').replace('.png','.txt') img = Image.open(img_path).convert('RGB') label = load_label(labelpath, img) input_img = img.resize((scale,scale)) input_img = transform(input_img) data = torch.zeros(1,3,input_img.shape[1],input_img.shape[2]) data[0] = input_img inputs = data.to(device) loc_preds_split, cls_preds_split = model(inputs.cuda()) loc_preds_nms, cls_preds_nms, score = encoder.decode(loc_preds_split, cls_preds_split, data.shape, data[0].shape, 0) image_id = img_path[-10:] if not os.path.exists(save_path+'/test_img/val_epoch_{}'\ .format(args.epoch)): os.mkdir(save_path+'/test_img/val_epoch_{}'.format(args.epoch)) if score.shape[0] != 0: box_preds = loc_preds_nms.cpu().detach().numpy().astype(int) box_preds = np.ndarray.tolist(box_preds) category_preds = cls_preds_nms.cpu().detach().numpy().astype(str) c = np.ndarray.tolist(category_preds) score_preds = score.cpu().detach().numpy().astype(str) score_preds = np.ndarray.tolist(score_preds) else: box_preds = [] c = [] score_preds = [] new_img = cv2.imread(img_path) for i in range(int(label.shape[0])): coor_min = (int(label[i][1]), int(label[i][2])) coor_max = (int(label[i][3]), int(label[i][4])) cls = int(label[i][0]) # cv2.rectangle(new_img, coor_min, coor_max, color_label[cls], 2) cv2.rectangle(new_img, coor_min, coor_max, (250,0,0), 2) cv2.putText(new_img, voc_label[cls] + ' | ' + 'GT', (coor_min[0]+5, coor_min[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.2, (255, 255, 255), 1, cv2.LINE_AA) if len(box_preds) > 0: for idx, box_pred in enumerate(box_preds): box_pred_xmin = int(float(box_pred[0])) if box_pred_xmin < 0: box_pred_xmin = 0 box_pred_ymin = int(float(box_pred[1])) if box_pred_ymin < 0: box_pred_ymin = 0 box_pred_xmax = int(float(box_pred[2])) if box_pred_xmax < 0: box_pred_xmax = 0 box_pred_ymax = int(float(box_pred[3])) if box_pred_ymax < 0: box_pred_ymax = 0 cls_idx = int(category_preds[idx]) box_pred_min = (int(box_pred_xmin), int(box_pred_ymin)) box_pred_max = (int(box_pred_xmax), int(box_pred_ymax)) box_pred_min = (int(box_pred_xmin*new_img.shape[1]/scale), int(box_pred_ymin*new_img.shape[0]/scale)) box_pred_max = (int(box_pred_xmax*new_img.shape[1]/scale), int(box_pred_ymax*new_img.shape[0]/scale)) cls_name = voc_label[cls_idx] cls_color = color_label[cls_idx] box_coor = (box_pred_min, box_pred_max) conf = score_preds[idx][:4] # cv2.rectangle(new_img, box_pred_min, box_pred_max, cls_color, 2) cv2.rectangle(new_img, box_pred_min, box_pred_max, (0,250,0), 2) cv2.putText(new_img, cls_name + ' | ' + conf, (box_pred_min[0]+5, box_pred_min[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1, cv2.LINE_AA) new_path = save_path+'/test_img/val_epoch_{}/'.format(args.epoch) + image_id cv2.imwrite(new_path, new_img) print(image_id)
def iter_scan(scan, scan_array, patient_df, net, cube_size=64, stride=50, iou=0.01): scan_df = pd.DataFrame(columns=["scan_id", "z", "y", "x", "iou"]) start_time = time.time() gt_boxes, gt_labels = annotation(patient_df) #print(gt_boxes, gt_labels) ais_gt_boxes, mia_gt_boxes = split_class(gt_boxes, gt_labels) #print(ais_gt_boxes, mia_gt_boxes) ais_locs = torch.FloatTensor(1, 6) ais_probs = torch.FloatTensor(1) mia_locs = torch.FloatTensor(1, 6) mia_probs = torch.FloatTensor(1) for z in range(0, scan_array.shape[0], stride): for y in range(0, scan_array.shape[1], stride): for x in range(0, scan_array.shape[2], stride): start_coord = torch.FloatTensor([z, y, x]) end_coord = start_coord + torch.FloatTensor( [cube_size, cube_size, cube_size]) zmax = min(z + cube_size, scan_array.shape[0]) ymax = min(y + cube_size, scan_array.shape[1]) xmax = min(x + cube_size, scan_array.shape[2]) cube_sample = np.zeros((cube_size, cube_size, cube_size), dtype=np.float32) cube_sample[:(zmax - z), :(ymax - y), :(xmax - x)] = scan_array[z:zmax, y:ymax, x:xmax] cube_sample = np.expand_dims(cube_sample, 0) cube_sample = np.expand_dims(cube_sample, 0) input_cube = Variable(torch.from_numpy(cube_sample).cuda()) locs, clss = net(input_cube) locs = locs.data.cpu().squeeze() clss = clss.data.cpu().squeeze() ais_boxes, ais_scores, ais_labels, mia_boxes, mia_scores, mia_labels = DataEncoder( ).decode(locs, clss, [cube_size, cube_size, cube_size]) if not isinstance(ais_boxes, int): ais_boxes = calc_scan_coord(ais_boxes, start_coord) ais_locs = torch.cat([ais_locs, ais_boxes], 0) ais_probs = torch.cat([ais_probs, ais_scores], 0) if not isinstance(mia_boxes, int): mia_boxes = calc_scan_coord(mia_boxes, start_coord) mia_locs = torch.cat([mia_locs, mia_boxes], 0) mia_probs = torch.cat([mia_probs, mia_scores], 0) end_time = time.time() run_time = end_time - start_time print(run_time) if not isinstance(ais_gt_boxes, int): ais_locs = ais_locs[1:, :] ais_probs = ais_probs[1:] ais_keep = box_nms(ais_locs, ais_probs) ais_locs = ais_locs[ais_keep] ais_probs = ais_probs[ais_keep] ais_count, best_ious = find_best_pred(ais_gt_boxes, ais_locs) ais_locs = change_box_order(ais_locs, "zyxzyx2zyxdhw") for i in range(ais_locs.size(0)): insert = { "scan_id": scan, "z": ais_locs[i, 0], "y": ais_locs[i, 1], "x": ais_locs[i, 2], "iou": best_ious[i] } la_df = pd.DataFrame(data=insert, index=["0"]) scan_df = scan_df.append(la_df, ignore_index=True) else: ais_count = np.zeros(3) if not isinstance(mia_gt_boxes, int): mia_locs = mia_locs[1:, :] mia_probs = mia_probs[1:] mia_keep = box_nms(mia_locs, mia_probs) mia_locs = mia_locs[mia_keep] mia_probs = mia_probs[mia_keep] mia_count, best_ious = find_best_pred(mia_gt_boxes, mia_locs) for i in range(mia_locs.size(0)): insert = { "scan_id": scan, "z": mia_locs[i, 0], "y": mia_locs[i, 1], "x": mia_locs[i, 2], "iou": best_ious[i] } la_df = pd.DataFrame(data=insert, index=["0"]) scan_df = scan_df.append(la_df, ignore_index=True) else: mia_count = np.zeros(3) return ais_count, mia_count, scan_df
class SSD_Core: def __init__(self): self.dictindex = [] with open('./label.txt') as f: content = f.readlines() for symbol in content: symbol = symbol.replace('\n', '') split = symbol.split(' ') self.dictindex.append(split[0]) # Load model self.net = SSD300() checkpoint = torch.load(args.resuming_model) checkpoint['net'] self.net.load_state_dict(checkpoint['net']) self.net.eval() self.data_encoder = DataEncoder() self.transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) def generatePrediction(self, imgpath, outname): # Load test image img = Image.open(imgpath).convert('L') img1 = img.resize((InputImgSize, InputImgSize)) img1 = self.transform(img1) # Forward loc, conf = self.net(Variable(img1[None, :, :, :], volatile=True)) # Decode boxes, labels, scores = self.data_encoder.decode( loc.data.squeeze(0), F.softmax(conf.squeeze(0)).data) draw = ImageDraw.Draw(img) return_str = 'null ' + str(len(boxes)) boxes_np = boxes.numpy() * InputImgSize labels_np = labels.numpy() for i in range(len(boxes)): return_str = return_str + ' ' + str(int( boxes_np[i][0])) + ' ' + str(int(boxes_np[i][1])) + ' ' + str( int(boxes_np[i][2])) + ' ' + str(int( boxes_np[i][3])) + ' ' + str(int(labels_np[i][0]) - 1) boxes[i][::2] *= img.width boxes[i][1::2] *= img.height draw.rectangle(list(boxes[i]), outline='red') draw.text((boxes[i][0], boxes[i][1]), self.dictindex[labels.numpy()[i, 0] - 1], font=ImageFont.truetype("./font/arial.ttf")) #draw.text((boxes[i][0] * 300, boxes[i][1] * 300), dictindex[labels.numpy()[i, 0]], font=ImageFont.truetype("./font/arial.ttf")) img.save('./temp/' + outname) return return_str
class ImageDataset(data.Dataset): def __init__(self, img_ids, img_dir, bbox_dict, has_label=True): self.input_size = settings.IMG_SZ self.img_ids = img_ids self.img_dir = img_dir self.num = len(img_ids) self.bbox_dict = bbox_dict self.has_label = has_label self.transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) self.boxes = [] self.labels = [] self.encoder = DataEncoder() if has_label: for img_id in self.img_ids: box = [] label = [] if img_id in self.bbox_dict: for x in self.bbox_dict[img_id]: box.append(x[1]) label.append(x[0]) else: raise ValueError('No bbox: {}'.format(img_id)) self.boxes.append(torch.Tensor(box) * self.input_size) # self.labels.append(torch.LongTensor(label)) # def __getitem__(self, index): fn = os.path.join(self.img_dir, '{}.jpg'.format(self.img_ids[index])) img = cv2.imread(fn) img = self.transform(img) #print(get_class_names(self.labels[index])) if self.has_label: return img, self.boxes[index], self.labels[index] else: return [img] def __len__(self): return self.num def collate_fn(self, batch): """Encode targets. Args: batch: (list) of images, ids Returns: images, stacked bbox_targets, stacked clf_targets. """ imgs = [x[0] for x in batch] if self.has_label: boxes = [x[1] for x in batch] labels = [x[2] for x in batch] h = w = self.input_size num_imgs = len(imgs) inputs = torch.zeros(num_imgs, 3, h, w) loc_targets = [] cls_targets = [] for i in range(num_imgs): inputs[i] = imgs[i] #print('1>>>') #print(boxes[i].size(), labels[i].size()) if self.has_label: loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(w, h)) loc_targets.append(loc_target) cls_targets.append(cls_target) if self.has_label: return inputs, torch.stack(loc_targets), torch.stack(cls_targets) else: return inputs
class ListDataset(data.Dataset): def __init__(self, root, list_file, train, transform, input_size): ''' Args: root: (str) ditectory to images. list_file: (str) path to index file. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) model input size. ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.fnames = [] self.boxes = [] self.labels = [] self.encoder = DataEncoder() with open(list_file) as f: lines = f.readlines() self.num_samples = len(lines) for line in lines: splited = line.strip().split() self.fnames.append(splited[0]) num_boxes = (len(splited) - 1) // 5 box = [] label = [] for i in range(num_boxes): xmin = splited[1 + 5 * i] ymin = splited[2 + 5 * i] xmax = splited[3 + 5 * i] ymax = splited[4 + 5 * i] c = splited[5 + 5 * i] box.append( [float(xmin), float(ymin), float(xmax), float(ymax)]) label.append(int(c)) self.boxes.append(torch.Tensor(box)) self.labels.append(torch.LongTensor(label)) def __getitem__(self, idx): '''Load image. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_targets: (tensor) location targets. cls_targets: (tensor) class label targets. ''' # Load image and boxes. fname = self.fnames[idx] img = Image.open(os.path.join(self.root, fname)) if img.mode != 'RGB': img = img.convert('RGB') boxes = self.boxes[idx].clone() labels = self.labels[idx] size = self.input_size # Data augmentation. if self.train: # img, boxes = random_flip(img, boxes) img, boxes = random_crop(img, boxes) img, boxes = resize(img, boxes, (size, size)) else: img, boxes = resize(img, boxes, size) img, boxes = center_crop(img, boxes, (size, size)) img = self.transform(img) return img, boxes, labels def collate_fn(self, batch): '''Pad images and encode targets. As for images are of different sizes, we need to pad them to the same size. Args: batch: (list) of images, cls_targets, loc_targets. Returns: padded images, stacked cls_targets, stacked loc_targets. ''' imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] h = w = self.input_size num_imgs = len(imgs) inputs = torch.zeros(num_imgs, 3, h, w) loc_targets = [] cls_targets = [] for i in range(num_imgs): inputs[i] = imgs[i] loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(w, h)) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack(cls_targets) def __len__(self): return self.num_samples
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batchSz', type=int, default=1, help='batch size') parser.add_argument('--nEpochs', type=int, default=300, help='number of epoch to end training') parser.add_argument('--lr', type=float, default=1e-5, help='learning rate') parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--wd', type=float, default=5e-4, help='weight decay') # parser.add_argument('--save') # parser.add_argument('--seed', type=int, default=1) parser.add_argument('--opt', type=str, default='sgd', choices=('sgd', 'adam', 'rmsprop')) parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') parser.add_argument('--resume_from', type=int, default=220, help='resume from which checkpoint') parser.add_argument('--visdom', '-v', action='store_true', help='use visdom for training visualization') args = parser.parse_args() # args.save = args.save or 'work/DSOS.base' # setproctitle.setproctitle(args.save) # if os.path.exists(args.save): # shutil.rmtree(args.save) # os.makedirs(args.save, exist_ok=True) use_cuda = torch.cuda.is_available() best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 for last epoch normMean = [0.485, 0.456, 0.406] normStd = [0.229, 0.224, 0.225] normTransform = transforms.Normalize(normMean, normStd) trainTransform = transforms.Compose([ transforms.Scale((300, 300)), transforms.ToTensor(), normTransform ]) testTransform = transforms.Compose([ transforms.Scale((300, 300)), transforms.ToTensor(), normTransform ]) # Data kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {} trainset = ListDataset(root=cfg.img_root, list_file=cfg.label_train, train=True, transform=trainTransform) trainLoader = DataLoader(trainset, batch_size=args.batchSz, shuffle=True, **kwargs) testset = ListDataset(root=cfg.img_root, list_file=cfg.label_test, train=False, transform=testTransform) testLoader = DataLoader(testset, batch_size=args.batchSz, shuffle=False, **kwargs) # Model net = DSOD(growthRate=48, reduction=1) if args.resume: print('==> Resuming from checkpoint...') checkpoint = torch.load('./checkpoint/ckpt_{:03d}.pth'.format(args.resume_from)) net.load_state_dict(checkpoint['net']) best_loss = checkpoint['loss'] start_epoch = checkpoint['epoch']+1 print('Previours_epoch: {}, best_loss: {}'.format(start_epoch-1, best_loss)) else: print('==> Initializing weight...') def init_weights(m): if isinstance(m, nn.Conv2d): init.xavier_uniform(m.weight.data) # m.bias.data.zero_() net.apply(init_weights) print(' + Number of params: {}'.format( sum([p.data.nelement() for p in net.parameters()]))) if use_cuda: net = net.cuda() if args.opt == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.wd) elif args.opt == 'adam': optimizer = optim.Adam(net.parameters(), weight_decay=args.wd) elif args.opt == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), weight_decay=args.wd) criterion = MultiBoxLoss() if use_cuda: net.cuda() cudnn.benchmark = True if args.visdom: import visdom viz = visdom.Visdom() training_plot = viz.line( X=torch.zeros((1,)).cpu(), Y=torch.zeros((1, 3)).cpu(), opts=dict( xlabel='Epoch', ylabel='Loss', title='Epoch DSOD Training Loss', legend=['Loc Loss', 'Conf Loss', 'Loss'] ) ) testing_plot = viz.line( X=torch.zeros((1,)).cpu(), Y=torch.zeros((1, 3)).cpu(), opts=dict( xlabel='Epoch', ylabel='Loss', title='Epoch DSOD Testing Loss', legend=['Loc Loss', 'Conf Loss', 'Loss'] ) ) with open(cfg.label_test) as f: test_lines = f.readlines() num_tests = len(test_lines) transform = trainTransform transform_viz = testTransform data_encoder = DataEncoder() if args.visdom: testing_image = viz.image(np.ones((3, 300, 300)), opts=dict(caption='Random Testing Image')) # TODO: save training data on log file # trainF = open(os.path.join(args.save, 'train.csv'), 'w') # testF = open(os.path.join(args.save, 'test.csv'), 'w') for epoch in range(start_epoch, start_epoch+args.nEpochs+1): adjust_opt(args.opt, optimizer, epoch) train(epoch, net, trainLoader, optimizer, criterion, use_cuda, args.visdom, viz=None) test(epoch, net, testLoader, optimizer, criterion, use_cuda, args.visdom, viz=None) if epoch%10 == 0: state = { 'net': net.state_dict(), 'loss': test_loss, 'epoch': epoch } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/ckpt_{:03d}.pth'.format(epoch))
class VESSELBboxDataset: def __init__(self, split='trainval'): data_dir = "/media/nasir/Drive1/datasets/SAR/SAR-Ship-Dataset" paths = glob.glob(f'{data_dir}/JPEGImages/*.jpg') ids = [os.path.splitext(os.path.basename(x))[0] for x in paths] if split == 'trainval': self.ids = ids[0: 40000] else: self.ids = ids[40000:] self.input_size = 256 self.encoder = DataEncoder() self.data_dir = data_dir self.label_names = ['ship'] self.transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485,0.456,0.406), (0.229,0.224,0.225)) ]) def __len__(self): return len(self.ids) def str2int(self, a): return [int(x) for x in a] def extract_boxes(self, fname): with open(fname) as f: content = f.readlines() f.close() content = [x.strip() for x in content] content = [self.str2int(x.split(' ')[-4:]) for x in content] return content def __getitem__(self, i): """Returns the i-th example. Returns a color image and bounding boxes. The image is in CHW format. The returned image is RGB. Args: i (int): The index of the example. Returns: tuple of an image and bounding boxes """ id_ = self.ids[i] anno_file = os.path.join(self.data_dir, 'ground-truth', id_ + '.txt') # bbox = self.extract_boxes(anno_file) # label = list() # bbox = np.stack(bbox).astype(np.float32) # bb = np.ones_like(bbox).astype(np.float32) # for i in range(len(bbox)): # label.append(0) # bb[:, 0] = bbox[:, 1] # bb[:, 1] = bbox[:, 0] # bb[:, 2] = bbox[:, 3] + bbox[:, 1] # bb[:, 3] = bbox[:, 2] + bbox[:, 0] # label = np.stack(label) img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg') img = Image.open(img_file).convert('RGB') img = self.transform(img) annot = self.load_annotations(self.extract_boxes(anno_file)) return {'img': img, 'annot': annot} # return {img, torch.Tensor(bb).type(torch.float)} def load_annotations(self, bboxes): annotations = np.zeros((0, 5)) if len(bboxes) == 0: return annotations for idx, box in enumerate(bboxes): annotation = np.zeros((1, 5)) annotation[0, :4] = box annotation[0, 4] = 0 annotations = np.append(annotations, annotation, axis=0) annotations[:, 2] = annotations[:, 0] + annotations[:, 2] annotations[:, 3] = annotations[:, 1] + annotations[:, 3] return annotations def collate_fn(self, batch): '''Pad images and encode targets. As for images are of different sizes, we need to pad them to the same size. Args: batch: (list) of images, cls_targets, loc_targets. Returns: padded images, stacked cls_targets, stacked loc_targets. ''' imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] h = w = self.input_size num_imgs = len(imgs) inputs = torch.zeros(num_imgs, 3, h, w) loc_targets = [] cls_targets = [] for i in range(num_imgs): inputs[i] = imgs[i] loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(w,h)) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack(cls_targets)
class jsonDataset(data.Dataset): def __init__(self, path, classes, transform, input_image_size, num_crops, fpn_level, is_norm_reg_target, radius, view_image=False, min_cols=1, min_rows=1): ''' Args: root: (str) ditectory to images. list_file: (str) path to index file. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) image shorter side size. max_size: (int) maximum image longer side size. ''' self.path = path self.classes = classes self.transform = transform self.input_size = input_image_size self.num_crops = num_crops self.view_img = view_image self.fpn_level = fpn_level self.is_norm_reg_target = is_norm_reg_target self.radius = radius self.fnames = list() self.offsets = list() self.boxes = list() self.labels = list() self.num_classes = len(self.classes) self.label_map = dict() self.class_idx_map = dict() # 0 is background class for idx in range(0, self.num_classes): self.label_map[self.classes[idx]] = idx + 1 # 0 is background self.class_idx_map[idx + 1] = self.classes[idx] self.data_encoder = DataEncoder( image_size=self.input_size, num_classes=self.num_classes + 1, fpn_level=self.fpn_level, is_norm_reg_target=self.is_norm_reg_target) fp_read = open(self.path, 'r') gt_dict = json.load(fp_read) all_boxes = list() all_labels = list() all_img_path = list() # read gt files for gt_key in gt_dict: gt_data = gt_dict[gt_key][0] box = list() label = list() num_boxes = len(gt_data['labels']) img = cv2.imread(gt_data['image_path']) img_rows = img.shape[0] img_cols = img.shape[1] for iter_box in range(0, num_boxes): xmin = gt_data['boxes'][iter_box][0] ymin = gt_data['boxes'][iter_box][1] xmax = gt_data['boxes'][iter_box][2] ymax = gt_data['boxes'][iter_box][3] rows = ymax - ymin cols = xmax - xmin if xmin < 0 or ymin < 0: print('negative coordinate: [xmin: ' + str(xmin) + ', ymin: ' + str(ymin) + ']') print(gt_data['image_path']) continue if xmax > img_cols or ymax > img_rows: print('over maximum size: [xmax: ' + str(xmax) + ', ymax: ' + str(ymax) + ']') print(gt_data['image_path']) continue if cols < min_cols: print('cols is lower than ' + str(min_cols) + ': [' + str(xmin) + ', ' + str(ymin) + ', ' + str(xmax) + ', ' + str(ymax) + '] ' + str(gt_data['image_path'])) continue if rows < min_rows: print('rows is lower than ' + str(min_rows) + ': [' + str(xmin) + ', ' + str(ymin) + ', ' + str(xmax) + ', ' + str(ymax) + '] ' + str(gt_data['image_path'])) continue class_name = gt_data['labels'][iter_box][0] if class_name not in self.label_map: print('weired class name: ' + class_name) print(gt_data['image_path']) continue class_idx = self.label_map[class_name] box.append( [float(xmin), float(ymin), float(xmax), float(ymax)]) label.append(int(class_idx)) if len(box) == 0 or len(label) == 0: print('none of object exist in the image: ' + gt_data['image_path']) continue all_boxes.append(box) all_labels.append(label) all_img_path.append(gt_data['image_path']) if len(all_boxes) == len(all_labels) and len(all_boxes) == len( all_img_path): num_images = len(all_img_path) else: print('num. of boxes: ' + str(len(all_boxes))) print('num. of labels: ' + str(len(all_labels))) print('num. of paths: ' + str(len(all_img_path))) raise ValueError( 'num. of elements are different(all boxes, all_labels, all_img_path)' ) if num_crops <= 0: for idx in range(0, num_images, 1): self.fnames.append(all_img_path[idx]) self.boxes.append( torch.tensor(all_boxes[idx], dtype=torch.float32)) self.labels.append( torch.tensor(all_labels[idx], dtype=torch.int64)) else: for idx in range(0, num_images, 1): ori_boxes = all_boxes[idx] ori_labels = all_labels[idx] ori_img = cv2.imread(all_img_path[idx]) img_rows = ori_img.shape[0] img_cols = ori_img.shape[1] offsets, crop_boxes, crop_labels = self._do_crop( ori_img_rows=img_rows, ori_img_cols=img_cols, target_img_size=self.input_size, boxes=ori_boxes, labels=ori_labels) num_offsets = len(offsets) for idx_offset in range(0, num_offsets, 1): self.fnames.append(all_img_path[idx]) self.offsets.append(offsets[idx_offset]) self.boxes.append( torch.tensor(crop_boxes[idx_offset], dtype=torch.float32)) self.labels.append( torch.tensor(crop_labels[idx_offset], dtype=torch.int64)) self.num_samples = len(self.fnames) def __getitem__(self, idx): # Load image and boxes. fname = self.fnames[idx] boxes = self.boxes[idx] labels = self.labels[idx] img = cv2.imread(fname) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) if self.num_crops > 0: offset = self.offsets[idx] crop_rect = (int(offset[0]), int(offset[1]), int(offset[0] + self.input_size[1]), int(offset[1] + self.input_size[0])) if offset[0] < 0 or offset[1] < 0: raise ValueError("negative offset!") for box in boxes: if box[0] < 0 or box[1] < 0 or box[2] > self.input_size[ 1] or box[3] > self.input_size[0]: raise ValueError("negative box coordinate!") img = img[crop_rect[1]:crop_rect[3], crop_rect[0]:crop_rect[2]] bboxes = [ bbox.tolist() + [label.item()] for bbox, label in zip(boxes, labels) ] augmented = self.transform(image=img, bboxes=bboxes) img = augmented['image'] rows, cols = img.shape[1:] boxes = augmented['bboxes'] boxes = [list(bbox) for bbox in boxes] labels = [bbox.pop() for bbox in boxes] if self.view_img is True: np_img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) np_img = np_img.numpy() np_img = np.transpose(np_img, (1, 2, 0)) np_img = np.uint8(np_img * 255) np_img = np.ascontiguousarray(np_img) for idx_box, box in enumerate(boxes): cv2.rectangle(np_img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 255, 0)) class_idx = labels[idx_box] text_size = cv2.getTextSize(self.class_idx_map[class_idx], cv2.FONT_HERSHEY_PLAIN, 1, 1) cv2.putText(np_img, self.class_idx_map[class_idx], (int(box[0]), int(box[1]) - text_size[1]), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) cv2.imwrite(os.path.join("crop_test", str(idx) + ".jpg"), np_img) boxes = torch.tensor(boxes, dtype=torch.float32) labels = torch.tensor(labels, dtype=torch.int64) return img, boxes, labels, fname def __len__(self): return self.num_samples # def _resize(self, img, boxes): # if isinstance(self.input_size, int) is True: # w = h = self.input_size # elif isinstance(self.input_size, tuple) is True: # h = self.input_size[0] # w = self.input_size[1] # else: # raise ValueError('input size should be int or tuple of ints') # # ws = 1.0 * w / img.shape[1] # hs = 1.0 * h / img.shape[0] # scale = torch.tensor([ws, hs, ws, hs], dtype=torch.float32) # if boxes.numel() == 0: # scaled_box = boxes # else: # scaled_box = scale * boxes # return cv2.resize(img, (w, h)), scaled_box def _do_crop(self, ori_img_rows, ori_img_cols, target_img_size, boxes, labels): num_boxes = len(boxes) num_labels = len(labels) if num_boxes != num_labels: print("error occur: Random crop") rand_indices = [0, 1, 2, 3, 4] np.random.shuffle(rand_indices) output_offsets = [] output_boxes = [] output_labels = [] for box in boxes: # box coordinate from 1. not 0. xmin = box[0] ymin = box[1] xmax = box[2] ymax = box[3] width = (xmax - xmin) + 1 height = (ymax - ymin) + 1 if width < 0 or height < 0: print("negative width/height") continue for iter_crop in range(0, self.num_crops, 1): rand_idx = rand_indices[iter_crop] margin = np.random.randint(16, 128, size=1) # top-left if rand_idx == 0: offset_x = xmin - 1 - margin[0] offset_y = ymin - 1 - margin[0] crop_maxx = offset_x + target_img_size[1] crop_maxy = offset_y + target_img_size[0] if crop_maxx > ori_img_cols - 1 or crop_maxy > ori_img_rows - 1: continue if offset_x < 0 or offset_y < 0: continue crop_rect = [ offset_x, offset_y, target_img_size[1], target_img_size[0] ] in_boxes, in_labels = self._find_boxes_in_crop( crop_rect, boxes, labels) if len(in_boxes) == 0: continue output_offsets.append([offset_x, offset_y]) output_boxes.append(in_boxes) output_labels.append(in_labels) # top-right elif rand_idx == 1: offset_x = xmin - (target_img_size[1] - width) - 1 + margin[0] offset_y = ymin - 1 - margin[0] crop_maxx = offset_x + target_img_size[1] crop_maxy = offset_y + target_img_size[0] if crop_maxx > ori_img_cols - 1 or crop_maxy > ori_img_rows - 1: continue if offset_x < 0 or offset_y < 0: continue crop_rect = [ offset_x, offset_y, target_img_size[1], target_img_size[0] ] in_boxes, in_labels = self._find_boxes_in_crop( crop_rect, boxes, labels) if len(in_boxes) == 0: continue output_offsets.append([offset_x, offset_y]) output_boxes.append(in_boxes) output_labels.append(in_labels) # bottom-left elif rand_idx == 2: offset_x = xmin - 1 - margin[0] offset_y = ymin - (target_img_size[0] - height) - 1 + margin[0] crop_maxx = offset_x + target_img_size[1] crop_maxy = offset_y + target_img_size[0] if crop_maxx > ori_img_cols - 1 or crop_maxy > ori_img_rows - 1: continue if offset_x < 0 or offset_y < 0: continue crop_rect = [ offset_x, offset_y, target_img_size[1], target_img_size[0] ] in_boxes, in_labels = self._find_boxes_in_crop( crop_rect, boxes, labels) if len(in_boxes) == 0: continue output_offsets.append([offset_x, offset_y]) output_boxes.append(in_boxes) output_labels.append(in_labels) # bottom-right elif rand_idx == 3: offset_x = xmin - (target_img_size[1] - width) - 1 + margin[0] offset_y = ymin - (target_img_size[0] - height) - 1 + margin[0] crop_maxx = offset_x + target_img_size[1] crop_maxy = offset_y + target_img_size[0] if crop_maxx > ori_img_cols - 1 or crop_maxy > ori_img_rows - 1: continue if offset_x < 0 or offset_y < 0: continue crop_rect = [ offset_x, offset_y, target_img_size[1], target_img_size[0] ] in_boxes, in_labels = self._find_boxes_in_crop( crop_rect, boxes, labels) if len(in_boxes) == 0: continue output_offsets.append([offset_x, offset_y]) output_boxes.append(in_boxes) output_labels.append(in_labels) # center elif rand_idx == 4: rand_direction = np.random.randint(-1, 1, size=1) offset_x = (xmin - ((target_img_size[1] - width) / 2) - 1) + (rand_direction[0] * margin[0]) offset_y = (ymin - ((target_img_size[0] - height) / 2) - 1) + (rand_direction[0] * margin[0]) crop_maxx = offset_x + target_img_size[1] crop_maxy = offset_y + target_img_size[0] if crop_maxx > ori_img_cols - 1 or crop_maxy > ori_img_rows - 1: continue if offset_x < 0 or offset_y < 0: continue crop_rect = [ offset_x, offset_y, target_img_size[1], target_img_size[0] ] in_boxes, in_labels = self._find_boxes_in_crop( crop_rect, boxes, labels) if len(in_boxes) == 0: continue output_offsets.append([offset_x, offset_y]) output_boxes.append(in_boxes) output_labels.append(in_labels) else: print("exceed possible crop num") return output_offsets, output_boxes, output_labels def _find_boxes_in_crop(self, crop_rect, boxes, labels): num_boxes = len(boxes) num_labels = len(labels) if num_boxes != num_labels: print("error occur: Random crop") boxes_in_crop = [] labels_in_crop = [] for idx in range(0, num_boxes, 1): box_in_crop, label, is_contain = self._find_box_in_crop( crop_rect, boxes[idx], labels[idx]) if is_contain is True: boxes_in_crop.append(box_in_crop) labels_in_crop.append(label) return boxes_in_crop, labels_in_crop def _find_box_in_crop(self, rect, box, label): rect_minx = rect[0] rect_miny = rect[1] rect_width = rect[2] rect_height = rect[3] box_minx = box[0] box_miny = box[1] box_maxx = box[2] box_maxy = box[3] box_width = (box_maxx - box_minx) + 1 box_height = (box_maxy - box_miny) + 1 # occlusion_ratio occlusion_ratio = 0.3 occlusion_width = int(box_width * occlusion_ratio) * -1 occlusion_height = int(box_height * occlusion_ratio) * -1 box_in_crop_minx = box_minx - rect_minx if box_in_crop_minx <= occlusion_width or box_in_crop_minx >= rect_width: box_in_rect = [] return box_in_rect, label, False box_in_crop_miny = box_miny - rect_miny if box_in_crop_miny <= occlusion_height or box_in_crop_miny >= rect_height: box_in_rect = [] return box_in_rect, label, False box_in_crop_maxx = box_maxx - rect_minx if rect_width - box_in_crop_maxx <= occlusion_width or box_in_crop_maxx <= 0: box_in_rect = [] return box_in_rect, label, False box_in_crop_maxy = box_maxy - rect_miny if rect_height - box_in_crop_maxy <= occlusion_height or box_in_crop_maxy <= 0: box_in_rect = [] return box_in_rect, label, False if box_in_crop_minx < 0: box_in_crop_minx = 0 if box_in_crop_miny < 0: box_in_crop_miny = 0 if rect_width - box_in_crop_maxx < 0: box_in_crop_maxx = rect_width - 1 if rect_height - box_in_crop_maxy < 0: box_in_crop_maxy = rect_height - 1 box_in_rect = [ box_in_crop_minx, box_in_crop_miny, box_in_crop_maxx, box_in_crop_maxy ] return box_in_rect, label, True def collate_fn(self, batch): imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] paths = [x[3] for x in batch] num_imgs = len(imgs) if isinstance(self.input_size, int) is True: inputs = torch.zeros( [num_imgs, 3, self.input_size, self.input_size], dtype=torch.float32) elif isinstance(self.input_size, tuple) is True: inputs = torch.zeros( [num_imgs, 3, self.input_size[0], self.input_size[1]], dtype=torch.float32) else: raise ValueError('input size should be int or tuple of ints') loc_targets = list() cls_targets = list() center_targets = list() for i in range(num_imgs): im = imgs[i] imh, imw = im.size(1), im.size(2) inputs[i, :, :imh, :imw] = im # Encode data. loc_target, cls_target, center_target = self.data_encoder.encode( boxes[i], labels[i], radius=self.radius) loc_targets.append(loc_target) cls_targets.append(cls_target) center_targets.append(center_target) return inputs, \ torch.stack(loc_targets, dim=0), \ torch.stack(cls_targets, dim=0), \ torch.stack(center_targets, dim=0), \ paths
print('Loading model..') net = RetinaNet() net.load_state_dict(torch.load('./checkpoint/params.pth')) net.eval() transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) print('Loading image..') img = Image.open('./image/000001.jpg') w = h = 600 img = img.resize((w, h)) print('Predicting..') x = transform(img) x = x.unsqueeze(0) x = Variable(x, volatile=True) loc_preds, cls_preds = net(x) print('Decoding..') encoder = DataEncoder() boxes, labels = encoder.decode(loc_preds.data.squeeze(), cls_preds.data.squeeze(), (w, h)) draw = ImageDraw.Draw(img) for box in boxes: draw.rectangle(list(box), outline='red') img.show()
def __init__(self, path, classes, transform, input_image_size, num_crops, fpn_level, is_norm_reg_target, radius, view_image=False, min_cols=1, min_rows=1): ''' Args: root: (str) ditectory to images. list_file: (str) path to index file. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) image shorter side size. max_size: (int) maximum image longer side size. ''' self.path = path self.classes = classes self.transform = transform self.input_size = input_image_size self.num_crops = num_crops self.view_img = view_image self.fpn_level = fpn_level self.is_norm_reg_target = is_norm_reg_target self.radius = radius self.fnames = list() self.offsets = list() self.boxes = list() self.labels = list() self.num_classes = len(self.classes) self.label_map = dict() self.class_idx_map = dict() # 0 is background class for idx in range(0, self.num_classes): self.label_map[self.classes[idx]] = idx + 1 # 0 is background self.class_idx_map[idx + 1] = self.classes[idx] self.data_encoder = DataEncoder( image_size=self.input_size, num_classes=self.num_classes + 1, fpn_level=self.fpn_level, is_norm_reg_target=self.is_norm_reg_target) fp_read = open(self.path, 'r') gt_dict = json.load(fp_read) all_boxes = list() all_labels = list() all_img_path = list() # read gt files for gt_key in gt_dict: gt_data = gt_dict[gt_key][0] box = list() label = list() num_boxes = len(gt_data['labels']) img = cv2.imread(gt_data['image_path']) img_rows = img.shape[0] img_cols = img.shape[1] for iter_box in range(0, num_boxes): xmin = gt_data['boxes'][iter_box][0] ymin = gt_data['boxes'][iter_box][1] xmax = gt_data['boxes'][iter_box][2] ymax = gt_data['boxes'][iter_box][3] rows = ymax - ymin cols = xmax - xmin if xmin < 0 or ymin < 0: print('negative coordinate: [xmin: ' + str(xmin) + ', ymin: ' + str(ymin) + ']') print(gt_data['image_path']) continue if xmax > img_cols or ymax > img_rows: print('over maximum size: [xmax: ' + str(xmax) + ', ymax: ' + str(ymax) + ']') print(gt_data['image_path']) continue if cols < min_cols: print('cols is lower than ' + str(min_cols) + ': [' + str(xmin) + ', ' + str(ymin) + ', ' + str(xmax) + ', ' + str(ymax) + '] ' + str(gt_data['image_path'])) continue if rows < min_rows: print('rows is lower than ' + str(min_rows) + ': [' + str(xmin) + ', ' + str(ymin) + ', ' + str(xmax) + ', ' + str(ymax) + '] ' + str(gt_data['image_path'])) continue class_name = gt_data['labels'][iter_box][0] if class_name not in self.label_map: print('weired class name: ' + class_name) print(gt_data['image_path']) continue class_idx = self.label_map[class_name] box.append( [float(xmin), float(ymin), float(xmax), float(ymax)]) label.append(int(class_idx)) if len(box) == 0 or len(label) == 0: print('none of object exist in the image: ' + gt_data['image_path']) continue all_boxes.append(box) all_labels.append(label) all_img_path.append(gt_data['image_path']) if len(all_boxes) == len(all_labels) and len(all_boxes) == len( all_img_path): num_images = len(all_img_path) else: print('num. of boxes: ' + str(len(all_boxes))) print('num. of labels: ' + str(len(all_labels))) print('num. of paths: ' + str(len(all_img_path))) raise ValueError( 'num. of elements are different(all boxes, all_labels, all_img_path)' ) if num_crops <= 0: for idx in range(0, num_images, 1): self.fnames.append(all_img_path[idx]) self.boxes.append( torch.tensor(all_boxes[idx], dtype=torch.float32)) self.labels.append( torch.tensor(all_labels[idx], dtype=torch.int64)) else: for idx in range(0, num_images, 1): ori_boxes = all_boxes[idx] ori_labels = all_labels[idx] ori_img = cv2.imread(all_img_path[idx]) img_rows = ori_img.shape[0] img_cols = ori_img.shape[1] offsets, crop_boxes, crop_labels = self._do_crop( ori_img_rows=img_rows, ori_img_cols=img_cols, target_img_size=self.input_size, boxes=ori_boxes, labels=ori_labels) num_offsets = len(offsets) for idx_offset in range(0, num_offsets, 1): self.fnames.append(all_img_path[idx]) self.offsets.append(offsets[idx_offset]) self.boxes.append( torch.tensor(crop_boxes[idx_offset], dtype=torch.float32)) self.labels.append( torch.tensor(crop_labels[idx_offset], dtype=torch.int64)) self.num_samples = len(self.fnames)
class ListDataset(data.Dataset): img_size = 300 def __init__(self, root, list_file, train, transform): ''' Args: root: (str) ditectory to images. list_file: (str) path to annotation files. train: (boolean) train or test. transform: ([transforms]) image transforms. ''' self.root = root self.train = train self.transform = transform self.fnames = [] self.boxes = [] self.labels = [] self.data_encoder = DataEncoder() self.num_samples = 0 for i in os.listdir(list_file): self.num_samples += 1 self.fnames.append(i) box = [] labels = [] with open(os.path.join(list_file, i)) as f: f = f.read().split("\n") f = f[:-1] num_objs = len(f) for j in range(num_objs): f[j] = f[j].split(",") xmin = float(f[j][0]) ymin = float(f[j][1]) w = float(f[j][2]) h = float(f[j][3]) box.append([xmin, ymin, xmin + h, ymin + h]) labels.append(int(f[j][5])) self.boxes.append(torch.Tensor(box)) self.labels.append(torch.LongTensor(labels)) def __getitem__(self, idx): '''Load a image, and encode its bbox locations and class labels. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_target: (tensor) location targets, sized [8732,4]. conf_target: (tensor) label targets, sized [8732,]. ''' # Load image and bbox locations. fname = self.fnames[idx] img = cv2.imread(os.path.join(self.root, fname[:-4] + ".jpg")) boxes = self.boxes[idx].clone() labels = self.labels[idx] # Data augmentation while training. if self.train: img, boxes = self.random_flip(img, boxes) img, boxes, labels = self.random_crop(img, boxes, labels) # Scale bbox locaitons to [0,1]. w, h = img.shape[1], img.shape[0] boxes /= torch.Tensor([w, h, w, h]).expand_as(boxes) img = cv2.resize(img, (self.img_size, self.img_size)) img = self.transform(img) # Encode loc & conf targets. loc_target, conf_target = self.data_encoder.encode(boxes, labels) return img, loc_target, conf_target def random_flip(self, img, boxes): '''Randomly flip the image and adjust the bbox locations. For bbox (xmin, ymin, xmax, ymax), the flipped bbox is: (w-xmax, ymin, w-xmin, ymax). Args: img: (ndarray.Image) image. f boxes: (tensor) bbox locations, sized [#obj, 4]. Returns: img: (ndarray.Image) randomly flipped image. boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4]. ''' if random.random() < 0.5: img = cv2.flip(img, 1) w = img.shape[1] xmin = w - boxes[:, 2] xmax = w - boxes[:, 0] boxes[:, 0] = xmin boxes[:, 2] = xmax return img, boxes def random_crop(self, img, boxes, labels): '''Randomly crop the image and adjust the bbox locations. For more details, see 'Chapter2.2: Data augmentation' of the paper. Args: img: (ndarray.Image) image. boxes: (tensor) bbox locations, sized [#obj, 4]. labels: (tensor) bbox labels, sized [#obj,]. Returns: img: (ndarray.Image) cropped image. selected_boxes: (tensor) selected bbox locations. labels: (tensor) selected bbox labels. ''' imw, imh = img.shape[1], img.shape[0] while True: min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9]) # random choice the one if min_iou is None: return img, boxes, labels for _ in range(100): w = random.randrange(int(0.1 * imw), imw) h = random.randrange(int(0.1 * imh), imh) if h > 2 * w or w > 2 * h or h < 1 or w < 1: continue x = random.randrange(imw - w) y = random.randrange(imh - h) roi = torch.Tensor([[x, y, x + w, y + h]]) center = (boxes[:, :2] + boxes[:, 2:]) / 2 # [N,2] roi2 = roi.expand(len(center), 4) # [N,4] mask = (center > roi2[:, :2]) & (center < roi2[:, 2:]) # [N,2] mask = mask[:, 0] & mask[:, 1] #[N,] if not mask.any(): continue selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) iou = self.data_encoder.iou(selected_boxes, roi) if iou.min() < min_iou: continue img = img[y:y + h, x:x + w, :] selected_boxes[:, 0].add_(-x).clamp_(min=0, max=w) selected_boxes[:, 1].add_(-y).clamp_(min=0, max=h) selected_boxes[:, 2].add_(-x).clamp_(min=0, max=w) selected_boxes[:, 3].add_(-y).clamp_(min=0, max=h) return img, selected_boxes, labels[mask] def __len__(self): return self.num_samples
class ListDataset(data.Dataset): def __init__(self, root, train, transform, input_size): ''' Args: root: (str) ditectory to images. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) model input size. ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.fnames = [] self.boxes = [] self.labels = [] self.encoder = DataEncoder() self._labpath = sorted(glob.glob("%s/*.*" % self.root)) self._imgpath = [ path.replace("labels", "image").replace(".txt", ".jpg") for path in self._labpath ] def __getitem__(self, index): '''Load image. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_targets: (tensor) location targets. cls_targets: (tensor) class label targets. ''' # Load image and boxes. img_path = self._imgpath[index].rstrip() fname = img_path.split('/')[-1].split('.')[0] # print(img_path) img = Image.open(img_path) if img.mode != 'RGB': img = img.convert('RGB') label_path = self._labpath[index].rstrip() # print(label_path) targets = np.loadtxt(label_path).reshape(-1, 5) # targets = np.array(targets) # print(targets) boxes = torch.Tensor(targets[:, 1:]) labels = torch.LongTensor(targets[:, 0]) size = self.input_size # Data augmentation. if self.train: img, boxes = random_flip(img, boxes) img, boxes = random_crop(img, boxes) img, boxes = resize(img, boxes, (size, size)) else: img, boxes = resize(img, boxes, (size, size)) # img, boxes = center_crop(img, boxes, (size,size)) img = self.transform(img) return img, boxes, labels, fname def collate_fn(self, batch): '''Pad images and encode targets. As for images are of different sizes, we need to pad them to the same size. Args: batch: (list) of images, cls_targets, loc_targets. Returns: padded images, stacked cls_targets, stacked loc_targets. ''' imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] fname = [x[3] for x in batch] h = w = self.input_size num_imgs = len(imgs) # print(num_imgs) inputs = torch.zeros(num_imgs, 3, h, w) loc_targets = [] cls_targets = [] for i in range(num_imgs): inputs[i] = imgs[i] loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(w, h)) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack( cls_targets), fname # return inputs, boxes, labels def __len__(self): return len(self._labpath)
from torch.autograd import Variable from encoder import DataEncoder import arcface_loss2 from cosface_loss import MarginCosineProduct cudnn.benchmark = True id_net = Idnet(classnum=2874) id_net = torch.nn.DataParallel(id_net, device_ids=[0]) id_net.load_state_dict(torch.load("./arcface_id_net-data_addition-epoch-20-acc0.pth")) id_net.cuda() #net.load_state_dict(torch.load("./trained model/originalFAN_model.pth")) #net.eval() coder = DataEncoder() detector = dlib.get_frontal_face_detector() predicter_path = "./model/shape_predictor_5_face_landmarks.dat" sp = dlib.shape_predictor(predicter_path) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485,0.456,0.406), (0.229,0.224,0.225)) ]) def KFold(n=6000, n_folds=10): folds = [] base = list(range(n)) for i in range(n_folds):
def train(): args = parse_args() assert torch.cuda.is_available(), 'Error: CUDA not found!' assert args.focal_loss, "OHEM + ce_loss is not working... :(" if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) if not os.path.exists(args.logdir): os.mkdir(args.logdir) ########################################################################### # Data ########################################################################### print('==> Preparing data..') trainset = ListDataset(root='/mnt/9C5E1A4D5E1A2116/datasets/', dataset=args.dataset, train=True, transform=Augmentation_traininig, input_size=args.input_size, multi_scale=args.multi_scale) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=trainset.collate_fn) ########################################################################### # Training Detail option\ stepvalues = (10000, 20000, 30000, 40000, 50000) if args.dataset in ["SynthText"] \ else (2000, 4000, 6000, 8000, 10000) best_loss = float('inf') # best test loss start_epoch = 0 # start from epoch 0 or last epoch iteration = 0 cur_lr = args.lr mean = (0.485, 0.456, 0.406) var = (0.229, 0.224, 0.225) step_index = 0 pEval = None ########################################################################### # Model ########################################################################### # set model (focal_loss vs OHEM_CE loss) if args.focal_loss: imagenet_pretrain = 'weights/retinanet_se50.pth' criterion = FocalLoss() num_classes = 1 else: imagenet_pretrain = 'weights/retinanet_se50_OHEM.pth' criterion = OHEM_loss() num_classes = 2 net = RetinaNet(num_classes) # Restore model weights net.load_state_dict(torch.load(imagenet_pretrain)) if args.resume: print('==> Resuming from checkpoint..', args.resume) checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) #start_epoch = checkpoint['epoch'] #iteration = checkpoint['iteration'] #cur_lr = checkpoint['lr'] #step_index = checkpoint['step_index'] # optimizer.load_state_dict(state["optimizer"]) print("multi_scale : ", args.multi_scale) print("input_size : ", args.input_size) print("stepvalues : ", stepvalues) print("start_epoch : ", start_epoch) print("iteration : ", iteration) print("cur_lr : ", cur_lr) print("step_index : ", step_index) print("num_gpus : ", torch.cuda.device_count()) # Data parellelism for multi-gpu training net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() # Put model in training mode and freeze batch norm. net.train() net.module.freeze_bn() # you must freeze batchnorm ########################################################################### # Optimizer ########################################################################### optimizer = optim.SGD(net.parameters(), lr=cur_lr, momentum=0.9, weight_decay=1e-4) #optimizer = optim.Adam(net.parameters(), lr=cur_lr) ########################################################################### # Utils ########################################################################### encoder = DataEncoder() writer = SummaryWriter(log_dir=args.logdir) ########################################################################### # Training loop ########################################################################### t0 = time.time() for epoch in range(start_epoch, 10000): if iteration > args.max_iter: break for inputs, loc_targets, cls_targets in trainloader: inputs = Variable(inputs.cuda()) loc_targets = Variable(loc_targets.cuda()) cls_targets = Variable(cls_targets.cuda()) optimizer.zero_grad() loc_preds, cls_preds = net(inputs) loc_loss, cls_loss = criterion(loc_preds, loc_targets, cls_preds, cls_targets) loss = loc_loss + cls_loss loss.backward() optimizer.step() if iteration % 20 == 0: t1 = time.time() print( 'iter ' + repr(iteration) + ' (epoch ' + repr(epoch) + ') || loss: %.4f || l loc_loss: %.4f || l cls_loss: %.4f (Time : %.1f)' % (loss.sum().item(), loc_loss.sum().item(), cls_loss.sum().item(), (t1 - t0))) # t0 = time.time() writer.add_scalar('loc_loss', loc_loss.sum().item(), iteration) writer.add_scalar('cls_loss', cls_loss.sum().item(), iteration) writer.add_scalar('loss', loss.sum().item(), iteration) # show inference image in tensorboard infer_img = np.transpose(inputs[0].cpu().numpy(), (1, 2, 0)) infer_img *= var infer_img += mean infer_img *= 255. infer_img = np.clip(infer_img, 0, 255) infer_img = infer_img.astype(np.uint8) h, w, _ = infer_img.shape boxes, labels, scores = encoder.decode(loc_preds[0], cls_preds[0], (w, h)) boxes = boxes.reshape(-1, 4, 2).astype(np.int32) if boxes.shape[0] != 0: # infer_img = infer_img/np.float32(255) # print(boxes) # print( # f"infer_img prior to cv2.polylines - dtype: {infer_img.dtype}, shape: {infer_img.shape}, min: {infer_img.min()}, max: {infer_img.max()}") # print( # f"boxes prior to cv2.polylines - dtype: {boxes.dtype}, shape: {boxes.shape}, min: {boxes.min()}, max: {boxes.max()}") infer_img = cv2.polylines(infer_img.copy(), boxes, True, (0, 255, 0), 4) # print( # f"infer_img - dtype: {infer_img.dtype}, shape: {infer_img.shape}, min: {infer_img.min()}, max: {infer_img.max()}") writer.add_image('image', infer_img, iteration, dataformats="HWC") writer.add_scalar('input_size', h, iteration) writer.add_scalar('learning_rate', cur_lr, iteration) t0 = time.time() if iteration % args.save_interval == 0 and iteration > 0: print('Saving state, iter : ', iteration) state = { 'net': net.module.state_dict(), "optimizer": optimizer.state_dict(), 'iteration': iteration, 'epoch': epoch, 'lr': cur_lr, 'step_index': step_index } model_file = args.save_folder + \ 'ckpt_' + repr(iteration) + '.pth' torch.save(state, model_file) if iteration in stepvalues: step_index += 1 cur_lr = adjust_learning_rate(cur_lr, optimizer, args.gamma, step_index) if iteration > args.max_iter: break if args.evaluation and iteration % args.eval_step == 0: try: if pEval is None: print("Evaluation started at iteration {} on IC15...". format(iteration)) eval_cmd = "CUDA_VISIBLE_DEVICES=" + str(args.eval_device) + \ " python eval.py" + \ " --tune_from=" + args.save_folder + 'ckpt_' + repr(iteration) + '.pth' + \ " --input_size=1024" + \ " --output_zip=result_temp1" pEval = Popen(eval_cmd, shell=True, stdout=PIPE, stderr=PIPE) elif pEval.poll() is not None: (scorestring, stderrdata) = pEval.communicate() hmean = float( str(scorestring).strip().split(":")[3].split(",") [0].split("}")[0].strip()) writer.add_scalar('test_hmean', hmean, iteration) print("test_hmean for {}-th iter : {:.4f}".format( iteration, hmean)) if pEval is not None: pEval.kill() pEval = None except Exception as e: print("exception happened in evaluation ", e) if pEval is not None: pEval.kill() pEval = None iteration += 1
img = Image.open(image_path).convert('RGB') img1 = img.resize((300, 300)) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) img1 = transform(img1) if use_cuda: img1 = img1.cuda() loc, conf = net(Variable(img1[None, :, :, :], volatile=True)) # Forward loc = loc.cpu() conf = conf.cpu() #print(loc, conf) data_encoder = DataEncoder() # Decode boxes, labels, scores = data_encoder.decode( loc.data.squeeze(0), F.softmax(conf.squeeze(0)).data) draw = ImageDraw.Draw(img) #draw.rectangle(list(box), outline='blue') #draw.rectangle(ground_truth_box, outline='blue') fnt = ImageFont.truetype('Pillow/Tests/fonts/FreeMono.ttf', 40) #img.show() for box in boxes: box[::2] *= img.width box[1::2] *= img.height box = list(box) x1_org = image[1] y1_org = image[2] x2_org = image[3]
default='ICDAR2015', type=str, help='evaluation dataset') args = parser.parse_args() net = RetinaNet() net = net.cuda() # load checkpoint checkpoint = torch.load(args.tune_from) net.load_state_dict(checkpoint['net']) net.eval() encoder = DataEncoder(args.cls_thresh, args.nms_thresh) # test image path & list img_dir = "/root/DB/ICDAR2015_Incidental/test/" if args.dataset in [ "ICDAR2015" ] else "/root/DB/ICDAR2013_FOCUSED/test/" val_list = [im for im in os.listdir(img_dir) if "jpg" in im] if not os.path.exists(args.output_zip): os.mkdir(args.output_zip) # save results dir & zip eval_dir = "/root/Detector/ocr_evaluation/code/icdar/4_incidental_scene_text/1_TextLocalization/1_IoU/" if args.dataset in ["ICDAR2015"] \ else "/root/Detector/ocr_evaluation/code/icdar/2_focused_scene_text/1_TextLocalization/1_ICDAR2013/" result_zip = zipfile.ZipFile(eval_dir + args.output_zip, 'w')
num_classes = len(target_classes) net = load_model(num_classes=num_classes, fpn_level=5, basenet=config['params']['base'], is_pretrained_base=False, is_norm_reg_target=config['params']['norm_reg_target'], centerness_with_loc=config['params']['centerness_on_reg'], is_train=False) net = net.to(device) net.eval() data_encoder = DataEncoder( image_size=img_size, num_classes=num_classes + 1, fpn_level=5, is_norm_reg_target=config['params']['norm_reg_target']) ckpt = torch.load(os.path.join(config['model']['exp_path'], 'best.pth'), map_location=device) weights = utils._load_weights(ckpt['net']) missing_keys = net.load_state_dict(weights, strict=False) print(missing_keys) class_idx_map = dict() for idx in range(0, num_classes): class_idx_map[idx + 1] = target_classes[idx] img_paths = list() for (path, _, files) in os.walk(opt.imgs):
class ListDataset(data.Dataset): def __init__(self, root, train, transform, input_size): ''' Args: root: (str) ditectory to images. train: (boolean) train or test. transform: ([transforms]) image transforms. input_size: (int) model input size. ''' self.root = root self.train = train self.transform = transform self.input_size = input_size self.fnames = [] self.boxes = [] self.labels = [] self.encoder = DataEncoder() self._labpath = sorted(glob.glob("%s/*.*" % self.root)) self._imgpath = [ path.replace("labels", "image").replace(".txt", ".jpg") for path in self._labpath ] def __getitem__(self, index): '''Load image. Args: idx: (int) image index. Returns: img: (tensor) image tensor. loc_targets: (tensor) location targets. cls_targets: (tensor) class label targets. ''' # Load image and boxes. img_path = self._imgpath[index].rstrip() fname = img_path.split('/')[-1].split('.')[0] # print(img_path) img = cv2.imread(img_path) # if img.mode != 'RGB': # img = img.convert('RGB') h, w, _ = img.shape label_path = self._labpath[index].rstrip() # print(label_path) targets = np.loadtxt(label_path).reshape(-1, 5) targets[:, 1] = (targets[:, 1]) / w targets[:, 2] = (targets[:, 2]) / h targets[:, 3] = (targets[:, 3]) / w targets[:, 4] = (targets[:, 4]) / h size = self.input_size if self.train: Augmentation = SSDAugmentation(size=size) img, boxe, labels = Augmentation(img, targets[:, 1:], targets[:, 0]) # to rgb img = img[:, :, (2, 1, 0)] img = torch.from_numpy(img).permute(2, 0, 1) img = img / 255 _, h1, w1 = img.shape img[0, :, :] = img[0, :, :] / 0.229 img[1, :, :] = img[1, :, :] / 0.224 img[2, :, :] = img[2, :, :] / 0.225 boxe[:, 0] = w1 * boxe[:, 0] boxe[:, 1] = h1 * boxe[:, 1] boxe[:, 2] = w1 * boxe[:, 2] boxe[:, 3] = h1 * boxe[:, 3] else: Augmentation = BaseTransform(size=size) img, boxe, labels = Augmentation(img, targets[:, 1:], targets[:, 0]) img = img[:, :, (2, 1, 0)] img = torch.from_numpy(img).permute(2, 0, 1) img = img / 255 _, h1, w1 = img.shape img[0, :, :] = img[0, :, :] / 0.229 img[1, :, :] = img[1, :, :] / 0.224 img[2, :, :] = img[2, :, :] / 0.225 boxe[:, 0] = w1 * boxe[:, 0] boxe[:, 1] = h1 * boxe[:, 1] boxe[:, 2] = w1 * boxe[:, 2] boxe[:, 3] = h1 * boxe[:, 3] boxes = torch.Tensor(boxe) labels = torch.LongTensor(labels) # img = self.transform(img) return img, boxes, labels, fname def collate_fn(self, batch): '''Pad images and encode targets. As for images are of different sizes, we need to pad them to the same size. Args: batch: (list) of images, cls_targets, loc_targets. Returns: padded images, stacked cls_targets, stacked loc_targets. ''' imgs = [x[0] for x in batch] boxes = [x[1] for x in batch] labels = [x[2] for x in batch] fname = [x[3] for x in batch] h = w = self.input_size num_imgs = len(imgs) # print(num_imgs) inputs = torch.zeros(num_imgs, 3, h, w) loc_targets = [] cls_targets = [] for i in range(num_imgs): inputs[i] = imgs[i] loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(w, h)) loc_targets.append(loc_target) cls_targets.append(cls_target) return inputs, torch.stack(loc_targets), torch.stack( cls_targets), fname def __len__(self): return len(self._labpath)