def test_dataset_create(): req_params = request.params # Receive params here. ratio = float(req_params.ratio) dataset_name = str(req_params.name) task_id = int(req_params.task_id) description = str(req_params.description) ## root = pathlib.Path('datasrc') img_dir = root / 'img' label_dir = root / 'label' assert img_dir.exists(), \ "The directory 'datasrc/img' is not found in current working directory." assert label_dir.exists(), \ "The directory 'datasrc/label/detection' is not found in current working directory." file_names = [name.relative_to(img_dir) for name in img_dir.iterdir() if name.is_file()] # For Detection if task_id == Task.CLASSIFICATION.value: classification_label_dir = DATASET_LABEL_CLASSIFICATION_DIR target, class_map = parse_txt_classification(str(classification_label_dir / "target.txt")) target_file_list = list(target.keys()) file_names = [p for p in file_names if (img_dir / p).is_file() and (p.name in target_file_list)] n_imgs = len(file_names) perm = np.random.permutation(n_imgs) file_names = [file_names[index] for index in perm[:int(n_imgs * ratio)]] img_files = [str(img_dir / name) for name in file_names] parsed_target = [target[name.name] for name in file_names] elif task_id == Task.DETECTION.value: detection_label_dir = DATASET_LABEL_DETECTION_DIR file_names = [p for p in file_names if (img_dir / p).is_file() and ((detection_label_dir / p.name).with_suffix(".xml")).is_file()] n_imgs = len(file_names) perm = np.random.permutation(n_imgs) file_names = [file_names[index] for index in perm[:int(n_imgs * ratio)]] img_files = [str(img_dir / name) for name in file_names] xml_files = [str(detection_label_dir / name.with_suffix('.xml')) for name in file_names] parsed_target, class_map = parse_xml_detection(xml_files, num_thread=8) elif task_id == Task.SEGMENTATION.value: pass test_data = { "img": img_files, "target": parsed_target, } test_dataset_id = storage.register_test_dataset(task_id, dataset_name, description, test_data) return { 'id': test_dataset_id, 'test_data': test_data, 'class_map': class_map, }
def create_dist(self, filename_list, train=True): """ This function creates img path list and annotation list from filename list. Image file name and label file must be same. Because of that, data_list is a list of file names. Data formats are bellow. image path list: [path_to_img1, path_to_img2, ...] annotation list: .. code-block :: python [ [ # Annotations of each image. {"box":[x, y, w, h], "name":"dog", "class":1}, {"box":[x, y, w, h], "name":"cat", "class":0}, ], [ {"box":[x, y, w, h], "name":"cat", "class":0}, ], ... ] Args: filename_list(list): [filename1, filename2, ...] train(bool): If it's ture, augmentation will be added to distributor. Returns: (ImageDistributor): ImageDistributor object with augmentation. """ img_path_list = [] label_path_list = [] for path in filename_list: name = os.path.splitext(path)[0] img_path = os.path.join(DATASRC_IMG, path) label_path = os.path.join(DATASRC_LABEL, name + ".xml") if os.path.exists(img_path) and os.path.exists(label_path): img_path_list.append(img_path) label_path_list.append(label_path) else: print("{} not found.".format(name)) annotation_list, _ = parse_xml_detection(label_path_list) if train: augmentation = Augmentation([ Shift(min(self.imsize[0] // 10, 20), min(self.imsize[1] // 10, 20)), Flip(), Rotate(), WhiteNoise(), ContrastNorm([0.5, 1.0]) ]) return ImageDistributor(img_path_list, annotation_list, augmentation=augmentation) else: return ImageDistributor(img_path_list, annotation_list)
def test_detection_model_implementation(algo): # 1. Check if the model can be instantiate only giving nothing. try: model = algo() except Exception as e: # Model have to be initialized without no argument for using it with trained weight. raise Exception("Model can initialize without no argument.") methods = { k: v for k, v in inspect.getmembers(model) if inspect.ismethod(v) } # 2. Check function names and their arguments. method_list = { "__init__": [["class_map", type(None)], ["imsize", tuple], ["load_pretrained_weight", bool], ["train_whole_network", bool]], "loss": ["x", "y"], "fit": [ "train_img_path_list", "train_annotation_list", ["valid_img_path_list", type(None)], ["valid_annotation_list", type(None)], ["epoch", int], ["batch_size", int], ["augmentation", type(None)], ["callback_end_epoch", type(None)] ], "get_bbox": ["z"], "predict": [ "img_list", ["batch_size", type(1)], "score_threshold", "nms_threshold" ], "get_optimizer": [ ["current_loss", type(None)], ["current_epoch", type(None)], ["total_epoch", type(None)], ["current_batch", type(None)], ["total_batch", type(None)], ["avg_valid_loss_list", type(None)], ], "preprocess": ["x"], "build_data": [], "regularize": [], } for k, v in method_list.items(): last_checked_index = -1 assert k in methods args = inspect.getargspec(getattr(model, k)) for i, a in enumerate(v): if isinstance(a, list): try: index = args.args.index(a[0]) except ValueError as e: raise ValueError( "Argument '{}' is not implemented.".format(a[0])) assert a[1] == type(args.defaults[index - (len(args.args) - len(args.defaults))]), \ "Default argument type miss matched." else: try: index = args.args.index(a) except ValueError as e: raise ValueError( "Argument '{}' is not implemented.".format(a)) assert index > last_checked_index, \ "The order of arguments are not correct in {}.".format(k) last_checked_index = index # 3. Check serializable attributes. serializables = ["class_map", "imsize", "num_class"] for s in serializables: assert s in algo.SERIALIZED # 4. Check fit function. test_imgs = [ "voc.jpg", "voc.jpg", ] test_xmls = ["voc.xml", "voc.xml"] test_annotation, class_map = parse_xml_detection(test_xmls) if algo is Yolov2: # Yolo needs anchor. model = algo(class_map, anchor=AnchorYolov2([[0.2, 0.3]], (224, 224))) else: model = algo(class_map) model.fit(test_imgs, test_annotation, test_imgs, test_annotation, batch_size=2, epoch=2) # Predict model.predict(test_imgs)
for p in sorted(os.listdir(img_path))[:8 * 100] ] lbl_list = [ os.path.join(label_path, p) for p in sorted(os.listdir(label_path))[:8 * 100] ] valid_img_list = [ os.path.join(img_path, p) for p in sorted(os.listdir(img_path))[8 * 300:10 * 300] ] valid_lbl_list = [ os.path.join(label_path, p) for p in sorted(os.listdir(label_path))[8 * 300:10 * 300] ] annotation_list, class_map = parse_xml_detection(lbl_list) valid_annotation_list, _ = parse_xml_detection(valid_lbl_list) def callback(epoch, model, train_loss, valid_loss): plt.clf() path = random.choice(valid_img_list) plt.imshow(draw_box(path, model.predict(path))) plt.tight_layout() plt.savefig("img%04d.png" % (epoch)) ssd = SSD(class_map, load_pretrained_weight=True, train_whole_network=False) ssd.fit(img_list, annotation_list, valid_img_list,
def load_dataset_split_detail(): import time try: start_t = time.time() datasrc = pathlib.Path(DATASRC_DIR) imgdirname = pathlib.Path("img") xmldirname = pathlib.Path("label") imgdir = (datasrc / imgdirname) xmldir = (datasrc / xmldirname) name = urllib.parse.unquote(request.params.name, encoding='utf-8') ratio = float(request.params.ratio) client_id = request.params.u_id description = urllib.parse.unquote(request.params.description, encoding='utf-8') start_t = time.time() # if 2nd time delete confirmdataset id if request.params.delete_id: del confirm_dataset[request.params.delete_id] # old cofirm_dataset delete if len(confirm_dataset) > 100: confirm_dataset.popitem(False) # search image files imgs = (p.relative_to(imgdir) for p in imgdir.iterdir() if p.is_file()) # remove images without label imgs = set([ img for img in imgs if (xmldir / img).with_suffix('.xml').is_file() ]) assert len( imgs ) > 0, "Image not found in directory. Please set images to 'datasrc/img' directory and xml files to 'datasrc/label' directory." # split files into trains and validations n_imgs = len(imgs) trains = set(random.sample(imgs, int(ratio * n_imgs))) valids = imgs - trains start_t = time.time() # build filename of images and labels train_imgs = [str(img) for img in trains] valid_imgs = [str(img) for img in valids] perm = np.random.permutation(int(n_imgs)) perm_train, perm_valid = np.split(perm, [int(n_imgs * ratio)]) imgs = list(imgs) parsed_train_imgs = [] parsed_valid_imgs = [] parsed_train_img_names = [ str(imgs[perm]).split('.')[0] for perm in perm_train ] parsed_valid_img_names = [ str(imgs[perm]).split('.')[0] for perm in perm_valid ] start_t = time.time() parsed_train, train_class_map = parse_xml_detection([ str(path) for path in xmldir.iterdir() if str(path).split('/')[-1].split('.')[0] in parsed_train_img_names ], num_thread=8) parsed_valid, valid_class_map = parse_xml_detection([ str(path) for path in xmldir.iterdir() if str(path).split('/')[-1].split('.')[0] in parsed_valid_img_names ], num_thread=8) start_t = time.time() # Insert detailed informations train_num = len(train_imgs) valid_num = len(valid_imgs) class_tag_list = [] train_tag_count = {} for i in range(len(parsed_train)): for j in range(len(train_class_map)): if parsed_train[i][0].get('name') == train_class_map[j]: if train_class_map[j] not in train_tag_count: train_tag_count[train_class_map[j]] = 1 else: train_tag_count[train_class_map[j]] += 1 valid_tag_count = {} for i in range(len(parsed_valid)): for j in range(len(valid_class_map)): if parsed_valid[i][0].get('name') == valid_class_map[j]: if valid_class_map[j] not in valid_tag_count: valid_tag_count[valid_class_map[j]] = 1 else: valid_tag_count[valid_class_map[j]] += 1 for tags in sorted(train_tag_count.keys()): class_tag_list.append({ "tags": tags, "train": train_tag_count.get(tags), "valid": valid_tag_count.get(tags) }) # save datasplit setting confirm_dataset[client_id] = { "name": name, "ratio": ratio, "description": description, "train_imgs": train_imgs, "valid_imgs": valid_imgs, "class_maps": train_class_map, "class_tag_list": class_tag_list } body = json.dumps({ "total": n_imgs, "id": client_id, "description": description, "train_image_num": train_num, "valid_image_num": valid_num, "class_tag_list": class_tag_list, "train_imgs": train_imgs, "valid_imgs": valid_imgs, }) ret = create_response(body) return ret except Exception as e: traceback.print_exc() body = json.dumps({"error_msg": e.args[0]}) ret = create_response(body) return ret
def test_dataset_confirm(): req_params = request.params # Receive params here. ratio = float(req_params.ratio) dataset_name = str(urllib.parse.unquote(req_params.name, encoding='utf-8')) task_id = int(req_params.task_id) description = str(urllib.parse.unquote(req_params.description, encoding='utf-8')) ## root = pathlib.Path('datasrc') img_dir = root / 'img' label_dir = root / 'label' assert img_dir.exists(), \ "The directory 'datasrc/img' is not found in current working directory." assert label_dir.exists(), \ "The directory 'datasrc/label/detection' is not found in current working directory." file_names = [name.relative_to(img_dir) for name in img_dir.iterdir() if name.is_file()] class_map = {} n_imgs = 0 # For Detection if task_id == Task.CLASSIFICATION.value: classification_label_dir = DATASET_LABEL_CLASSIFICATION_DIR target, class_map = parse_txt_classification(str(classification_label_dir / "target.txt")) target_file_list = list(target.keys()) file_names = [p for p in file_names if (img_dir / p).is_file() and (p.name in target_file_list)] n_imgs = len(file_names) perm = np.random.permutation(n_imgs) file_names = [file_names[index] for index in perm[:int(n_imgs * ratio)]] img_files = [str(img_dir / name) for name in file_names] parsed_target = [target[name.name] for name in file_names] elif task_id == Task.DETECTION.value: detection_label_dir = DATASET_LABEL_DETECTION_DIR file_names = [p for p in file_names if (img_dir / p).is_file() and ((detection_label_dir / p.name).with_suffix(".xml")).is_file()] n_imgs = len(file_names) perm = np.random.permutation(n_imgs) file_names = [file_names[index] for index in perm[:int(n_imgs * ratio)]] img_files = [str(img_dir / name) for name in file_names] xml_files = [str(detection_label_dir / name.with_suffix('.xml')) for name in file_names] parsed_target, class_map = parse_xml_detection(xml_files, num_thread=8) elif task_id == Task.SEGMENTATION.value: pass test_data = { "img": img_files, "target": parsed_target, } # test_dataset_id = storage.register_test_dataset(task_id, dataset_name, description, test_data) test_tag_num = [] if task_id == Task.DETECTION.value: test_tag_list = [] for i in range(len(parsed_target)): test_tag_list.append(parsed_target[i][0].get('class')) test_tag_num, _ = np.histogram(test_tag_list, bins=list(range(len(class_map) + 1))) class_info = { "test_dataset_name": dataset_name, "class_map": class_map, "other_imgs": (n_imgs - len(img_files)), "test_imgs": len(img_files), "class_ratio": test_tag_num.tolist(), "test_ratio": ratio, } # "class_ratio": ((train_tag_num + valid_tag_num) / np.sum(train_tag_num + valid_tag_num)).tolist(), # "train_ratio": (train_tag_num / (train_tag_num + valid_tag_num)).tolist(), # "valid_ratio": (valid_tag_num / (train_tag_num + valid_tag_num)).tolist(), # "test_ratio": test_ratio, # } return class_info
def dataset_confirm(): global temp_dataset req_params = request.params dataset_hash = req_params.hash # Receive params here. ratio = float(req_params.ratio) # Correspondence for multi byte input data dataset_name = str(urllib.parse.unquote(req_params.name, encoding='utf-8')) test_dataset_id = int( req_params.test_dataset_id if req_params.test_dataset_id != '' else '-1') task_id = int(req_params.task_id) description = str(urllib.parse.unquote(req_params.description, encoding='utf-8')) assert len(dataset_name) <= DATASET_NAME_MAX_LENGTH, \ "Dataset name is too long. Please set the name length <= {}".format(DATASET_NAME_MAX_LENGTH) assert len(description) <= DATASET_DESCRIPTION_MAX_LENGTH, \ "Dataset description is too long. Please set the description length <= {}".format( DATASET_DESCRIPTION_MAX_LENGTH) root = pathlib.Path('datasrc') img_dir = root / 'img' label_dir = root / 'label' assert img_dir.exists(), \ "The directory 'datasrc/img' is\ not found in current working directory." file_names = set([name.relative_to(img_dir) for name in img_dir.iterdir() if name.is_file()]) if test_dataset_id > 0: test_dataset = storage.fetch_test_dataset(test_dataset_id) test_dataset = set([pathlib.Path(test_path).relative_to(img_dir) for test_path in test_dataset['data']['img']]) # Remove test files. file_names = file_names - test_dataset if task_id == Task.CLASSIFICATION.value: # Load data for Classification # Checks # 1. The class name file existence and format. classification_label_dir = DATASET_LABEL_CLASSIFICATION_DIR class_labe_path = classification_label_dir / "target.txt" assert classification_label_dir.exists(), \ "target.txt was not found in the directory {}".format(str(classification_label_dir)) target, class_map = parse_txt_classification(str(class_labe_path)) target_file_list = list(target.keys()) file_names = [p for p in file_names if (img_dir / p).is_file() and (p.name in target_file_list)] img_files = [str(img_dir / name) for name in file_names] parsed_target = [target[name.name] for name in file_names] elif task_id == Task.DETECTION.value: detection_label_dir = DATASET_LABEL_DETECTION_DIR file_names = [p for p in file_names if (img_dir / p).is_file() and ((detection_label_dir / p.name).with_suffix(".xml")).is_file()] img_files = [str(img_dir / name) for name in file_names] xml_files = [str(detection_label_dir / name.with_suffix('.xml')) for name in file_names] parsed_target, class_map = parse_xml_detection(xml_files, num_thread=8) elif task_id == Task.SEGMENTATION.value: segmentation_label_dir = DATASET_LABEL_SEGMENTATION_DIR file_names = [p for p in file_names if (img_dir / p).is_file() and any([((segmentation_label_dir / p.name).with_suffix(suf)).is_file() for suf in [".jpg", ".png"]])] img_files = [str(img_dir / name) for name in file_names] parsed_target = [str(segmentation_label_dir / name.with_suffix(".png")) for name in file_names] class_map = parse_classmap_file(str(segmentation_label_dir / "class_map.txt")) # Split into train and valid. n_imgs = len(file_names) perm = np.random.permutation(n_imgs) train_img, valid_img = np.split(np.array([img_files[index] for index in perm]), [int(ratio * n_imgs)]) train_img = train_img.tolist() valid_img = valid_img.tolist() valid_img_size = [list(Image.open(i).size) for i in valid_img] train_target, valid_target = np.split(np.array([parsed_target[index] for index in perm]), [int(ratio * n_imgs)]) train_target = train_target.tolist() valid_target = valid_target.tolist() # Load test Dataset if exists. if test_dataset_id > 0: test_dataset = storage.fetch_test_dataset(test_dataset_id) test_ratio = [] else: test_ratio = [] # Dataset Information if task_id == Task.CLASSIFICATION.value: train_tag_num, _ = np.histogram(train_target, bins=list(range(len(class_map) + 1))) valid_tag_num, _ = np.histogram(valid_target, bins=list(range(len(class_map) + 1))) elif task_id == Task.DETECTION.value: train_tag_list = [] valid_tag_list = [] for i in range(len(train_target)): for j in range(len(train_target[i])): train_tag_list.append(train_target[i][j].get('class')) for i in range(len(valid_target)): for j in range(len(valid_target[i])): valid_tag_list.append(valid_target[i][j].get('class')) train_tag_num, _ = np.histogram(train_tag_list, bins=list(range(len(class_map) + 1))) valid_tag_num, _ = np.histogram(valid_tag_list, bins=list(range(len(class_map) + 1))) elif task_id == Task.SEGMENTATION.value: train_tag_num = parse_image_segmentation(train_target, len(class_map), 8) valid_tag_num = parse_image_segmentation(valid_target, len(class_map), 8) class_info = { "class_map": class_map, "class_ratio": ((train_tag_num + valid_tag_num) / np.sum(train_tag_num + valid_tag_num)).tolist(), "train_ratio": (train_tag_num / (train_tag_num + valid_tag_num)).tolist(), "valid_ratio": (valid_tag_num / (train_tag_num + valid_tag_num)).tolist(), "test_ratio": test_ratio, "train_img_num": len(train_img), "valid_img_num": len(valid_img), "test_img_num": 1, } # Register train_data = { 'img': train_img, 'target': train_target } valid_data = { 'img': valid_img, 'target': valid_target, 'size': valid_img_size, } dataset = { "task_id": task_id, "dataset_name": dataset_name, "description": description, "ratio": ratio, "train_data": train_data, "valid_data": valid_data, "class_map": class_map, "test_dataset_id": test_dataset_id, "class_info": class_info } temp_dataset[dataset_hash] = dataset # Client doesn't need 'train_data' return_dataset = { "task_id": task_id, "dataset_name": dataset_name, "description": description, "ratio": ratio, # "train_data": train_data, "valid_data": valid_data, "class_map": class_map, "test_dataset_id": test_dataset_id, "class_info": class_info } return return_dataset
def fetch_detection_dataset_voc_2007(split_validation=True): """ Args: split_validation (boolean): Whether or not split validation data. Returns: (list): This ret1urns list of image path. (list): This returns list of annotations. Each annotation has a list of dictionary which includes keys 'box' and 'name'. The structure is bellow. [ [ # Objects of 1st image. {'box': [x(float), y, w, h], 'name': class_name(string), 'size': (x, y)(list of float), 'class': id(int)}, {'box': [x(float), y, w, h], 'name': class_name(string), 'size': (x, y)(list of float), 'class': id(int)}, ... ], [ # Objects of 2nd image. {'box': [x(float), y, w, h], 'name': class_name(string), 'size': (x, y)(list of float), 'class': id(int)}, {'box': [x(float), y, w, h], 'name': class_name(string), 'size': (x, y)(list of float), 'class': id(int)}, ... ] ], """ voc_2007_url = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar" voc_2007_tar = "VOCtrainval_06-Nov-2007.tar" image_voc_2007 = "VOCdevkit/VOC2007/JPEGImages/" label_voc_2007 = "VOCdevkit/VOC2007/Annotations/" if not os.path.exists("VOCdevkit/VOC2007"): if not os.path.exists(voc_2007_tar): download(voc_2007_url) with tarfile.open(voc_2007_tar) as tar: tar.extractall() train_voc_2007 = [ line.strip() for line in open( "VOCdevkit/VOC2007/ImageSets/Main/train.txt").readlines() ] valid_voc_2007 = [ line.strip() for line in open( "VOCdevkit/VOC2007/ImageSets/Main/val.txt").readlines() ] train_image_path_list = [] train_label_path_list = [] valid_image_path_list = [] valid_label_path_list = [] # Use training dataset of VOC2007 as training data. for path in train_voc_2007: train_image_path_list.append( os.path.join(image_voc_2007, path + '.jpg')) train_label_path_list.append( os.path.join(label_voc_2007, path + '.xml')) # Use validation dataset of VOC2007 as validation data. for path in valid_voc_2007: valid_image_path_list.append( os.path.join(image_voc_2007, path + '.jpg')) valid_label_path_list.append( os.path.join(label_voc_2007, path + '.xml')) if split_validation == True: train_annotation_list, _ = parse_xml_detection(train_label_path_list) valid_annotation_list, _ = parse_xml_detection(valid_label_path_list) return train_annotation_list, train_image_path_list, valid_annotation_list, valid_image_path_list else: train_label_path_list.extend(valid_label_path_list) label_path_list = train_label_path_list train_image_path_list.extend(valid_image_path_list) image_path_list = train_image_path_list annotation_list, _ = parse_xml_detection(label_path_list) return annotation_list, image_path_list
def fetch_detection_dataset_pets(split_validation=True, test_size=0.2): """ Args: split_validation (boolean): Whether or not split validation data. test_size(float): propotion of the test dataset in total datasets. Returns: (list): This ret1urns list of image path. (list): This returns list of annotations. Each annotation has a list of dictionary which includes keys 'box' and 'name'. The structure is bellow. [ [ # Objects of 1st image. {'box': [x(float), y, w, h], 'name': class_name(string), 'size': (x, y)(list of float), 'class': id(int)}, {'box': [x(float), y, w, h], 'name': class_name(string), 'size': (x, y)(list of float), 'class': id(int)}, ... ], [ # Objects of 2nd image. {'box': [x(float), y, w, h], 'name': class_name(string), 'size': image_size(float), 'class': id(int)}, {'box': [x(float), y, w, h], 'name': class_name(string), 'size': image_size(float), 'class': id(int)}, ... ] ] """ pets_image_url = "http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz" pets_label_url = "http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz" pets_image_tar = "images.tar.gz" pets_label_tar = "annotations.tar.gz" setting = { "image": [pets_image_url, pets_image_tar], "label": [pets_label_url, pets_label_tar] } for url, path in setting.values(): if not os.path.exists(path): download(url) with tarfile.open(path) as tar: tar.extractall(path="pets") def exist_xml_list(path, xml_name_list): name, _ = os.path.splitext(path) return name in xml_name_list xml_list = os.listdir("pets/annotations/xmls") xml_name_list = [os.path.splitext(name)[0] for name in xml_list] image_path_list = os.listdir("pets/images") image_name_list = [os.path.splitext(name)[0] for name in image_path_list] name_list = list(set(image_name_list) & set(xml_name_list)) xml_list = [ os.path.join("pets/annotations/xmls", name + ".xml") for name in name_list ] annotation_list, _ = parse_xml_detection(xml_list) image_path_list = [ os.path.join("pets/images", name + ".jpg") for name in name_list ] if split_validation == False: return image_path_list, annotation_list else: image_path_list, annotation_list = np.array(image_path_list), np.array( annotation_list) indices = np.random.permutation(image_path_list.shape[0] - 1) threshold = int(np.round(test_size * image_path_list.shape[0])) train_index, test_index = indices[threshold:], indices[:threshold] train_annotation_list, valid_annotation_list = annotation_list[ train_index], annotation_list[test_index] train_image_path_list, valid_image_path_list = image_path_list[ train_index], image_path_list[test_index] return list(train_image_path_list), list(train_annotation_list), list( valid_image_path_list), list(valid_annotation_list)