def _extract_file(file_path: pathlib.Path) -> pathlib.Path: """Extract the file_path, returns the saved folder.""" save_folder = file_path.parent if file_path.suffix not in ['.zip', '.tar', '.gz', '.tgz']: return save_folder reader = core.create_reader(file_path) compressed_files = set(reader.list_files()) existed_files = set(core.create_reader(save_folder).list_files()) uncompressed_files = compressed_files.difference(existed_files) if len(uncompressed_files): logging.info( f'Extracting {str(file_path)} to {str(save_folder.resolve())}') for p in tqdm.tqdm(uncompressed_files): out = save_folder / p if not out.parent.exists(): out.parent.mkdir(parents=True) with out.open('wb') as f: f.write(reader.open(p).read()) return save_folder
def stanford_dogs(): reader = core.create_reader('https://www.kaggle.com/jessicali9530/stanford-dogs-dataset') images = reader.list_images() entries = [] for img in images: xml_fp = 'annotations/Annotation/'+img.parent.name+'/'+img.stem for label in object_detection.parse_voc_annotation(reader.open(xml_fp)): label.file_path = str(img) entries.append(label) return Dataset(pd.DataFrame(entries), reader)
def from_label_func(cls, data_path: Union[str, Sequence[str]], label_func: Callable[[pathlib.Path], Optional[pathlib.Path]], pixel_to_class_func: Callable[[core.Reader], Dict[Sequence[int], str]]): reader = core.create_reader(data_path) all_image_paths = reader.list_images() pairs = [] for p in all_image_paths: if label_func(p): pairs.append({'file_path':p, 'label_file_path':label_func(p)}) pixel_to_class = pixel_to_class_func(reader) return Dataset(pd.DataFrame(pairs), reader, pixel_to_class)
def wheat(): reader = core.create_reader('https://www.kaggle.com/c/global-wheat-detection') df = pd.read_csv(reader.open('train.csv')) bbox = df.bbox.str.split(',', expand=True) xmin = bbox[0].str.strip('[ ').astype(float) / df.width ymin = bbox[1].str.strip(' ').astype(float) / df.height df = pd.DataFrame({ 'file_path':'train/'+df.image_id+'.jpg', 'xmin':xmin, 'ymin':ymin, 'xmax':bbox[2].str.strip(' ').astype(float) / df.width + xmin, 'ymax':bbox[3].str.strip(' ]').astype(float) / df.height + ymin, 'class_name':df.source}) return Dataset(df, reader)
def read_csv(data_path: Union[str, Sequence[str]], columns=None): header = 0 if columns else 'infer' reader = core.create_reader(data_path) filenames = [ p.replace('#', '/').replace('?select=', '/').replace('+', ' ').split('/')[-1] for p in core.listify(data_path) ] dfs = [ pd.read_csv(reader.open(f), header=header, names=columns) for f in filenames ] df = dfs[0] if len(dfs) == 1 else pd.concat(dfs, axis=0, ignore_index=True) return df, reader
def from_voc(cls, data_path: Union[str, Sequence[str]], image_folders: str, annotation_folders: str): """Create a dataset when data are stored in the VOC format. :param data_path: Either a URL or a local path. For the former, data will be downloaded automatically. :param folders: The folders containing all example images. :return: The created dataset. """ reader = core.create_reader(data_path) dfs = [] for image_folder, annotation_folder in zip( core.listify(image_folders), core.listify(annotation_folders)): dfs.append(_parse_voc(reader, image_folder, annotation_folder)) df = pd.concat(dfs, axis=0, ignore_index=True) return cls(df, reader)
def from_label_func( cls, data_path: Union[str, Sequence[str]], label_func: Callable[[pathlib.Path], str]) -> 'Dataset': """Create a dataset from a function that maps a image path to its class name. :param data_path: Either a URL or a local path. For the former, data will be downloaded automatically. :param label_func: A function takes an image path (an instance :class:`pathlib.Path`) to return a string class name or a None to skip this image. :return: The created dataset. :param data_path: """ reader = core.create_reader(data_path) entries = [] for file_path in reader.list_images(): lbl = label_func(file_path) if lbl: entries.append({'file_path': file_path, 'class_name': lbl}) df = pd.DataFrame(entries) return cls(df, reader)