示例#1
0
 def remove(ds_name, data_path=None):
     """Remove a downloaded coco dataset."""
     path = Path(URLs.path(c_key='data'))/ds_name if data_path is None else Path(data_path)/ds_name
     if path.is_dir():
         rmtree(path)
         print(f"{path} removed.")
     else:
         print(f"No dataset '{path}' found.")
示例#2
0
 def get_path_df(ds_name, data_path=None):
     """Get path and dataframe of downloaded coco dataset."""
     path = Path(URLs.path(c_key='data'))/ds_name if data_path is None else Path(data_path)/ds_name
     if path.is_dir():
         if (path/"df_train.csv").is_file():
             return (path, pd.read_csv(path/"df_train.csv"))
         else:
             print("No Dataframe found in "+str(path))
     else:
         print("No dataset '"+str(path)+"' found.")
         print("Create dataset first with CocoData.create(ds_name, cat_list) or list available datasets with CocoData.ls()")
示例#3
0
def preprocess_audio_folder(path, folders=None, output_dir=None, **kwargs):
    "Preprocess audio files in `path` in parallel using `n_workers`"
    path = Path(path)
    fnames = get_audio_files(path, recurse=True, folders=folders)
    output_dir = Path(ifnone(output_dir, path.parent / f"{path.name}_cached"))
    output_dir.mkdir(exist_ok=True)

    pp = PreprocessAudio(**kwargs)

    for i, fil in enumerate(fnames):
        out = output_dir / fnames[i].relative_to(path)
        aud = pp(fil)
        save_audio(str(out), aud, aud.sr)
    return output_dir
示例#4
0
    def _create_dataframe(path, cat_list, with_mask,):
        print("Creating Dataframe...")
        path_images = path/"images"
        path_masks = path/"masks"
        df_train = pd.DataFrame()

        img_id2fn = {int(Path(fn).stem):fn for fn in path_images.ls()}
        img_ids = [i for i in img_id2fn.keys()]
        idx2cat = {e['id']:e['name'] for e in CocoData.coco.loadCats(CocoData.coco.getCatIds())}

        for i in progress_bar(range(len(img_ids))):
            img_id = img_ids[i]
            annos = CocoData.coco.loadAnns(CocoData.coco.getAnnIds(imgIds=img_id))
            # remove annotations of other labels
            annos = [a for a in annos if idx2cat[a["category_id"]] in cat_list]
            # sort by area
            area_dict = {a["area"]:a for a in annos}
            annos = [area_dict[k] for k in sorted(area_dict, reverse=True)]

            n_objs = len(annos)

            df_x_mins = [a["bbox"][0] for a in annos]
            df_y_mins = [a["bbox"][1] for a in annos]
            widths = [a["bbox"][2] for a in annos]
            heights = [a["bbox"][3] for a in annos]
            df_x_maxs = [df_x_mins[ia]+widths[ia] for ia in range(len(annos))]
            df_y_maxs = [df_y_mins[ia]+heights[ia] for ia in range(len(annos))]
            df_class_names = [idx2cat[a["category_id"]] for a in annos]

            df_img_id = [img_id] * n_objs
            img_path = img_id2fn[img_id]
            df_img_path = [str(img_path)] * n_objs

            if with_mask:
                df_mask_path = []
                df_obj_ids = [i for i in range(n_objs)]
                mask = np.zeros(CocoData.coco.annToMask(annos[0]).shape, dtype=np.uint8)
                for o_id in df_obj_ids:
                    mask = CocoData.coco.annToMask(annos[o_id]) #* p_idx
                    #mask[mask>p_idx] = p_idx # for overlapping parts
                    mask_path = path_masks/(img_path.stem+"_"+str(o_id)+".png") # save mask always as png
                    Image.fromarray(mask).save(mask_path)
                    df_mask_path.append(str(mask_path))

                df = pd.DataFrame({"image_id":df_img_id, "image_path":df_img_path,
                                   "mask_path":df_mask_path, "object_id":df_obj_ids,
                                   "x_min":df_x_mins, "y_min":df_y_mins, "x_max":df_x_maxs, "y_max":df_y_maxs,
                                   "class_name":df_class_names})
            else:
                df = pd.DataFrame({"image_id":df_img_id, "image_path":df_img_path,
                                   "x_min":df_x_mins, "y_min":df_y_mins, "x_max":df_x_maxs, "y_max":df_y_maxs,
                                   "class_name":df_class_names})

            df_train = df_train.append(df)

        df_train.reset_index(inplace=True, drop=True)
        df_train.to_csv(str(path/"df_train.csv"), index=False)
        return df_train
示例#5
0
 def _download_annotation_file(path):
     print("Downloading annotation files...")
     url = 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip'
     zipresp = urlopen(url)
     zip_fn = path/'annotations_trainval2017.zip'
     with open(zip_fn, 'wb') as zip:
         zip.write(zipresp.read())
     zf = ZipFile(zip_fn)
     zf.extractall(path=str(path))
     zf.close()
     Path(zip_fn).unlink()
示例#6
0
    def create(cls, ds_name, cat_list, data_path=None, with_mask=False, max_images=1000, remove_crowded=True):
        """Creates a new coco dataset with categories defined in cat_list, optionally with or without masks.
        You can specify the path, where the dataset gets stored, by default it uses fastai's data path like `untar_data`"""

        path = Path(URLs.path(c_key='data'))/ds_name if data_path is None else Path(data_path)/ds_name
        path_images = path/"images"
        path_masks = path/"masks"

        if Path(path).is_dir():
            print(f"Dataset {ds_name} already exists: {path}")
            return cls.get_path_df(ds_name, data_path=data_path)

        # create folders
        print("Creating folders.")
        path.mkdir(exist_ok=False, parents=True)
        path_images.mkdir()
        if with_mask: path_masks.mkdir()

        # download annotation files
        annotations = 'annotations/instances_train2017.json'
        if not (path/annotations).is_file():
            cls._download_annotation_file(path)
        if not (path/annotations).is_file():
            print("Download was not successful. No annotation file found.")
            return
        cls.coco = COCO(annotation_file=str(path/annotations))

        # download images
        cls._download_images(cat_list, path_images, max_images, remove_crowded)

        # create dataframe
        df = cls._create_dataframe(path, cat_list, with_mask)

        return path, df
示例#7
0
def tar_extract_at_filename(fname, dest):
    "Extract `fname` to `dest`/`fname.name` folder using `tarfile`"
    dest = Path(dest) / Path(fname).with_suffix("").name
    tarfile.open(fname, "r:gz").extractall(dest)
示例#8
0
 def ls(data_path=None):
     """List all available datasets."""
     path = Path(URLs.path(c_key='data')) if data_path is None else Path(data_path)
     if path.is_dir():
         return list(path.ls())
     else: print(f"Path {path} does not exist.")
示例#9
0
warnings.filterwarnings("ignore",
                        category=UserWarning,
                        module="torch.nn.functional")

# https://github.com/dmlc/xgboost/issues/1715
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

# training is very, very slow
os.environ["OMP_NUM_THREADS"] = "1"

## helpful way to initially get folders
# import split_folders
# split_folders.ratio('<path>', output='<path>/split', seed=1337, ratio=(.8, .2)) # uses default values
# sys.exit()

path = Path("data/CNN/-released/split")

################################################################################
# fastai uses databunches
################################################################################
data = (
    ImageList.from_folder(path / "train").split_by_rand_pct(
        0.1, seed=33).label_from_folder()
    # .add_test_folder('..'/path/'test')
    .transform(
        get_transforms(do_flip=True, flip_vert=True),
        size=150,
        resize_method=ResizeMethod.SQUISH,
        padding_mode="zeros",
    ).databunch(bs=64).normalize(imagenet_stats))