def get_cat_dogs_dataset( dirs: str = "/app/data/data_cat_dogs/*", extension: str = "*.jpg", test_size: float = 0.2, random_state: int = 42, tag_file_path: tp.Optional[str] = None, ) -> tp.Tuple[tp.Dict[str, tp.Any], tp.Dict[str, tp.Any], int]: dataset = utils.create_dataset(dirs=dirs, extension=extension) df = utils.create_dataframe(dataset, columns=["class", "filepath"]) tag_to_label = utils.get_dataset_labeling(df, "class") if tag_file_path is not None: with open(tag_file_path, "w") as file: json.dump(tag_to_label, file) df_with_labels = utils.map_dataframe( df, tag_column="class", class_column="label", tag2class=tag_to_label, verbose=False, ) train_data, valid_data = utils.split_dataframe_train_test( df_with_labels, test_size=test_size, random_state=random_state) return ( train_data.to_dict("records"), valid_data.to_dict("records"), len(tag_to_label), )
def main(args, _=None): if args.in_csv is not None: df = pd.read_csv(args.in_csv) elif args.in_dir is not None: df = _prepare_df_from_dirs(args.in_dir, args.tag_column, recursive=args.recursive) else: raise Exception if args.tag_delim is not None: df = separate_tags(df, tag_column=args.tag_column, tag_delim=args.tag_delim) tag2lbl = get_dataset_labeling(df, args.tag_column) print("Num classes: ", len(tag2lbl)) with open(args.out_labeling, "w") as fout: json.dump(tag2lbl, fout, indent=4) if args.out_dataset is not None: df.to_csv(args.out_dataset, index=False)