def main(dataset: str, split: int): [train, dev, _, _] = php.get_meta_paths(dataset, split) env.LOGGER.info(f'Splitting {train} into train and dev...') meta_train, meta_dev = _split_train_dev(train) meta_train.to_json(ct.WORK_ROOT / train, orient='index') meta_dev.to_json(ct.WORK_ROOT / dev, orient='index') env.LOGGER.info('...Done')
def main(dataset: str, split: int): global DATA_ROOT_DIR DATA_ROOT_DIR = php.get_data_root_path(dataset) [train, dev, _, test] = php.get_meta_paths(dataset, split) for path in [train, dev, test]: env.LOGGER.info(f'Extracting jpeg images from {path.as_posix()}...') meta = ghp.read_meta(path) for _ in ghp.parallel.execute(_extract_jpeg, list(meta.iterrows()), 1): continue env.LOGGER.info('...Done')
def main(dataset: str, split: int): global DATA_ROOT_DIR DATA_ROOT_DIR = php.get_data_root_path(dataset) [train, dev, _, test] = php.get_meta_paths(dataset, split) for path in [train, dev, test]: env.LOGGER.info(f'Augmenting metadata at {path.as_posix()}...') meta = ghp.read_meta(path) add_columns(meta) for index, row in ghp.parallel.execute(_augment_meta, list(meta.iterrows()), 1): meta.loc[index] = row meta.to_json(ct.WORK_ROOT / path, orient='index') env.LOGGER.info('...Done')
def main(dataset: str, split: int): _, _, merged_meta_path, _ = php.get_meta_paths(dataset, split) stats_path = php.get_stats_path(dataset, split) env.LOGGER.info(f'Gathering size statistics for the {dataset} dataset...') meta = ghp.read_meta(merged_meta_path) stats = { 'min_length': [meta['length'].min()], 'mean_length': [meta['length'].mean()], 'max_length': [meta['length'].max()], 'min_height': [meta['height'].min()], 'mean_height': [meta['height'].mean()], 'max_height': [meta['height'].max()], 'min_width': [meta['width'].min()], 'mean_width': [meta['width'].mean()], 'max_width': [meta['width'].max()], } _store_stats(stats, stats_path) env.LOGGER.info('...Done.')
def main(dataset: str, split: int): env.LOGGER.info(f'Merging {dataset} train and dev DataFrames into one...') [train, dev, merged, _] = php.get_meta_paths(dataset, split) merge_meta(train, dev, merged) env.LOGGER.info('...Done.')