def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" manually_downloaded_files = { 'test_all_frames': dl_manager.manual_dir / 'test_all_frames.zip', 'test_annotations': dl_manager.manual_dir / 'test.json', 'train_all_frames': dl_manager.manual_dir / 'train_all_frames.zip', 'train_annotations': dl_manager.manual_dir / 'train.json', 'valid_all_frames': dl_manager.manual_dir / 'valid_all_frames.zip', 'valid_annotations': dl_manager.manual_dir / 'valid.json', } extracted_files = dl_manager.extract(manually_downloaded_files) return { tfds.Split.TRAIN: self._generate_examples( annotations=extracted_files['train_annotations'], all_frames=extracted_files['train_all_frames'] / 'train_all_frames' / 'JPEGImages', ), tfds.Split.VALIDATION: self._generate_examples( annotations=extracted_files['valid_annotations'], all_frames=extracted_files['valid_all_frames'] / 'valid_all_frames' / 'JPEGImages', ), tfds.Split.TEST: self._generate_examples( annotations=extracted_files['test_annotations'], all_frames=extracted_files['test_all_frames'] / 'test_all_frames' / 'JPEGImages', ), }
def _maybe_prepare_manual_data(dl_manager: tfds.download.DownloadManager): """Return paths to the manually downloaded data if it is available.""" # The file has a different name each time it is downloaded. manually_downloaded_files = [ '1_AVA_HACS_TRAIN_*.zip', '2_AVA_HACS_VAL_*.zip', ] files = [] for file in manually_downloaded_files: file_glob = [_ for _ in dl_manager.manual_dir.glob(file)] if not file_glob: # No manually downloaded files. return None, None if len(file_glob) == 1: files.append(file_glob[0]) else: raise ValueError(f'Unexpected multiple files matching pattern: ' f'{file} inside {os.fspath(dl_manager.manual_dir)}. ' f'There should only be one file matching this pattern.') return dl_manager.extract(files)
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract(URL) subpath = dl_manager.extract( os.path.join(path, self.builder_config.language + ".tar.gz")) return [ tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={"filepath": os.path.join(subpath, "dev")}, ), tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={"filepath": os.path.join(subpath, "test")}, ), tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={"filepath": os.path.join(subpath, "train")}, ), ]
def _split_generators( self, dl_manager: tfds.download.DownloadManager ) -> List[tfds.core.SplitGenerator]: """Returns SplitGenerators.""" extracted_path = dl_manager.extract( os.path.join(dl_manager.manual_dir, "CORD-19-research-challenge.zip")) pd = tfds.core.lazy_imports.pandas df = pd.read_csv(os.path.join(extracted_path, "metadata.csv")).fillna("") data_paths = [] for _, row in df.iterrows(): file_dir = row["full_text_file"] if row["has_full_text"] and _has_abstract(row) and file_dir: d = {k: row[k] for k in _ADDITIONAL_FEATURES + [_ABSTRACT]} d["path"] = os.path.join(extracted_path, file_dir, file_dir, row[_SHA] + ".json") data_paths.append(d) return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={"data_paths": data_paths}, ) ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" manually_downloaded_files = { 'train_all_frames': dl_manager.manual_dir / 'train_all_frames.zip', 'train_annotations': dl_manager.manual_dir / 'train.json', } if self.builder_config.split_train_data_range is not None: # Create a custom training split by subsampling the training data. train_data_range = self.builder_config.split_train_data_range else: # Use the provided training split. train_data_range = None if self.builder_config.split_val_data_range is not None: # Create a custom validation split by subsampling the training data. val_data_range = self.builder_config.split_val_data_range manually_downloaded_files[ 'valid_all_frames'] = manually_downloaded_files[ 'train_all_frames'] manually_downloaded_files[ 'valid_annotations'] = manually_downloaded_files[ 'train_annotations'] else: # Use the provided validation split. val_data_range = None manually_downloaded_files[ 'valid_all_frames'] = dl_manager.manual_dir / 'valid_all_frames.zip' manually_downloaded_files[ 'valid_annotations'] = dl_manager.manual_dir / 'valid.json' if self.builder_config.split_test_data_range is not None: # Create a custom test split by subsampling the training data. test_data_range = self.builder_config.split_test_data_range manually_downloaded_files[ 'test_all_frames'] = manually_downloaded_files[ 'train_all_frames'] manually_downloaded_files[ 'test_annotations'] = manually_downloaded_files[ 'train_annotations'] else: # Use the provided test split. test_data_range = None manually_downloaded_files[ 'test_all_frames'] = dl_manager.manual_dir / 'test_all_frames.zip' manually_downloaded_files[ 'test_annotations'] = dl_manager.manual_dir / 'test.json' extracted_files = dl_manager.extract(manually_downloaded_files) val_dir = 'train_all_frames' if val_data_range else 'valid_all_frames' test_dir = 'train_all_frames' if test_data_range else 'test_all_frames' return { tfds.Split.TRAIN: self._generate_examples( annotations=extracted_files['train_annotations'], all_frames=extracted_files['train_all_frames'] / 'train_all_frames' / 'JPEGImages', video_range_to_use=train_data_range, ), tfds.Split.VALIDATION: self._generate_examples( annotations=extracted_files['valid_annotations'], all_frames=extracted_files['valid_all_frames'] / val_dir / 'JPEGImages', video_range_to_use=val_data_range, ), tfds.Split.TEST: self._generate_examples( annotations=extracted_files['test_annotations'], all_frames=extracted_files['test_all_frames'] / test_dir / 'JPEGImages', video_range_to_use=test_data_range, ), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" archive_path = dl_manager.manual_dir / 'train_1.zip' extracted_path = dl_manager.extract(archive_path) return {'train': self._generate_examples(extracted_path / 'train_1')}