def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(empathetic_dialogues): Downloads the data and defines the splits # dl_manager is a datasets.download.DownloadManager that can be used to # download and extract URLs archive = dl_manager.download(_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"files": dl_manager.iter_archive(archive), "split_file": "empatheticdialogues/train.csv"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"files": dl_manager.iter_archive(archive), "split_file": "empatheticdialogues/valid.csv"}, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={"files": dl_manager.iter_archive(archive), "split_file": "empatheticdialogues/test.csv"}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": os.path.join(data_dir, "macmorpho-train.txt"), "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "macmorpho-test.txt"), "split": "test"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": os.path.join(data_dir, "macmorpho-dev.txt"), "split": "dev", }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_path = dl_manager.download_and_extract(_DATA_URL) infile = f"{dl_path}/mapdata.dat" return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "split": "train", "data_file": infile }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" my_urls = _URLs[self.config.name] data = dl_manager.download_and_extract(my_urls) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data, "gutenberg_time_phrases.csv"), "split": "train", }, ) ]
def _split_generators(self, dl_manager): arch_path = dl_manager.download_and_extract(self._DOWNLOAD_URL) data_dir = os.path.join(arch_path, "data") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": os.path.join(data_dir, self._TRAIN_FILE) }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": os.path.join(data_dir, self._VAL_FILE) }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": os.path.join(data_dir, self._TEST_FILE) }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" path_to_manual_file = os.path.abspath( os.path.expanduser(dl_manager.manual_dir)) if not os.path.exists(path_to_manual_file): raise FileNotFoundError( "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('msr_zhen_translation_parity', data_dir=...)`. Manual download instructions: {})" .format(path_to_manual_file, self.manual_download_instructions)) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"path": path_to_manual_file}) ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(event2Mind): Downloads the data and defines the splits # dl_manager is a datasets.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(dl_dir, "train.csv")}, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(dl_dir, "test.csv")}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(dl_dir, "dev.csv")}, ), ]
def _split_generators(self, dl_manager): my_urls = _URLs[self.config.name] data_dir = dl_manager.download_and_extract(my_urls) if self.config.name in ["manual", "auto"]: return [ datasets.SplitGenerator( name=spl, gen_kwargs={ "filepaths": data_dir, "split": spl, }, ) for spl in data_dir ] else: return [ datasets.SplitGenerator( name="full", gen_kwargs={ "filepaths": data_dir, "split": "full" }, ) ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" downloaded_path = dl_manager.download(_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(downloaded_path), "split": "train", }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_path = dl_manager.download_and_extract(_URL) split_types = ["train", "val", "test"] extract_paths = dl_manager.extract( {k: os.path.join(dl_path, "bigPatentData", k + ".tar.gz") for k in split_types} ) extract_paths = {k: os.path.join(extract_paths[k], k) for k in split_types} return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"path": extract_paths["train"]}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"path": extract_paths["val"]}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"path": extract_paths["test"]}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.extract( os.path.join(dl_manager.download_and_extract(_URL), _FILE_NAME_ZIP)) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": (os.path.join(data_dir, _FILE_NAME)), "split": "train", }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" if self.config.name == "crime-and-punish": data = dl_manager.download_and_extract(self.config.data_url) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"data_file": data, "split": "train"}, ), ] else: raise ValueError("{} does not exist".format(self.config.name))
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" urls_to_download = { "train": f"{_URL}{_TRAINING_FILE}", "dev": f"{_URL}{_DEV_FILE}", "test": f"{_URL}{_TEST_FILE}", } downloaded_files = dl_manager.download_and_extract(urls_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"], "split": "train"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"], "split": "validation"}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"], "split": "test"}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" urls_to_download = { k: [f"{_URL}{v}{_TEXT_FILE}", f"{_URL}{v}{_ANNOTATIONS_FILE}"] for k, v in zip(["train", "dev", "test"], [_TRAIN_DIR, _DEV_DIR, _TEST_DIR]) } downloaded_files = dl_manager.download_and_extract(urls_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"txt_path": downloaded_files["train"][0], "ann_path": downloaded_files["train"][1]}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"txt_path": downloaded_files["dev"][0], "ann_path": downloaded_files["dev"][1]}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"txt_path": downloaded_files["test"][0], "ann_path": downloaded_files["test"][1]}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive my_urls = { "train": f"{_URL}{_TRAINING_FILE}", "dev": f"{_URL}{_DEV_FILE}", "test": f"{_URL}{_TEST_FILE}", } data_dir = dl_manager.download_and_extract(my_urls) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_dir["train"]}), datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": data_dir["test"]}), datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": data_dir["dev"]}), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_fp = dl_manager.download_and_extract(_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "input_file": dl_fp, "split": "train", }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URLs) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, "FakeNewsData", "Fake News Stories.xlsx") }, ) ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" gloss_path, text_path = dl_manager.download([_GLOSS_URL, _TEXT_URL]) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "gloss_path": gloss_path, "text_path": text_path }, ) ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URL) train_path = os.path.join(data_dir, "fakenews", "full.csv") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": train_path, "split": "train", }, ) ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" path_to_manual_file = os.path.join( os.path.abspath(os.path.expanduser(dl_manager.manual_dir)), _FILENAME) if not os.path.exists(path_to_manual_file): raise FileNotFoundError( "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('times_of_india_news_headlines', data_dir=...)` that includes a file name {}. Manual download instructions: {})" .format(path_to_manual_file, _FILENAME, self.manual_download_instructions)) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"path": path_to_manual_file}) ]
def _split_generators(self, dl_manager): dl_dir = dl_manager.download_and_extract(_DATA_URL) data_dir = os.path.join(dl_dir, "snli_1.0") return [ datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={ "filepath": os.path.join(data_dir, "snli_1.0_test.txt") }), datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": os.path.join(data_dir, "snli_1.0_dev.txt") }), datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={ "filepath": os.path.join(data_dir, "snli_1.0_train.txt") }), ]
def _mnli_split_generator(name, data_dir, split, matched): return datasets.SplitGenerator( name=name, gen_kwargs={ "data_file": os.path.join( data_dir, "%s_%s.tsv" % (split, "matched" if matched else "mismatched")), "split": split, "mrpc_files": None, }, )
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" urls_to_download = { "train": f"{_URL}{_ES_TRAINING_FILE if self.config.name == 'es' else _NL_TRAINING_FILE}", "dev": f"{_URL}{_ES_DEV_FILE if self.config.name == 'es' else _NL_DEV_FILE}", "test": f"{_URL}{_ES_TEST_FILE if self.config.name == 'es' else _NL_TEST_FILE}", } downloaded_files = dl_manager.download_and_extract(urls_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_path = os.path.join(pathlib.Path().absolute(), _PATH) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "path": os.path.join(data_path, "ROCStory.train.csv") }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "path": os.path.join(data_path, "ROCStory.validation.csv") }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "path": os.path.join(data_path, "ROCStory.test.csv") }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URLs) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": data_dir["train"], }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": data_dir["test"], }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": data_dir["dev"], }, ), ]
def _split_generators(self, dl_manager): archive = dl_manager.download(_URL) return [ datasets.SplitGenerator( name=split, gen_kwargs={ "filepath": "mocha/" + SPLIT_FILENAMES[split], "split": split, "files": dl_manager.iter_archive(archive), }, ) for split in SPLIT_FILENAMES ]
def _split_generators(self, dl_manager): """We handle string, list and dicts in datafiles""" if not self.config.data_files: raise ValueError( f"At least one data file must be specified, but got data_files={self.config.data_files}" ) data_files = dl_manager.download_and_extract(self.config.data_files) if isinstance(data_files, (str, list, tuple)): files = data_files if isinstance(files, str): files = [files] return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files}) ] splits = [] for split_name, files in data_files.items(): if isinstance(files, str): files = [files] splits.append( datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) return splits
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" files = dl_manager.download_and_extract({ "train": [_URL + "esnli_train_1.csv", _URL + "esnli_train_2.csv"], "validation": [_URL + "esnli_dev.csv"], "test": [_URL + "esnli_test.csv"], }) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"files": files["train"]}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"files": files["validation"]}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"files": files["test"]}, ), ]
def _split_generators(self, dl_manager): dl_dir = dl_manager.download_and_extract(_DATA_URL) filepath = os.path.join(dl_dir, "ECDC-TM", "ECDC.tmx") source, target = self.config.language_pair return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": filepath, "source_language": source, "target_language": target, }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" urls_to_download = { "biobert_ner_datasets": _URL, } downloaded_files = dl_manager.download_and_extract(urls_to_download) dataset_directory = os.path.join( downloaded_files["biobert_ner_datasets"], _BIOBERT_NER_DATASET_DIRECTORY) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={ "filepath": f"{dataset_directory}/{_TRAINING_FILE}" }), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"filepath": f"{dataset_directory}/{_DEV_FILE}"}), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"filepath": f"{dataset_directory}/{_TEST_FILE}"}), ]