def _split_generators(self, dl_manager: tfds.download.DownloadManager): if not self.builder_config.labeled: num_shard = 5 files_to_download = {} for i in range(num_shard): files_to_download.update({ f"{i}": f"https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/unlabeled/unlabeled_comments_{i + 1}.txt", f"{i}_title": f"https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/news_title/unlabeled_comments.news_title_{i + 1}.txt", }) downloaded_files = dl_manager.download(files_to_download) return { "train": itertools.chain(*[ self._generate_examples(downloaded_files[f"{i}"], downloaded_files[f"{i}_title"], str(i)) for i in range(num_shard) ]) } downloaded_files = dl_manager.download({ "train": "https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/labeled/train.tsv", "train_title": "https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/news_title/train.news_title.txt", "dev": "https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/labeled/dev.tsv", "dev_title": "https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/news_title/dev.news_title.txt", "dev": "https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/labeled/dev.tsv", "dev_title": "https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/news_title/dev.news_title.txt", "test": "https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/test.no_label.tsv", "test_title": "https://raw.githubusercontent.com/kocohub/korean-hate-speech/f8d05dce2b22007bb149e5139c0060c68ad8f94b/news_title/test.news_title.txt", }) return { "train": self._generate_examples(downloaded_files["train"], downloaded_files["train_title"], "train", with_label=True), "dev": self._generate_examples(downloaded_files["dev"], downloaded_files["dev_title"], "dev", with_label=True), "test": self._generate_examples(downloaded_files["test"], downloaded_files["test_title"], "test"), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" index_path = dl_manager.download(INDEX_URL) with open(index_path, "r", encoding="utf-8") as f: index_data = json.load(f) # No need to download HTML pages for datum in index_data.values(): del datum["transcript"] del datum["format"] # Don't download videos if not necessary if not self._builder_config.include_video: for datum in index_data.values(): del datum["video_a"] del datum["video_b"] del datum["video_c"] # Don't download openpose poses if not necessary if self._builder_config.include_pose != "openpose": for datum in index_data.values(): del datum["openpose"] # Don't download holistic poses if not necessary if self._builder_config.include_pose != "holistic": for datum in index_data.values(): del datum["holistic_a"] del datum["holistic_b"] urls = { url: url for datum in index_data.values() for url in datum.values() if url is not None } local_paths = dl_manager.download(urls) processed_data = { _id: { k: local_paths[v] if v is not None else None for k, v in datum.items() } for _id, datum in index_data.items() } return [ tfds.core.SplitGenerator(name=tfds.Split.TRAIN, gen_kwargs={"data": processed_data}) ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" # Downloads the data and defines the splits # dl_manager is a tfds.download.DownloadManager that can be used to # download and extract URLs filenames = { "train_data": _TRAIN_IMAGES_FILENAME, "train_labels": _TRAIN_LABELS_FILENAME, "test_data": _TEST_IMAGES_FILENAME, "test_labels": _TEST_LABELS_FILENAME, } files = dl_manager.download({ data_type: urllib.parse.urljoin(_URL, filename) for data_type, filename in filenames.items() }) return [ tfds.core.SplitGenerator(name=tfds.Split.TRAIN, gen_kwargs=dict( num_examples=_TRAIN_EXAMPLES, images_path=files["train_data"], label_path=files["train_labels"])), tfds.core.SplitGenerator(name=tfds.Split.TEST, gen_kwargs=dict( num_examples=_TEST_EXAMPLES, images_path=files["test_data"], label_path=files["test_labels"])) ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" data_index_path = dl_manager.download(_INDEX_URL) # Download videos and update paths with GFile(data_index_path, "r") as f: data = json.load(f) if self._builder_config.include_video: paths = self._download_videos(data, dl_manager) for datum in data: for instance in datum["instances"]: instance["video"] = paths[ instance["video_id"]] if instance[ "video_id"] in paths else None if self._builder_config.include_pose == "openpose": pose_path = dl_manager.download_and_extract( _POSE_URLS[self._builder_config.include_pose]) else: pose_path = None return { "train": self._generate_examples(data, pose_path, "train"), "validation": self._generate_examples(data, pose_path, "val"), "test": self._generate_examples(data, pose_path, "test"), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): if self.builder_config.v == 1: splits = dl_manager.download({ "train": "https://raw.githubusercontent.com/korquad/korquad.github.io/918f5229639203d741045fdcdbb7462c602887da/dataset/KorQuAD_v1.0_train.json", "dev": "https://raw.githubusercontent.com/korquad/korquad.github.io/918f5229639203d741045fdcdbb7462c602887da/dataset/KorQuAD_v1.0_dev.json", }) return { "train": self._generate_examples(splits["train"]), "dev": self._generate_examples(splits["dev"]), } splits = dl_manager.download_and_extract({ "train": [ f"https://raw.githubusercontent.com/korquad/korquad.github.io/918f5229639203d741045fdcdbb7462c602887da/dataset/KorQuAD_2.1/train/KorQuAD_2.1_train_{i:02}.zip" for i in range(13) ], "dev": [ f"https://raw.githubusercontent.com/korquad/korquad.github.io/918f5229639203d741045fdcdbb7462c602887da/dataset/KorQuAD_2.1/dev/KorQuAD_2.1_dev_{i:02}.zip" for i in range(2) ], }) return { "train": itertools.chain.from_iterable( [self._generate_examples(i) for i in splits["train"]]), "dev": itertools.chain.from_iterable( [self._generate_examples(i) for i in splits["dev"]]), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): url = f"https://raw.githubusercontent.com/naver/nlp-challenge/a51654472e0da75cd37c6e73ffe583db78e68323/missions/{self.builder_config.name}/data/train/train_data" train_file = dl_manager.download(url) return { "train": self._generate_examples(train_file), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" gloss_path, text_path = dl_manager.download([_GLOSS_URL, _TEXT_URL]) return { "train": self._generate_examples(gloss_path, text_path), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): train_split = dl_manager.download( "https://raw.githubusercontent.com/warnikchow/paraKQC/c16270fe6c2e888af07e7cb043248ad31d8a6f9c/data/paraKQC_v1.txt" ) return { "train": self._generate_examples(train_split), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): train_filepath = dl_manager.download( "https://raw.githubusercontent.com/songys/Chatbot_data/a22e508811b5040eead0be5a89c27ef3780d4e82/ChatbotData%20.csv" ) return { "train": self._generate_examples(train_filepath), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" # TODO(netflix_shows): Downloads the data and defines the splits path = dl_manager.download(_URL) # TODO(netflix_shows): Returns the Dict[split names, Iterator[Key, Example]] return [ tfds.core.SplitGenerator(name=tfds.Split.TRAIN, gen_kwargs={"path": path}), ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" # TODO(bitcoin_prediction_dataset): Downloads the data and defines the splits path = dl_manager.download( 'https://raw.githubusercontent.com/johann-su/ai_tradebot/main/data/financial_data/price_prediction_dataset.csv' ) # TODO(bitcoin_prediction_dataset): Returns the Dict[split names, Iterator[Key, Example]] return { 'train': self._generate_examples(path), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): files = dl_manager.download( [ f"https://raw.githubusercontent.com/theeluwin/sci-news-sum-kr-50/aca0583651503c1cdfa8ef0bc2ef0976250a33ca/data/{index:02d}.json" for index in range(1, 51) ] ) return { "dev": self._generate_examples(files), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): # ls | sort > filelist.txt # 위 명령어로 filelist.txt 생성 후 넣어줌 with tf.io.gfile.GFile( os.path.join(os.path.dirname(__file__), "filelist.txt")) as f: files = f.readlines() train_files = dl_manager.download([ f"https://raw.githubusercontent.com/kmounlp/NER/1e557de738b8e6215c7cacac116e735518c0f680/말뭉치%20-%20형태소_개체명/{filename.strip()}" for filename in files ]) return {"train": self._generate_examples(train_files)}
def _split_generators(self, dl_manager: tfds.download.DownloadManager): splits = dl_manager.download({ "train": "https://raw.githubusercontent.com/e9t/nsmc/cc0670e872d4ac27bfe36c87456783004b39ef6c/ratings_train.txt", "test": "https://raw.githubusercontent.com/e9t/nsmc/cc0670e872d4ac27bfe36c87456783004b39ef6c/ratings_test.txt", }) return { "train": self._generate_examples(splits["train"]), "test": self._generate_examples(splits["test"]), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): dl_manager.verify_ssl = False dl_paths = dl_manager.download(_URLS['the_pile']) return [ tfds.core.SplitGenerator(name=tfds.Split.TRAIN, gen_kwargs={"paths": dl_paths['train']}), tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={"paths": dl_paths['validation']}), tfds.core.SplitGenerator(name=tfds.Split.TEST, gen_kwargs={"paths": dl_paths['test']}), ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): splits = dl_manager.download( { "train": "https://raw.githubusercontent.com/KLUE-benchmark/KLUE/ab22cd5cfdd6b527a9a4e2d177f9dacb85ddde2c/klue_benchmark/klue-nli-v1/klue-nli-v1_train.json", "dev": "https://raw.githubusercontent.com/KLUE-benchmark/KLUE/ab22cd5cfdd6b527a9a4e2d177f9dacb85ddde2c/klue_benchmark/klue-nli-v1/klue-nli-v1_dev.json", } ) return { "train": self._generate_examples(splits["train"]), "dev": self._generate_examples(splits["dev"]), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): splits = dl_manager.download({ "train": "https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/0df0fe7d496eb61b092e022e238c2230b29f1cbc/KorSTS/sts-train.tsv", "dev": "https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/0df0fe7d496eb61b092e022e238c2230b29f1cbc/KorSTS/sts-dev.tsv", "test": "https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/0df0fe7d496eb61b092e022e238c2230b29f1cbc/KorSTS/sts-test.tsv", }) return { "train": self._generate_examples(splits["train"]), "dev": self._generate_examples(splits["dev"]), "test": self._generate_examples(splits["test"]), }
def _split_generators( self, dl_manager: tfds.download.DownloadManager ) -> List[tfds.core.SplitGenerator]: """Define the train and test split.""" dataset_files = dl_manager.download({ "dataset_archive": urllib.parse.urljoin(self.URL, _GTA_DATA_FILENAME), }) return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs=dict( dataset_archive_path=dataset_files["dataset_archive"]), ), ]
def _download_and_extract_multipart( self, dl_manager: tfds.download.DownloadManager, url: str, parts: int, pwd: str = None): """Download and extract multipart zip file""" # Write OpenPose disclaimer if self._builder_config.include_pose == "openpose": print(_OPENPOSE_DISCLAIMER) # Make sure not already downloaded dirname = get_dl_dirname(url) output_path = os.path.join(dl_manager._download_dir, dirname) output_path_extracted = os.path.join(dl_manager._extract_dir, dirname) print("output_path", output_path) print("output_path_extracted", output_path_extracted) if not os.path.isfile(output_path): parts = [url + f".{i + 1:03}" for i in range(parts)] files = dl_manager.download(parts) # Cat parts to single file with open(output_path, "ab") as cat_file: for f in files: with open(f, "rb") as z: cat_file.write(z.read()) if not os.path.isdir(output_path_extracted): # Extract file os.makedirs(output_path_extracted) pwd_bytes = bytes(pwd, "utf-8") if pwd is not None else None with ZipFile(output_path, "r") as zip_obj: # Loop over each file for file in tqdm(iterable=zip_obj.namelist(), total=len(zip_obj.namelist())): zip_obj.extract(member=file, path=output_path_extracted, pwd=pwd_bytes) return output_path_extracted
def _split_generators(self, dl_manager: tfds.download.DownloadManager): splits = dl_manager.download({ "train": "https://raw.githubusercontent.com/songys/Question_pair/e84b6f0e784c10c6a22cbbc7b1e415b901baa877/train.txt", "test": "https://raw.githubusercontent.com/songys/Question_pair/e84b6f0e784c10c6a22cbbc7b1e415b901baa877/test.txt", "validation": "https://raw.githubusercontent.com/songys/Question_pair/e84b6f0e784c10c6a22cbbc7b1e415b901baa877/validation.txt", }) return { "train": self._generate_examples(splits["train"], split_name="train"), "test": self._generate_examples(splits["test"], split_name="test"), "validation": self._generate_examples(splits["validation"], split_name="validation"), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): splits = dl_manager.download({ "mnli_train": "https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/0df0fe7d496eb61b092e022e238c2230b29f1cbc/KorNLI/multinli.train.ko.tsv", "snli_train": "https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/0df0fe7d496eb61b092e022e238c2230b29f1cbc/KorNLI/snli_1.0_train.ko.tsv", "xnli_dev": "https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/0df0fe7d496eb61b092e022e238c2230b29f1cbc/KorNLI/xnli.dev.ko.tsv", "xnli_test": "https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/0df0fe7d496eb61b092e022e238c2230b29f1cbc/KorNLI/xnli.test.ko.tsv", }) return { "mnli_train": self._generate_examples(splits["mnli_train"], prefix="mnli_train"), "snli_train": self._generate_examples(splits["snli_train"], prefix="snli_train"), "xnli_dev": self._generate_examples(splits["xnli_dev"], prefix="xnli_dev"), "xnli_test": self._generate_examples(splits["xnli_test"], prefix="xnli_test"), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): if self.builder_config.name == NEWS_NAME: splits = dl_manager.download_and_extract({ "train": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-english-news-v1/korean-english-park.train.tar.gz", "dev": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-english-news-v1/korean-english-park.dev.tar.gz", "test": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-english-news-v1/korean-english-park.test.tar.gz", }) return { "train": self._generate_examples( splits["train"] / "korean-english-park.train.ko", splits["train"] / "korean-english-park.train.en", src_name="ko", tgt_name="en", ), "dev": self._generate_examples( splits["dev"] / "korean-english-park.dev.ko", splits["dev"] / "korean-english-park.dev.en", src_name="ko", tgt_name="en", ), "test": self._generate_examples( splits["test"] / "korean-english-park.test.ko", splits["test"] / "korean-english-park.test.en", src_name="ko", tgt_name="en", ), } if self.builder_config.name == JHE_NAME: splits = dl_manager.download({ "dev.ko": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-english-jhe/jhe-koen-dev.ko", "dev.en": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-english-jhe/jhe-koen-dev.en", "eval.ko": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-english-jhe/jhe-koen-eval.en", "eval.en": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-english-jhe/jhe-koen-eval.en", }) return { "dev": self._generate_examples(splits["dev.ko"], splits["dev.en"], src_name="ko", tgt_name="en"), "eval": self._generate_examples(splits["eval.ko"], splits["eval.en"], src_name="ko", tgt_name="en"), } if self.builder_config.name == FRENCH_JIM_NAME: splits = dl_manager.download_and_extract({ "train": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-french-jim-v1/korean-french-park-v1.train.tar.gz", "test": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/korean-french-jim-v1/korean-french-park-v1.test.tar.gz", }) return { "train": self._generate_examples( splits["train"] / "korean-french-park-v1.train.ko", splits["train"] / "korean-french-park-v1.train.fr", src_name="ko", tgt_name="fr", ), "test": self._generate_examples( splits["test"] / "korean-french-park-v1.test.ko", splits["test"] / "korean-french-park-v1.test.fr", src_name="ko", tgt_name="fr", ), } if self.builder_config.name == NORTH_KOREAN_NEWS_NAME: splits = dl_manager.download({ "dev.nk": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/northkorean-english-news-v1/northkorean-english.dev.nk", "dev.en": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/northkorean-english-news-v1/northkorean-english.dev.en", "test.nk": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/northkorean-english-news-v1/northkorean-english.test.nk", "test.en": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/northkorean-english-news-v1/northkorean-english.test.en", }) return { "dev": self._generate_examples(splits["dev.nk"], splits["dev.en"], src_name="nk", tgt_name="en"), "test": self._generate_examples(splits["test.nk"], splits["test.en"], src_name="nk", tgt_name="en"), } if self.builder_config.name == BIBLE_NAME: splits = dl_manager.download({ "train.ko": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/bible/bible-all.kr.txt", "train.en": "https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/07883d4cae4e309dfde8c48c5f37ebea0b59574e/bible/bible-all.en.txt", }) return { "train": self._generate_examples(splits["train.ko"], splits["train.en"], src_name="ko", tgt_name="en") }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" train_labels = dl_manager.download(_TRAIN_LABELS) valid_labels = dl_manager.download(_VALID_LABELS) test_labels = None # Download validation labels # Load videos if needed if self._builder_config.include_video: train_parts = self._download_and_extract_multipart( dl_manager, url=_TRAIN_VIDEOS, parts=18, pwd=self.train_decryption_key) train_videos = os.path.join(train_parts, "train") valid_parts = self._download_and_extract_multipart( dl_manager, url=_VALID_VIDEOS, parts=3, pwd=self.valid_decryption_key) valid_videos = os.path.join(valid_parts, "val") test_parts = self._download_and_extract_multipart( dl_manager, url=_TEST_VIDEOS, parts=3, pwd=self.test_decryption_key) test_videos = os.path.join(test_parts, "test") else: train_videos = valid_videos = test_videos = None # Load poses if needed if self._builder_config.include_pose is not None: pose_path = dl_manager.download_and_extract( _POSE_URLS[self._builder_config.include_pose]) train_pose_path = path.join(pose_path, self._builder_config.include_pose, "train") valid_pose_path = path.join(pose_path, self._builder_config.include_pose, "validation") test_pose_path = path.join(pose_path, self._builder_config.include_pose, "test") else: train_pose_path = valid_pose_path = test_pose_path = None splits = [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={ "videos_path": train_videos, "poses_path": train_pose_path, "labels_path": train_labels }, ), tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={ "videos_path": valid_videos, "poses_path": valid_pose_path, "labels_path": valid_labels }, ) ] # If no validation set, no data even about its annotations if test_videos is not None or test_pose_path is not None or test_labels is not None: splits.append( tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={ "videos_path": test_videos, "poses_path": test_pose_path, "labels_path": test_labels, }, )) return splits