"https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs", "test_urls": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt", "train_urls": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt", "val_urls": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt", # pylint: enable=line-too-long } _HIGHLIGHTS = "highlights" _ARTICLE = "article" _SUPPORTED_VERSIONS = [ # Using cased version. datasets.Version("3.0.0", "Using cased version."), # Same data as 0.0.2 datasets.Version("1.0.0", ""), # Having the model predict newline separators makes it easier to evaluate # using summary-level ROUGE. datasets.Version("2.0.0", "Separate target sentences with newline."), ] _DEFAULT_VERSION = datasets.Version("3.0.0", "Using cased version.") class CnnDailymailConfig(datasets.BuilderConfig): """BuilderConfig for CnnDailymail.""" def __init__(self, **kwargs): """BuilderConfig for CnnDailymail.
class FewRel(datasets.GeneratorBasedBuilder): """The FewRelDataset.""" VERSION = datasets.Version("1.0.0") BUILDER_CONFIGS = [ datasets.BuilderConfig( name="default", version=VERSION, description="This covers the entire FewRel dataset."), ] def _info(self): features = datasets.Features({ "relation": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "head": { "text": datasets.Value("string"), "type": datasets.Value("string"), "indices": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), }, "tail": { "text": datasets.Value("string"), "type": datasets.Value("string"), "indices": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), }, "names": datasets.Sequence(datasets.Value("string")) # These are the features of your dataset like images, labels ... }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URLs) return [ datasets.SplitGenerator( name=datasets.Split(key), # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": data_dir[key], "pid2name": data_dir["pid2name"], "return_names": key in ["train_wiki", "val_wiki", "val_nyt"], }, ) for key in data_dir.keys() if key != "pid2name" ] def _generate_examples(self, filepath, pid2name, return_names): """ Yields examples. """ pid2name_dict = {} with open(pid2name, encoding="utf-8") as f: data = json.load(f) for key in list(data.keys()): name_1 = data[key][0] name_2 = data[key][1] pid2name_dict[key] = [name_1, name_2] with open(filepath, encoding="utf-8") as f: data = json.load(f) if isinstance(data, dict): id = 0 for key in list(data.keys()): for items in data[key]: tokens = items["tokens"] h_0 = items["h"][0] h_1 = items["h"][1] h_2 = items["h"][2] t_0 = items["t"][0] t_1 = items["t"][1] t_2 = items["t"][2] id += 1 yield id, { "relation": key, "tokens": tokens, "head": { "text": h_0, "type": h_1, "indices": h_2 }, "tail": { "text": t_0, "type": t_1, "indices": t_2 }, "names": pid2name_dict[key] if return_names else [ key, ], } else: # For `pubmed_unsupervised.json` id = 0 for items in data: tokens = items["tokens"] h_0 = items["h"][0] h_1 = items["h"][1] h_2 = items["h"][2] t_0 = items["t"][0] t_1 = items["t"][1] t_2 = items["t"][2] id += 1 yield id, { "relation": "", "tokens": tokens, "head": { "text": h_0, "type": h_1, "indices": h_2 }, "tail": { "text": t_0, "type": t_1, "indices": t_2 }, "names": [ "", ], }
class Piaf(datasets.GeneratorBasedBuilder): """The Piaf Question Answering Dataset. Version 1.0.""" BUILDER_CONFIGS = [ PiafConfig( name="plain_text", version=datasets.Version("1.0.0", ""), description="Plain text", ), ] def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "id": datasets.Value("string"), "title": datasets.Value("string"), "context": datasets.Value("string"), "question": datasets.Value("string"), "answers": datasets.features.Sequence({ "text": datasets.Value("string"), "answer_start": datasets.Value("int32"), }), }), # No default supervised_keys (as we have to pass both question # and context as input). supervised_keys=None, homepage="https://piaf.etalab.studio", citation=_CITATION, task_templates=[ QuestionAnsweringExtractive(question_column="question", context_column="context", answers_column="answers") ], ) def _split_generators(self, dl_manager): urls_to_download = _URLS downloaded_files = dl_manager.download_and_extract(urls_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}), ] def _generate_examples(self, filepath): """This function returns the examples in the raw (text) form.""" logger.info("generating examples from = %s", filepath) with open(filepath, encoding="utf-8") as f: dataset = json.load(f) for article in dataset["data"]: title = article.get("title", "").strip() for paragraph in article["paragraphs"]: context = paragraph["context"].strip() for qa in paragraph["qas"]: question = qa["question"].strip() id_ = qa["id"] answer_starts = [ answer["answer_start"] for answer in qa["answers"] ] answers = [ answer["text"].strip() for answer in qa["answers"] ] # Features currently used are "context", "question", and "answers". # Others are extracted here for the ease of future expansions. yield id_, { "title": title, "context": context, "question": question, "id": id_, "answers": { "answer_start": answer_starts, "text": answers, }, }
class Glucose(datasets.GeneratorBasedBuilder): """GLUCOSE: GeneraLized and COntextualized Story Explanations, is a novel conceptual framework and dataset for commonsense reasoning.""" VERSION = datasets.Version("1.1.0") BUILDER_CONFIGS = [ datasets.BuilderConfig(name="glucose", description="Main dataset"), ] def _info(self): feature_dict = { "experiment_id": datasets.Value("string"), "story_id": datasets.Value("string"), # The train set contains only one ID in numeric form "worker_id": datasets.Value("int64"), # The test set contains several IDs in string form "worker_ids": datasets.Value("string"), "submission_time_normalized": datasets.Value("string"), "worker_quality_assessment": datasets.Value("int64"), "selected_sentence_index": datasets.Value("int64"), "story": datasets.Value("string"), "selected_sentence": datasets.Value("string"), "number_filled_in": datasets.Value("int64"), } for i in range(1, 11): feature_dict[f"{i}_specificNL"] = datasets.Value("string") feature_dict[f"{i}_specificStructured"] = datasets.Value("string") feature_dict[f"{i}_generalNL"] = datasets.Value("string") feature_dict[f"{i}_generalStructured"] = datasets.Value("string") features = datasets.Features(feature_dict) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" train_url = _URLs[self.config.name]["train"] test_url = _URLs[self.config.name]["test"] train_data = dl_manager.download_and_extract(train_url) test_data = dl_manager.download_and_extract(test_url) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": os.path.join(train_data, "GLUCOSE_training_data_final.csv"), "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": test_data, "split": "test" }, ), ] def _generate_examples(self, filepath, split): with open(filepath, encoding="utf8") as f: data = csv.reader(f) next(data) for id_, row in enumerate(data): if split == "train": yield id_, train_dict_from_row(row) else: yield id_, test_dict_from_row(row)
class Scicite(datasets.GeneratorBasedBuilder): """This is a dataset for classifying citation intents in academic papers.""" VERSION = datasets.Version("1.0.0") def _info(self): return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # datasets.features.FeatureConnectors features=datasets.Features({ "string": datasets.Value("string"), "sectionName": datasets.Value("string"), "label": datasets.features.ClassLabel( names=["method", "background", "result"]), "citingPaperId": datasets.Value("string"), "citedPaperId": datasets.Value("string"), "excerpt_index": datasets.Value("int32"), "isKeyCitation": datasets.Value("bool"), "label2": datasets.features.ClassLabel(names=[ "supportive", "not_supportive", "cant_determine", "none" ]), "citeEnd": datasets.Value("int64"), "citeStart": datasets.Value("int64"), "source": datasets.features.ClassLabel(names=_SOURCE_NAMES), "label_confidence": datasets.Value("float32"), "label2_confidence": datasets.Value("float32"), "id": datasets.Value("string"), }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/allenai/scicite", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" archive = dl_manager.download( "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scicite/scicite.tar.gz" ) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": "/".join(["scicite", "train.jsonl"]), "files": dl_manager.iter_archive(archive), }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": "/".join(["scicite", "dev.jsonl"]), "files": dl_manager.iter_archive(archive) }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": "/".join(["scicite", "test.jsonl"]), "files": dl_manager.iter_archive(archive), }, ), ] def _generate_examples(self, filepath, files): """Yields examples.""" for path, f in files: if path == filepath: unique_ids = {} for line in f: d = json.loads(line.decode("utf-8")) unique_id = str(d["unique_id"]) if unique_id in unique_ids: continue unique_ids[unique_id] = True yield unique_id, { "string": d["string"], "label": str(d["label"]), "sectionName": str(d["sectionName"]), "citingPaperId": str(d["citingPaperId"]), "citedPaperId": str(d["citedPaperId"]), "excerpt_index": int(d["excerpt_index"]), "isKeyCitation": bool(d["isKeyCitation"]), "label2": str(d.get("label2", "none")), "citeEnd": _safe_int(d["citeEnd"]), "citeStart": _safe_int(d["citeStart"]), "source": str(d["source"]), "label_confidence": float(d.get("label_confidence", 0.0)), "label2_confidence": float(d.get("label2_confidence", 0.0)), "id": str(d["id"]), } break
class ArabicPosDialect(datasets.GeneratorBasedBuilder): """POS-tagged Arabic tweets in four major dialects.""" VERSION = datasets.Version("1.1.0") BUILDER_CONFIG_CLASS = ArabicPosDialectConfig BUILDER_CONFIGS = [ ArabicPosDialectConfig( name=dialect, dialect=dialect, description= "A set of 350 tweets in the {} dialect of Arabic that have been manually segmented and POS tagged." .format(dialect), ) for dialect in _DIALECTS ] def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "fold": datasets.Value("int32"), "subfold": datasets.Value("string"), "words": datasets.Sequence(datasets.Value("string")), "segments": datasets.Sequence(datasets.Value("string")), "pos_tags": datasets.Sequence(datasets.Value("string")), }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, homepage="https://alt.qcri.org/resources/da_resources/", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO: Downloads the data and defines the splits # dl_manager is a datasets.download.DownloadManager that can be used to # download and extract URLs urls_to_download = { dialect: _URL + "seg_plus_pos_{}.txt".format(dialect) for dialect in _DIALECTS } dl_dir = dl_manager.download_and_extract(urls_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": dl_dir[self.config.dialect]}, ) ] def _generate_examples(self, filepath): """ Yields examples in the raw (text) form. """ with open(filepath, encoding="utf-8") as csv_file: reader = csv.DictReader(csv_file, delimiter="\t", quoting=csv.QUOTE_NONE) fold = -1 subfold = "" words = [] segments = [] pos_tags = [] curr_sent = -1 for idx, row in enumerate(reader): # first example if fold == -1: fold = row["Fold"] subfold = row["SubFold"] curr_sent = int(row["SentID"]) if int(row["SentID"]) != curr_sent: yield curr_sent, { "fold": fold, "subfold": subfold, "words": words, "segments": segments, "pos_tags": pos_tags, } fold = row["Fold"] subfold = row["SubFold"] words = [row["Word"]] segments = [row["Segmentation"]] pos_tags = [row["POS"]] curr_sent = int(row["SentID"]) else: words.append(row["Word"]) segments.append(row["Segmentation"]) pos_tags.append(row["POS"]) # last example yield curr_sent, { "fold": fold, "subfold": subfold, "words": words, "segments": segments, "pos_tags": pos_tags, }
class Reclor(datasets.GeneratorBasedBuilder): """TODO(reclor): Short description of my dataset.""" # TODO(reclor): Set up version. VERSION = datasets.Version("0.1.0") @property def manual_download_instructions(self): return """\ to use ReClor you need to download it manually. Please go to its homepage (http://whyu.me/reclor/) fill the google form and you will receive a download link and a password to extract it.Please extract all files in one folder and use the path folder in datasets.load_dataset('reclor', data_dir='path/to/folder/folder_name') """ def _info(self): # TODO(reclor): Specifies the datasets.DatasetInfo object return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # datasets.features.FeatureConnectors features=datasets.Features( { # These are the features of your dataset like images, labels ... "context": datasets.Value("string"), "question": datasets.Value("string"), "answers": datasets.features.Sequence(datasets.Value("string")), "label": datasets.Value("string"), "id_string": datasets.Value("string"), } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="http://whyu.me/reclor/", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(reclor): Downloads the data and defines the splits # dl_manager is a datasets.download.DownloadManager that can be used to # download and extract URLs data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) if not os.path.exists(data_dir): raise FileNotFoundError( "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('wikihow', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}".format( data_dir, self.manual_download_instructions ) ) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(data_dir, "train.json")}, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(data_dir, "test.json")}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(data_dir, "val.json")}, ), ] def _generate_examples(self, filepath): """Yields examples.""" # TODO(reclor): Yields (key, example) tuples from the dataset with open(filepath, encoding="utf-8") as f: data = json.load(f) for id_, row in enumerate(data): yield id_, { "context": row["context"], "question": row["question"], "answers": row["answers"], "label": str(row.get("label", "")), "id_string": row["id_string"], }
class SquadV1Pt(datasets.GeneratorBasedBuilder): """TODO(squad_v1_pt): Short description of my dataset.""" # TODO(squad_v1_pt): Set up version. VERSION = datasets.Version("1.1.0") def _info(self): # TODO(squad_v1_pt): Specifies the datasets.DatasetInfo object return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # datasets.features.FeatureConnectors features=datasets.Features( { "id": datasets.Value("string"), "title": datasets.Value("string"), "context": datasets.Value("string"), "question": datasets.Value("string"), "answers": datasets.features.Sequence( { "text": datasets.Value("string"), "answer_start": datasets.Value("int32"), } ), # These are the features of your dataset like images, labels ... } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/nunorc/squad-v1.1-pt", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(squad_v1_pt): Downloads the data and defines the splits # dl_manager is a datasets.download.DownloadManager that can be used to # download and extract URLs urls_to_download = _URLS downloaded_files = dl_manager.download_and_extract(urls_to_download) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}), datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}), ] def _generate_examples(self, filepath): """Yields examples.""" # TODO(squad_v1_pt): Yields (key, example) tuples from the dataset with open(filepath, encoding="utf-8") as f: data = json.load(f) for example in data["data"]: title = example.get("title", "").strip() for paragraph in example["paragraphs"]: context = paragraph["context"].strip() for qa in paragraph["qas"]: question = qa["question"].strip() id_ = qa["id"] answer_starts = [answer["answer_start"] for answer in qa["answers"]] answers = [answer["text"].strip() for answer in qa["answers"]] yield id_, { "title": title, "context": context, "question": question, "id": id_, "answers": { "answer_start": answer_starts, "text": answers, }, }
class Spider(datasets.GeneratorBasedBuilder): VERSION = datasets.Version("1.0.0") BUILDER_CONFIGS = [ datasets.BuilderConfig( name="spider", version=VERSION, description= "Spider: A Large-Scale Human-Labeled Dataset for Text-to-SQL Tasks", ), ] def _info(self): features = datasets.Features({ "db_id": datasets.Value("string"), "query": datasets.Value("string"), "question": datasets.Value("string"), "query_toks": datasets.features.Sequence(datasets.Value("string")), "query_toks_no_value": datasets.features.Sequence(datasets.Value("string")), "question_toks": datasets.features.Sequence(datasets.Value("string")), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): downloaded_filepath = dl_manager.download_and_extract(_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "data_filepath": os.path.join(downloaded_filepath, "spider/train_spider.json"), }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "data_filepath": os.path.join(downloaded_filepath, "spider/dev.json"), }, ), ] def _generate_examples(self, data_filepath): """This function returns the examples in the raw (text) form.""" logger.info("generating examples from = %s", data_filepath) with open(data_filepath, encoding="utf-8") as f: spider = json.load(f) for idx, sample in enumerate(spider): yield idx, { "db_id": sample["db_id"], "query": sample["query"], "question": sample["question"], "query_toks": sample["query_toks"], "query_toks_no_value": sample["query_toks_no_value"], "question_toks": sample["question_toks"], }
class DailyDialog(datasets.GeneratorBasedBuilder): """DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset""" VERSION = datasets.Version("1.0.0") __EOU__ = "__eou__" def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "dialog": datasets.features.Sequence(datasets.Value("string")), "act": datasets.features.Sequence(datasets.ClassLabel(names=list(act_label.values()))), "emotion": datasets.features.Sequence(datasets.ClassLabel(names=list(emotion_label.values()))), } ), supervised_keys=None, homepage="http://yanran.li/dailydialog", citation=_CITATION, ) def _split_generators(self, dl_manager: datasets.DownloadManager): """Returns SplitGenerators.""" # dl_manager is a datasets.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) data_dir = os.path.join(dl_dir, "ijcnlp_dailydialog") # The splits are nested inside the zip for name in ("train", "validation", "test"): zip_fpath = os.path.join(data_dir, f"{name}.zip") with ZipFile(zip_fpath) as zip_file: zip_file.extractall(path=data_dir) zip_file.close() return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "file_path": os.path.join(data_dir, "train", "dialogues_train.txt"), "act_path": os.path.join(data_dir, "train", "dialogues_act_train.txt"), "emotion_path": os.path.join(data_dir, "train", "dialogues_emotion_train.txt"), "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "file_path": os.path.join(data_dir, "test", "dialogues_test.txt"), "act_path": os.path.join(data_dir, "test", "dialogues_act_test.txt"), "emotion_path": os.path.join(data_dir, "test", "dialogues_emotion_test.txt"), "split": "test", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "file_path": os.path.join(data_dir, "validation", "dialogues_validation.txt"), "act_path": os.path.join(data_dir, "validation", "dialogues_act_validation.txt"), "emotion_path": os.path.join(data_dir, "validation", "dialogues_emotion_validation.txt"), "split": "dev", }, ), ] def _generate_examples(self, file_path, act_path, emotion_path, split): """ Yields examples. """ # Yields (key, example) tuples from the dataset with open(file_path, "r", encoding="utf-8") as f, open(act_path, "r", encoding="utf-8") as act, open( emotion_path, "r", encoding="utf-8" ) as emotion: for i, (line_f, line_act, line_emotion) in enumerate(zip(f, act, emotion)): if len(line_f.strip()) == 0: break dialog = line_f.split(self.__EOU__)[:-1] act = line_act.split(" ")[:-1] emotion = line_emotion.split(" ")[:-1] assert len(dialog) == len(act) == len(emotion), "Different turns btw dialogue & emotion & action" yield f"{split}-{i}", { "dialog": dialog, "act": [act_label[x] for x in act], "emotion": [emotion_label[x] for x in emotion], }
class WebNlg(datasets.GeneratorBasedBuilder): """The WebNLG corpus""" VERSION = datasets.Version("3.0.0") BUILDER_CONFIGS = [ datasets.BuilderConfig( name="webnlg_challenge_2017", description="WebNLG Challenge 2017 data, covers 10 DBpedia categories." ), datasets.BuilderConfig(name="release_v1", description="Covers 15 DBpedia categories."), datasets.BuilderConfig( name="release_v2", description="Includes release_v1 and test data from the WebNLG challenge." ), datasets.BuilderConfig( name="release_v2_constrained", description="Same data as v2, the split into train/dev/test is more challenging.", ), datasets.BuilderConfig(name="release_v2.1", description="5,667 texts from v2 were cleaned."), datasets.BuilderConfig( name="release_v2.1_constrained", description="Same data as v2.1, the split into train/dev/test is more challenging.", ), datasets.BuilderConfig( name="release_v3.0_en", description="WebNLG+ data used in the WebNLG challenge 2020. English." ), datasets.BuilderConfig( name="release_v3.0_ru", description="WebNLG+ data used in the WebNLG challenge 2020. Russian." ), ] def _info(self): features = datasets.Features( { "category": datasets.Value("string"), "size": datasets.Value("int32"), "eid": datasets.Value("string"), "original_triple_sets": datasets.Sequence( {"otriple_set": datasets.Sequence(datasets.Value("string"))} ), "modified_triple_sets": datasets.Sequence( {"mtriple_set": datasets.Sequence(datasets.Value("string"))} ), "shape": datasets.Value("string"), "shape_type": datasets.Value("string"), "lex": datasets.Sequence( { "comment": datasets.Value("string"), "lid": datasets.Value("string"), "text": datasets.Value("string"), "lang": datasets.Value("string"), } ), "test_category": datasets.Value("string"), "dbpedia_links": datasets.Sequence(datasets.Value("string")), "links": datasets.Sequence(datasets.Value("string")), } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://webnlg-challenge.loria.fr/", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URL) return [ datasets.SplitGenerator( name=spl, # These kwargs will be passed to _generate_examples gen_kwargs={ "filedirs": [ os.path.join(data_dir, "webnlg-dataset-master", dir_suf) for dir_suf in dir_suffix_list ], }, ) for spl, dir_suffix_list in _FILE_PATHS[self.config.name].items() ] def _generate_examples(self, filedirs): """ Yields examples. """ id_ = 0 for xml_location in filedirs: for xml_file in sorted(glob(pjoin(xml_location, "*.xml"))): # windows may use backslashes so we first need to replace them with slashes xml_file_path_with_slashes = "/".join(Path(xml_file).parts) if ( "webnlg_challenge_2017/test" in xml_file_path_with_slashes or "release_v3.0/en/test" in xml_file_path_with_slashes or "release_v3.0/ru/test" in xml_file_path_with_slashes ): test_cat = xml_file_path_with_slashes.split("/")[-1][:-4] else: test_cat = "" for exple_dict in xml_file_to_examples(xml_file): exple_dict["test_category"] = test_cat id_ += 1 yield id_, exple_dict
class Xsum(datasets.GeneratorBasedBuilder): """Extreme Summarization (XSum) Dataset.""" # Version 1.2.0 expands coverage, includes ids, and removes web contents. VERSION = datasets.Version("1.2.0") def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ _DOCUMENT: datasets.Value("string"), _SUMMARY: datasets.Value("string"), _ID: datasets.Value("string"), }), supervised_keys=(_DOCUMENT, _SUMMARY), homepage= "https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" files_to_download = {"data": _URL_DATA, "splits": _URL_SPLITS} downloaded_files = dl_manager.download_and_extract(files_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "split_path": downloaded_files["splits"], "split_name": "train", "data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"), }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "split_path": downloaded_files["splits"], "split_name": "validation", "data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"), }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "split_path": downloaded_files["splits"], "split_name": "test", "data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"), }, ), ] def _generate_examples(self, split_path, split_name, data_dir): """Yields examples.""" with open(split_path, "r", encoding="utf-8") as f: split_ids = json.load(f) for i in split_ids[split_name]: with open(os.path.join(data_dir, i + ".summary"), "r", encoding="utf-8") as f: text = "".join([ line for line in f.readlines() if line not in _REMOVE_LINES and line.strip() ]) # Each file follows below format: # [SN]URL[SN] # http://somelink # # [SN]TITLE[SN] # some intro # # [SN]FIRST-SENTENCE[SN] # some intro # # [SN]RESTBODY[SN] # text line. # another text line. # "another text line." # According to the following issue, FIRST-SENTENCE # is the reference summary and TITLE is unused: # https://github.com/EdinburghNLP/XSum/issues/22 segs = text.split("[SN]") yield i, { _DOCUMENT: segs[8].strip(), _SUMMARY: segs[6].strip(), _ID: i }
class S2orc(datasets.GeneratorBasedBuilder): """Semantic Scholar's records for research papers published in all fields""" VERSION = datasets.Version("1.1.0") def _info(self): features = datasets.Features({ "id": datasets.Value("string"), "title": datasets.Value("string"), "paperAbstract": datasets.Value("string"), "entities": datasets.Sequence(datasets.Value("string")), "s2Url": datasets.Value("string"), "pdfUrls": datasets.Sequence(datasets.Value("string")), "s2PdfUrl": datasets.Value("string"), "authors": [ { "name": datasets.Value("string"), "ids": datasets.Sequence(datasets.Value("string")), }, ], "inCitations": datasets.Sequence(datasets.Value("string")), "outCitations": datasets.Sequence(datasets.Value("string")), "fieldsOfStudy": datasets.Sequence(datasets.Value("string")), "year": datasets.Value("int32"), "venue": datasets.Value("string"), "journalName": datasets.Value("string"), "journalVolume": datasets.Value("string"), "journalPages": datasets.Value("string"), "sources": datasets.Sequence(datasets.Value("string")), "doi": datasets.Value("string"), "doiUrl": datasets.Value("string"), "pmid": datasets.Value("string"), "magId": datasets.Value("string"), }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" _MANIFEST_URL = _ROOT_URL + "manifest.txt" manifest_file = dl_manager.download_and_extract(_MANIFEST_URL) file = open(manifest_file, "r", encoding="utf-8") train_names = file.read().splitlines() r = re.compile("(?s:s2\\-corpus\\-.*\\.gz)\\Z" ) # files are of the form 's2-corpus-*.gz' train_names = list(filter(r.match, train_names)) train_filepaths = dl_manager.download_and_extract( [_ROOT_URL + x for x in train_names]) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepaths": train_filepaths, "split": "train", }, ), ] def _generate_examples(self, filepaths, split): """ Yields examples. """ for train_files in filepaths: with open(train_files, encoding="utf-8") as f: for id_, row in enumerate(f): data = json.loads(row) if type(data["year"]) != int: data["year"] = -1 yield id_, data