Пример #1
0
    "https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs",
    "test_urls":
    "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt",
    "train_urls":
    "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt",
    "val_urls":
    "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt",
    # pylint: enable=line-too-long
}

_HIGHLIGHTS = "highlights"
_ARTICLE = "article"

_SUPPORTED_VERSIONS = [
    # Using cased version.
    datasets.Version("3.0.0", "Using cased version."),
    # Same data as 0.0.2
    datasets.Version("1.0.0", ""),
    # Having the model predict newline separators makes it easier to evaluate
    # using summary-level ROUGE.
    datasets.Version("2.0.0", "Separate target sentences with newline."),
]

_DEFAULT_VERSION = datasets.Version("3.0.0", "Using cased version.")


class CnnDailymailConfig(datasets.BuilderConfig):
    """BuilderConfig for CnnDailymail."""
    def __init__(self, **kwargs):
        """BuilderConfig for CnnDailymail.
Пример #2
0
class FewRel(datasets.GeneratorBasedBuilder):
    """The FewRelDataset."""

    VERSION = datasets.Version("1.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="default",
            version=VERSION,
            description="This covers the entire FewRel dataset."),
    ]

    def _info(self):
        features = datasets.Features({
            "relation":
            datasets.Value("string"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "head": {
                "text":
                datasets.Value("string"),
                "type":
                datasets.Value("string"),
                "indices":
                datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
            },
            "tail": {
                "text":
                datasets.Value("string"),
                "type":
                datasets.Value("string"),
                "indices":
                datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
            },
            "names":
            datasets.Sequence(datasets.Value("string"))
            # These are the features of your dataset like images, labels ...
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URLs)
        return [
            datasets.SplitGenerator(
                name=datasets.Split(key),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir[key],
                    "pid2name": data_dir["pid2name"],
                    "return_names": key
                    in ["train_wiki", "val_wiki", "val_nyt"],
                },
            ) for key in data_dir.keys() if key != "pid2name"
        ]

    def _generate_examples(self, filepath, pid2name, return_names):
        """ Yields examples. """
        pid2name_dict = {}
        with open(pid2name, encoding="utf-8") as f:
            data = json.load(f)
        for key in list(data.keys()):
            name_1 = data[key][0]
            name_2 = data[key][1]
            pid2name_dict[key] = [name_1, name_2]

        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict):
            id = 0
            for key in list(data.keys()):
                for items in data[key]:
                    tokens = items["tokens"]
                    h_0 = items["h"][0]
                    h_1 = items["h"][1]
                    h_2 = items["h"][2]
                    t_0 = items["t"][0]
                    t_1 = items["t"][1]
                    t_2 = items["t"][2]
                    id += 1
                    yield id, {
                        "relation": key,
                        "tokens": tokens,
                        "head": {
                            "text": h_0,
                            "type": h_1,
                            "indices": h_2
                        },
                        "tail": {
                            "text": t_0,
                            "type": t_1,
                            "indices": t_2
                        },
                        "names": pid2name_dict[key] if return_names else [
                            key,
                        ],
                    }
        else:  # For `pubmed_unsupervised.json`
            id = 0
            for items in data:
                tokens = items["tokens"]
                h_0 = items["h"][0]
                h_1 = items["h"][1]
                h_2 = items["h"][2]
                t_0 = items["t"][0]
                t_1 = items["t"][1]
                t_2 = items["t"][2]
                id += 1
                yield id, {
                    "relation": "",
                    "tokens": tokens,
                    "head": {
                        "text": h_0,
                        "type": h_1,
                        "indices": h_2
                    },
                    "tail": {
                        "text": t_0,
                        "type": t_1,
                        "indices": t_2
                    },
                    "names": [
                        "",
                    ],
                }
Пример #3
0
class Piaf(datasets.GeneratorBasedBuilder):
    """The Piaf Question Answering Dataset. Version 1.0."""

    BUILDER_CONFIGS = [
        PiafConfig(
            name="plain_text",
            version=datasets.Version("1.0.0", ""),
            description="Plain text",
        ),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "id":
                datasets.Value("string"),
                "title":
                datasets.Value("string"),
                "context":
                datasets.Value("string"),
                "question":
                datasets.Value("string"),
                "answers":
                datasets.features.Sequence({
                    "text":
                    datasets.Value("string"),
                    "answer_start":
                    datasets.Value("int32"),
                }),
            }),
            # No default supervised_keys (as we have to pass both question
            # and context as input).
            supervised_keys=None,
            homepage="https://piaf.etalab.studio",
            citation=_CITATION,
            task_templates=[
                QuestionAnsweringExtractive(question_column="question",
                                            context_column="context",
                                            answers_column="answers")
            ],
        )

    def _split_generators(self, dl_manager):
        urls_to_download = _URLS
        downloaded_files = dl_manager.download_and_extract(urls_to_download)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={"filepath": downloaded_files["train"]}),
        ]

    def _generate_examples(self, filepath):
        """This function returns the examples in the raw (text) form."""
        logger.info("generating examples from = %s", filepath)
        with open(filepath, encoding="utf-8") as f:
            dataset = json.load(f)
            for article in dataset["data"]:
                title = article.get("title", "").strip()
                for paragraph in article["paragraphs"]:
                    context = paragraph["context"].strip()
                    for qa in paragraph["qas"]:
                        question = qa["question"].strip()
                        id_ = qa["id"]

                        answer_starts = [
                            answer["answer_start"] for answer in qa["answers"]
                        ]
                        answers = [
                            answer["text"].strip() for answer in qa["answers"]
                        ]

                        # Features currently used are "context", "question", and "answers".
                        # Others are extracted here for the ease of future expansions.
                        yield id_, {
                            "title": title,
                            "context": context,
                            "question": question,
                            "id": id_,
                            "answers": {
                                "answer_start": answer_starts,
                                "text": answers,
                            },
                        }
Пример #4
0
class Glucose(datasets.GeneratorBasedBuilder):
    """GLUCOSE: GeneraLized and COntextualized Story Explanations, is a novel conceptual framework and dataset for commonsense reasoning."""

    VERSION = datasets.Version("1.1.0")
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="glucose", description="Main dataset"),
    ]

    def _info(self):
        feature_dict = {
            "experiment_id": datasets.Value("string"),
            "story_id": datasets.Value("string"),
            # The train set contains only one ID in numeric form
            "worker_id": datasets.Value("int64"),
            # The test set contains several IDs in string form
            "worker_ids": datasets.Value("string"),
            "submission_time_normalized": datasets.Value("string"),
            "worker_quality_assessment": datasets.Value("int64"),
            "selected_sentence_index": datasets.Value("int64"),
            "story": datasets.Value("string"),
            "selected_sentence": datasets.Value("string"),
            "number_filled_in": datasets.Value("int64"),
        }
        for i in range(1, 11):
            feature_dict[f"{i}_specificNL"] = datasets.Value("string")
            feature_dict[f"{i}_specificStructured"] = datasets.Value("string")
            feature_dict[f"{i}_generalNL"] = datasets.Value("string")
            feature_dict[f"{i}_generalStructured"] = datasets.Value("string")
        features = datasets.Features(feature_dict)
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        train_url = _URLs[self.config.name]["train"]
        test_url = _URLs[self.config.name]["test"]
        train_data = dl_manager.download_and_extract(train_url)
        test_data = dl_manager.download_and_extract(test_url)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath":
                    os.path.join(train_data,
                                 "GLUCOSE_training_data_final.csv"),
                    "split":
                    "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": test_data,
                    "split": "test"
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        with open(filepath, encoding="utf8") as f:
            data = csv.reader(f)
            next(data)
            for id_, row in enumerate(data):
                if split == "train":
                    yield id_, train_dict_from_row(row)
                else:
                    yield id_, test_dict_from_row(row)
class Scicite(datasets.GeneratorBasedBuilder):
    """This is a dataset for classifying citation intents in academic papers."""

    VERSION = datasets.Version("1.0.0")

    def _info(self):
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features({
                "string":
                datasets.Value("string"),
                "sectionName":
                datasets.Value("string"),
                "label":
                datasets.features.ClassLabel(
                    names=["method", "background", "result"]),
                "citingPaperId":
                datasets.Value("string"),
                "citedPaperId":
                datasets.Value("string"),
                "excerpt_index":
                datasets.Value("int32"),
                "isKeyCitation":
                datasets.Value("bool"),
                "label2":
                datasets.features.ClassLabel(names=[
                    "supportive", "not_supportive", "cant_determine", "none"
                ]),
                "citeEnd":
                datasets.Value("int64"),
                "citeStart":
                datasets.Value("int64"),
                "source":
                datasets.features.ClassLabel(names=_SOURCE_NAMES),
                "label_confidence":
                datasets.Value("float32"),
                "label2_confidence":
                datasets.Value("float32"),
                "id":
                datasets.Value("string"),
            }),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/allenai/scicite",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        archive = dl_manager.download(
            "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scicite/scicite.tar.gz"
        )
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": "/".join(["scicite", "train.jsonl"]),
                    "files": dl_manager.iter_archive(archive),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": "/".join(["scicite", "dev.jsonl"]),
                    "files": dl_manager.iter_archive(archive)
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": "/".join(["scicite", "test.jsonl"]),
                    "files": dl_manager.iter_archive(archive),
                },
            ),
        ]

    def _generate_examples(self, filepath, files):
        """Yields examples."""
        for path, f in files:
            if path == filepath:
                unique_ids = {}
                for line in f:
                    d = json.loads(line.decode("utf-8"))
                    unique_id = str(d["unique_id"])
                    if unique_id in unique_ids:
                        continue
                    unique_ids[unique_id] = True
                    yield unique_id, {
                        "string": d["string"],
                        "label": str(d["label"]),
                        "sectionName": str(d["sectionName"]),
                        "citingPaperId": str(d["citingPaperId"]),
                        "citedPaperId": str(d["citedPaperId"]),
                        "excerpt_index": int(d["excerpt_index"]),
                        "isKeyCitation": bool(d["isKeyCitation"]),
                        "label2": str(d.get("label2", "none")),
                        "citeEnd": _safe_int(d["citeEnd"]),
                        "citeStart": _safe_int(d["citeStart"]),
                        "source": str(d["source"]),
                        "label_confidence":
                        float(d.get("label_confidence", 0.0)),
                        "label2_confidence":
                        float(d.get("label2_confidence", 0.0)),
                        "id": str(d["id"]),
                    }
                break
Пример #6
0
class ArabicPosDialect(datasets.GeneratorBasedBuilder):
    """POS-tagged Arabic tweets in four major dialects."""

    VERSION = datasets.Version("1.1.0")
    BUILDER_CONFIG_CLASS = ArabicPosDialectConfig
    BUILDER_CONFIGS = [
        ArabicPosDialectConfig(
            name=dialect,
            dialect=dialect,
            description=
            "A set of 350 tweets in the {} dialect of Arabic that have been manually segmented and POS tagged."
            .format(dialect),
        ) for dialect in _DIALECTS
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "fold":
                datasets.Value("int32"),
                "subfold":
                datasets.Value("string"),
                "words":
                datasets.Sequence(datasets.Value("string")),
                "segments":
                datasets.Sequence(datasets.Value("string")),
                "pos_tags":
                datasets.Sequence(datasets.Value("string")),
            }),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            homepage="https://alt.qcri.org/resources/da_resources/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO: Downloads the data and defines the splits
        # dl_manager is a datasets.download.DownloadManager that can be used to
        # download and extract URLs
        urls_to_download = {
            dialect: _URL + "seg_plus_pos_{}.txt".format(dialect)
            for dialect in _DIALECTS
        }
        dl_dir = dl_manager.download_and_extract(urls_to_download)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": dl_dir[self.config.dialect]},
            )
        ]

    def _generate_examples(self, filepath):
        """ Yields examples in the raw (text) form. """
        with open(filepath, encoding="utf-8") as csv_file:
            reader = csv.DictReader(csv_file,
                                    delimiter="\t",
                                    quoting=csv.QUOTE_NONE)
            fold = -1
            subfold = ""
            words = []
            segments = []
            pos_tags = []
            curr_sent = -1
            for idx, row in enumerate(reader):
                # first example
                if fold == -1:
                    fold = row["Fold"]
                    subfold = row["SubFold"]
                    curr_sent = int(row["SentID"])
                if int(row["SentID"]) != curr_sent:
                    yield curr_sent, {
                        "fold": fold,
                        "subfold": subfold,
                        "words": words,
                        "segments": segments,
                        "pos_tags": pos_tags,
                    }
                    fold = row["Fold"]
                    subfold = row["SubFold"]
                    words = [row["Word"]]
                    segments = [row["Segmentation"]]
                    pos_tags = [row["POS"]]
                    curr_sent = int(row["SentID"])
                else:
                    words.append(row["Word"])
                    segments.append(row["Segmentation"])
                    pos_tags.append(row["POS"])
            # last example
            yield curr_sent, {
                "fold": fold,
                "subfold": subfold,
                "words": words,
                "segments": segments,
                "pos_tags": pos_tags,
            }
Пример #7
0
class Reclor(datasets.GeneratorBasedBuilder):
    """TODO(reclor): Short description of my dataset."""

    # TODO(reclor): Set up version.
    VERSION = datasets.Version("0.1.0")

    @property
    def manual_download_instructions(self):
        return """\
  to use ReClor you need to download it manually. Please go to its homepage (http://whyu.me/reclor/) fill the google
  form and you will receive a download link and a password to extract it.Please extract all files in one folder and use the path folder in datasets.load_dataset('reclor', data_dir='path/to/folder/folder_name')
  """

    def _info(self):
        # TODO(reclor): Specifies the datasets.DatasetInfo object
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features(
                {
                    # These are the features of your dataset like images, labels ...
                    "context": datasets.Value("string"),
                    "question": datasets.Value("string"),
                    "answers": datasets.features.Sequence(datasets.Value("string")),
                    "label": datasets.Value("string"),
                    "id_string": datasets.Value("string"),
                }
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="http://whyu.me/reclor/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(reclor): Downloads the data and defines the splits
        # dl_manager is a datasets.download.DownloadManager that can be used to
        # download and extract URLs
        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

        if not os.path.exists(data_dir):
            raise FileNotFoundError(
                "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('wikihow', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}".format(
                    data_dir, self.manual_download_instructions
                )
            )
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": os.path.join(data_dir, "train.json")},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": os.path.join(data_dir, "test.json")},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": os.path.join(data_dir, "val.json")},
            ),
        ]

    def _generate_examples(self, filepath):
        """Yields examples."""
        # TODO(reclor): Yields (key, example) tuples from the dataset
        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)
            for id_, row in enumerate(data):
                yield id_, {
                    "context": row["context"],
                    "question": row["question"],
                    "answers": row["answers"],
                    "label": str(row.get("label", "")),
                    "id_string": row["id_string"],
                }
Пример #8
0
class SquadV1Pt(datasets.GeneratorBasedBuilder):
    """TODO(squad_v1_pt): Short description of my dataset."""

    # TODO(squad_v1_pt): Set up version.
    VERSION = datasets.Version("1.1.0")

    def _info(self):
        # TODO(squad_v1_pt): Specifies the datasets.DatasetInfo object
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "context": datasets.Value("string"),
                    "question": datasets.Value("string"),
                    "answers": datasets.features.Sequence(
                        {
                            "text": datasets.Value("string"),
                            "answer_start": datasets.Value("int32"),
                        }
                    ),
                    # These are the features of your dataset like images, labels ...
                }
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/nunorc/squad-v1.1-pt",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(squad_v1_pt): Downloads the data and defines the splits
        # dl_manager is a datasets.download.DownloadManager that can be used to
        # download and extract URLs
        urls_to_download = _URLS
        downloaded_files = dl_manager.download_and_extract(urls_to_download)

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
        ]

    def _generate_examples(self, filepath):
        """Yields examples."""
        # TODO(squad_v1_pt): Yields (key, example) tuples from the dataset
        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)
            for example in data["data"]:
                title = example.get("title", "").strip()
                for paragraph in example["paragraphs"]:
                    context = paragraph["context"].strip()
                    for qa in paragraph["qas"]:
                        question = qa["question"].strip()
                        id_ = qa["id"]

                        answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                        answers = [answer["text"].strip() for answer in qa["answers"]]

                        yield id_, {
                            "title": title,
                            "context": context,
                            "question": question,
                            "id": id_,
                            "answers": {
                                "answer_start": answer_starts,
                                "text": answers,
                            },
                        }
class Spider(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="spider",
            version=VERSION,
            description=
            "Spider: A Large-Scale Human-Labeled Dataset for Text-to-SQL Tasks",
        ),
    ]

    def _info(self):
        features = datasets.Features({
            "db_id":
            datasets.Value("string"),
            "query":
            datasets.Value("string"),
            "question":
            datasets.Value("string"),
            "query_toks":
            datasets.features.Sequence(datasets.Value("string")),
            "query_toks_no_value":
            datasets.features.Sequence(datasets.Value("string")),
            "question_toks":
            datasets.features.Sequence(datasets.Value("string")),
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        downloaded_filepath = dl_manager.download_and_extract(_URL)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "data_filepath":
                    os.path.join(downloaded_filepath,
                                 "spider/train_spider.json"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "data_filepath":
                    os.path.join(downloaded_filepath, "spider/dev.json"),
                },
            ),
        ]

    def _generate_examples(self, data_filepath):
        """This function returns the examples in the raw (text) form."""
        logger.info("generating examples from = %s", data_filepath)
        with open(data_filepath, encoding="utf-8") as f:
            spider = json.load(f)
            for idx, sample in enumerate(spider):
                yield idx, {
                    "db_id": sample["db_id"],
                    "query": sample["query"],
                    "question": sample["question"],
                    "query_toks": sample["query_toks"],
                    "query_toks_no_value": sample["query_toks_no_value"],
                    "question_toks": sample["question_toks"],
                }
Пример #10
0
class DailyDialog(datasets.GeneratorBasedBuilder):
    """DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset"""

    VERSION = datasets.Version("1.0.0")

    __EOU__ = "__eou__"

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "dialog": datasets.features.Sequence(datasets.Value("string")),
                    "act": datasets.features.Sequence(datasets.ClassLabel(names=list(act_label.values()))),
                    "emotion": datasets.features.Sequence(datasets.ClassLabel(names=list(emotion_label.values()))),
                }
            ),
            supervised_keys=None,
            homepage="http://yanran.li/dailydialog",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager: datasets.DownloadManager):
        """Returns SplitGenerators."""
        # dl_manager is a datasets.download.DownloadManager that can be used to
        # download and extract URLs
        dl_dir = dl_manager.download_and_extract(_URL)
        data_dir = os.path.join(dl_dir, "ijcnlp_dailydialog")

        # The splits are nested inside the zip
        for name in ("train", "validation", "test"):
            zip_fpath = os.path.join(data_dir, f"{name}.zip")
            with ZipFile(zip_fpath) as zip_file:
                zip_file.extractall(path=data_dir)
                zip_file.close()

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "file_path": os.path.join(data_dir, "train", "dialogues_train.txt"),
                    "act_path": os.path.join(data_dir, "train", "dialogues_act_train.txt"),
                    "emotion_path": os.path.join(data_dir, "train", "dialogues_emotion_train.txt"),
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "file_path": os.path.join(data_dir, "test", "dialogues_test.txt"),
                    "act_path": os.path.join(data_dir, "test", "dialogues_act_test.txt"),
                    "emotion_path": os.path.join(data_dir, "test", "dialogues_emotion_test.txt"),
                    "split": "test",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "file_path": os.path.join(data_dir, "validation", "dialogues_validation.txt"),
                    "act_path": os.path.join(data_dir, "validation", "dialogues_act_validation.txt"),
                    "emotion_path": os.path.join(data_dir, "validation", "dialogues_emotion_validation.txt"),
                    "split": "dev",
                },
            ),
        ]

    def _generate_examples(self, file_path, act_path, emotion_path, split):
        """ Yields examples. """
        # Yields (key, example) tuples from the dataset
        with open(file_path, "r", encoding="utf-8") as f, open(act_path, "r", encoding="utf-8") as act, open(
            emotion_path, "r", encoding="utf-8"
        ) as emotion:
            for i, (line_f, line_act, line_emotion) in enumerate(zip(f, act, emotion)):
                if len(line_f.strip()) == 0:
                    break
                dialog = line_f.split(self.__EOU__)[:-1]
                act = line_act.split(" ")[:-1]
                emotion = line_emotion.split(" ")[:-1]

                assert len(dialog) == len(act) == len(emotion), "Different turns btw dialogue & emotion & action"

                yield f"{split}-{i}", {
                    "dialog": dialog,
                    "act": [act_label[x] for x in act],
                    "emotion": [emotion_label[x] for x in emotion],
                }
Пример #11
0
class WebNlg(datasets.GeneratorBasedBuilder):
    """The WebNLG corpus"""

    VERSION = datasets.Version("3.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="webnlg_challenge_2017", description="WebNLG Challenge 2017 data, covers 10 DBpedia categories."
        ),
        datasets.BuilderConfig(name="release_v1", description="Covers 15 DBpedia categories."),
        datasets.BuilderConfig(
            name="release_v2", description="Includes release_v1 and test data from the WebNLG challenge."
        ),
        datasets.BuilderConfig(
            name="release_v2_constrained",
            description="Same data as v2, the split into train/dev/test is more challenging.",
        ),
        datasets.BuilderConfig(name="release_v2.1", description="5,667 texts from v2 were cleaned."),
        datasets.BuilderConfig(
            name="release_v2.1_constrained",
            description="Same data as v2.1, the split into train/dev/test is more challenging.",
        ),
        datasets.BuilderConfig(
            name="release_v3.0_en", description="WebNLG+ data used in the WebNLG challenge 2020. English."
        ),
        datasets.BuilderConfig(
            name="release_v3.0_ru", description="WebNLG+ data used in the WebNLG challenge 2020. Russian."
        ),
    ]

    def _info(self):
        features = datasets.Features(
            {
                "category": datasets.Value("string"),
                "size": datasets.Value("int32"),
                "eid": datasets.Value("string"),
                "original_triple_sets": datasets.Sequence(
                    {"otriple_set": datasets.Sequence(datasets.Value("string"))}
                ),
                "modified_triple_sets": datasets.Sequence(
                    {"mtriple_set": datasets.Sequence(datasets.Value("string"))}
                ),
                "shape": datasets.Value("string"),
                "shape_type": datasets.Value("string"),
                "lex": datasets.Sequence(
                    {
                        "comment": datasets.Value("string"),
                        "lid": datasets.Value("string"),
                        "text": datasets.Value("string"),
                        "lang": datasets.Value("string"),
                    }
                ),
                "test_category": datasets.Value("string"),
                "dbpedia_links": datasets.Sequence(datasets.Value("string")),
                "links": datasets.Sequence(datasets.Value("string")),
            }
        )
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://webnlg-challenge.loria.fr/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URL)
        return [
            datasets.SplitGenerator(
                name=spl,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filedirs": [
                        os.path.join(data_dir, "webnlg-dataset-master", dir_suf) for dir_suf in dir_suffix_list
                    ],
                },
            )
            for spl, dir_suffix_list in _FILE_PATHS[self.config.name].items()
        ]

    def _generate_examples(self, filedirs):
        """ Yields examples. """

        id_ = 0
        for xml_location in filedirs:
            for xml_file in sorted(glob(pjoin(xml_location, "*.xml"))):
                # windows may use backslashes so we first need to replace them with slashes
                xml_file_path_with_slashes = "/".join(Path(xml_file).parts)
                if (
                    "webnlg_challenge_2017/test" in xml_file_path_with_slashes
                    or "release_v3.0/en/test" in xml_file_path_with_slashes
                    or "release_v3.0/ru/test" in xml_file_path_with_slashes
                ):
                    test_cat = xml_file_path_with_slashes.split("/")[-1][:-4]
                else:
                    test_cat = ""
                for exple_dict in xml_file_to_examples(xml_file):
                    exple_dict["test_category"] = test_cat
                    id_ += 1
                    yield id_, exple_dict
Пример #12
0
class Xsum(datasets.GeneratorBasedBuilder):
    """Extreme Summarization (XSum) Dataset."""

    # Version 1.2.0 expands coverage, includes ids, and removes web contents.
    VERSION = datasets.Version("1.2.0")

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                _DOCUMENT: datasets.Value("string"),
                _SUMMARY: datasets.Value("string"),
                _ID: datasets.Value("string"),
            }),
            supervised_keys=(_DOCUMENT, _SUMMARY),
            homepage=
            "https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        files_to_download = {"data": _URL_DATA, "splits": _URL_SPLITS}
        downloaded_files = dl_manager.download_and_extract(files_to_download)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "split_path":
                    downloaded_files["splits"],
                    "split_name":
                    "train",
                    "data_dir":
                    os.path.join(downloaded_files["data"], "bbc-summary-data"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "split_path":
                    downloaded_files["splits"],
                    "split_name":
                    "validation",
                    "data_dir":
                    os.path.join(downloaded_files["data"], "bbc-summary-data"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "split_path":
                    downloaded_files["splits"],
                    "split_name":
                    "test",
                    "data_dir":
                    os.path.join(downloaded_files["data"], "bbc-summary-data"),
                },
            ),
        ]

    def _generate_examples(self, split_path, split_name, data_dir):
        """Yields examples."""

        with open(split_path, "r", encoding="utf-8") as f:
            split_ids = json.load(f)

        for i in split_ids[split_name]:
            with open(os.path.join(data_dir, i + ".summary"),
                      "r",
                      encoding="utf-8") as f:
                text = "".join([
                    line for line in f.readlines()
                    if line not in _REMOVE_LINES and line.strip()
                ])
                # Each file follows below format:
                # [SN]URL[SN]
                # http://somelink
                #
                # [SN]TITLE[SN]
                # some intro
                #
                # [SN]FIRST-SENTENCE[SN]
                # some intro
                #
                # [SN]RESTBODY[SN]
                # text line.
                # another text line.
                # "another text line."

                # According to the following issue, FIRST-SENTENCE
                # is the reference summary and TITLE is unused:
                # https://github.com/EdinburghNLP/XSum/issues/22
                segs = text.split("[SN]")
                yield i, {
                    _DOCUMENT: segs[8].strip(),
                    _SUMMARY: segs[6].strip(),
                    _ID: i
                }
Пример #13
0
class S2orc(datasets.GeneratorBasedBuilder):
    """Semantic Scholar's records for research papers published in all fields"""

    VERSION = datasets.Version("1.1.0")

    def _info(self):
        features = datasets.Features({
            "id":
            datasets.Value("string"),
            "title":
            datasets.Value("string"),
            "paperAbstract":
            datasets.Value("string"),
            "entities":
            datasets.Sequence(datasets.Value("string")),
            "s2Url":
            datasets.Value("string"),
            "pdfUrls":
            datasets.Sequence(datasets.Value("string")),
            "s2PdfUrl":
            datasets.Value("string"),
            "authors": [
                {
                    "name": datasets.Value("string"),
                    "ids": datasets.Sequence(datasets.Value("string")),
                },
            ],
            "inCitations":
            datasets.Sequence(datasets.Value("string")),
            "outCitations":
            datasets.Sequence(datasets.Value("string")),
            "fieldsOfStudy":
            datasets.Sequence(datasets.Value("string")),
            "year":
            datasets.Value("int32"),
            "venue":
            datasets.Value("string"),
            "journalName":
            datasets.Value("string"),
            "journalVolume":
            datasets.Value("string"),
            "journalPages":
            datasets.Value("string"),
            "sources":
            datasets.Sequence(datasets.Value("string")),
            "doi":
            datasets.Value("string"),
            "doiUrl":
            datasets.Value("string"),
            "pmid":
            datasets.Value("string"),
            "magId":
            datasets.Value("string"),
        })
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        _MANIFEST_URL = _ROOT_URL + "manifest.txt"
        manifest_file = dl_manager.download_and_extract(_MANIFEST_URL)

        file = open(manifest_file, "r", encoding="utf-8")
        train_names = file.read().splitlines()

        r = re.compile("(?s:s2\\-corpus\\-.*\\.gz)\\Z"
                       )  # files are of the form 's2-corpus-*.gz'
        train_names = list(filter(r.match, train_names))

        train_filepaths = dl_manager.download_and_extract(
            [_ROOT_URL + x for x in train_names])

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepaths": train_filepaths,
                    "split": "train",
                },
            ),
        ]

    def _generate_examples(self, filepaths, split):
        """ Yields examples. """
        for train_files in filepaths:
            with open(train_files, encoding="utf-8") as f:
                for id_, row in enumerate(f):
                    data = json.loads(row)
                    if type(data["year"]) != int:
                        data["year"] = -1
                    yield id_, data