Пример #1
0
class Esnli(nlp.GeneratorBasedBuilder):
    """e-SNLI: Natural Language Inference with Natural Language Explanations corpus."""

    # Version History
    # 0.0.2 Added explanation_2, explanation_3 fields which exist in the dev/test
    # splits only.
    # 0.0.1 Initial version
    BUILDER_CONFIGS = [
        nlp.BuilderConfig(name="plain_text", version=nlp.Version("0.0.2"), description="Plain text import of e-SNLI",)
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "premise": nlp.Value("string"),
                    "hypothesis": nlp.Value("string"),
                    "label": nlp.features.ClassLabel(names=["entailment", "neutral", "contradiction"]),
                    "explanation_1": nlp.Value("string"),
                    "explanation_2": nlp.Value("string"),
                    "explanation_3": nlp.Value("string"),
                }
            ),
            supervised_keys=None,
            homepage="https://github.com/OanaMariaCamburu/e-SNLI",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        files = dl_manager.download_and_extract(
            {
                "train": [os.path.join(_URL, "esnli_train_1.csv"), os.path.join(_URL, "esnli_train_2.csv")],
                "validation": [os.path.join(_URL, "esnli_dev.csv")],
                "test": [os.path.join(_URL, "esnli_test.csv")],
            }
        )

        return [
            nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files": files["train"]},),
            nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"files": files["validation"]},),
            nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"files": files["test"]},),
        ]

    def _generate_examples(self, files):
        """Yields examples."""
        for filepath in files:
            with open(filepath, encoding="utf-8") as f:
                reader = csv.DictReader(f)
                for _, row in enumerate(reader):
                    yield row["pairID"], {
                        "premise": row["Sentence1"],
                        "hypothesis": row["Sentence2"],
                        "label": row["gold_label"],
                        "explanation_1": row["Explanation_1"],
                        "explanation_2": row.get("Explanation_2", ""),
                        "explanation_3": row.get("Explanation_3", ""),
                    }
Пример #2
0
class Newsroom(nlp.GeneratorBasedBuilder):
    """NEWSROOM Dataset."""

    VERSION = nlp.Version("1.0.0")

    @property
    def manual_download_instructions(self):
        return """\
  You should download the dataset from http://lil.nlp.cornell.edu/newsroom/
  The webpage requires registration.
  To unzip the .tar file run `tar -zxvf complete.tar`. To unzip the .gz files
  run `gunzip train.json.gz` , ...
  After downloading, please put the files under the following names
  dev.jsonl, test.jsonl and train.jsonl in a dir of your choice,
  which will be used as a manual_dir, e.g. `~/.manual_dirs/newsroom`
  Newsroom can then be loaded via:
  `nlp.load_dataset("newsroom", data_dir="~/.manual_dirs/newsroom")`.
  """

    def _info(self):
        features = {k: nlp.Value("string") for k in [_DOCUMENT, _SUMMARY] + _ADDITIONAL_TEXT_FEATURES}
        features.update({k: nlp.Value("float32") for k in _ADDITIONAL_FLOAT_FEATURES})
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(features),
            supervised_keys=(_DOCUMENT, _SUMMARY),
            homepage="http://lil.nlp.cornell.edu/newsroom/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
        if not os.path.exists(data_dir):
            raise FileNotFoundError(
                "{} does not exist. Make sure you insert a manual dir via `nlp.load_dataset('newsroom', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}".format(
                    data_dir, self.manual_download_instructions
                )
            )
        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN, gen_kwargs={"input_file": os.path.join(data_dir, "train.jsonl")},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION, gen_kwargs={"input_file": os.path.join(data_dir, "dev.jsonl")},
            ),
            nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"input_file": os.path.join(data_dir, "test.jsonl")},),
        ]

    def _generate_examples(self, input_file=None):
        """Yields examples."""
        with open(input_file) as f:
            for i, line in enumerate(f):
                d = json.loads(line)
                # fields are "url", "archive", "title", "date", "text",
                #  "compression_bin", "density_bin", "summary", "density",
                #  "compression', "coverage", "coverage_bin",
                yield i, {
                    k: d[k] for k in [_DOCUMENT, _SUMMARY] + _ADDITIONAL_TEXT_FEATURES + _ADDITIONAL_FLOAT_FEATURES
                }
Пример #3
0
    def __init__(self,
                 features,
                 data_url,
                 citation,
                 url,
                 label_classes=("False", "True"),
                 **kwargs):
        """BuilderConfig for SuperGLUE.

    Args:
      features: `list[string]`, list of the features that will appear in the
        feature dict. Should not include "label".
      data_url: `string`, url to download the zip file from.
      citation: `string`, citation for the data set.
      url: `string`, url for information about the data set.
      label_classes: `list[string]`, the list of classes for the label if the
        label is present as a string. Non-string labels will be cast to either
        'False' or 'True'.
      **kwargs: keyword arguments forwarded to super.
    """
        # Version history:
        # 1.0.2: Fixed non-nondeterminism in ReCoRD.
        # 1.0.1: Change from the pre-release trial version of SuperGLUE (v1.9) to
        #        the full release (v2.0).
        # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
        # 0.0.2: Initial version.
        super(SuperGlueConfig, self).__init__(version=nlp.Version("1.0.2"),
                                              **kwargs)
        self.features = features
        self.label_classes = label_classes
        self.data_url = data_url
        self.citation = citation
        self.url = url
Пример #4
0
class QaZre(nlp.GeneratorBasedBuilder):
    """QA-ZRE: Reducing relation extraction to simple reading comprehension questions"""

    VERSION = nlp.Version("0.1.0")

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "relation": nlp.Value("string"),
                    "question": nlp.Value("string"),
                    "subject": nlp.Value("string"),
                    "context": nlp.Value("string"),
                    "answers": nlp.features.Sequence(nlp.Value("string")),
                }
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="http://nlp.cs.washington.edu/zeroshot",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        dl_dir = os.path.join(dl_dir, "relation_splits")

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                gen_kwargs={"filepaths": [os.path.join(dl_dir, "test." + str(i)) for i in range(10)],},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={"filepaths": [os.path.join(dl_dir, "dev." + str(i)) for i in range(10)],},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={"filepaths": [os.path.join(dl_dir, "train." + str(i)) for i in range(10)],},
            ),
        ]

    def _generate_examples(self, filepaths):
        """Yields examples."""

        for filepath in filepaths:
            with open(filepath) as f:
                data = csv.reader(f, delimiter="\t")
                for idx, row in enumerate(data):
                    yield idx, {
                        "relation": row[0],
                        "question": row[1],
                        "subject": row[2],
                        "context": row[3],
                        "answers": row[4:],
                    }
Пример #5
0
class Boolq(nlp.GeneratorBasedBuilder):
    """TODO(boolq): Short description of my dataset."""

    # TODO(boolq): Set up version.
    VERSION = nlp.Version("0.1.0")

    def _info(self):
        # TODO(boolq): Specifies the nlp.DatasetInfo object
        return nlp.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # nlp.features.FeatureConnectors
            features=nlp.Features(
                {
                    "question": nlp.Value("string"),
                    "answer": nlp.Value("bool"),
                    "passage": nlp.Value("string")
                    # These are the features of your dataset like images, labels ...
                }
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/google-research-datasets/boolean-questions",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(boolq): Downloads the data and defines the splits
        # dl_manager is a nlp.download.DownloadManager that can be used to
        # download and extract URLs
        urls_to_download = {
            "train": os.path.join(_URL, _TRAIN_FILE_NAME),
            "dev": os.path.join(_URL, _DEV_FILE_NAME),
        }
        downloaded_files = dl_manager.download_custom(urls_to_download, tf.io.gfile.copy)

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={"filepath": downloaded_files["dev"]},
            ),
        ]

    def _generate_examples(self, filepath):
        """Yields examples."""
        # TODO(boolq): Yields (key, example) tuples from the dataset
        with open(filepath) as f:
            for id_, row in enumerate(f):
                data = json.loads(row)
                question = data["question"]
                answer = data["answer"]
                passage = data["passage"]
                yield id_, {"question": question, "answer": answer, "passage": passage}
Пример #6
0
class ParaCrawl(nlp.GeneratorBasedBuilder):
    """ParaCrawl machine translation dataset."""

    # Version history:
    # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
    # 0.1.0: Initial version.
    BUILDER_CONFIGS = [
        # The version below does not refer to the version of the released
        # database. It only indicates the version of the TFDS integration.
        ParaCrawlConfig(  # pylint: disable=g-complex-comprehension
            target_language=target_language,
            version=nlp.Version("1.0.0"),
        ) for target_language in _target_languages()
    ]

    def _info(self):
        target_language = self.config.target_language
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features({
                "translation":
                nlp.features.Translation(languages=("en", target_language))
            }),
            supervised_keys=("en", target_language),
            homepage=_BENCHMARK_URL,
            citation=_CITATION,
        )

    def _vocab_text_gen(self, files, language):
        for _, ex in self._generate_examples(**files):
            yield ex[language]

    def _split_generators(self, dl_manager):
        # Download the data file.
        data_file = dl_manager.download_and_extract(
            {"data_file": self.config.data_url})

        # Return the single split of the data.
        return [nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs=data_file)]

    def _generate_examples(self, data_file):
        """This function returns the examples in the raw (text) form."""
        target_language = self.config.target_language

        with open(data_file, encoding="utf-8") as f:
            for idx, line in enumerate(f):
                line_parts = line.strip().split("\t")
                if len(line_parts) != 2:
                    msg = ("Wrong data format in line {}. The line '{}' does "
                           "not have exactly one delimiter.").format(
                               idx, line)
                    raise ValueError(msg)
                source, target = line_parts[0].strip(), line_parts[1].strip()
                yield idx, {
                    "translation": {
                        "en": source,
                        target_language: target
                    }
                }
Пример #7
0
    def __init__(self,  **kwargs):
        """Constructs a DoQA.

        Args:
        **kwargs: keyword arguments forwarded to super.
        """

        super(DoqaConfig, self).__init__(version=nlp.Version("2.1.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs)
Пример #8
0
    def __init__(self, data_url, **kwargs):
        """BuilderConfig for SearchQA

        Args:
          **kwargs: keyword arguments forwarded to super.
        """
        super(SearchQaConfig, self).__init__(version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs)
        self.data_url = data_url
Пример #9
0
 def __init__(self, **kwargs):
     """BuilderConfig for Art.
     Args:
       **kwargs: keyword arguments forwarded to super.
     """
     super(ArtConfig, self).__init__(version=nlp.Version(
         "0.1.0", "New split API (https://tensorflow.org/datasets/splits)"),
                                     **kwargs)
Пример #10
0
    def __init__(self, data_url, **kwargs):
        """BuilderConfig for Wikitext

    Args:
      data_url: `string`, url to the dataset (word or raw level)
      **kwargs: keyword arguments forwarded to super.
    """
        super(WikitextConfig, self).__init__(version=nlp.Version("1.0.0",), **kwargs)
        self.data_url = data_url
Пример #11
0
    def __init__(self, **kwargs):
        """BuilderConfig for KorNLI.

    Args:

      **kwargs: keyword arguments forwarded to super.
    """
        # Version 1.1.0 remove empty document and summary strings.
        super(KorNLIConfig, self).__init__(version=nlp.Version("1.0.0"), **kwargs)
Пример #12
0
 def __init__(self, **kwargs):
     """BuilderConfig for BlogAuthorship
     Args:
       data_url: `string`, url to the dataset (word or raw level)
       **kwargs: keyword arguments forwarded to super.
     """
     super(AssertionConfig, self).__init__(version=nlp.Version(
         "1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
                                           **kwargs)
Пример #13
0
    def __init__(self, data_url, **kwargs):
        """BuilderConfig for BlogAuthorship

        Args:
          data_url: `string`, url to the dataset (word or raw level)
          **kwargs: keyword arguments forwarded to super.
        """
        super(CrimeAndPunishConfig, self).__init__(version=nlp.Version("1.0.0",), **kwargs)
        self.data_url = data_url
Пример #14
0
class Snli(nlp.GeneratorBasedBuilder):
    """The Stanford Natural Language Inference (SNLI) Corpus."""

    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="plain_text",
            version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
            description="Plain text import of SNLI",
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "premise": nlp.Value("string"),
                    "hypothesis": nlp.Value("string"),
                    "label": nlp.features.ClassLabel(names=["entailment", "neutral", "contradiction"]),
                }
            ),
            # No default supervised_keys (as we have to pass both premise
            # and hypothesis as input).
            supervised_keys=None,
            homepage="https://nlp.stanford.edu/projects/snli/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        data_dir = os.path.join(dl_dir, "snli_1.0")
        return [
            nlp.SplitGenerator(
                name=nlp.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_test.txt")}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_dev.txt")}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_train.txt")}
            ),
        ]

    def _generate_examples(self, filepath):
        """This function returns the examples in the raw (text) form."""
        print(filepath)
        print("==" * 100)
        with open(filepath) as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for idx, row in enumerate(reader):
                label = -1 if row["gold_label"] == "-" else row["gold_label"]
                yield idx, {
                    "premise": row["sentence1"],
                    "hypothesis": row["sentence2"],
                    "label": label,
                }
Пример #15
0
    def __init__(self, **kwargs):
        """BuilderConfig for KILTTasks.

            Args:
        .
              **kwargs: keyword arguments forwarded to super.
        """
        super(KILTTasksConfig, self).__init__(version=nlp.Version(
            "1.0.0", "KILT tasks training and evaluation data"),
                                              **kwargs)
Пример #16
0
    def __init__(self, **kwargs):
        """BuilderConfig for KILTWikipedia.

            Args:
        .
              **kwargs: keyword arguments forwarded to super.
        """
        super(KILTWikipediaConfig, self).__init__(
            version=nlp.Version("1.0.0", "Wikipedia pre-processed for KILT"), **kwargs
        )
Пример #17
0
    def __init__(self, **kwargs):
        """

        Args:
            data_dir: directory for the given language dataset
            **kwargs: keyword arguments forwarded to super.
        """
        super(XcopaConfig, self).__init__(version=nlp.Version(
            "1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
                                          **kwargs)
Пример #18
0
    def __init__(self, data_url, **kwargs):
        """BuilderConfig for MLQA

        Args:
          data_url: `string`, url to the dataset
          **kwargs: keyword arguments forwarded to super.
        """
        super(MlqaConfig, self).__init__(version=nlp.Version("1.0.0", ),
                                         **kwargs)
        self.data_url = data_url
Пример #19
0
class CrimeAndPunish(nlp.GeneratorBasedBuilder):

    VERSION = nlp.Version("0.1.0")
    BUILDER_CONFIGS = [
        CrimeAndPunishConfig(
            name="crime-and-punish",
            data_url=_DATA_URL,
            description="word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
        ),
    ]

    def _info(self):
        return nlp.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # nlp.features.FeatureConnectors
            features=nlp.Features({"line": nlp.Value("string"),}),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            homepage=_URL,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        if self.config.name == "crime-and-punish":
            data = dl_manager.download_and_extract(self.config.data_url)

            return [
                nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"data_file": data, "split": "train"},),
            ]
        else:
            raise ValueError("{} does not exist".format(self.config.name))

    def _generate_examples(self, data_file, split):

        with open(data_file, "rb") as f:
            id_counter = 0
            add_text = False
            crime_and_punishment_occ_counter = 0

            for line in f:
                line = line.decode("UTF-8")
                if "CRIME AND PUNISHMENT" in line:
                    crime_and_punishment_occ_counter += 1
                    add_text = crime_and_punishment_occ_counter == 3
                if "End of Project" in line:
                    add_text = False

                if add_text is True:
                    result = {"line": line}
                    id_counter += 1
                    yield id_counter, result
Пример #20
0
class TedMultiTranslate(nlp.GeneratorBasedBuilder):
    """TED talk multilingual data set."""

    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="plain_text",
            version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
            description="Plain text import of multilingual TED talk translations",
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "translations": nlp.features.TranslationVariableLanguages(languages=_LANGUAGES),
                    "talk_name": nlp.Value("string"),
                }
            ),
            homepage="https://github.com/neulab/word-embeddings-for-nmt",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_train.tsv")}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_dev.tsv")}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_test.tsv")}
            ),
        ]

    def _generate_examples(self, data_file):
        """This function returns the examples in the raw (text) form."""
        with open(data_file) as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for idx, row in enumerate(reader):
                # Everything in the row except for 'talk_name' will be a translation.
                # Missing/incomplete translations will contain the string "__NULL__" or
                # "_ _ NULL _ _".
                yield idx, {
                    "translations": {
                        lang: text
                        for lang, text in six.iteritems(row)
                        if lang != "talk_name" and _is_translation_complete(text)
                    },
                    "talk_name": row["talk_name"],
                }
Пример #21
0
    def __init__(self, lang, **kwargs):
        """

        Args:
            lang: string, language for the input text
            **kwargs: keyword arguments forwarded to super.
        """
        super(XquadConfig, self).__init__(version=nlp.Version(
            "1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
                                          **kwargs)
        self.lang = lang
Пример #22
0
    def __init__(self, features, **kwargs):
        """BuilderConfig for UbuntuDialogsCorpus.

    Args:

      **kwargs: keyword arguments forwarded to super.
    """

        super(UbuntuDialogsCorpusConfig,
              self).__init__(version=nlp.Version("2.0.0"), **kwargs)
        self.features = features
Пример #23
0
class Gap(nlp.GeneratorBasedBuilder):
    """GAP is a gender-balanced dataset.

  It contains 8,908 coreference-labeled pairs
  of (ambiguous pronoun, antecedent name), sampled from Wikipedia.
  """

    VERSION = nlp.Version("0.1.0")

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "ID": nlp.Value("string"),
                    "Text": nlp.Value("string"),
                    "Pronoun": nlp.Value("string"),
                    "Pronoun-offset": nlp.Value("int32"),
                    "A": nlp.Value("string"),
                    "A-offset": nlp.Value("int32"),
                    "A-coref": nlp.Value("bool"),
                    "B": nlp.Value("string"),
                    "B-offset": nlp.Value("int32"),
                    "B-coref": nlp.Value("bool"),
                    "URL": nlp.Value("string"),
                }
            ),
            supervised_keys=None,
            homepage="https://github.com/google-research-datasets/gap-coreference",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        directory = dl_manager.download_and_extract(
            {"train": _TRAINURL, "validation": _VALIDATIONURL, "test": _TESTURL}
        )
        return [
            nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": directory["train"]},),
            nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": directory["validation"]},),
            nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": directory["test"]},),
        ]

    def _generate_examples(self, filepath):
        """Yields examples."""
        with open(filepath, encoding="utf-8") as tsvfile:
            reader = csv.DictReader(tsvfile, dialect="excel-tab")
            for i, row in enumerate(reader):
                row["A-coref"] = bool(row["A-coref"])
                row["B-coref"] = bool(row["B-coref"])
                row["A-offset"] = int(row["A-offset"])
                row["B-offset"] = int(row["B-offset"])
                row["Pronoun-offset"] = int(row["Pronoun-offset"])
                yield i, row
Пример #24
0
    def __init__(self, filename=None, **kwargs):
        """BuilderConfig for Wikihow.

    Args:
      filename: filename of different configs for the dataset.
      **kwargs: keyword arguments forwarded to super.
    """
        # 1.1.0 remove sentence breaker <S> and </S> in summary.
        super(ScientificPapersConfig,
              self).__init__(version=nlp.Version("1.1.1"), **kwargs)
        self.filename = filename
Пример #25
0
    def __init__(self, filename=None, **kwargs):
        """BuilderConfig for Wikihow.

    Args:
      filename: filename of different configs for the dataset.
      **kwargs: keyword arguments forwarded to super.
    """
        # Version 1.1.0 remove empty document and summary strings.
        # Version 1.2.0 add train validation test split, add cleaning & filtering.
        super(WikihowConfig, self).__init__(version=nlp.Version("1.2.0"), **kwargs)
        self.filename = filename
Пример #26
0
    def __init__(self, summary_key=None, **kwargs):
        """BuilderConfig for RedditTifu.

    Args:
      summary_key: key string of summary in downloaded json file.
      **kwargs: keyword arguments forwarded to super.
    """
        # Version 1.1.0 remove empty document and summary strings.
        super(RedditTifuConfig, self).__init__(version=nlp.Version("1.1.0"),
                                               **kwargs)
        self.summary_key = summary_key
Пример #27
0
class DefinitePronounResolution(nlp.GeneratorBasedBuilder):
    """The Definite Pronoun Resolution Dataset."""

    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="plain_text",
            version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
            description="Plain text import of the Definite Pronoun Resolution Dataset.",  # pylint: disable=line-too-long
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "sentence": nlp.Value("string"),
                    "pronoun": nlp.Value("string"),
                    "candidates": nlp.features.Sequence(nlp.Value("string"), length=2),
                    "label": nlp.features.ClassLabel(num_classes=2),
                }
            ),
            supervised_keys=("sentence", "label"),
            homepage="http://www.hlt.utdallas.edu/~vince/data/emnlp12/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        files = dl_manager.download_and_extract(
            {"train": _DATA_URL_PATTERN.format("train"), "test": _DATA_URL_PATTERN.format("test"),}
        )
        return [
            nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": files["test"]}),
            nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": files["train"]}),
        ]

    def _generate_examples(self, filepath):
        with open(filepath) as f:
            line_num = -1
            while True:
                line_num += 1
                sentence = f.readline().strip()
                pronoun = f.readline().strip()
                candidates = [c.strip() for c in f.readline().strip().split(",")]
                correct = f.readline().strip()
                f.readline()
                if not sentence:
                    break
                yield line_num, {
                    "sentence": sentence,
                    "pronoun": pronoun,
                    "candidates": candidates,
                    "label": candidates.index(correct),
                }
Пример #28
0
    def __init__(self, data_size, **kwargs):
        """

        Args:
            data_size: the size of the training set we want to us (xs, s, m, l, xl)
            **kwargs: keyword arguments forwarded to super.
        """
        super(WinograndeConfig, self).__init__(version=nlp.Version(
            "1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
                                               **kwargs)
        self.data_size = data_size
Пример #29
0
    def __init__(self, data_url, balanced=False, **kwargs):
        """

        Args:
            balanced: to specify if we want to load the balanced file or the full file
            **kwargs: keyword arguments forwarded to super.
        """
        super(DiscofuseConfig, self).__init__(
            version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs
        )
        self.balanced = balanced
        self.data_url = data_url
Пример #30
0
def _generate_builder_configs():
  """Generate configs with different subsets of mathematics dataset."""
  configs = []
  for module in sorted(set(_MODULES)):
    configs.append(
        nlp.BuilderConfig(
            name=module,
            version=nlp.Version("1.0.0"),
            description=_DESCRIPTION,
        ))

  return configs