Exemplo n.º 1
0
    def _info(self):
        if self.config.name[:9] == "documents":
            features = datasets.Features(
                {
                    "Domain": datasets.Value("string"),
                    "Source_URL": datasets.Value("string"),
                    "Target_URL": datasets.Value("string"),
                    "translation": datasets.Translation(languages=("en_XX", self.config.language_code)),
                }
            )
        else:
            features = datasets.Features(
                {
                    "translation": datasets.Translation(languages=("en_XX", self.config.language_code)),
                    "LASER_similarity": datasets.Value("float"),
                }
            )

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )
Exemplo n.º 2
0
 def _info(self):
     if (
             self.config.name == "monolingual_raw"
     ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features({
             "text_sentence":
             datasets.Value("string"),
             "text_title":
             datasets.Value("string"),
             "speaker":
             datasets.Value("string"),
             "date":
             datasets.Value("int32"),
             "type":
             datasets.Value("string"),
             "dialect":
             datasets.Value("string"),
         })
     elif (
             self.config.name == "parallel_raw"
     ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features({
             "line_number":
             datasets.Value("string"),  # doesn't always map to a number
             "sentence_pair":
             datasets.Translation(languages=["en", "chr"]),
             "text_title":
             datasets.Value("string"),
             "speaker":
             datasets.Value("string"),
             "date":
             datasets.Value("int32"),
             "type":
             datasets.Value("string"),
             "dialect":
             datasets.Value("string"),
         })
     elif (
             self.config.name == "parallel"
     ):  # This is an example to show how to have different features for "first_domain" and "second_domain"
         features = datasets.Features({
             "sentence_pair":
             datasets.Translation(languages=["en", "chr"]),
         })
     elif (
             self.config.name == "monolingual"
     ):  # This is an example to show how to have different features for "first_domain" and "second_domain"
         features = datasets.Features({
             "sentence": datasets.Value("string"),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
Exemplo n.º 3
0
 def _info(self):
     if self.config.language == "all_languages":
         features = datasets.Features({
             "premise":
             datasets.Translation(languages=_LANGUAGES, ),
             "hypothesis":
             datasets.TranslationVariableLanguages(languages=_LANGUAGES, ),
             "label":
             datasets.ClassLabel(
                 names=["entailment", "neutral", "contradiction"]),
         })
     else:
         features = datasets.Features({
             "premise":
             datasets.Value("string"),
             "hypothesis":
             datasets.Value("string"),
             "label":
             datasets.ClassLabel(
                 names=["entailment", "neutral", "contradiction"]),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         # No default supervised_keys (as we have to pass both premise
         # and hypothesis as input).
         supervised_keys=None,
         homepage="https://www.nyu.edu/projects/bowman/xnli/",
         citation=_CITATION,
     )
Exemplo n.º 4
0
    def _info(self):
        features = datasets.Features({
            "translation":
            datasets.Translation(languages=(self.config.src_lg,
                                            self.config.tgt_lg)),
            "src_tags":
            datasets.Sequence(datasets.ClassLabel(names=["BAD", "OK"])),
            "mt_tags":
            datasets.Sequence(datasets.ClassLabel(names=["BAD", "OK"])),
            "pe":
            datasets.Value("string"),
            "hter":
            datasets.Value("float32"),
            "alignments":
            datasets.Sequence(datasets.Sequence(datasets.Value("int32"))),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
    def _info(self):
        features = datasets.Features({
            "segid":
            datasets.Value("int32"),
            "translation":
            datasets.Translation(languages=(self.config.src_lg,
                                            self.config.tgt_lg)),
            "scores":
            datasets.Sequence(datasets.Value("float32")),
            "mean":
            datasets.Value("float32"),
            "z_scores":
            datasets.Sequence(datasets.Value("float32")),
            "z_mean":
            datasets.Value("float32"),
            "model_score":
            datasets.Value("float32"),
            "doc_id":
            datasets.Value("string"),
            "nmt_output":
            datasets.Value("string"),
            "word_probas":
            datasets.Sequence(datasets.Value("float32")),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
Exemplo n.º 6
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "id":
                 datasets.Value("string"),
                 "meta": {
                     "year": datasets.Value("uint32"),
                     "imdbId": datasets.Value("uint32"),
                     "subtitleId": {
                         self.config.lang1: datasets.Value("uint32"),
                         self.config.lang2: datasets.Value("uint32"),
                     },
                     "sentenceIds": {
                         self.config.lang1:
                         datasets.Sequence(datasets.Value("uint32")),
                         self.config.lang2:
                         datasets.Sequence(datasets.Value("uint32")),
                     },
                 },
                 "translation":
                 datasets.Translation(languages=(self.config.lang1,
                                                 self.config.lang2)),
             }, ),
         supervised_keys=None,
         homepage=_HOMEPAGE_URL,
         citation=_CITATION,
     )
Exemplo n.º 7
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "id": datasets.Value("string"),
                 "translation": datasets.Translation(languages=(self.config.lang1, self.config.lang2)),
             },
         ),
         supervised_keys=None,
         homepage=_HOMEPAGE_URL,
         citation=_CITATION,
     )
Exemplo n.º 8
0
    def _info(self):
        """ This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset. """
        features = datasets.Features({
            "translation":
            datasets.Translation(languages=(self.config.lang1,
                                            self.config.lang2)),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
Exemplo n.º 9
0
 def _info(self):
     features = datasets.Features({
         "date":
         datasets.Value("string"),
         "docIdx":
         datasets.Value("int64"),
         "translation":
         datasets.Translation(languages=["en", "hi_en"]),
         "uid":
         datasets.Value("string"),
         "utcTimestamp":
         datasets.Value("string"),
         "rating":
         datasets.Value("int64"),
         "status":
         datasets.Value("int64"),
         "uid1LogInTime":
         datasets.Value("string"),
         "uid1LogOutTime":
         datasets.Value("string"),
         "uid1response": {
             "response": datasets.Sequence(datasets.Value("int64")),
             "type": datasets.Value("string"),
         },
         "uid2response": {
             "response": datasets.Sequence(datasets.Value("int64")),
             "type": datasets.Value("string"),
         },
         "user2_id":
         datasets.Value("string"),
         "whoSawDoc":
         datasets.Sequence(datasets.Value("string")),
         "wikiDocumentIdx":
         datasets.Value("int64"),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         citation=_CITATION,
     )
Exemplo n.º 10
0
 def _info(self):
     features = datasets.Features({
         "sentence_pair":
         datasets.Translation(languages=["en", "fr"]),
         "label":
         datasets.features.ClassLabel(names=["divergent", "equivalent"]),
         "all_labels":
         datasets.features.ClassLabel(names=[
             "unrelated", "some_meaning_difference", "no_meaning_difference"
         ]),
         "rationale_en":
         datasets.features.Sequence(datasets.Value("int32")),
         "rationale_fr":
         datasets.features.Sequence(datasets.Value("int32")),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )