def _info(self): if self.config.name[:9] == "documents": features = datasets.Features( { "Domain": datasets.Value("string"), "Source_URL": datasets.Value("string"), "Target_URL": datasets.Value("string"), "translation": datasets.Translation(languages=("en_XX", self.config.language_code)), } ) else: features = datasets.Features( { "translation": datasets.Translation(languages=("en_XX", self.config.language_code)), "LASER_similarity": datasets.Value("float"), } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): if ( self.config.name == "monolingual_raw" ): # This is the name of the configuration selected in BUILDER_CONFIGS above features = datasets.Features({ "text_sentence": datasets.Value("string"), "text_title": datasets.Value("string"), "speaker": datasets.Value("string"), "date": datasets.Value("int32"), "type": datasets.Value("string"), "dialect": datasets.Value("string"), }) elif ( self.config.name == "parallel_raw" ): # This is the name of the configuration selected in BUILDER_CONFIGS above features = datasets.Features({ "line_number": datasets.Value("string"), # doesn't always map to a number "sentence_pair": datasets.Translation(languages=["en", "chr"]), "text_title": datasets.Value("string"), "speaker": datasets.Value("string"), "date": datasets.Value("int32"), "type": datasets.Value("string"), "dialect": datasets.Value("string"), }) elif ( self.config.name == "parallel" ): # This is an example to show how to have different features for "first_domain" and "second_domain" features = datasets.Features({ "sentence_pair": datasets.Translation(languages=["en", "chr"]), }) elif ( self.config.name == "monolingual" ): # This is an example to show how to have different features for "first_domain" and "second_domain" features = datasets.Features({ "sentence": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features= features, # Here we define them above because they are different between the two configurations supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): if self.config.language == "all_languages": features = datasets.Features({ "premise": datasets.Translation(languages=_LANGUAGES, ), "hypothesis": datasets.TranslationVariableLanguages(languages=_LANGUAGES, ), "label": datasets.ClassLabel( names=["entailment", "neutral", "contradiction"]), }) else: features = datasets.Features({ "premise": datasets.Value("string"), "hypothesis": datasets.Value("string"), "label": datasets.ClassLabel( names=["entailment", "neutral", "contradiction"]), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, # No default supervised_keys (as we have to pass both premise # and hypothesis as input). supervised_keys=None, homepage="https://www.nyu.edu/projects/bowman/xnli/", citation=_CITATION, )
def _info(self): features = datasets.Features({ "translation": datasets.Translation(languages=(self.config.src_lg, self.config.tgt_lg)), "src_tags": datasets.Sequence(datasets.ClassLabel(names=["BAD", "OK"])), "mt_tags": datasets.Sequence(datasets.ClassLabel(names=["BAD", "OK"])), "pe": datasets.Value("string"), "hter": datasets.Value("float32"), "alignments": datasets.Sequence(datasets.Sequence(datasets.Value("int32"))), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): features = datasets.Features({ "segid": datasets.Value("int32"), "translation": datasets.Translation(languages=(self.config.src_lg, self.config.tgt_lg)), "scores": datasets.Sequence(datasets.Value("float32")), "mean": datasets.Value("float32"), "z_scores": datasets.Sequence(datasets.Value("float32")), "z_mean": datasets.Value("float32"), "model_score": datasets.Value("float32"), "doc_id": datasets.Value("string"), "nmt_output": datasets.Value("string"), "word_probas": datasets.Sequence(datasets.Value("float32")), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "id": datasets.Value("string"), "meta": { "year": datasets.Value("uint32"), "imdbId": datasets.Value("uint32"), "subtitleId": { self.config.lang1: datasets.Value("uint32"), self.config.lang2: datasets.Value("uint32"), }, "sentenceIds": { self.config.lang1: datasets.Sequence(datasets.Value("uint32")), self.config.lang2: datasets.Sequence(datasets.Value("uint32")), }, }, "translation": datasets.Translation(languages=(self.config.lang1, self.config.lang2)), }, ), supervised_keys=None, homepage=_HOMEPAGE_URL, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "id": datasets.Value("string"), "translation": datasets.Translation(languages=(self.config.lang1, self.config.lang2)), }, ), supervised_keys=None, homepage=_HOMEPAGE_URL, citation=_CITATION, )
def _info(self): """ This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset. """ features = datasets.Features({ "translation": datasets.Translation(languages=(self.config.lang1, self.config.lang2)), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): features = datasets.Features({ "date": datasets.Value("string"), "docIdx": datasets.Value("int64"), "translation": datasets.Translation(languages=["en", "hi_en"]), "uid": datasets.Value("string"), "utcTimestamp": datasets.Value("string"), "rating": datasets.Value("int64"), "status": datasets.Value("int64"), "uid1LogInTime": datasets.Value("string"), "uid1LogOutTime": datasets.Value("string"), "uid1response": { "response": datasets.Sequence(datasets.Value("int64")), "type": datasets.Value("string"), }, "uid2response": { "response": datasets.Sequence(datasets.Value("int64")), "type": datasets.Value("string"), }, "user2_id": datasets.Value("string"), "whoSawDoc": datasets.Sequence(datasets.Value("string")), "wikiDocumentIdx": datasets.Value("int64"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, citation=_CITATION, )
def _info(self): features = datasets.Features({ "sentence_pair": datasets.Translation(languages=["en", "fr"]), "label": datasets.features.ClassLabel(names=["divergent", "equivalent"]), "all_labels": datasets.features.ClassLabel(names=[ "unrelated", "some_meaning_difference", "no_meaning_difference" ]), "rationale_en": datasets.features.Sequence(datasets.Value("int32")), "rationale_fr": datasets.features.Sequence(datasets.Value("int32")), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )