示例#1
0
文件: glue.py 项目: worm1271/nlp
 def _info(self):
     if self.config_name not in [
             "sst2", "mnli", "mnli_mismatched", "mnli_matched", "cola",
             "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"
     ]:
         raise KeyError(
             'You should supply a configuration name selected in '
             '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
             '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
         )
     return nlp.MetricInfo(
         description=_DESCRIPTION,
         citation=_CITATION,
         inputs_description=_KWARGS_DESCRIPTION,
         features=nlp.Features({
             'predictions':
             nlp.Value(
                 'int64' if self.config_name != 'stsb' else 'float32'),
             'references':
             nlp.Value(
                 'int64' if self.config_name != 'stsb' else 'float32'),
         }),
         codebase_urls=[],
         reference_urls=[],
         format='numpy')
示例#2
0
 def _info(self):
     return nlp.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # nlp.features.FeatureConnectors
         features=nlp.Features(
             {
                 "string": nlp.Value("string"),
                 "sectionName": nlp.Value("string"),
                 "label": nlp.features.ClassLabel(names=["method", "background", "result"]),
                 "citingPaperId": nlp.Value("string"),
                 "citedPaperId": nlp.Value("string"),
                 "excerpt_index": nlp.Value("int32"),
                 "isKeyCitation": nlp.Value("bool"),
                 "label2": nlp.features.ClassLabel(
                     names=["supportive", "not_supportive", "cant_determine", "none"]
                 ),
                 "citeEnd": nlp.Value("int64"),
                 "citeStart": nlp.Value("int64"),
                 "source": nlp.features.ClassLabel(names=_SOURCE_NAMES),
                 "label_confidence": nlp.Value("float32"),
                 "label2_confidence": nlp.Value("float32"),
                 "id": nlp.Value("string"),
             }
         ),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="https://github.com/allenai/scicite",
         citation=_CITATION,
     )
示例#3
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({
             'whoTarget': nlp.Value("string"),
             'intentYN': nlp.Value("string"),
             'sexYN': nlp.Value("string"),
             'sexReason': nlp.Value("string"),
             'offensiveYN': nlp.Value("string"),
             'annotatorGender': nlp.Value("string"),
             'annotatorMinority': nlp.Value("string"),
             'sexPhrase': nlp.Value("string"),
             'speakerMinorityYN': nlp.Value("string"),
             'WorkerId': nlp.Value("string"),
             'HITId': nlp.Value("string"),
             'annotatorPolitics': nlp.Value("string"),
             'annotatorRace': nlp.Value("string"),
             'annotatorAge': nlp.Value("string"),
             'post': nlp.Value("string"),
             'targetMinority': nlp.Value("string"),
             'targetCategory': nlp.Value("string"),
             'targetStereotype': nlp.Value("string")
         }),
         # No default supervised_keys (as we have to pass both premise
         # and hypothesis as input).
         supervised_keys=None,
         homepage=
         "https://homes.cs.washington.edu/~msap/social-bias-frames/",
         citation=_CITATION,
     )
示例#4
0
文件: qangaroo.py 项目: zedauna/nlp-1
 def _info(self):
     # TODO(qangaroo): Specifies the nlp.DatasetInfo object
     return nlp.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # nlp.features.FeatureConnectors
         features=nlp.Features({
             # These are the features of your dataset like images, labels ...
             "query":
             nlp.Value("string"),
             "supports":
             nlp.features.Sequence({"support": nlp.Value("string")}),
             "candidates":
             nlp.features.Sequence({"candidate": nlp.Value("string")}),
             "answer":
             nlp.Value("string"),
             "id":
             nlp.Value("string")
             # These are the features of your dataset like images, labels ...
         }),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="http://qangaroo.cs.ucl.ac.uk/index.html",
         citation=_CITATION,
     )
示例#5
0
文件: arcd.py 项目: lukewheless/NLP
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({
             "id":
             nlp.Value("string"),
             "title":
             nlp.Value("string"),
             "context":
             nlp.Value("string"),
             "question":
             nlp.Value("string"),
             "answers":
             nlp.features.Sequence({
                 "text": nlp.Value("string"),
                 "answer_start": nlp.Value("int32")
             }),
         }),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage=
         "https://github.com/husseinmozannar/SOQAL/tree/master/data",
         citation=_CITATION,
     )
示例#6
0
文件: mlsum.py 项目: vinayya/nlp
    def _info(self):
        return nlp.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # nlp.features.FeatureConnectors

            features=nlp.Features(
                {
                    "text": nlp.Value("string"),
                    "summary": nlp.Value("string"),
                    "topic": nlp.Value("string"),
                    "url": nlp.Value("string"),
                    "title": nlp.Value("string"),
                    "date":nlp.Value("string")
                    # These are the features of your dataset like images, labels ...
                }
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="",
            citation=_CITATION,
        )
示例#7
0
 def _info(self):
   # TODO(mlqa): Specifies the nlp.DatasetInfo object
   return nlp.DatasetInfo(
       # This is the description that will appear on the datasets page.
       description=_DESCRIPTION,
       # nlp.features.FeatureConnectors
       features=nlp.Features({
           'context': nlp.Value('string'),
           'questions': nlp.features.Sequence({
               'question': nlp.Value('string')
           }
           ),
           'answers': nlp.features.Sequence({
               "text": nlp.Value('string'),
               "answer_start": nlp.Value('int32'),
           }),
           'ids': nlp.features.Sequence({
               'idx':nlp.Value('string')
           })
           # These are the features of your dataset like images, labels ...
       }),
       # If there's a common (input, target) tuple from the features,
       # specify them here. They'll be used if as_supervised=True in
       # builder.as_dataset.
       supervised_keys=None,
       # Homepage of the dataset for documentation
       homepage='https://github.com/facebookresearch/MLQA',
       citation=_CITATION,
   )
示例#8
0
 def _info(self):
     # TODO(openBookQA): Specifies the nlp.DatasetInfo object
     return nlp.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # nlp.features.FeatureConnectors
         features=nlp.Features({
             # These are the features of your dataset like images, labels ...
             "id":
             nlp.Value("string"),
             "question_stem":
             nlp.Value("string"),
             "choices":
             nlp.features.Sequence({
                 "text": nlp.Value("string"),
                 "label": nlp.Value("string")
             }),
             "answerKey":
             nlp.Value("string"),
         }),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="https://allenai.org/data/open-book-qa",
         citation=_CITATION,
     )
示例#9
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({
             "q_id":
             nlp.Value("string"),
             "title":
             nlp.Value("string"),
             "selftext":
             nlp.Value("string"),
             "document":
             nlp.Value("string"),
             "subreddit":
             nlp.Value("string"),
             "answers":
             nlp.features.Sequence({
                 "a_id": nlp.Value("string"),
                 "text": nlp.Value("string"),
                 "score": nlp.Value("int32")
             }),
             "title_urls":
             nlp.features.Sequence(nlp.Value("string")),
             "selftext_urls":
             nlp.features.Sequence(nlp.Value("string")),
             "answers_urls":
             nlp.features.Sequence(nlp.Value("string")),
         }),
         supervised_keys=None,
         homepage="https://facebookresearch.github.io/ELI5/explore.html",
         citation=_CITATION,
     )
示例#10
0
 def _info(self):
     # TODO(discofuse): Specifies the nlp.DatasetInfo object
     return nlp.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # nlp.features.FeatureConnectors
         features=nlp.Features(
             {
                 "connective_string": nlp.Value("string"),
                 "discourse_type": nlp.Value("string"),
                 "coherent_second_sentence": nlp.Value("string"),
                 "has_coref_type_pronoun": nlp.Value("float32"),
                 "incoherent_first_sentence": nlp.Value("string"),
                 "incoherent_second_sentence": nlp.Value("string"),
                 "has_coref_type_nominal": nlp.Value("float32"),
                 "coherent_first_sentence": nlp.Value("string"),
                 # These are the features of your dataset like images, labels ...
             }
         ),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="https://github.com/google-research-datasets/discofuse",
         citation=_CITATION,
     )
示例#11
0
 def _info(self):
     # TODO(lc_quad): Specifies the nlp.DatasetInfo object
     return nlp.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # nlp.features.FeatureConnectors
         features=nlp.Features({
             "NNQT_question": nlp.Value("string"),
             "uid": nlp.Value("int32"),
             "subgraph": nlp.Value("string"),
             "template_index": nlp.Value("int32"),
             "question": nlp.Value("string"),
             "sparql_wikidata": nlp.Value("string"),
             "sparql_dbpedia18": nlp.Value("string"),
             "template": nlp.Value("string"),
             # "template_id": nlp.Value('string'),
             "paraphrased_question": nlp.Value("string")
             # These are the features of your dataset like images, labels ...
         }),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="http://lc-quad.sda.tech/",
         citation=_CITATION,
     )
示例#12
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION + "\n" + self.config.description,
         features=nlp.Features({
             "category":
             nlp.Value("string"),
             "air_date":
             nlp.Value("string"),
             "question":
             nlp.Value("string"),
             "value":
             nlp.Value("string"),
             "answer":
             nlp.Value("string"),
             "round":
             nlp.Value("string"),
             "category":
             nlp.Value("string"),
             "show_number":
             nlp.Value("int32"),
             "search_results":
             nlp.features.Sequence({
                 "urls": nlp.Value("string"),
                 "snippets": nlp.Value("string"),
                 "titles": nlp.Value("string"),
                 "related_links": nlp.Value("string"),
             })
             # These are the features of your dataset like images, labels ...
         }),
         homepage="https://github.com/nyu-dl/dl4ir-searchQA",
         citation=_CITATION,
     )
示例#13
0
 def _info(self):
     # TODO(jeopardy): Specifies the nlp.DatasetInfo object
     return nlp.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # nlp.features.FeatureConnectors
         features=nlp.Features(
             {
                 "category": nlp.Value("string"),
                 "air_date": nlp.Value("string"),
                 "question": nlp.Value("string"),
                 "value": nlp.Value("int32"),
                 "answer": nlp.Value("string"),
                 "round": nlp.Value("string"),
                 "category": nlp.Value("string"),
                 "show_number": nlp.Value("int32"),
                 # These are the features of your dataset like images, labels ...
             }
         ),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_URL,
         citation=_CITATION,
     )
示例#14
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({
             "id":
             nlp.Value("string"),
             "title":
             nlp.Value("string"),
             "context":
             nlp.Value("string"),
             "question":
             nlp.Value("string"),
             "answers":
             nlp.features.Sequence({
                 "text": nlp.Value("string"),
                 "answer_start": nlp.Value("int32"),
             }),
         }),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage=
         "https://modestyachts.github.io/squadshifts-website/index.html",
         citation=_CITATION,
     )
示例#15
0
 def _info(self):
     # TODO(squad_it): Specifies the nlp.DatasetInfo object
     return nlp.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # nlp.features.FeatureConnectors
         features=nlp.Features(
             {
                 "id": nlp.Value("string"),
                 "context": nlp.Value("string"),
                 "question": nlp.Value("string"),
                 "answers": nlp.features.Sequence(
                     {"text": nlp.Value("string"), "answer_start": nlp.Value("int32"),}
                 ),
                 # These are the features of your dataset like images, labels ...
             }
         ),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="https://github.com/crux82/squad-it",
         citation=_CITATION,
     )
示例#16
0
 def _info(self):
   # TODO(cosmos_qa): Specifies the nlp.DatasetInfo object
   return nlp.DatasetInfo(
       # This is the description that will appear on the datasets page.
       description=_DESCRIPTION,
       # nlp.features.FeatureConnectors
       features=nlp.Features({
           'id': nlp.Value('string'),
           'context': nlp.Value('string'),
           'question': nlp.Value('string'),
           'answer0': nlp.Value('string'),
           'answer1': nlp.Value('string'),
           'answer2':nlp.Value('string'),
           'answer3': nlp.Value('string'),
           'label': nlp.Value('int32')
           # These are the features of your dataset like images, labels ...
       }),
       # If there's a common (input, target) tuple from the features,
       # specify them here. They'll be used if as_supervised=True in
       # builder.as_dataset.
       supervised_keys=None,
       # Homepage of the dataset for documentation
       homepage='https://wilburone.github.io/cosmos/',
       citation=_CITATION,
   )
示例#17
0
文件: qasc.py 项目: lukewheless/NLP
  def _info(self):
    # TODO(qasc): Specifies the nlp.DatasetInfo object
    return nlp.DatasetInfo(
        # This is the description that will appear on the datasets page.
        description=_DESCRIPTION,
        # nlp.features.FeatureConnectors
        features=nlp.Features({
            'id': nlp.Value('string'),
            'question': nlp.Value('string'),
            'choices': nlp.features.Sequence({
                'text': nlp.Value('string'),
                'label': nlp.Value('string')
            }),
            'answerKey': nlp.Value('string'),
            'fact1': nlp.Value('string'),
            'fact2': nlp.Value('string'),
            'combinedfact': nlp.Value('string'),
            'formatted_question': nlp.Value('string'),


            # These are the features of your dataset like images, labels ...
        }),
        # If there's a common (input, target) tuple from the features,
        # specify them here. They'll be used if as_supervised=True in
        # builder.as_dataset.
        supervised_keys=None,
        # Homepage of the dataset for documentation
        homepage='https://allenai.org/data/qasc',
        citation=_CITATION,
    )
示例#18
0
    def _info(self):
        # TODO(xtreme): Specifies the nlp.DatasetInfo object
        features = {text_feature: nlp.Value("string") for text_feature in six.iterkeys(self.config.text_features)}
        if "answers" in features.keys():
            features["answers"] = nlp.features.Sequence(
                {"answer_start": nlp.Value("int32"), "text": nlp.Value("string")}
            )
        if self.config.name.startswith("PAWS-X"):
            features["label"] = nlp.Value("string")
        if self.config.name == "XNLI":
            features["gold_label"] = nlp.Value("string")

        return nlp.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=self.config.description + "\n" + _DESCRIPTION,
            # nlp.features.FeatureConnectors
            features=nlp.Features(
                features
                # These are the features of your dataset like images, labels ...
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url,
            citation=self.config.citation + "\n" + _CITATION,
        )
示例#19
0
 def _info(self):
     # TODO(empathetic_dialogues): Specifies the nlp.DatasetInfo object
     return nlp.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # nlp.features.FeatureConnectors
         features=nlp.Features({
             'conv_id': nlp.Value('string'),
             'utterance_idx': nlp.Value('int32'),
             'context': nlp.Value('string'),
             'prompt': nlp.Value('string'),
             'speaker_idx': nlp.Value('int32'),
             'utterance': nlp.Value('string'),
             'selfeval': nlp.Value('string'),
             'tags': nlp.Value('string')
             # These are the features of your dataset like images, labels ...
         }),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage='https://github.com/facebookresearch/EmpatheticDialogues',
         citation=_CITATION,
     )
示例#20
0
文件: xsum.py 项目: mayurnewase/nlp
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({_DOCUMENT: nlp.Value("string"), _SUMMARY: nlp.Value("string"),}),
         supervised_keys=(_DOCUMENT, _SUMMARY),
         homepage="https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset",
         citation=_CITATION,
     )
示例#21
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({_DOCUMENT: nlp.Value("string"), _SUMMARY: nlp.Value("string")}),
         supervised_keys=(_DOCUMENT, _SUMMARY),
         homepage="https://github.com/ryanzhumich/AESLC",
         citation=_CITATION,
     )
示例#22
0
def write_flattened_sequence(feats, dummy_data, tmp_dir):
    my_features = nlp.Features(feats)
    writer = ArrowWriter(features=my_features,
                         path=os.path.join(tmp_dir, "beta.arrow"))
    for key, record in dummy_data:
        example = my_features.encode_example(record)
        writer.write(example)
    num_examples, num_bytes = writer.finalize()
示例#23
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features(
             {
                 "id": nlp.Value("string"),
                 "text": nlp.Value("string"),
                 "title": nlp.Value("string"),
                 "embeddings": nlp.Sequence(nlp.Value("float32")),
             }
         )
         if self.config.with_embeddings
         else nlp.Features({"id": nlp.Value("string"), "text": nlp.Value("string"), "title": nlp.Value("string")}),
         supervised_keys=None,
         homepage="https://github.com/facebookresearch/DPR",
         citation=_CITATION,
     )
示例#24
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({"text": nlp.Value("string"), "label": nlp.features.ClassLabel(names=["1", "2"]),}),
         supervised_keys=None,
         homepage="https://course.fast.ai/datasets",
         citation=_CITATION,
     )
示例#25
0
文件: bookcorpus.py 项目: vinayya/nlp
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({"text": nlp.Value("string"),}),
         supervised_keys=None,
         homepage="https://yknzhu.wixsite.com/mbweb",
         citation=_CITATION,
     )
示例#26
0
文件: lm1b.py 项目: vinayya/nlp
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({"text": nlp.Value("string")}),
         supervised_keys=("text", "text"),
         homepage="http://www.statmt.org/lm-benchmark/",
         citation=_CITATION,
     )
示例#27
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({"title": nlp.Value("string"), "text": nlp.Value("string"),}),
         # No default supervised_keys.
         supervised_keys=None,
         homepage="https://dumps.wikimedia.org",
         citation=_CITATION,
     )
示例#28
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features({"text": nlp.Value("string")}),
         supervised_keys=None,
         homepage=
         "https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt",
         citation=_CITATION,
     )
示例#29
0
def benchmark_map_filter():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    with tempfile.TemporaryDirectory() as tmp_dir:
        features = nlp.Features({
            "text": nlp.Value("string"),
            "numbers": nlp.Value("float32")
        })
        dataset = generate_example_dataset(os.path.join(
            tmp_dir, "dataset.arrow"),
                                           features,
                                           num_examples=SPEED_TEST_N_EXAMPLES)

        tokenizer = transformers.AutoTokenizer.from_pretrained(
            "bert-base-cased", use_fast=True)

        def tokenize(examples):
            return tokenizer(examples["text"])

        times["map identity"] = map(dataset)

        times["map identity batched"] = map(dataset, batched=True)

        times["map no-op batched"] = map(dataset,
                                         function=lambda x: None,
                                         batched=True)

        with dataset.formatted_as(type="numpy"):
            times["map no-op batched numpy"] = map(dataset,
                                                   function=lambda x: None,
                                                   batched=True)

        with dataset.formatted_as(type="pandas"):
            times["map no-op batched pandas"] = map(dataset,
                                                    function=lambda x: None,
                                                    batched=True)

        with dataset.formatted_as(type="torch", columns="numbers"):
            times["map no-op batched pytorch"] = map(dataset,
                                                     function=lambda x: None,
                                                     batched=True)

        with dataset.formatted_as(type="tensorflow", columns="numbers"):
            times["map no-op batched tensorflow"] = map(
                dataset, function=lambda x: None, batched=True)

        times["map fast-tokenizer batched"] = map(dataset,
                                                  function=tokenize,
                                                  batched=True)

        times["filter"] = filter(dataset)

        # Activate later when tokenizer support batched inputs
        # with dataset.formatted_as(type='numpy'):
        #     times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))
示例#30
0
 def _info(self):
     return nlp.DatasetInfo(
         # nlp.features.FeatureConnectors
         features=nlp.Features({
             "buggy": nlp.Value("string"),
             "fixed": nlp.Value("string")
         }),
         supervised_keys=None,
     )