Exemplo n.º 1
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features(
             {
                 "text": nlp.Value("string"),
                 "topics": nlp.Sequence(nlp.Value("string")),
                 "lewis_split": nlp.Value("string"),
                 "cgis_split": nlp.Value("string"),
                 "old_id": nlp.Value("string"),
                 "new_id": nlp.Value("string"),
                 "places": nlp.Sequence(nlp.Value("string")),
                 "people": nlp.Sequence(nlp.Value("string")),
                 "orgs": nlp.Sequence(nlp.Value("string")),
                 "exchanges": nlp.Sequence(nlp.Value("string")),
                 "date": nlp.Value("string"),
                 "title": nlp.Value("string"),
             }
         ),
         # No default supervised_keys (as we have to pass both premise
         # and hypothesis as input).
         supervised_keys=None,
         homepage="https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html",
         citation=_CITATION,
     )
Exemplo n.º 2
0
    def _info(self):
        features = {
            feature: nlp.Value("string")
            for feature in self.config.features
        }
        if self.config.name == 'task1':
            features["id"] = nlp.Value("int64")
            features["text"] = nlp.Value("string")
            features["url"] = nlp.Value("string")
            features["label"] = nlp.ClassLabel(names=["0", "1"])
        elif self.config.name == 'task2':
            features["id"] = nlp.Value("int64")
            features["label"] = nlp.ClassLabel(names=["0", "1"])
            features["last"] = nlp.Value("bool")
            features["sent_num"] = nlp.Value("int64")
            features["sentence"] = nlp.Value("string")
        elif self.config.name in ['task3_document', 'task3_sentence']:
            features['token'] = nlp.Sequence(nlp.Value("string"))
            features['label'] = nlp.Sequence(
                nlp.ClassLabel(names=[
                    'B-etime', 'B-fname', 'B-loc', 'B-organizer',
                    'B-participant', 'B-place', 'B-target', 'B-trigger',
                    'I-etime', 'I-fname', 'I-loc', 'I-organizer',
                    'I-participant', 'I-place', 'I-target', 'I-trigger', 'O'
                ]))
        else:
            raise SystemExit('Invalid task name')

        return nlp.DatasetInfo(features=nlp.Features(features), )
Exemplo n.º 3
0
 def _info(self):
     return nlp.MetricInfo(
         description=_DESCRIPTION,
         citation=_CITATION,
         inputs_description=_KWARGS_DESCRIPTION,
         features=nlp.Features({
             'predictions': nlp.Sequence(nlp.Value('string', id='token'), id='sequence'),
             'references': nlp.Sequence(nlp.Sequence(nlp.Value('string', id='token'), id='sequence'), id='references'),
         }),
         codebase_urls=["https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py"],
         reference_urls=["https://en.wikipedia.org/wiki/BLEU",
                         "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213"]
     )
Exemplo n.º 4
0
 def _info(self):
     return nlp.MetricInfo(
         description=_DESCRIPTION,
         citation=_CITATION,
         homepage="https://github.com/chakki-works/seqeval",
         inputs_description=_KWARGS_DESCRIPTION,
         features=nlp.Features({
             'predictions':
             nlp.Sequence(nlp.Value('string', id='label'), id='sequence'),
             'references':
             nlp.Sequence(nlp.Value('string', id='label'), id='sequence'),
         }),
         codebase_urls=["https://github.com/chakki-works/seqeval"],
         reference_urls=["https://github.com/chakki-works/seqeval"])
Exemplo n.º 5
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features(
             {
                 "id": nlp.Value("string"),
                 "tokens": nlp.Sequence(nlp.Value("string")),
                 "labels": nlp.Sequence(nlp.Value("string")),
             }
         ),
         supervised_keys=None,
         homepage="http://noisy-text.github.io/2017/emerging-rare-entities.html",
         citation=_CITATION,
     )
Exemplo n.º 6
0
 def _info(self):
     return nlp.MetricInfo(
         description=_DESCRIPTION,
         citation=_CITATION,
         inputs_description=_KWARGS_DESCRIPTION,
         features=nlp.Features({
             'predictions':
             nlp.Sequence(nlp.Value('string', id='token'), id='sequence'),
             'references':
             nlp.Sequence(nlp.Sequence(nlp.Value('string', id='token'),
                                       id='sequence'),
                          id='references'),
         }),
         codebase_urls=["https://github.com/cnap/gec-ranking"],
         reference_urls=["https://github.com/cnap/gec-ranking"])
Exemplo n.º 7
0
 def test_nested_features(self):
     expected_num_examples = len(get_test_nested_examples())
     with tempfile.TemporaryDirectory() as tmp_cache_dir:
         builder = NestedBeamDataset(cache_dir=tmp_cache_dir,
                                     beam_runner="DirectRunner")
         builder.download_and_prepare()
         self.assertTrue(
             os.path.exists(
                 os.path.join(tmp_cache_dir, "nested_beam_dataset",
                              "default", "0.0.0",
                              "nested_beam_dataset-train.arrow")))
         self.assertDictEqual(
             builder.info.features,
             nlp.Features({"a": nlp.Sequence({"b": nlp.Value("string")})}))
         dset = builder.as_dataset()
         self.assertEqual(dset["train"].num_rows, expected_num_examples)
         self.assertEqual(dset["train"].info.splits["train"].num_examples,
                          expected_num_examples)
         self.assertDictEqual(dset["train"][0],
                              get_test_nested_examples()[0][1])
         self.assertDictEqual(
             dset["train"][expected_num_examples - 1],
             get_test_nested_examples()[expected_num_examples - 1][1])
         self.assertTrue(
             os.path.exists(
                 os.path.join(tmp_cache_dir, "nested_beam_dataset",
                              "default", "0.0.0", "dataset_info.json")))
Exemplo n.º 8
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features(
             {
                 "id": nlp.Value("string"),
                 "source": nlp.Value("string"),
                 "tokens": nlp.Sequence(nlp.Value("string")),
                 "labels": nlp.Sequence(nlp.Value("string")),
                 "nested-labels": nlp.Sequence(nlp.Value("string")),
             }
         ),
         supervised_keys=None,
         homepage="https://sites.google.com/site/germeval2014ner/",
         citation=_CITATION,
     )
Exemplo n.º 9
0
 def _info(self):
     return nlp.DatasetInfo(
         features=nlp.Features(
             {"a": nlp.Sequence({"b": nlp.Value("string")})}),
         # No default supervised_keys.
         supervised_keys=None,
     )
Exemplo n.º 10
0
    def _info(self):
        # TODO(xtreme): Specifies the nlp.DatasetInfo object
        features = {
            text_feature: nlp.Value("string")
            for text_feature in six.iterkeys(self.config.text_features)
        }
        if "answers" in features.keys():
            features["answers"] = nlp.features.Sequence({
                "answer_start":
                nlp.Value("int32"),
                "text":
                nlp.Value("string")
            })
        if self.config.name.startswith("PAWS-X"):
            features["label"] = nlp.Value("string")
        if self.config.name == "XNLI":
            features["gold_label"] = nlp.Value("string")

        if self.config.name.startswith("PAN-X"):
            features = nlp.Features({
                "words":
                nlp.Sequence(nlp.Value("string")),
                "ner_tags":
                nlp.Sequence(nlp.Value("string")),
                "langs":
                nlp.Sequence(nlp.Value("string")),
            })
        return nlp.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=self.config.description + "\n" + _DESCRIPTION,
            # nlp.features.FeatureConnectors
            features=nlp.Features(
                features
                # These are the features of your dataset like images, labels ...
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/google-research/xtreme" + "\t" +
            self.config.url,
            citation=self.config.citation + "\n" + _CITATION,
        )
Exemplo n.º 11
0
 def _info(self):
     return nlp.MetricInfo(
         description=_DESCRIPTION,
         citation=_CITATION,
         homepage="https://github.com/Tiiiger/bert_score",
         inputs_description=_KWARGS_DESCRIPTION,
         features=nlp.Features({
             'predictions': nlp.Value('string', id='sequence'),
             'references': nlp.Sequence(nlp.Value('string', id='sequence'), id='references'),
         }),
         codebase_urls=["https://github.com/Tiiiger/bert_score"],
         reference_urls=["https://github.com/Tiiiger/bert_score",
                         "https://arxiv.org/abs/1904.09675"]
     )
Exemplo n.º 12
0
 def _info(self):
     return nlp.DatasetInfo(
         description=_DESCRIPTION,
         features=nlp.Features(
             {
                 "id": nlp.Value("string"),
                 "text": nlp.Value("string"),
                 "title": nlp.Value("string"),
                 "embeddings": nlp.Sequence(nlp.Value("float32")),
             }
         )
         if self.config.with_embeddings
         else nlp.Features({"id": nlp.Value("string"), "text": nlp.Value("string"), "title": nlp.Value("string")}),
         supervised_keys=None,
         homepage="https://github.com/facebookresearch/DPR",
         citation=_CITATION,
     )
Exemplo n.º 13
0
    def test_benchmark_speed(self):
        times = {}
        read_functions = (
            read_unformated,
            read_formatted_as_numpy,
            read_batch_unformated,
            read_batch_formatted_as_numpy,
            read_col_unformated,
            read_col_formatted_as_numpy,
        )
        with tempfile.TemporaryDirectory() as tmp_dir:
            feats = nlp.Features(
                {"image": Array2D(SPEED_TEST_SHAPE, dtype="float32")})
            data = generate_examples(features=feats,
                                     num_examples=SPEED_TEST_N_EXAMPLES)
            write_func = write_array2d
            times[write_func.__name__] = write_func(feats, data, tmp_dir)
            for read_func in read_functions:
                times[read_func.__name__ + " after " +
                      write_func.__name__] = read_func(feats, tmp_dir)

        with tempfile.TemporaryDirectory() as tmp_dir:
            feats = nlp.Features({
                "image":
                nlp.Sequence(
                    nlp.Sequence(nlp.Value("float32"), SPEED_TEST_SHAPE[1]),
                    SPEED_TEST_SHAPE[0])
            })
            data = generate_examples(features=feats,
                                     num_examples=SPEED_TEST_N_EXAMPLES)
            write_func = write_nested_sequence
            times[write_func.__name__] = write_func(feats, data, tmp_dir)
            for read_func in read_functions:
                times[read_func.__name__ + " after " +
                      write_func.__name__] = read_func(feats, tmp_dir)

        with tempfile.TemporaryDirectory() as tmp_dir:
            feats = nlp.Features({
                "image":
                nlp.Sequence(nlp.Value("float32"),
                             SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1])
            })
            data = generate_examples(features=feats,
                                     num_examples=SPEED_TEST_N_EXAMPLES)
            write_func = write_flattened_sequence
            times[write_func.__name__] = write_func(feats, data, tmp_dir)
            for read_func in read_functions:
                times[read_func.__name__ + " after " +
                      write_func.__name__] = read_func(feats, tmp_dir)

        benchmark_df = pd.DataFrame.from_dict(times,
                                              orient="index",
                                              columns=["time"]).sort_index()
        warn("Speed benchmark:\n" + str(benchmark_df))
        self.assertGreater(
            times["write_nested_sequence"], times["write_array2d"] * 10
        )  # At leasr 10 times faster (it is supposed to be ~25 times faster)
        self.assertGreater(
            times["read_batch_formatted_as_numpy after write_nested_sequence"],
            times["read_batch_formatted_as_numpy after write_array2d"],
        )  # At least faster (it is supposed to be ~2 times faster)
        self.assertGreater(
            times["read_batch_unformated after write_nested_sequence"],
            times["read_batch_formatted_as_numpy after write_array2d"] * 5,
        )  # At least 5 times faster (it is supposed to be ~10 times faster)
Exemplo n.º 14
0
def benchmark_iterating():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    functions = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "pandas",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "torch",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "tensorflow",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]

    functions_shuffled = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]
    with tempfile.TemporaryDirectory() as tmp_dir:
        print("generating dataset")
        features = nlp.Features({
            "list": nlp.Sequence(nlp.Value("float32")),
            "numbers": nlp.Value("float32")
        })
        dataset = generate_example_dataset(
            os.path.join(tmp_dir, "dataset.arrow"),
            features,
            num_examples=SPEED_TEST_N_EXAMPLES,
            seq_shapes={"list": (100, )},
        )
        print("first set of iterations")
        for func, kwargs in functions:
            print(func.__name__, str(kwargs))
            times[func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

        print("shuffling dataset")
        dataset = dataset.shuffle()
        print("Second set of iterations (after shuffling")
        for func, kwargs in functions_shuffled:
            print("shuffled ", func.__name__, str(kwargs))
            times["shuffled " + func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))
Exemplo n.º 15
0
 def _info(self):
     if self.config.gameplay_scenario == "original":
         return nlp.DatasetInfo(
             # This is the description that will appear on the datasets page.
             description=self._DESCRIPTION,
             # nlp.features.FeatureConnectors
             features=nlp.Features(
                 {
                     "id": nlp.Value("int32"),
                     "target_id": nlp.Value("int32"),
                     "timestamp": nlp.Value("string"),
                     "status": nlp.Value("string"),
                     "image": {
                         "id": nlp.Value("int32"),
                         "file_name": nlp.Value("string"),
                         "flickr_url": nlp.Value("string"),
                         "coco_url": nlp.Value("string"),
                         "height": nlp.Value("int32"),
                         "width": nlp.Value("int32"),
                         "vg_id": nlp.Value("int32"),
                         "vg_url": nlp.Value("string"),
                     },
                     "qas": nlp.features.Sequence(
                         {"question": nlp.Value("string"), "answer": nlp.Value("string"), "id": nlp.Value("int32")}
                     ),
                     "objects": nlp.features.Sequence(
                         {
                             "id": nlp.Value("int32"),
                             "bbox": nlp.Sequence(nlp.Value("float32"), length=4),
                             "category": nlp.Value("string"),
                             "area": nlp.Value("float32"),
                             "category_id": nlp.Value("int32"),
                             "segment": nlp.features.Sequence(nlp.features.Sequence(nlp.Value("float32"))),
                         }
                     ),
                 }
             ),
             # If there's a common (input, target) tuple from the features,
             # specify them here. They'll be used if as_supervised=True in
             # builder.as_dataset.
             supervised_keys=None,
             # Homepage of the dataset for documentation
             homepage="https://compguesswhat.github.io/",
             citation=self._CITATION,
         )
     elif self.config.gameplay_scenario == "zero_shot":
         return nlp.DatasetInfo(
             # This is the description that will appear on the datasets page.
             description=self._DESCRIPTION,
             # nlp.features.FeatureConnectors
             features=nlp.Features(
                 {
                     "id": nlp.Value("int32"),
                     "target_id": nlp.Value("string"),
                     "status": nlp.Value("string"),
                     "image": {
                         "id": nlp.Value("int32"),
                         "file_name": nlp.Value("string"),
                         "coco_url": nlp.Value("string"),
                         "height": nlp.Value("int32"),
                         "width": nlp.Value("int32"),
                         "license": nlp.Value("int32"),
                         "open_images_id": nlp.Value("string"),
                         "date_captured": nlp.Value("string"),
                     },
                     "objects": nlp.features.Sequence(
                         {
                             "id": nlp.Value("string"),
                             "bbox": nlp.Sequence(nlp.Value("float32"), length=4),
                             "category": nlp.Value("string"),
                             "area": nlp.Value("float32"),
                             "category_id": nlp.Value("int32"),
                             "IsOccluded": nlp.Value("int32"),
                             "IsTruncated": nlp.Value("int32"),
                             "segment": nlp.features.Sequence(
                                 {
                                     "MaskPath": nlp.Value("string"),
                                     "LabelName": nlp.Value("string"),
                                     "BoxID": nlp.Value("string"),
                                     "BoxXMin": nlp.Value("string"),
                                     "BoxXMax": nlp.Value("string"),
                                     "BoxYMin": nlp.Value("string"),
                                     "BoxYMax": nlp.Value("string"),
                                     "PredictedIoU": nlp.Value("string"),
                                     "Clicks": nlp.Value("string"),
                                 }
                             ),
                         }
                     ),
                 }
             ),
             # If there's a common (input, target) tuple from the features,
             # specify them here. They'll be used if as_supervised=True in
             # builder.as_dataset.
             supervised_keys=None,
             # Homepage of the dataset for documentation
             homepage="https://compguesswhat.github.io/",
             citation=self._CITATION,
         )
Exemplo n.º 16
0
class Cord19Docrel(nlp.GeneratorBasedBuilder):
    """CORD-19 document relation dataset."""

    BUILDER_CONFIGS = [
        Cord19DocrelConfig(
            name="docs",
            description="document text and meta data",
            features={
                "doi": nlp.Value("string"),
                "cord19_id": nlp.Value("string"),
                "s2_id": nlp.Value("string"),
                "title": nlp.Value("string"),
                "abstract": nlp.Value("string"),
                "arxivId": nlp.Value("string"),
                "venue": nlp.Value("string"),
                "year": nlp.Value("int16"),
                "citations_count": nlp.Value("int32"),
                "references_count": nlp.Value("int32"),
                "authors": nlp.Sequence(nlp.Value('string', id='author_name')),
            },
            data_url=DATA_URL,
        ),
        Cord19DocrelConfig(
            name="relations",
            description=" relation data",
            features={
                DOC_A_COL: nlp.Value("string"),
                DOC_B_COL: nlp.Value("string"),
                LABEL_COL: nlp.Sequence(nlp.Value('string', id='label'))
            },
            data_url=DATA_URL,
        ),
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION + self.config.description,
            features=nlp.Features(self.config.features),
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        arch_path = dl_manager.download_and_extract(self.config.data_url)

        if "relations" in self.config.name:
            train_file = "train.csv"
            test_file = "test.csv"

            generators = []

            for k in [1, 2, 3, 4]:
                folds_path = os.path.join(arch_path, 'folds', str(k))
                generators += [
                    nlp.SplitGenerator(name=get_train_split(k),
                                       gen_kwargs={
                                           'filepath':
                                           os.path.join(
                                               folds_path, train_file)
                                       }),
                    nlp.SplitGenerator(name=get_test_split(k),
                                       gen_kwargs={
                                           'filepath':
                                           os.path.join(folds_path, test_file)
                                       })
                ]
            return generators

        elif "docs" in self.config.name:
            # docs
            docs_file = os.path.join(arch_path, "docs.jsonl")

            return [
                nlp.SplitGenerator(name=nlp.Split('docs'),
                                   gen_kwargs={"filepath": docs_file}),
            ]
        else:
            raise ValueError()

    @staticmethod
    def get_dict_value(d, key, default=None):
        if key in d:
            return d[key]
        else:
            return default

    def _generate_examples(self, filepath):
        """Generate docs + rel examples."""

        if "relations" in self.config.name:
            df = csv.read_csv(filepath).to_pandas()

            for idx, row in df.iterrows():
                yield idx, {
                    DOC_A_COL: row[DOC_A_COL],
                    DOC_B_COL: row[DOC_B_COL],
                    LABEL_COL: row[LABEL_COL].split(','),
                }

        elif self.config.name == "docs":

            with open(filepath, 'r') as f:
                for i, line in enumerate(f):
                    doc = json.loads(line)

                    yield i, {
                        'doi':
                        str(self.get_dict_value(
                            doc, 'doi')),  # cast to str otherwise float
                        'cord19_id':
                        self.get_dict_value(doc, 'cord19_id'),
                        's2_id':
                        self.get_dict_value(doc, 's2_id'),
                        'title':
                        self.get_dict_value(doc, 'title'),
                        'abstract':
                        self.get_dict_value(doc, 'abstract'),
                        'arxivId':
                        self.get_dict_value(doc, 'arxivId'),
                        'venue':
                        str(self.get_dict_value(doc, 'venue') or ''),
                        'year':
                        int(self.get_dict_value(doc, 'year', 0) or 0),
                        'citations_count':
                        int(
                            self.get_dict_value(doc, 'citations_count', 0)
                            or 0),
                        'references_count':
                        int(
                            self.get_dict_value(doc, 'references_count', 0)
                            or 0),
                        'authors':
                        self.get_dict_value(doc, 'authors', []),
                    }
Exemplo n.º 17
0
class AclDocrel(nlp.GeneratorBasedBuilder):
    """ACL anthology document relation dataset."""

    BUILDER_CONFIGS = [
        AclDocrelConfig(
            name="docs",
            description="document text and meta data",
            features={
                "s2_id": nlp.Value("string"),
                "title": nlp.Value("string"),
                "abstract": nlp.Value("string"),
                "arxivId": nlp.Value("string"),
                "doi": nlp.Value("string"),
                "venue": nlp.Value("string"),
                "year": nlp.Value("int16"),
                "citations_count": nlp.Value("int32"),
                "references_count": nlp.Value("int32"),
                "authors": nlp.Sequence(nlp.Value('string', id='author_name')),
            },
            data_url=DATA_URL,
        ),
        AclDocrelConfig(
            name="relations",
            description=" relation data",
            features={
                "from_s2_id": nlp.Value("string"),
                "to_s2_id": nlp.Value("string"),
                "label": nlp.Sequence(nlp.Value('string', id='label'))
            },
            data_url=DATA_URL,
        ),
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION + self.config.description,
            features=nlp.Features(self.config.features),
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        arch_path = dl_manager.download_and_extract(self.config.data_url)

        if self.config.name == "relations":
            train_file = "train.csv"
            test_file = "test.csv"

            generators = []

            for k in [1, 2, 3, 4]:
                folds_path = os.path.join(arch_path, 'folds', str(k))
                generators += [
                    nlp.SplitGenerator(name=get_train_split(k),
                                       gen_kwargs={
                                           'filepath':
                                           os.path.join(
                                               folds_path, train_file)
                                       }),
                    nlp.SplitGenerator(name=get_test_split(k),
                                       gen_kwargs={
                                           'filepath':
                                           os.path.join(folds_path, test_file)
                                       })
                ]
            return generators

        elif self.config.name == "docs":
            # docs
            docs_file = os.path.join(arch_path, "docs.jsonl")

            return [
                nlp.SplitGenerator(name=nlp.Split('docs'),
                                   gen_kwargs={"filepath": docs_file}),
            ]
        else:
            raise ValueError()

    @staticmethod
    def get_s2_value(s2, key, default=None):
        if key in s2:
            return s2[key]
        else:
            return default

    def _generate_examples(self, filepath):
        """Generate docs + rel examples."""

        if self.config.name == "relations":
            df = csv.read_csv(filepath).to_pandas()

            for idx, row in df.iterrows():
                yield idx, dict(from_s2_id=row['from_s2_id'],
                                to_s2_id=row['to_s2_id'],
                                label=row['label'].split(','))

        elif self.config.name == "docs":

            with open(filepath, 'r') as f:
                for i, line in enumerate(f):
                    s2 = json.loads(line)

                    yield i, {
                        's2_id':
                        self.get_s2_value(s2, 'paperId'),
                        'title':
                        self.get_s2_value(s2, 'title'),
                        'abstract':
                        self.get_s2_value(s2, 'abstract'),
                        'doi':
                        self.get_s2_value(s2, 'doi'),
                        'arxivId':
                        self.get_s2_value(s2, 'arxivId'),
                        'venue':
                        self.get_s2_value(s2, 'venue'),
                        'year':
                        self.get_s2_value(s2, 'year', 0),
                        'citations_count':
                        len(self.get_s2_value(s2, 'citations', [])),
                        'references_count':
                        len(self.get_s2_value(s2, 'references', [])),
                        'authors': [
                            a['name']
                            for a in self.get_s2_value(s2, 'authors', [])
                            if 'name' in a
                        ],
                    }