def test_filter_datasets_by_language(self): _api = HfApi() f = DatasetFilter(languages="en") datasets = _api.list_datasets(filter=f) self.assertGreater(len(datasets), 0) self.assertTrue("languages:en" in datasets[0].tags) args = DatasetSearchArguments() f = DatasetFilter(languages=(args.languages.en, args.languages.fr)) datasets = _api.list_datasets(filter=f) self.assertGreater(len(datasets), 0) self.assertTrue("languages:en" in datasets[0].tags) self.assertTrue("languages:fr" in datasets[0].tags)
def test_filter_datasets_with_cardData(self): _api = HfApi() datasets = _api.list_datasets(cardData=True) self.assertGreater( sum([ getattr(dataset, "cardData", None) is not None for dataset in datasets ]), 0, ) datasets = _api.list_datasets() self.assertTrue( all([ getattr(dataset, "cardData", None) is None for dataset in datasets ]))
def test_list_datasets_full(self): _api = HfApi() datasets = _api.list_datasets(full=True) self.assertGreater(len(datasets), 100) dataset = datasets[0] self.assertIsInstance(dataset, DatasetInfo) self.assertTrue(any(dataset.cardData for dataset in datasets))
def test_filter_datasets_by_task_ids(self): _api = HfApi() f = DatasetFilter(task_ids="automatic-speech-recognition") datasets = _api.list_datasets(filter=f) self.assertGreater(len(datasets), 0) self.assertTrue( "task_ids:automatic-speech-recognition" in datasets[0].tags)
def test_filter_datasets_by_task_categories(self): _api = HfApi() f = DatasetFilter(task_categories="audio-classification") datasets = _api.list_datasets(filter=f) self.assertGreater(len(datasets), 0) self.assertTrue( "task_categories:audio-classification" in datasets[0].tags)
def test_filter_datasets_by_author_and_name(self): _api = HfApi() f = DatasetFilter(author="huggingface", dataset_name="DataMeasurementsFiles") datasets = _api.list_datasets(filter=f) self.assertEqual(len(datasets), 1) self.assertTrue("huggingface" in datasets[0].author) self.assertTrue("DataMeasurementsFiles" in datasets[0].id)
def test_list_datasets_search(self): _api = HfApi() datasets = _api.list_datasets(search="wikipedia") self.assertGreater(len(datasets), 10) self.assertIsInstance(datasets[0], DatasetInfo)
def test_list_datasets_author(self): _api = HfApi() datasets = _api.list_datasets(author="huggingface") self.assertGreater(len(datasets), 1) self.assertIsInstance(datasets[0], DatasetInfo)
def test_filter_datasets_by_size_categories(self): _api = HfApi() f = DatasetFilter(size_categories="100K<n<1M") datasets = _api.list_datasets(filter=f) self.assertGreater(len(datasets), 0) self.assertTrue("size_categories:100K<n<1M" in datasets[0].tags)
def test_filter_datasets_by_multilinguality(self): _api = HfApi() f = DatasetFilter(multilinguality="yes") datasets = _api.list_datasets(filter=f) self.assertGreater(len(datasets), 0) self.assertTrue("multilinguality:yes" in datasets[0].tags)
def test_filter_datasets_by_language_creator(self): _api = HfApi() f = DatasetFilter(language_creators="crowdsourced") datasets = _api.list_datasets(filter=f) self.assertGreater(len(datasets), 0) self.assertTrue("language_creators:crowdsourced" in datasets[0].tags)
def test_filter_datasets_by_benchmark(self): _api = HfApi() f = DatasetFilter(benchmark="raft") datasets = _api.list_datasets(filter=f) self.assertGreater(len(datasets), 0) self.assertTrue("benchmark:raft" in datasets[0].tags)
def test_list_datasets(self): _api = HfApi() datasets = _api.list_datasets() self.assertGreater(len(datasets), 100) self.assertIsInstance(datasets[0], DatasetInfo)
def test_staging_list_datasets(self): _api = HfApi(endpoint=ENDPOINT_STAGING) _ = _api.list_datasets()