def test_input_huggingface_datasets_with_no_split(dataset_configs): with pytest.raises(ValueError): result = from_huggingface_datasets( dataset_configs['adversarial']['dataset_path'], name=dataset_configs['adversarial']['name'], ) for _ in result: pass
def test_input_huggingface_datasets_with_tweet_dataset(dataset_configs): result = list( from_huggingface_datasets( dataset_configs['tweet_eval']['dataset_path'], name=dataset_configs['tweet_eval']['name'], split=dataset_configs['tweet_eval']['split'], )) assert isinstance(result[0], Document) assert result[0].text
def test_input_huggingface_datasets_with_filter_fields_and_no_resolver( dataset_configs): with pytest.raises(ValueError): result = from_huggingface_datasets( dataset_configs['adversarial']['dataset_path'], name=dataset_configs['adversarial']['name'], filter_fields=True, ) for _ in result: pass
def test_input_huggingface_datasets_with_field_resolver(dataset_configs): field_resolver = {'question': 'text'} result = list( from_huggingface_datasets( dataset_configs['adversarial']['dataset_path'], field_resolver=field_resolver, name=dataset_configs['adversarial']['name'], split=dataset_configs['adversarial']['split'], )) assert isinstance(result[0], Document) assert result[0].text assert 'title' in result[0].tags
def test_input_huggingface_datasets_from_csv_file(dataset_configs): field_resolver = {'question': 'text'} result = list( from_huggingface_datasets( 'csv', field_resolver=field_resolver, data_files='docs.csv', split='train', )) assert len(result) == 2 assert isinstance(result[0], Document) assert result[0].text == 'What are the symptoms?' assert result[0].tags['source'] == 'testsrc'
def test_client_huggingface_datasets(protocol, mocker, func_name): with Flow(protocol=protocol).add() as f: mock = mocker.Mock() getattr(f, f'{func_name}')( from_huggingface_datasets( dataset_path='adversarial_qa', size=2, name='adversarialQA', split='test', field_resolver={'question': 'text'}, ), on_done=mock, ) mock.assert_called_once()
def test_input_huggingface_datasets_from_path(dataset_configs, size, sampling_rate): result = list( from_huggingface_datasets( dataset_configs['adversarial']['dataset_path'], size=size, name=dataset_configs['adversarial']['name'], sampling_rate=sampling_rate, split=dataset_configs['adversarial']['split'], )) if size is not None: assert len(result) == size assert isinstance(result[0], Document)