def test_copy_not_null_rows_cli(tmpdir, synthetic_dataset):
    target_url = 'file://' + os.path.join(tmpdir.strpath, 'copied_data')

    _main([
        synthetic_dataset.url, target_url, '--not-null-fields',
        'string_array_nullable'
    ])
    with make_reader(target_url, num_epochs=1) as reader:
        not_null_data = list(reader)
    assert len(not_null_data) < len(synthetic_dataset.data)
def test_copy_some_fields_with_repartition_cli(tmpdir, synthetic_dataset):
    target_path = os.path.join(tmpdir.strpath, 'copied_data')
    target_url = 'file://' + target_path
    _main([
        synthetic_dataset.url, target_url, '--field-regex', r'\bid\b',
        '--partition-count', '1'
    ])

    # Check reparititioning
    assert 1 == len(glob.glob(os.path.join(target_path, 'part-*')))

    # Check we the regex filter worked
    with make_reader(target_url, num_epochs=1) as reader:
        assert list(reader.schema.fields.keys()) == ['id']
def test_copy_and_overwrite_cli(tmpdir, synthetic_dataset):
    target_url = 'file:///' + os.path.join(tmpdir.strpath, 'copied_data')
    _main([synthetic_dataset.url, target_url])

    with make_reader(target_url, num_epochs=1) as reader:
        for row in reader:
            actual = row._asdict()
            expected = next(d for d in synthetic_dataset.data
                            if d['id'] == actual['id'])
            np.testing.assert_equal(actual, expected)

    with pytest.raises(AnalysisException, match='already exists'):
        _main([synthetic_dataset.url, target_url])

    _main([synthetic_dataset.url, target_url, '--overwrite'])