def test_sort_reverse_many_rows(): from dataflows import sort_rows f = Flow( ({ 'a': i, 'b': i % 5 } for i in range(1000)), sort_rows(key='{b}{a}', reverse=True, batch_size=0), ) results, _, _ = f.results() results = results[0] assert results[0:2] == [{'a': 999, 'b': 4}, {'a': 994, 'b': 4}] assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}]
def test_fullouter_join_dump_different_keys(): from dataflows import Flow, join, dump_to_path from decimal import Decimal data1 = [ { "col1": 1.531, "col2": "hello" }, { "col1": 1.132, "col2": "goodbye" }, ] data2 = [ { "colA": 1.531, "colB": "123" }, { "colA": 1.132, "colB": 1.132 }, ] f = Flow( data1, data2, join("res_1", ["col1"], "res_2", ["colA"], {"col2": { "name": "col2", "aggregate": "first" }}, mode="full-outer"), dump_to_path(out_path='out/test_join_dump'), ) results = f.results()[0][0] assert results == [ { 'colA': Decimal('1.531'), 'col2': 'hello', 'colB': '123' }, { 'colA': Decimal('1.132'), 'col2': 'goodbye', 'colB': 1.132 }, ]
def test_rename_resource(): from dataflows import Flow, printer, PackageWrapper, ResourceWrapper def rename(package: PackageWrapper): package.pkg.descriptor['resources'][0]['name'] = 'renamed' yield package.pkg res_iter = iter(package) first: ResourceWrapper = next(res_iter) yield first.it yield from package f = Flow(({'a': x} for x in range(10)), rename, printer()) results, dp, stats = f.results() print(dp.descriptor) assert dp.descriptor['resources'][0]['name'] == 'renamed'
def test_find_replace(): from dataflows import find_replace f = Flow( data, find_replace([ dict(name='y', patterns=[ dict(find='a', replace='Apple'), dict(find='b', replace='Banana'), dict(find='c', replace='Coconut'), ]) ])) results, _, _ = f.results() y = [r['y'] for r in results[0]] assert y == ['Apple', 'Banana', 'Coconut']
def test_set_type_resources(): from dataflows import Flow, set_type, validate f = Flow([dict(a=str(i)) for i in range(10)], [dict(b=str(i)) for i in range(10)], [dict(c='0_' + str(i)) for i in range(10)], set_type('a', resources='res_[1]', type='integer'), set_type('b', resources=['res_2'], type='integer'), set_type('[cd]', resources=-1, type='number', groupChar='_'), validate()) results, dp, stats = f.results() print(dp.descriptor) assert results[0][1]['a'] == 1 assert results[1][3]['b'] == 3 assert results[2][8]['c'] == 8.0
def test_update_resource(): from dataflows import Flow, printer, update_resource f = Flow( *[ ({k: x} for x in range(10)) for k in 'abcdef' ], update_resource(['res_1', 'res_3', 'res_5'], source='thewild'), printer() ) results, dp, stats = f.results() print(dp.descriptor) assert dp.descriptor['resources'][0]['source'] == 'thewild' assert dp.descriptor['resources'][2]['source'] == 'thewild' assert dp.descriptor['resources'][4]['source'] == 'thewild'
def test_add_computed_field(): from dataflows import add_computed_field f = Flow( data, add_computed_field([ dict(source=['x', 'x'], target='xx', operation='multiply'), dict(target='f', operation='format', with_='{y} - {x}') ])) results, dp, stats = f.results() results = list(results[0]) xx = [x['xx'] for x in results] f = [x['f'] for x in results] assert xx == [1, 4, 9] assert f == ['a - 1', 'b - 2', 'c - 3']
def test_filter_rows_callable(): from dataflows import filter_rows f = Flow( [ {'a': 1, 'b': 3}, {'a': 2, 'b': 3}, {'a': 1, 'b': 4}, {'a': 2, 'b': 4}, ], filter_rows(condition=lambda row: row['a'] > 1 and row['b'] < 4), ) results, _, _ = f.results() assert results[0][0] == dict(a=2, b=3) assert len(results[0]) == 1 assert len(results) == 1
def test_example_1(): from dataflows import Flow data = [ {'data': 'Hello'}, {'data': 'World'} ] def lowerData(row): row['data'] = row['data'].lower() f = Flow( data, lowerData ) data, *_ = f.results()
def test_concatenate_multifield(): from dataflows import concatenate f = Flow([ { 'a': 1, 'b': 2, 'c': None }, { 'a': 2, 'b': None, 'c': 3 }, { 'a': 3, 'c': 4 }, { 'a': 3, 'b': 6, 'c': 4 }, ], concatenate({ 'f1': ['a'], 'f2': ['b', 'c'], })) results, _, _ = f.results() assert results[0] == [ { 'f1': 1, 'f2': 2 }, { 'f1': 2, 'f2': 3 }, { 'f1': 3, 'f2': 4 }, { 'f1': 3, 'f2': 4 }, ]
def test_filter_rows(): from dataflows import filter_rows f = Flow( [ {'a': 1, 'b': 3}, {'a': 2, 'b': 3}, {'a': 1, 'b': 4}, {'a': 2, 'b': 4}, ], filter_rows(equals=[dict(a=1)]), filter_rows(not_equals=[dict(b=3)]), ) results, _, _ = f.results() assert results[0][0] == dict(a=1, b=4) assert len(results[0]) == 1 assert len(results) == 1
def test_duplicate(): from dataflows import duplicate a = [ {'a': 1, 'b': 3}, {'a': 2, 'b': 3}, {'a': 3, 'b': 1}, {'a': 4, 'b': 1}, ] f = Flow( a, duplicate(), ) results, _, _ = f.results() assert list(results[0]) == a assert list(results[1]) == a
def test_load_limit_rows(): from dataflows import load flow = Flow(load('data/beatles.csv', limit_rows=3)) data = flow.results()[0] assert data == [[ { 'name': 'john', 'instrument': 'guitar' }, { 'name': 'paul', 'instrument': 'bass' }, { 'name': 'george', 'instrument': 'guitar' }, ]]
def test_delete_fields_regex(): from dataflows import load, delete_fields flow = Flow( load('data/regex.csv'), delete_fields(['temperature (24h)'], regex=False), ) data = flow.results()[0] assert data == [[ { 'city': 'london' }, { 'city': 'paris' }, { 'city': 'rome' }, ]]
def test_add_field(): from dataflows import Flow, add_field f = Flow( (dict(a=i) for i in range(3)), add_field('b', 'string', 'b'), add_field('c', 'number'), add_field('d', 'boolean', title='mybool'), ) results, dp, _ = f.results() assert results == [[{ 'a': 0, 'b': 'b', 'c': None, 'd': None }, { 'a': 1, 'b': 'b', 'c': None, 'd': None }, { 'a': 2, 'b': 'b', 'c': None, 'd': None }]] assert dp.descriptor == \ {'profile': 'data-package', 'resources': [{'name': 'res_1', 'path': 'res_1.csv', 'profile': 'tabular-data-resource', 'schema': {'fields': [{'format': 'default', 'name': 'a', 'type': 'integer'}, {'format': 'default', 'name': 'b', 'type': 'string'}, {'format': 'default', 'name': 'c', 'type': 'number'}, {'format': 'default', 'name': 'd', 'title': 'mybool', 'type': 'boolean'}], 'missingValues': ['']}}]}
def test_unpivot(): from dataflows import unpivot f = Flow( data, unpivot( [ dict( name='x', keys=dict( field='x-value' ) ), dict( name='y', keys=dict( field='y-value' ) ), ], [ dict( name='field', type='string' ) ], dict( name='the-value', type='any' ) ) ) results, _, _ = f.results() assert results[0] == [ dict(zip(['field', 'the-value'], r)) for r in [ ['x-value', 1], ['y-value', 'a'], ['x-value', 2], ['y-value', 'b'], ['x-value', 3], ['y-value', 'c'], ] ]
def test_sort_rows(): from dataflows import sort_rows f = Flow( [ { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, { 'a': 3, 'b': 1 }, { 'a': 4, 'b': 1 }, ], sort_rows(key='{b}{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ { 'a': 3, 'b': 1 }, { 'a': 4, 'b': 1 }, { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, ]
def test_add_computed_field_func(): from dataflows import add_computed_field data = [dict(x=i) for i in range(3)] f = Flow( data, add_computed_field([ dict(target=dict(name='sq', type='integer'), operation=lambda row: row['x']**2), dict(target='f', operation='format', with_='{x} - {x}') ])) results, *_ = f.results() results = list(results[0]) assert results == [ dict(x=0, sq=0, f='0 - 0'), dict(x=1, sq=1, f='1 - 1'), dict(x=2, sq=4, f='2 - 2'), ]
def test_join_full_outer(): from dataflows import load, set_type, join flow = Flow( load('data/population.csv'), load('data/cities.csv'), join( source_name='population', source_key=['id'], target_name='cities', target_key=['id'], fields={'population': { 'name': 'population' }}, mode='full-outer', ), ) data = flow.results()[0] assert data == [[ { 'id': 1, 'city': 'london', 'population': 8 }, { 'id': 2, 'city': 'paris', 'population': 2 }, { 'id': 3, 'city': 'rome', 'population': None }, { 'id': 4, 'city': None, 'population': 3 }, ]]
def test_set_type_regex(): from dataflows import load, set_type flow = Flow( load('data/regex.csv'), set_type('city', type='string'), set_type('temperature (24h)', type='integer', regex=False), ) data = flow.results()[0] assert data == [[ { 'city': 'london', 'temperature (24h)': 23 }, { 'city': 'paris', 'temperature (24h)': 26 }, { 'city': 'rome', 'temperature (24h)': 21 }, ]]
def test_deduplicate(): from dataflows import deduplicate, set_primary_key a = [ { 'a': 1, 'b': 3, 'c': 'First' }, { 'a': 2, 'b': 3, 'c': 'First' }, { 'a': 1, 'b': 3, 'c': '!First' }, { 'a': 1, 'b': 2, 'c': 'First' }, { 'a': 2, 'b': 3, 'c': '!First' }, ] f = Flow( a, set_primary_key(['a', 'b']), deduplicate(), ) results, _, _ = f.results() assert set(x['c'] for x in results[0]) == {'First'}
def test_add_metadata(): from dataflows import add_metadata f = Flow(data, add_metadata(author='Adam Kariv')) _, dp, _ = f.results() assert dp.descriptor['author'] == 'Adam Kariv'
def test_unpivot_any_resources(): from dataflows import unpivot, validate data1 = [ dict( [('name', 'ike{}'.format(i))] + [(str(year), year + i) for year in range(1990, 2020, 10)] ) for i in range(5) ] data2 = [ dict( [('city', 'mike{}'.format(i))] + [(str(year), year + i) for year in range(2050, 2080, 10)] ) for i in range(5) ] f = Flow( data1, data2, unpivot( [ dict( name='([0-9]+)', keys=dict( year='\\1' ) ) ], [ dict( name='year', type='integer' ) ], dict( name='amount', type='integer' ) ), validate() ) results, _, _ = f.results() assert results[0] == [ dict(zip(['name', 'year', 'amount'], r)) for r in [ ['ike0', 1990, 1990], ['ike0', 2000, 2000], ['ike0', 2010, 2010], ['ike1', 1990, 1991], ['ike1', 2000, 2001], ['ike1', 2010, 2011], ['ike2', 1990, 1992], ['ike2', 2000, 2002], ['ike2', 2010, 2012], ['ike3', 1990, 1993], ['ike3', 2000, 2003], ['ike3', 2010, 2013], ['ike4', 1990, 1994], ['ike4', 2000, 2004], ['ike4', 2010, 2014], ] ] assert results[1] == [ dict(zip(['city', 'year', 'amount'], r)) for r in [ ['mike0', 2050, 2050], ['mike0', 2060, 2060], ['mike0', 2070, 2070], ['mike1', 2050, 2051], ['mike1', 2060, 2061], ['mike1', 2070, 2071], ['mike2', 2050, 2052], ['mike2', 2060, 2062], ['mike2', 2070, 2072], ['mike3', 2050, 2053], ['mike3', 2060, 2063], ['mike3', 2070, 2073], ['mike4', 2050, 2054], ['mike4', 2060, 2064], ['mike4', 2070, 2074], ] ]
def test_sort_rows_number(): from dataflows import sort_rows f = Flow( [ { 'a': 0.1 }, { 'a': -3 }, { 'a': -4 }, { 'a': 10 }, { 'a': 8 }, { 'a': 0 }, { 'a': -1000000 }, { 'a': 1000000 }, { 'a': -0.1 }, { 'a': -0.2 }, { 'a': 0.2 }, { 'a': -1000001 }, { 'a': 1000001 }, { 'a': 6 }, { 'a': -10 }, { 'a': -0.001 }, { 'a': 0.001 }, { 'a': 1 }, { 'a': -1 }, ], sort_rows(key='{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ { 'a': -1000001 }, { 'a': -1000000 }, { 'a': -10 }, { 'a': -4 }, { 'a': -3 }, { 'a': -1 }, { 'a': -0.2 }, { 'a': -0.1 }, { 'a': -0.001 }, { 'a': 0 }, { 'a': 0.001 }, { 'a': 0.1 }, { 'a': 0.2 }, { 'a': 1 }, { 'a': 6 }, { 'a': 8 }, { 'a': 10 }, { 'a': 1000000 }, { 'a': 1000001 }, ]
def test_example_4(): from dataflows import Flow, set_type f = Flow(country_population(), set_type('population', type='number', groupChar=',')) data, dp, _ = f.results()
from pprint import pprint from dataflows import Flow, load, unpivot # Select unpivoing fields unpivoting_fields = [{'name': 'treatment_(\w)', 'keys': {'treatment': r'\1'}}] # A newly created column header would be 'year' with type 'year': extra_keys = [{'name': 'treatment', 'type': 'string'}] # And values will be placed in the 'result' column with type 'string': extra_value = {'name': 'result', 'type': 'string'} # Run flow flow = Flow(load('layouts/wide.csv'), unpivot(unpivoting_fields, extra_keys, extra_value)) results, package, stats = flow.results() print('[Data]\n') pprint(results[0]) print('\n[Meta]\n') pprint(package.descriptor)
def test_example_3(): from dataflows import Flow, printer f = Flow(country_population(), ) data, *_ = f.results()
def test_force_temporal_format(): import datetime from dataflows import load, update_resource, dump_to_path # Dump Flow( load('data/temporal.csv', name='temporal', override_fields={ 'datetime': { 'type': 'datetime', 'outputFormat': '%y|%m|%d %H|%M|%S' }, 'date': { 'outputFormat': '%y|%m|%d' }, 'time': { 'outputFormat': '%H|%M|%S' }, }), dump_to_path('out/force_temporal_format', temporal_format_property='outputFormat')).process() # Load flow = Flow(load('out/force_temporal_format/datapackage.json')) data, package, stats = flow.results() # Assert assert package.descriptor['resources'][0]['schema'] == { 'fields': [ { 'format': 'default', 'name': 'event', 'type': 'string' }, { 'format': '%y|%m|%d %H|%M|%S', 'name': 'datetime', 'type': 'datetime' }, { 'format': '%y|%m|%d', 'name': 'date', 'type': 'date' }, { 'format': '%H|%M|%S', 'name': 'time', 'type': 'time' }, ], 'missingValues': [''], } assert data == [[{ 'event': 'start', 'datetime': datetime.datetime(2015, 1, 2, 15, 30, 45), 'date': datetime.date(2015, 1, 2), 'time': datetime.time(15, 30, 45), }, { 'event': 'end', 'datetime': datetime.datetime(2016, 6, 25, 8, 10, 4), 'date': datetime.date(2016, 6, 25), 'time': datetime.time(8, 10, 4), }]]
def test_load_duplicate_headers(): from dataflows import load flow = Flow(load('data/duplicate_headers.csv'), ) with pytest.raises(ValueError) as excinfo: flow.results() assert 'duplicate headers' in str(excinfo.value)
def flow(parameters, *_): files_dump_to_path = parameters['files_dump_to_path'] data_dump_to_path = parameters.get('data_dump_to_path') def _download_gdrive_data(): stats = defaultdict(int) file_sources = parameters['file_sources'] folder_id = parameters['google_drive_csv_folder_id'] files_dir = os.path.join(files_dump_to_path, "files") os.makedirs(files_dir, exist_ok=True) client = get_client() existing_files = {} if os.path.exists(os.path.join(files_dump_to_path, "datapackage.json")): for row in Flow( load(os.path.join(files_dump_to_path, "datapackage.json"))).results()[0][0]: existing_files[row["name"]] = row for id, name, version in list_files(client, folder_id): source = file_sources.get(name) if source: assert name.endswith( ".csv"), "only csv file sources are supported" stats['relevant_source_files'] += 1 row = { "id": id, "name": name, "version": version, "source": source, "resource_name": "%s__%s" % (source, stats['relevant_source_files']) } yield row if (os.path.exists( os.path.join(files_dump_to_path, "files", name)) and name in existing_files and existing_files[name]["id"] == id and existing_files[name]["version"] == version): logging.info("existing file, will not redownload: %s" % name) else: logging.info("downloading file: %s" % name) get_file(client, id, os.path.join(files_dump_to_path, "files", name)) if stats['relevant_source_files'] != len(file_sources): raise Exception("source files mismatch") files_flow = Flow( _download_gdrive_data(), update_resource(-1, name="gdrive_data_files", path="gdrive_data_files.csv", **{"dpp:streaming": True}), dump_to_path(files_dump_to_path), printer()) data_flow_args = [] for file_row in files_flow.results()[0][0]: data_flow_args += [ load(os.path.join(files_dump_to_path, "files", file_row["name"]), strip=False, infer_strategy=load.INFER_STRINGS, deduplicate_headers=True, cast_strategy=load.CAST_TO_STRINGS, on_error=ignore, limit_rows=parameters.get("limit_rows"), encoding="utf-8"), update_resource(-1, name=file_row["resource_name"], path=file_row["name"], **{"dpp:streaming": True}) ] if data_dump_to_path: data_flow_args += [dump_to_path(data_dump_to_path)] return Flow(*data_flow_args)