def test_unpivot_any_resources(): from dataflows import unpivot, validate data1 = [ dict([('name', 'ike{}'.format(i))] + [(str(year), year + i) for year in range(1990, 2020, 10)]) for i in range(5) ] data2 = [ dict([('city', 'mike{}'.format(i))] + [(str(year), year + i) for year in range(2050, 2080, 10)]) for i in range(5) ] f = Flow( data1, data2, unpivot([dict(name='([0-9]+)', keys=dict(year='\\1'))], [dict(name='year', type='integer')], dict(name='amount', type='integer')), validate()) results, _, _ = f.results() assert results[0] == [ dict(zip(['name', 'year', 'amount'], r)) for r in [ ['ike0', 1990, 1990], ['ike0', 2000, 2000], ['ike0', 2010, 2010], ['ike1', 1990, 1991], ['ike1', 2000, 2001], ['ike1', 2010, 2011], ['ike2', 1990, 1992], ['ike2', 2000, 2002], ['ike2', 2010, 2012], ['ike3', 1990, 1993], ['ike3', 2000, 2003], ['ike3', 2010, 2013], ['ike4', 1990, 1994], ['ike4', 2000, 2004], ['ike4', 2010, 2014], ] ] assert results[1] == [ dict(zip(['city', 'year', 'amount'], r)) for r in [ ['mike0', 2050, 2050], ['mike0', 2060, 2060], ['mike0', 2070, 2070], ['mike1', 2050, 2051], ['mike1', 2060, 2061], ['mike1', 2070, 2071], ['mike2', 2050, 2052], ['mike2', 2060, 2062], ['mike2', 2070, 2072], ['mike3', 2050, 2053], ['mike3', 2060, 2063], ['mike3', 2070, 2073], ['mike4', 2050, 2054], ['mike4', 2060, 2064], ['mike4', 2070, 2074], ] ]
def AFRR_Data(): unpivoting_fields = [{ 'name': 'aFRR_DownActivated', 'keys': { 'product': 'aFRR_DownActivated' } }, { 'name': 'aFRR_UpActivated', 'keys': { 'product': 'aFRR_UpActivated' } }] extra_keys = [{'name': 'product', 'type': 'string'}] extra_value = {'name': 'amount', 'type': 'number'} flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000', format="json", property="result.records", name="fact_afrr"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('afrr'), # Normalize/unpivot: unpivot(unpivoting_fields, extra_keys, extra_value), add_computed_field([ dict(target=dict(name='PriceArea', type='string'), operation='constant', with_='DK1'), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_='dummy'), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_='dummy') ]), add_price, delete_fields(fields=[ 'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK', 'aFRR_UpPriceEUR' ]), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_afrr', title='Automatic Frequency Restoration Reserves', source= 'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c' ), printer(), dump_to_path('afrr_data')) flow.process()
def test_unpivot(): from dataflows import unpivot f = Flow( data, unpivot( [ dict( name='x', keys=dict( field='x-value' ) ), dict( name='y', keys=dict( field='y-value' ) ), ], [ dict( name='field', type='string' ) ], dict( name='the-value', type='any' ) ) ) results, _, _ = f.results() assert results[0] == [ dict(zip(['field', 'the-value'], r)) for r in [ ['x-value', 1], ['y-value', 'a'], ['x-value', 2], ['y-value', 'b'], ['x-value', 3], ['y-value', 'c'], ] ]
load(source_url,format='xlsx',sheet='ESTIMATES',headers=17), load(source_url,format='xlsx',sheet='LOW VARIANT',headers=17), load(source_url,format='xlsx',sheet='MEDIUM VARIANT',headers=17), load(source_url,format='xlsx',sheet='HIGH VARIANT',headers=17), load(source_url,format='xlsx',sheet='CONSTANT-FERTILITY',headers=17), load(source_url,format='xlsx',sheet='CONSTANT-MORTALITY',headers=17), load(source_url,format='xlsx',sheet='INSTANT-REPLACEMENT',headers=17), load(source_url,format='xlsx',sheet='MOMENTUM',headers=17), load(source_url,format='xlsx',sheet='ZERO-MIGRATION',headers=17), load(source_url,format='xlsx',sheet='NO CHANGE',headers=17), checkpoint('loaded'), delete_fields(fields=['Index', 'Variant', 'Notes']), rename_resources, unpivot( unpivot_fields=[{'name': '([0-9]{4})', 'keys': {'year': '\\1'}}], extra_keys=[{'name': 'year', 'type': 'year'}], extra_value={'name': 'population', 'type': 'number'}, resources='population-estimates' ), unpivot( unpivot_fields=[{'name': '([0-9]{4})', 'keys': {'year': '\\1'}}], extra_keys=[{'name': 'year', 'type': 'year'}], extra_value={'name': 'population', 'type': 'number'}, resources=resource_names[1:] ), add_computed_field([ { "operation": "format", "target": "Region", "with": "{Region, subregion, country or area *}" }, {
unpivoting_fields = [{ 'name': '([0-9]+\/[0-9]+\/[0-9]+)', 'keys': { 'Date': r'\1' } }] extra_keys = [{'name': 'Date', 'type': 'string'}] extra_value = {'name': 'Case', 'type': 'number'} Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ 'name': 'Date', 'patterns': [{ 'find': '/', 'replace': '-' }] }]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_19-covid-Confirmed', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Confirmed={
def flow(self): if len(self.errors) == 0: primaryKey = [ self.ct_to_fn(f) for f in self.config.get(CONFIG_PRIMARY_KEY) ] fieldOptions = {} dataTypes = dict( (ct['name'], dict(ct.get('options', {}), type=ct['dataType'])) for ct in self.config.get(CONFIG_TAXONOMY_CT) if 'dataType' in ct) for mf in self.config.get(CONFIG_MODEL_MAPPING): ct = mf.get('columnType') name = mf['name'] fieldOptions[name] = {} if ct is not None: fieldOptions[name].update(dataTypes.get(ct, {})) fieldOptions[name].update(mf.get('options', {})) fieldOptions[name]['columnType'] = ct extraFieldDefs = self.join_mapping_taxonomy('extra', fieldOptions) normalizeFieldDef = self.join_mapping_taxonomy( 'normalize', fieldOptions) unpivotFields = [ dict( name=f['name'], keys=f['normalize'], ) for f in self.config.get(CONFIG_MODEL_MAPPING) if 'normalize' in f ] if len(normalizeFieldDef) > 0: normalizeFieldDef = normalizeFieldDef[0] else: normalizeFieldDef = None steps = [ self.create_fdp(), self.datetime_handler(), self.set_consts(fieldOptions), validate(on_error=ignore), ] + ([ unpivot(unpivotFields, extraFieldDefs, normalizeFieldDef, regex=False, resources=RESOURCE_NAME), ] if normalizeFieldDef else []) + [ self.copy_names_to_titles(), self.rename([(self.ct_to_fn(f['columnType']), f['name']) for f in self.config.get(CONFIG_MODEL_MAPPING) if f.get('columnType') is not None]), update_resource(RESOURCE_NAME, path='out.csv'), # *[ # set_type( # self.ct_to_fn(f['columnType']), # columnType=f['columnType'], # **fieldOptions.get(f['columnType'], {}), # resources=RESOURCE_NAME, # on_error=ignore # ) # for f in self.config.get(CONFIG_MODEL_MAPPING) # if f.get('columnType') is not None # ], set_primary_key(primaryKey, resources=RESOURCE_NAME) if len(primaryKey) else None # printer() ] f = Flow(*steps) return f
def process_stack_demand(stack): def collect_cats(): F = 'כלל המדגם' def f(rows): cat = None for row in rows: if F in row: v = row[F] if v.startswith('סך הכל '): cat = v[7:] elif v.startswith('--- '): if not v.endswith('ללא פירוט'): subcat = v[4:] row['category'] = cat row['subcategory'] = subcat yield row else: yield row return DF.Flow( DF.add_field('category', 'string', resources=-1), DF.add_field('subcategory', 'string', resources=-1), f, DF.delete_fields([F], resources=-1), ) def fix_nones(row): row['demand_pct'] = row['demand_pct'] or 0 key = 'stack:demand' try: demand_stacks = _cache.get(key) except KeyError: demand_stacks = DF.Flow( DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2), collect_cats(), DF.update_schema(-1, missingValues=['--']), DF.unpivot( unpivot_fields=[dict( name='(.+) \\([A-Z]\\)', keys=dict( neighborhood='\\1' ), )], extra_keys=[dict( name='neighborhood', type='string' )], extra_value=dict( name='demand_pct', type='number' ), resources=-1 ), DF.validate(), DF.duplicate('demand', 'demand_stacks'), DF.join_with_self('demand', ['category', 'subcategory'], dict( category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max') )), DF.join( 'demand', ['category', 'subcategory'], 'demand_stacks', ['category', 'subcategory'], dict( max_demand=None ) ), fix_nones, DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)), DF.add_field('value', 'number', lambda r: r['demand_pct']), DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6), DF.delete_fields(['demand_pct', 'max_demand']), DF.sort_rows('{score}', reverse=True), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_display=r['display'], score_value=float(r['value']), geometry_score=float(r['score']), )), DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict( category=None, subcategory=None, scores=dict(aggregate='array'), )), DF.add_field('card', 'object', lambda r: dict( title='ביקוש ל{}'.format(r['subcategory']), content='', scores=r['scores'], test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_') )), DF.join_with_self('demand_stacks', ['category'], dict( category=None, cards=dict(name='card', aggregate='array'), )), DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')), ).results()[0][0] _cache.set(key, demand_stacks) cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards'] stack.update(dict( layout='scores', currentField='neighborhood', map=True )) stack.setdefault('cards', []).extend(cards)
def process_demographics(stack): key = 'stack:demographics' try: demographics_cards = _cache.get(key) except KeyError: def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f ) def map_to_cards(): MAP = { ("דו''ח אג''ס לפי עולים וותיקים", ("סה''כ עולים",) ): 'immigrants', ("דו''ח אג''ס לפי קבוצות גיל", ('0-5', '6-12') ): 'kids', ("דו''ח אג''ס לפי קבוצות גיל", ('13-17',) ): 'teenagers', ("דו''ח אג''ס לפי קבוצות גיל", ('60-64', '65-69', '70-74', '75-120') ): 'elderly', ("דו''ח אג''ס לפי קבוצות גיל", ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59') ): 'adults', } def f(rows): for row in rows: for (source, kinds), kind in MAP.items(): if row['source'] == source and row['kind'] in kinds: row['kind'] = kind yield row return f s2n = dict( (int(stat_area), f['properties']['title']) for f in get_neighborhood_features() for stat_area in f['properties']['stat_areas'] ) MAP2 = dict( adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0), kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1), teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2), elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3), immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4), ) demographics_cards = DF.Flow( *[ DF.load(f, headers=4) for f in glob.glob('demographics/*.csv') ], DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]), DF.add_field('total', 'number', lambda r: r.get("סה''כ")), DF.delete_fields(["אג''ס", "סה''כ "]), DF.unpivot([dict( name="([-'א-ת0-9 ].+)", keys=dict( kind=r'\1' ) )], [dict( name='kind', type='string' )], dict( name='value', type='number' )), DF.validate(), add_source(), map_to_cards(), DF.concatenate(dict( total=[], value=[], kind=[], stat_id=[] )), DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))), DF.filter_rows(lambda r: r['neighborhood']), DF.join_with_self('concat', ['neighborhood', 'kind'], dict( neighborhood=None, kind=None, total=dict(aggregate='sum'), value=dict(aggregate='sum'), )), DF.duplicate('concat', 'maxes'), DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)), DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict( total=None, )), DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total'] DF.sort_rows('{score_value}', reverse=True), DF.duplicate('maxes', 'demographics'), DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))), DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)), DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']), DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_value=float(r['score_value']), score_display=r['score_display'], geometry_score=float(r['geometry_score']), )), DF.join_with_self('demographics', ['kind'], dict( kind=None, scores=dict(aggregate='array'), )), DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]), DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]), DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]), DF.sort_rows('{order}'), DF.delete_fields(['kind']), ).results()[0][0] _cache.set(key, demographics_cards) # features = [ # dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0])) # for r in DF.Flow( # DF.load('geo/stat-areas/stat-areas/datapackage.json'), # ).results()[0][0] # ] # geometry=dict(type='FeatureCollection', features=features) stack.update(dict( map=True, scheme='green', currentField='neighborhood', layout='scores', # geometry=geometry )) stack.setdefault('cards', []).extend(demographics_cards)
'Criminal Damage 2008-09', 'Criminal Damage 2009-10', 'Criminal Damage 2010-11', 'Criminal Damage 2011-12', 'Criminal Damage 2012-13', 'Criminal Damage 2013-14', 'Criminal Damage 2014-15', 'Criminal Damage 2015-16', 'Criminal Damage 2016-17', 'Drugs 1999-00', 'Drugs 2000-01', 'Drugs 2001-02', 'Drugs 2002-03', 'Drugs 2003-04', 'Drugs 2004-05', 'Drugs 2005-06', 'Drugs 2006-07', 'Drugs 2007-08', 'Drugs 2008-09', 'Drugs 2009-10', 'Drugs 2010-11', 'Drugs 2011-12', 'Drugs 2012-13', 'Drugs 2013-14', 'Drugs 2014-15', 'Drugs 2015-16', 'Drugs 2016-17', 'Other Notifiable Offences 1999-00', 'Other Notifiable Offences 2000-01', 'Other Notifiable Offences 2001-02', 'Other Notifiable Offences 2002-03', 'Other Notifiable Offences 2003-04', 'Other Notifiable Offences 2004-05', 'Other Notifiable Offences 2005-06', 'Other Notifiable Offences 2006-07', 'Other Notifiable Offences 2007-08', 'Other Notifiable Offences 2008-09', 'Other Notifiable Offences 2009-10', 'Other Notifiable Offences 2010-11', 'Other Notifiable Offences 2011-12', 'Other Notifiable Offences 2012-13', 'Other Notifiable Offences 2013-14', 'Other Notifiable Offences 2014-15', 'Other Notifiable Offences 2015-16', 'Other Notifiable Offences 2016-17' ]), set_format_and_name_crime, unpivot(unpivot_fields, extra_keys, extra_value), remove_duplicates, dump_to_path()).process()
def flow(parameters): return Flow( unpivot(parameters.get('unpivot'), parameters.get('extraKeyFields'), parameters.get('extraValueField'), resources=parameters.get('resources')))
def data_pull_csv(): unpivoting_fields = [{ "name": r"([0-9]+\/[0-9]+\/[0-9]+)", "keys": { "Date": r"\1" } }] extra_keys = [{"name": "Date", "type": "string"}] extra_value = {"name": "Case", "type": "number"} Flow( load(f"{BASE_URL}{CONFIRMED}"), load(f"{BASE_URL}{RECOVERED}"), load(f"{BASE_URL}{DEATH}"), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ "name": "Date", "patterns": [{ "find": "/", "replace": "-" }] }]), to_normal_date, set_type("Date", type="date", format="%d-%m-%y", resources=None), set_type("Case", type="number", resources=None), join( source_name="time_series_19-covid-Confirmed", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_19-covid-Deaths", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Confirmed={ "name": "Case", "aggregate": "first" }), ), join( source_name="time_series_19-covid-Recovered", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_19-covid-Deaths", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Recovered={ "name": "Case", "aggregate": "first" }), ), add_computed_field( target={ "name": "Deaths", "type": "number" }, operation="format", with_="{Case}", ), delete_fields(["Case"]), update_resource( "time_series_19-covid-Deaths", name="time-series-19-covid-combined", path=RAW_OUTPUT_CSV, ), dump_to_path(), ).results()[0]
def london_gva(link): Flow(load(link, sheet=3), filter_gva, unpivot(unpivoting_fields, extra_keys, extra_value), remove_duplicates, set_format_and_name, dump_to_path(), printer(num_rows=1)).process()
row['year'] = 2020 return True return False datasets_flow = DF.Flow( *[transpose(sheet) for sheet in sheets], DF.unpivot( [{ 'name': '(' + '([-0-9 ]+' + '[א-ת ]+)' + '|' + '([א-ת ]*' + '[0-9/]{2,})' + '.+)', 'keys': { 'year': '\\1' } }], [{ 'name': 'year', 'type': 'string' }], { 'name': 'value', 'type': 'number' }, ), verify_unused_fields(), DF.concatenate(FIELD_MAPPING, target=dict(name='out')), fix_urls(['source_url']), ensure_chart_title(), fix_languages(), DF.add_field('order_index', 'integer'), lambda rows: ({
dict(name='field_name', patterns=[ dict(find='re-pattern-to-find', replace='re-pattern-to-replace-with'), ]) ]), {% endif %} {% if 'delete_fields' in processing %} delete_fields(['field_name']), # Pass a list of field names to delete from the data {% endif %} {% if 'set_type' in processing %} set_type('field_name', type='number', constraints=dict(minimum=3)), # There are quite a few options you can use here # Take a look at https://frictionlessdata.io/specs/table-schema/ # Or you can simply use validate() here instead {% endif %} {% if 'unpivot' in processing %} unpivot(unpivot_fields, extra_keys, extra_value), # See documentation on the meaning of each of these parameters {% endif %} {% if 'custom' in processing %} my_custom_processing, {% endif %} # Save the results add_metadata(name='{{slug}}', title='''{{title}}'''), {% if output in ('print', 'print_n_pkg') %} printer(), {% endif %} {% if output == 'list' %} {% endif %} {% if output in ('dp_csv', 'print_n_pkg') %} dump_to_path('{{slug}}'), {% endif %} {% if output == 'dp_csv_zip' %}