def test_validate(): from dataflows import Flow, validate, set_type from dataflows.base.schema_validator import ignore data = [ { 'a': 1, 'b': 1 }, { 'a': 2, 'b': 2 }, { 'a': 3, 'b': 3 }, { 'a': 4, 'b': 'a' }, ] class on_error(): def __init__(self): self.bad_row, self.bad_index = None, None def __call__(self, name, row, i, e): self.bad_row, self.bad_index = row, i return False # Schema validator handler = on_error() res, *_ = Flow(data, set_type('b', type='integer', on_error=ignore), validate(on_error=handler)).results() assert len(res[0]) == 3 assert handler.bad_row == {'a': 4, 'b': 'a'} assert handler.bad_index == 3 # Field validator handler = on_error() res, *_ = Flow(data, set_type('b', type='integer', on_error=ignore), validate('a', lambda v: v < 4, on_error=handler)).results() assert len(res[0]) == 3 assert handler.bad_row == {'a': 4, 'b': 'a'} assert handler.bad_index == 3 # Row validator handler = on_error() res, *_ = Flow(data, set_type('b', type='integer', on_error=ignore), validate(lambda v: v['a'] < 4, on_error=handler)).results() assert len(res[0]) == 3 assert handler.bad_row == {'a': 4, 'b': 'a'} assert handler.bad_index == 3
def generate_package(): package_flow = Flow( add_metadata( name="unicode-emojis", title="UTS #51 Unicode Emoji", descriptor=( "List of emojis available from the Unicode Consortium. " "More information can be found in the Unicode® Technical Standard #51." ), sources=[ { "name": "unicode-emoji", "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt", "title": "UTS #51 Unicode Emoji", }, ], licenses=[ { "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", } ], keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"], ), load(load_source="data/emojis.csv", format="csv",), validate(), dump_to_path(), ) package_flow.process()
def flow(*_): DF.Flow( DF.load(filename, name='welfare'), DF.add_field('activity_name', 'string', lambda r: r['שם השירות (ציבורי)']), DF.filter_rows(lambda r: r['activity_name']), DF.add_field( 'activity_description', 'array', lambda r: [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)'] ]), DF.add_field( 'history', 'array', lambda r: [ dict( year=2019, unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(), subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1]. strip(), subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[ 1].strip(), ) ]), DF.add_field('target_audience', 'array', splitter('אוכלוסייה')), DF.add_field('subject', 'array', splitter('תחום ההתערבות')), DF.add_field('intervention', 'array', splitter('אופן התערבות')), DF.select_fields(FIELDS), DF.add_field('publisher_name', 'string', 'משרד הרווחה'), DF.add_field('min_year', 'integer', 2019), DF.add_field('max_year', 'integer', 2019), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(), DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process() return DF.Flow( DF.load('tmp/activities-welfare/datapackage.json'), DF.update_resource(-1, **{'dpp:streaming': True}), )
def flow(*_): return DF.Flow( get_updated_sources(), DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')), DF.validate(), DF.filter_rows(lambda r: r['publication_id']), DF.add_field('tender_type', 'string', lambda r: TENDER_KINDS[r['tender_type_he']], **{'es:keyword': True}), DF.join_with_self( 'tenders', KEY, dict((k, dict(aggregate='last')) for k in list(TENDER_MAPPING.keys()) + ['tender_type'])), DF.set_type('publication_id', type='string', transform=str), DF.set_type('supplier_id', type='string', transform=str), DF.set_type('tender_id', type='string', transform=lambda v: v or 'none'), DF.set_type('.+_date', type='date', format='%d.%m.%Y', on_error=DF.schema_validator.clear), DF.set_type('subjects', type='string', transform=lambda v: ';'.join(x.strip() for x in v.split(',')) if v else ''), DF.set_type('claim_date', type='datetime', transform=lambda v, field_name, row: datetime.datetime. combine(v, row['claim_time'] or datetime.time(0)) if v else None), DF.set_type('tender_type_he', **{'es:keyword': True}), DF.delete_fields(['claim_time']), DF.add_field( 'page_url', 'string', lambda r: f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'), DF.add_field('page_title', 'string', lambda r: r['description']), DF.add_field('reason', 'string', lambda r: r['regulation']), DF.add_field('documents', 'array', []), DF.add_field('contact', 'string'), DF.add_field('contact_email', 'string'), DF.validate(), DF.update_resource(-1, **{'dpp:streaming': True}), DF.printer(), )
def test_unpivot_any_resources(): from dataflows import unpivot, validate data1 = [ dict([('name', 'ike{}'.format(i))] + [(str(year), year + i) for year in range(1990, 2020, 10)]) for i in range(5) ] data2 = [ dict([('city', 'mike{}'.format(i))] + [(str(year), year + i) for year in range(2050, 2080, 10)]) for i in range(5) ] f = Flow( data1, data2, unpivot([dict(name='([0-9]+)', keys=dict(year='\\1'))], [dict(name='year', type='integer')], dict(name='amount', type='integer')), validate()) results, _, _ = f.results() assert results[0] == [ dict(zip(['name', 'year', 'amount'], r)) for r in [ ['ike0', 1990, 1990], ['ike0', 2000, 2000], ['ike0', 2010, 2010], ['ike1', 1990, 1991], ['ike1', 2000, 2001], ['ike1', 2010, 2011], ['ike2', 1990, 1992], ['ike2', 2000, 2002], ['ike2', 2010, 2012], ['ike3', 1990, 1993], ['ike3', 2000, 2003], ['ike3', 2010, 2013], ['ike4', 1990, 1994], ['ike4', 2000, 2004], ['ike4', 2010, 2014], ] ] assert results[1] == [ dict(zip(['city', 'year', 'amount'], r)) for r in [ ['mike0', 2050, 2050], ['mike0', 2060, 2060], ['mike0', 2070, 2070], ['mike1', 2050, 2051], ['mike1', 2060, 2061], ['mike1', 2070, 2071], ['mike2', 2050, 2052], ['mike2', 2060, 2062], ['mike2', 2070, 2072], ['mike3', 2050, 2053], ['mike3', 2060, 2063], ['mike3', 2070, 2073], ['mike4', 2050, 2054], ['mike4', 2060, 2064], ['mike4', 2070, 2074], ] ]
def conference_csv(): flow = Flow( # Load inputs load( od19_base + od19_feedback, name='feedback', format='csv', ), load( od19_base + od19_analysis, name='analysis', format='csv', ), # Process them set_type("Anzahl.*", type='integer', resources='analysis'), delete_fields([ "Anzahl Auflistung", ".*\\(Formel\\)", ".*Duplikate", ], resources='analysis'), not_empty_groupcol, # Save the results add_metadata( name='opendatach19', title='''Opendata.ch/2019 Forum''', licenses=[{ "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0" }], maintainers=[{ "name": "Oleg Lavrovsky", "web": "https://datalets.ch/" }], views=[{ "name": "Groups", "resources": ["analysis"], "spec": { "group": "Alle " "Bedürfnisse" "", "series": ["Anzahl Auflistung (Zahl)"], "type": "bar" }, "specType": "simple", "title": "Topic counts" }]), printer(), validate(), dump_to_path('data/opendatach19'), ) flow.process()
def test_update_schema(): from dataflows import Flow, printer, update_schema, validate f = Flow([['a', '-'], ['a', 0]], update_schema(-1, missingValues=['-']), validate(), printer()) results, dp, stats = f.results() print(dp.descriptor) assert results[0] == [ dict(col0='a', col1=None), dict(col0='a', col1=0), ]
def flow(parameters): resources = parameters.get('resources') regex = parameters.get('regex', True) if 'types' in parameters: return Flow(*[ set_type(name, resources=resources, regex=regex, **options) if options is not None else delete_fields([name], resources=resources) for name, options in parameters['types'].items() ]) else: return Flow(validate())
def test_validate(): from dataflows import Flow, validate, set_type, printer, ValidationError, exceptions def adder(row): row['a'] += 0.5 row['a'] = str(row['a']) f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'), adder, validate(), printer()) with pytest.raises(exceptions.ProcessorError) as excinfo: f.process() assert isinstance(excinfo.value.cause, ValidationError)
def test_validate(): from dataflows import Flow, validate, set_type, printer, ValidationError def adder(row): row['a'] += 0.5 row['a'] = str(row['a']) f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'), adder, validate(), printer()) try: _ = f.process() assert False except ValidationError: pass
def test_set_type_resources(): from dataflows import Flow, set_type, validate f = Flow([dict(a=str(i)) for i in range(10)], [dict(b=str(i)) for i in range(10)], [dict(c='0_' + str(i)) for i in range(10)], set_type('a', resources='res_[1]', type='integer'), set_type('b', resources=['res_2'], type='integer'), set_type('[cd]', resources=-1, type='number', groupChar='_'), validate()) results, dp, stats = f.results() print(dp.descriptor) assert results[0][1]['a'] == 1 assert results[1][3]['b'] == 3 assert results[2][8]['c'] == 8.0
def flow(*_): return DF.Flow( scraper(), DF.filter_rows(lambda row: row['page_title'] and row['page_title']. startswith('קול קורא'), resources=-1), DF.set_type('start_date', type='date', format='%d/%m/%Y', resources=-1), DF.set_type('claim_date', type='datetime', format='%d/%m/%Y', resources=-1), calculate_publication_id(9), DF.validate(), DF.update_resource(-1, name='negev_galil', **{PROP_STREAMING: True}), )
def clean_data(filename: str, location: str) -> None: """Clean and validate data with `dataflows`, creating data packages in the process, one for each file.""" global FILE_NAME FILE_NAME = f"{location}-{filename}" clean_directory, _, processing_directory = set_location_dirs(location) exported_file = f"{clean_directory}/{filename}" _ = Flow( load( f"{processing_directory}/{filename}.csv", name=FILE_NAME, ), change_path, add_field("NameFIPS", "string"), concat_name_columns, delete_fields(["Name", "FIPS"]), set_type("Data", type="any"), validate(), dump_to_path(exported_file), ).process()[1]
def flow(*_): return DF.Flow( scraper(), DF.filter_rows(lambda row: row['page_title'] and row['page_title'].startswith('קול קורא'), resources=-1), page_parser(), DF.add_field('decision', 'string', default=lambda row: row['parsed']['decision'], resources=-1), DF.add_field('start_date', 'date', format='%d/%m/%Y', default=lambda row: row['parsed']['start_date'], resources=-1), DF.add_field('claim_date', 'datetime', format='%d/%m/%Y', default=lambda row: row['parsed']['claim_date'], resources=-1), DF.add_field('documents', 'array', default=lambda row: row['parsed']['documents'], resources=-1), DF.delete_fields(['parsed'], resources=-1), calculate_publication_id(9), DF.validate(), DF.update_resource( -1, name='negev_galil', **{ PROP_STREAMING: True } ), )
def flow(self): if len(self.errors) == 0: primaryKey = [ self.ct_to_fn(f) for f in self.config.get(CONFIG_PRIMARY_KEY) ] fieldOptions = {} dataTypes = dict( (ct['name'], dict(ct.get('options', {}), type=ct['dataType'])) for ct in self.config.get(CONFIG_TAXONOMY_CT) if 'dataType' in ct) for mf in self.config.get(CONFIG_MODEL_MAPPING): ct = mf.get('columnType') name = mf['name'] fieldOptions[name] = {} if ct is not None: fieldOptions[name].update(dataTypes.get(ct, {})) fieldOptions[name].update(mf.get('options', {})) fieldOptions[name]['columnType'] = ct extraFieldDefs = self.join_mapping_taxonomy('extra', fieldOptions) normalizeFieldDef = self.join_mapping_taxonomy( 'normalize', fieldOptions) unpivotFields = [ dict( name=f['name'], keys=f['normalize'], ) for f in self.config.get(CONFIG_MODEL_MAPPING) if 'normalize' in f ] if len(normalizeFieldDef) > 0: normalizeFieldDef = normalizeFieldDef[0] else: normalizeFieldDef = None steps = [ self.create_fdp(), self.datetime_handler(), self.set_consts(fieldOptions), validate(on_error=ignore), ] + ([ unpivot(unpivotFields, extraFieldDefs, normalizeFieldDef, regex=False, resources=RESOURCE_NAME), ] if normalizeFieldDef else []) + [ self.copy_names_to_titles(), self.rename([(self.ct_to_fn(f['columnType']), f['name']) for f in self.config.get(CONFIG_MODEL_MAPPING) if f.get('columnType') is not None]), update_resource(RESOURCE_NAME, path='out.csv'), # *[ # set_type( # self.ct_to_fn(f['columnType']), # columnType=f['columnType'], # **fieldOptions.get(f['columnType'], {}), # resources=RESOURCE_NAME, # on_error=ignore # ) # for f in self.config.get(CONFIG_MODEL_MAPPING) # if f.get('columnType') is not None # ], set_primary_key(primaryKey, resources=RESOURCE_NAME) if len(primaryKey) else None # printer() ] f = Flow(*steps) return f
def flow(parameters, *_): def take_first(field): def f(row): if field in row and isinstance(row[field], list): row[field] = row[field][0] return Flow( f, set_type(field, type='string'), ) def datetime_to_date(field): def f(row): if row.get(field): row[field] = row[field].date() return Flow( f, set_type(field, type='date'), ) def approve(parameters): def func(row): if parameters.get('filter-out') is None: return True bad_phrase = parameters['filter-out'] for f in ('page_title', 'description'): if row.get(f) and bad_phrase in row[f]: return False return True return func return Flow( fetcher(parameters), concatenate(dict( page_title=['Title'], publication_id=['ItemId'], tender_id=['ItemUniqueId'], publisher=['OfficeDesc'], start_date=['PublishDate'], claim_date=['LastDate'], decision=['StatusDesc'], description=['Description'], last_update_date=['UpdateDate'], base_url=['BaseUrl'], url_name=['UrlName'], tender_type_he=['PublicationTypeDesc'], ), resources=-1), add_field('tender_type', 'string', default=parameters['tender_type'], resources=-1), take_first('publisher'), take_first('tender_type_he'), add_field('page_url', 'string', default=lambda row: 'https://www.gov.il/he{base_url}{url_name}'.format(**row)), # delete_fields(['base_url', 'url_name']), filter_rows(approve(parameters)), set_type('publication_id', type='integer'), set_type('start_date', type='datetime', format=DATE_FMT), set_type('last_update_date', type='datetime', format=DATE_FMT), set_type('claim_date', type='datetime', format=DATE_FMT), datetime_to_date('last_update_date'), datetime_to_date('start_date'), set_primary_key(['publication_id', 'tender_type', 'tender_id']), dedup(), update_resource(-1, **parameters.pop('resource')), update_resource(-1, **{'dpp:streaming': True}), validate(), )
def process_demographics(stack): key = 'stack:demographics' try: demographics_cards = _cache.get(key) except KeyError: def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f ) def map_to_cards(): MAP = { ("דו''ח אג''ס לפי עולים וותיקים", ("סה''כ עולים",) ): 'immigrants', ("דו''ח אג''ס לפי קבוצות גיל", ('0-5', '6-12') ): 'kids', ("דו''ח אג''ס לפי קבוצות גיל", ('13-17',) ): 'teenagers', ("דו''ח אג''ס לפי קבוצות גיל", ('60-64', '65-69', '70-74', '75-120') ): 'elderly', ("דו''ח אג''ס לפי קבוצות גיל", ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59') ): 'adults', } def f(rows): for row in rows: for (source, kinds), kind in MAP.items(): if row['source'] == source and row['kind'] in kinds: row['kind'] = kind yield row return f s2n = dict( (int(stat_area), f['properties']['title']) for f in get_neighborhood_features() for stat_area in f['properties']['stat_areas'] ) MAP2 = dict( adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0), kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1), teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2), elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3), immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4), ) demographics_cards = DF.Flow( *[ DF.load(f, headers=4) for f in glob.glob('demographics/*.csv') ], DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]), DF.add_field('total', 'number', lambda r: r.get("סה''כ")), DF.delete_fields(["אג''ס", "סה''כ "]), DF.unpivot([dict( name="([-'א-ת0-9 ].+)", keys=dict( kind=r'\1' ) )], [dict( name='kind', type='string' )], dict( name='value', type='number' )), DF.validate(), add_source(), map_to_cards(), DF.concatenate(dict( total=[], value=[], kind=[], stat_id=[] )), DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))), DF.filter_rows(lambda r: r['neighborhood']), DF.join_with_self('concat', ['neighborhood', 'kind'], dict( neighborhood=None, kind=None, total=dict(aggregate='sum'), value=dict(aggregate='sum'), )), DF.duplicate('concat', 'maxes'), DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)), DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict( total=None, )), DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total'] DF.sort_rows('{score_value}', reverse=True), DF.duplicate('maxes', 'demographics'), DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))), DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)), DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']), DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_value=float(r['score_value']), score_display=r['score_display'], geometry_score=float(r['geometry_score']), )), DF.join_with_self('demographics', ['kind'], dict( kind=None, scores=dict(aggregate='array'), )), DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]), DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]), DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]), DF.sort_rows('{order}'), DF.delete_fields(['kind']), ).results()[0][0] _cache.set(key, demographics_cards) # features = [ # dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0])) # for r in DF.Flow( # DF.load('geo/stat-areas/stat-areas/datapackage.json'), # ).results()[0][0] # ] # geometry=dict(type='FeatureCollection', features=features) stack.update(dict( map=True, scheme='green', currentField='neighborhood', layout='scores', # geometry=geometry )) stack.setdefault('cards', []).extend(demographics_cards)
' ', '_', 'Country', '2017', '2018', '2019', '-', 'Q417', '1Q18', '2Q18', '3Q18', '4Q18', '1Q19', '2Q19', '3Q19', '4Q19' ]), load( load_source= 'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx', format='xls', sheet=5, skip_rows=[1, 2, 3, 4, 5, 6], headers=[ ' ', 'Country', '2016', '2017', '2018', '3Q19', '4Q19', '2019', 'Change 19/18', '1Q20', '2Q20', '3Q20', '4Q20', 'Change 20/19' ]), load( load_source= 'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx', format='xls', sheet=6, skip_rows=[1, 2, 3, 4, 5, 6], headers=[ ' ', 'Country', '2017', '2018', '2019', 'Change 19/18', '2Q19', '3Q19', '4Q19', '1Q20', 'Feb20', 'Mar20', 'Change Mar/Feb' ]), rename_resources, validate(), printer(), dump_to_path('opec'), ) oil_prices.process()
def process_stack_demand(stack): def collect_cats(): F = 'כלל המדגם' def f(rows): cat = None for row in rows: if F in row: v = row[F] if v.startswith('סך הכל '): cat = v[7:] elif v.startswith('--- '): if not v.endswith('ללא פירוט'): subcat = v[4:] row['category'] = cat row['subcategory'] = subcat yield row else: yield row return DF.Flow( DF.add_field('category', 'string', resources=-1), DF.add_field('subcategory', 'string', resources=-1), f, DF.delete_fields([F], resources=-1), ) def fix_nones(row): row['demand_pct'] = row['demand_pct'] or 0 key = 'stack:demand' try: demand_stacks = _cache.get(key) except KeyError: demand_stacks = DF.Flow( DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2), collect_cats(), DF.update_schema(-1, missingValues=['--']), DF.unpivot( unpivot_fields=[dict( name='(.+) \\([A-Z]\\)', keys=dict( neighborhood='\\1' ), )], extra_keys=[dict( name='neighborhood', type='string' )], extra_value=dict( name='demand_pct', type='number' ), resources=-1 ), DF.validate(), DF.duplicate('demand', 'demand_stacks'), DF.join_with_self('demand', ['category', 'subcategory'], dict( category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max') )), DF.join( 'demand', ['category', 'subcategory'], 'demand_stacks', ['category', 'subcategory'], dict( max_demand=None ) ), fix_nones, DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)), DF.add_field('value', 'number', lambda r: r['demand_pct']), DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6), DF.delete_fields(['demand_pct', 'max_demand']), DF.sort_rows('{score}', reverse=True), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_display=r['display'], score_value=float(r['value']), geometry_score=float(r['score']), )), DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict( category=None, subcategory=None, scores=dict(aggregate='array'), )), DF.add_field('card', 'object', lambda r: dict( title='ביקוש ל{}'.format(r['subcategory']), content='', scores=r['scores'], test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_') )), DF.join_with_self('demand_stacks', ['category'], dict( category=None, cards=dict(name='card', aggregate='array'), )), DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')), ).results()[0][0] _cache.set(key, demand_stacks) cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards'] stack.update(dict( layout='scores', currentField='neighborhood', map=True )) stack.setdefault('cards', []).extend(cards)
{ "name": "graph", "title": "10 year US Government Bond Yields (Monthly granuarlity)", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["Rate"]} } ], readme=readme() ), load( load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn', skip_rows=[i+1 for i in range(6)], headers=['Date', 'Rate'], format='csv', name='monthly' ), set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'), set_type('Rate', type='number', description='Percent per year'), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), validate(), dump_to_path() ) def flow(parameters, datapackage, resources, stats): return bond_us if __name__ == '__main__': bond_us.process()
resources=resource_names[1:]), add_computed_field(fields=[{ "operation": "format", "target": "Region", "with": "{Region, subregion, country or area *}" }, { "operation": "format", "target": "Country Code", "with": "{Country code}" }, { "operation": "format", "target": "Year", "with": "{year}" }, { "operation": "format", "target": "Population", "with": "{population}" }]), delete_fields(fields=[ 'Region, subregion, country or area *', 'Country code', 'year', 'population' ]), validate()) def flow(parameters, datapackage, resources, stats): return population_estimates if __name__ == '__main__': population_estimates.process()
def postflow(self): return Flow(validate(on_error=schema_validator.drop), )
set_type( 'Rate', resources='quarterly', type='number', description= 'Quarterly average yield from British Government Securities, 10 year Nominal Par Yield' ), set_type('Year', resources='annual', type='date', format='any'), set_type( 'Rate', resources='annual', type='number', description= 'Annual average yield from British Government Securities, 10 year Nominal Par Yield' ), update_resource('quarterly', **{ 'path': 'data/quarterly.csv', 'dpp:streaming': True }), update_resource('annual', **{ 'path': 'data/annual.csv', 'dpp:streaming': True }), validate(), dump_to_path()) def flow(parameters, datapackage, resources, stats): return bond_uk if __name__ == '__main__': bond_uk.process()
fix_doc_id, fix_links('objective'), fix_links('objective__en'), fix_links('objective__ar'), DF.add_field('year', 'integer', default=cur_year), DF.set_type('org_name', **{'es:title': True}), DF.set_type('org_name__ar', **{'es:title': True}), DF.set_type('alt_names', **{'es:itemType': 'string', 'es:title': True}), *[ DF.set_type(f, **{'es:index': False}) for f in [ 'org_website', 'org_facebook', 'org_phone_number', 'org_email_address', 'logo_url' ] ], DF.validate(), ) def flow(*_): return DF.Flow( org_flow, es_dumper('orgs', REVISION, 'orgs_in_es') ) if __name__ == '__main__': DF.Flow(org_flow, DF.printer()).process()
update_resource('brent-annual', **{ 'path': 'data/brent-annual.csv', 'dpp:streaming': True }), update_resource('wti-daily', **{ 'path': 'data/wti-daily.csv', 'dpp:streaming': True }), update_resource('wti-weekly', **{ 'path': 'data/wti-weekly.csv', 'dpp:streaming': True }), update_resource('wti-monthly', **{ 'path': 'data/wti-monthly.csv', 'dpp:streaming': True }), update_resource('wti-annual', **{ 'path': 'data/wti-annual.csv', 'dpp:streaming': True }), format_date, remove_empty_rows, set_type('Date', resources=None, type='date', format='any'), validate(), dump_to_path('data')) def flow(parameters, datapackage, resources, stats): return oil_prices if __name__ == '__main__': oil_prices.process()