def main(request_times_api_url): metadata = {} stats = collections.defaultdict(int) instance_stats = collections.defaultdict(int) Flow(get_builds(request_times_api_url, stats), aggregate_instance_stats(instance_stats, metadata), dump_to_path('data/aggregate_request_times')).process() Flow(get_instance_stats_data(instance_stats, metadata), dump_to_path('data/aggregate_request_times_stats'), printer(num_rows=1)).process()
def main(package_url): jenkins_user_token = ckan_manager.get_jenkins_token( 'ckan-cloud-operator-jenkins-creds') package_url = package_url.replace( 'https://', 'https://{}:{}@'.format(*jenkins_user_token)) stats_rows = [] Flow(load(package_url), aggregate_stats(stats_rows), dump_to_path('data/aggregate_access_logs')).process() Flow((row for row in stats_rows), dump_to_path('data/aggregate_access_logs_stats'), printer()).process()
def test_example_75(): from dataflows import Flow, load, dump_to_path def add_is_guitarist_column_to_schema(package): # Add a new field to the first resource package.pkg.descriptor['resources'][0]['schema']['fields'].append(dict( name='is_guitarist', type='boolean' )) # Must yield the modified datapackage yield package.pkg yield from package def add_is_guitarist_column(row): row['is_guitarist'] = row['instrument'] == 'guitar' return row f = Flow( # Same one as above load('data/beatles.csv'), add_is_guitarist_column_to_schema, add_is_guitarist_column, dump_to_path('out/beatles_guitarists2') ) _ = f.process()
def flow(*args): is_dpp = len(args) > 3 return Flow( load('data/unique_records_full/datapackage.json', resources=['unique_records']), load('data/app_records_full/datapackage.json', resources=['search_app_records']), add_field('__revision', 'integer', REVISION), *(add_field(f['name'], f['type']) for f in STATUS_FIELDS), manage_revisions, *(dump_to_sql( { DB_TABLE: { 'resource-name': resource_name, 'mode': 'update', 'update_keys': KEY_FIELDS } }, DATAFLOWS_DB_ENGINE) for resource_name in ['unique_records', 'search_app_records']), *(add_field(f'rev_{name}', 'date') for name in ['last_updated_at', 'last_modified_at', 'created_at']), set_revisions, filter_rows(equals=[{ '__next_update_days': FILTER_NEXT_UPDATE_DAYS }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(), dump_to_path('data/publications_for_es'), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['doc_id']), update_resource(None, **{'dpp:streaming': True}))
def es_dumper(resource_name, revision, path): now = time.time() return DF.Flow( update_pk('doc_id'), DF.add_field('revision', 'integer', default=revision), DF.add_field('score', 'number', default=1), DF.add_field('create_timestamp', 'number', now), my_dump_to_es(indexes={ 'migdar__' + resource_name: [{ 'resource-name': resource_name, 'revision': revision }] }, mapper_cls=BoostingMappingGenerator, index_settings={'index.mapping.coerce': True}, elasticsearch_options=dict(timeout=60)), DF.dump_to_path('data/{}'.format(path)), collate(revision), my_dump_to_es(indexes={ 'migdar__docs': [{ 'resource-name': resource_name, 'revision': revision }] }, mapper_cls=BoostingMappingGenerator, index_settings={'index.mapping.coerce': True}), DF.update_resource(None, **{'dpp:streaming': True}), DF.printer(), )
def flow(*args): is_dpp = len(args) > 3 return Flow( load( 'https://migdar-internal-search.odata.org.il/__data/search_import/index.csv', encoding='utf-8', http_session=get_migdar_session()), update_resource('index', name='search_import_index', path='search_import_index.csv'), load_from_gdrive_files, update_resource('search_import_index', name='search_import', path='search_import.csv', schema={ 'fields': [{ 'name': n, 'type': 'string' } for n in SEARCH_IMPORT_FIELD_NAMES] }, **{'dpp:streaming': True}), printer(num_rows=20, tablefmt='plain' if is_dpp else 'html', fields=['migdar_id', 'pubyear', 'title']), dump_to_path('data/search_import_from_gdrive'))
def join_unique_records(*args): is_dpp = len(args) > 3 return Flow( load('data/search_import_from_gdrive/datapackage.json', resources=['search_import']), load('data/search_results/unique_records.csv', resources=['unique_records']), set_type('migdar_id', type='string', resources=['unique_records', 'search_import']), join(source_name='search_import', source_key=['migdar_id'], target_name='unique_records', target_key=['migdar_id'], fields={ f'gd_{field}': { 'name': field } for field in SEARCH_IMPORT_FIELD_NAMES }, full=False), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['migdar_id']), dump_to_path('data/unique_records_full'), update_resource(None, **{'dpp:streaming': True}))
def run_flow(datetime_format=None): Flow([{ 'today': str(_today), 'now': str(_now) }], set_type('today', type='date'), set_type('now', type='datetime', format=datetime_format), dump_to_path('out/dump_dates')).process()
def test_example_7(): from dataflows import Flow, load, dump_to_path def add_is_guitarist_column(package): # Add a new field to the first resource package.pkg.descriptor['resources'][0]['schema']['fields'].append( dict(name='is_guitarist', type='boolean')) # Must yield the modified datapackage yield package.pkg # Now iterate on all resources resources = iter(package) beatles = next(resources) def f(row): row['is_guitarist'] = row['instrument'] == 'guitar' return row yield map(f, beatles) f = Flow( # Same one as above load('data/beatles.csv'), add_is_guitarist_column, dump_to_path('out/beatles_guitarists')) _ = f.process()
def test_example_8(): from dataflows import Flow, load, dump_to_path def find_double_winners(package): # Remove the emmies resource - we're going to consume it now package.pkg.remove_resource('emmies') # Must yield the modified datapackage yield package.pkg # Now iterate on all resources resources = iter(package) # Emmies is the first - read all its data and create a set of winner names emmy = next(resources) emmy_winners = set( map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy))) # Oscars are next - filter rows based on the emmy winner set academy = next(resources) yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners, academy) f = Flow( # Emmy award nominees and winners load('data/emmy.csv', name='emmies'), # Academy award nominees and winners load('data/academy.csv', encoding='utf8', name='oscars'), find_double_winners, dump_to_path('out/double_winners')) _ = f.process()
def test_example_5(): from dataflows import Flow, set_type, dump_to_path f = Flow(country_population(), set_type('population', type='number', groupChar=','), dump_to_path('out/country_population')) _ = f.process()
def flow(*_): DF.Flow( DF.load(filename, name='welfare'), DF.add_field('activity_name', 'string', lambda r: r['שם השירות (ציבורי)']), DF.filter_rows(lambda r: r['activity_name']), DF.add_field( 'activity_description', 'array', lambda r: [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)'] ]), DF.add_field( 'history', 'array', lambda r: [ dict( year=2019, unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(), subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1]. strip(), subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[ 1].strip(), ) ]), DF.add_field('target_audience', 'array', splitter('אוכלוסייה')), DF.add_field('subject', 'array', splitter('תחום ההתערבות')), DF.add_field('intervention', 'array', splitter('אופן התערבות')), DF.select_fields(FIELDS), DF.add_field('publisher_name', 'string', 'משרד הרווחה'), DF.add_field('min_year', 'integer', 2019), DF.add_field('max_year', 'integer', 2019), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(), DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process() return DF.Flow( DF.load('tmp/activities-welfare/datapackage.json'), DF.update_resource(-1, **{'dpp:streaming': True}), )
def generate_package(): package_flow = Flow( add_metadata( name="unicode-emojis", title="UTS #51 Unicode Emoji", descriptor=( "List of emojis available from the Unicode Consortium. " "More information can be found in the Unicode® Technical Standard #51." ), sources=[ { "name": "unicode-emoji", "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt", "title": "UTS #51 Unicode Emoji", }, ], licenses=[ { "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", } ], keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"], ), load(load_source="data/emojis.csv", format="csv",), validate(), dump_to_path(), ) package_flow.process()
def prepare_locations(): prepare_addresses() return DF.Flow( DF.load('_cache_addresses/datapackage.json'), DF.add_field( 'address', 'string', lambda r: '{} {}{}'.format( r['street_name'], r['house_number'], r['letter'] or '')), DF.add_field( 'item', 'object', lambda r: dict(value=dict(lat=float(r['lat']), lon=float(r['lon']), arnona_zones=r['arnona_zones'], שם=r['address']), display=r['address'])), DF.sort_rows('{house_number}'), DF.delete_fields([ 'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address' ]), DF.join_with_self( 'concat', ['street_name'], dict(display=dict(name='street_name'), items=dict(name='item', aggregate='array'))), DF.add_field('sort_street_address', 'string', lambda r: sort_street_address(r['display'])), DF.sort_rows('{sort_street_address}'), DF.delete_fields(['sort_street_address']), DF.printer(), DF.dump_to_path('_cache_locations'), DF.checkpoint('_cache_locations')).results()[0][0]
def operator(name, params): connection_string = params['db_url'] source_table = params['db_table'] target_instance_name = params['target_instance_name'] target_package_id = params['target_package_id'] target_organization_id = params['target_organization_id'] print('starting db_fetcher operator') print( 'source_table={} target_instance_name={} target_package_id={} target_organization_id={}' .format(source_table, target_instance_name, target_package_id, target_organization_id)) with tempfile.TemporaryDirectory() as tempdir: csv_filename = target_package_id + '.csv' DF.Flow( DF.load(connection_string, table=source_table, name=target_package_id, infer_strategy=DF.load.INFER_PYTHON_TYPES), DF.update_resource(-1, path=csv_filename), DF.delete_fields(['_source']), DF.dump_to_path(tempdir)).process() csv_filename = os.path.join(tempdir, csv_filename) print('{}, {:,} bytes'.format(csv_filename, os.stat(csv_filename).st_size)) update_package(target_instance_name, target_organization_id, target_package_id, target_package_id, [('CSV', csv_filename)])
def get_secondary_chain(source_chain, secondary, num_secondaries, workdir): for step_idx, step in enumerate(source_chain): serverless_step_config = getattr(step, '__serverless_step', None) if serverless_step_config: wait_primary_step_complete(secondary, num_secondaries, step_idx, workdir) notify_complete = partial(notify_secondary_step_complete, step_idx, secondary, num_secondaries, workdir, serverless_step_config) print('secondary {}/{}: running step {} flow'.format( secondary, num_secondaries, step_idx)) try: Flow( load( PRIMARY_INPUT_DATAPACKAGE_FILE_TEMPLATE.format( workdir=workdir, step_idx=step_idx)), get_secondary_step(step, serverless_step_config, secondary, num_secondaries, step_idx, workdir), dump_to_path( SECONDARY_OUTPUT_DATAPACKAGE_PATH_TEMPLATE.format( workdir=workdir, secondary=secondary, step_idx=step_idx))).process() except Exception as e: notify_complete(str(e)) raise notify_complete(None) return [[]]
def judges_flow(out_path): return Flow( get_tribunals(), update_resource(['res_1'], name='tribunals', path='tribunals.csv'), checkpoint('judges_tribunals'), get_judges(), update_resource(['res_2'], name='judges_list', path='judges_list.csv'), set_type('Is_In_Dimus_List', resources=['judges_list'], type='boolean'), checkpoint('judges_judges_list'), join('tribunals', ['Tribunal_Code'], 'judges_list', ['Tribunal_Code'], fields={ 'Tribunal_Type_Code': {}, 'Tribunal_Arkaa_Code': { 'name': 'Arkaa_Code' }, 'Tribunal_District_Code': { 'name': 'District_Code' }, 'Tribunal_Name': { 'name': 'Name' } }), fetch_judges_details, checkpoint('judges_details'), add_field('tribunal_type_name', 'string'), parse_judges_extra_details, checkpoint('judges_extra_details'), parse_judge_events, dump_to_path(out_path), printer(num_rows=1))
def prepare(): for resource_name, load in loads: DF.Flow( load, # DF.printer(tablefmt='html'), DF.concatenate( FIELD_MAPPING, dict(name=resource_name, path=resource_name + '.csv')), DF.set_type('activity_name', type='string', constraints=dict(required=True), on_error=DF.schema_validator.drop), DF.set_type('allocated_budget', type='number', groupChar=',', bareNumber=False), DF.set_type('num_beneficiaries', type='number', groupChar=',', bareNumber=False, on_error=DF.schema_validator.ignore), fix_beneficiaries, DF.set_type('num_beneficiaries', type='string'), multiply_budget, fill_org_hierarchy, # DF.printer(tablefmt='html'), DF.dump_to_path('tmp/' + resource_name), ).process()
def list_instances(): os.makedirs('data/list_instances', exist_ok=True) data = [] Flow((get_instance_row(instance) for instance in ckan_instance_manager.list_instances(full=True)), dump_to_json(data), dump_to_path('data/list_instances'), printer(num_rows=99999)).process() with open('data/list_instances.json', 'w') as f: json.dump(data, f)
def test_load_from_package(): from dataflows import dump_to_path, load Flow([{'foo': 'bar'}], dump_to_path('data/load_from_package')).process() ds = Flow(load('data/load_from_package/datapackage.json')).datastream() assert len(ds.dp.resources) == 1 assert [list(res) for res in ds.res_iter] == [[{'foo': 'bar'}]]
def dump_print_flow(flow, dump_path, num_rows=1, fields=None, checkpoint_name=None): return Flow(flow, checkpoint(checkpoint_name) if checkpoint_name else None, dump_to_path(dump_path), printer(num_rows=num_rows, fields=fields))
def flow(*_): return DF.Flow( DF.load('/var/datapackages/activities/social_services/datapackage.json'), DF.add_field('entity_id', 'string'), DF.add_field('soproc_supplier', 'boolean'), unwind(), DF.select_fields(['entity_id', 'soproc_supplier']), DF.dump_to_path('/var/datapackages/activities/social_services_suppliers'), )
def decp_processing(): flow = Flow( # Chargement du CSV suite à la conversion depuis JSON load("decp.csv"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), # Tri par rootId et seq pour préparer à la création de donneesActuelles sort_rows('{rootId}:{seq}', resources=0, reverse=True), donnees_actuelles, # rootId et seq peuvent maintenant être supprimés delete_fields(["rootId", "seq"], resources=0, regex=False), sort_rows('{datePublicationDonnees}', resources=0, reverse=True), # Nouvelle table dédiée aux marchés, sans données sur les titulaires print("Création de la table dédiée aux marchés..."), duplicate(source="decp", target_name="decp-sans-titulaires", target_path="decp-sans-titulaires.csv", duplicate_to_end=True), delete_fields([ "titulaire.id", "titulaire.denominationSociale", "titulaire.typeIdentifiant" ], resources="decp-sans-titulaires", regex=False), set_primary_key(["uid"], resources="decp-sans-titulaires"), deduplicate(), # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données # print("Téléchargement des données tabulaires précédentes..."), # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"), # set_type("acheteur.id", type="string"), # set_type("titulaire.id", type="string"), # set_type("codeCPV", type="string"), # set_type("lieuExecution.code", type="string"), # delete_fields(["rowid"], resources="previous-decp", regex=False), # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."), # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]), # Chargement des précédentes données dédiées aux titulaires print("Chargement des données titulaires..."), load("decp-titulaires.csv", name="decp-titulaires"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), set_type("departement", type="string"), set_type("codeAPE", type="string"), print("Enregistrement des données sur le disque..."), dump_to_path("decp")) flow.process()
def conference_csv(): flow = Flow( # Load inputs load( od19_base + od19_feedback, name='feedback', format='csv', ), load( od19_base + od19_analysis, name='analysis', format='csv', ), # Process them set_type("Anzahl.*", type='integer', resources='analysis'), delete_fields([ "Anzahl Auflistung", ".*\\(Formel\\)", ".*Duplikate", ], resources='analysis'), not_empty_groupcol, # Save the results add_metadata( name='opendatach19', title='''Opendata.ch/2019 Forum''', licenses=[{ "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0" }], maintainers=[{ "name": "Oleg Lavrovsky", "web": "https://datalets.ch/" }], views=[{ "name": "Groups", "resources": ["analysis"], "spec": { "group": "Alle " "Bedürfnisse" "", "series": ["Anzahl Auflistung (Zahl)"], "type": "bar" }, "specType": "simple", "title": "Topic counts" }]), printer(), validate(), dump_to_path('data/opendatach19'), ) flow.process()
def main(instance_ids_or_names, approve_code): instance_ids_or_names = [ i.strip() for i in instance_ids_or_names.split(',') if i.strip() ] approve_code = approve_code.strip() logs.info(instance_ids_or_names=instance_ids_or_names, approve_code=approve_code) Flow(delete_instances(instance_ids_or_names, approve_code), dump_to_path('data/delete_instances'), printer(num_rows=9999)).process()
def prepare(self): self.ref_hash = md5(self.REF_DATAPACKAGE.encode('utf8')).hexdigest() self.key = self.__class__.__name__ check = checkpoint(self.ref_hash) if not check.exists(): Flow(load(self.REF_DATAPACKAGE), rename_last_resource(self.ref_hash), dump_to_path('.cache/{}'.format(self.ref_hash)), check).process() logger.debug('DONE PREPARING %s', self.key)
def AFRR_Data(): unpivoting_fields = [{ 'name': 'aFRR_DownActivated', 'keys': { 'product': 'aFRR_DownActivated' } }, { 'name': 'aFRR_UpActivated', 'keys': { 'product': 'aFRR_UpActivated' } }] extra_keys = [{'name': 'product', 'type': 'string'}] extra_value = {'name': 'amount', 'type': 'number'} flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000', format="json", property="result.records", name="fact_afrr"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('afrr'), # Normalize/unpivot: unpivot(unpivoting_fields, extra_keys, extra_value), add_computed_field([ dict(target=dict(name='PriceArea', type='string'), operation='constant', with_='DK1'), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_='dummy'), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_='dummy') ]), add_price, delete_fields(fields=[ 'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK', 'aFRR_UpPriceEUR' ]), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_afrr', title='Automatic Frequency Restoration Reserves', source= 'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c' ), printer(), dump_to_path('afrr_data')) flow.process()
def test_load_from_env_var(): import os from dataflows import load, dump_to_path Flow([{'foo': 'bar'}], dump_to_path('out/load_from_env_var')).process() os.environ['MY_DATAPACKAGE'] = 'out/load_from_env_var/datapackage.json' results, dp, _ = Flow(load('env://MY_DATAPACKAGE')).results() assert len(dp.resources) == 1 assert results == [[{'foo': 'bar'}]]
def flow(parameters, *_): year = parameters['year'] return DF.Flow( DF.load(wrapper(year), format='csv', infer_strategy=DF.load.INFER_STRINGS, cast_strategy=DF.load.CAST_DO_NOTHING), DF.update_resource( None, **{ 'dpp:streaming': True, 'name': 'supports', 'path': 'data/supports.csv' }), DF.dump_to_path(f'/var/datapackages/supports/yearly-{year}'))
def update_dataset(): flow = Flow( # Load inputs load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), # Process them (if necessary) # Save the results add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''), printer(), dump_to_path(), ) flow.process()
{ "name": "graph", "title": "10 year US Government Bond Yields (Monthly granuarlity)", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["Rate"]} } ], readme=readme() ), load( load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn', skip_rows=[i+1 for i in range(6)], headers=['Date', 'Rate'], format='csv', name='monthly' ), set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'), set_type('Rate', type='number', description='Percent per year'), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), validate(), dump_to_path() ) def flow(parameters, datapackage, resources, stats): return bond_us if __name__ == '__main__': bond_us.process()