def flow(*args): is_dpp = len(args) > 3 return Flow( load('data/unique_records_full/datapackage.json', resources=['unique_records']), load('data/app_records_full/datapackage.json', resources=['search_app_records']), add_field('__revision', 'integer', REVISION), *(add_field(f['name'], f['type']) for f in STATUS_FIELDS), manage_revisions, *(dump_to_sql( { DB_TABLE: { 'resource-name': resource_name, 'mode': 'update', 'update_keys': KEY_FIELDS } }, DATAFLOWS_DB_ENGINE) for resource_name in ['unique_records', 'search_app_records']), *(add_field(f'rev_{name}', 'date') for name in ['last_updated_at', 'last_modified_at', 'created_at']), set_revisions, filter_rows(equals=[{ '__next_update_days': FILTER_NEXT_UPDATE_DAYS }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(), dump_to_path('data/publications_for_es'), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['doc_id']), update_resource(None, **{'dpp:streaming': True}))
def flow(*_): return DF.Flow( services(), DF.delete_fields( ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']), DF.add_field('publisher_name', 'string', lambda r: r['office'], **{'es:keyword': True}), splitter('target_audience'), splitter('subject'), splitter('intervention'), splitter('target_age_group'), floater('beneficiaries'), floater('budgetItems'), floater('manualBudget'), floater('tenders'), floater('suppliers'), floater('virtue_of_table'), fix_suppliers(), fix_tenders(), add_current_budget(), add_current_beneficiaries(), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', CURRENT_YEAR), DF.add_field('kind', 'string', 'gov_social_service', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירות חברתי', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('name', **{'es:title': True}), DF.set_type('description', **{ 'es:itemType': 'string', 'es:boost': True }), DF.add_field('score', 'number', get_score, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='activities', **{'dpp:streaming': True}), DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})), DF.filter_rows(lambda r: not r['deleted']), DF.delete_fields(['deleted']), DF.dump_to_path('/var/datapackages/activities/social_services'), DF.dump_to_sql(dict(activities={'resource-name': 'activities'})), )
def flow(self): taxonomy = self.context.taxonomy txn_config = taxonomy.config fmt_str = [taxonomy.title + ' עבור:'] fields = txn_config['key-fields'] for f in fields: for ct in taxonomy.column_types: if ct['name'] == f: fmt_str.append('%s: "{%s}",' % (ct['title'], f.replace(':', '-'))) break fmt_str = ' '.join(fmt_str) fields = [ct.replace(':', '-') for ct in fields] all_fields = ['_source'] + fields TARGET = 'configurations' saved_config = self.config._unflatten() saved_config.setdefault('publish', {})['allowed'] = False return Flow( duplicate(RESOURCE_NAME, TARGET), join_with_self( TARGET, all_fields, dict((f, {}) for f in all_fields), ), add_computed_field([ dict(operation='format', target='snippets', with_=fmt_str), dict(operation='constant', target='key_values', with_=None), ], resources=TARGET), add_field('config', 'object', saved_config, resources=TARGET), add_field('fields', type='object', default=self.collate_values(fields), resources=TARGET), join_with_self( TARGET, ['_source'], dict( source=dict(name='_source'), config={}, key_values=dict(aggregate='array'), snippets=dict(aggregate='array'), )), set_type('source', type='string'), set_type('config', type='object'), set_type('key_values', type='array'), set_type('snippets', type='array'), set_primary_key(['source']), dump_to_sql( dict([(TARGET, { 'resource-name': TARGET, 'mode': 'update' })]), engine=self.lazy_engine(), ), )
def flow(*_): return DF.Flow( DF.load( '/var/datapackages/activities/social_services/social_services/datapackage.json' ), DF.concatenate( dict(kind=[], kind_he=[], activity_name=[], activity_description=[], publisher_name=[], history=[], max_year=[], min_year=[]), dict(name='activities', path='activities.csv')), DF.set_primary_key(['kind', 'publisher_name', 'activity_name']), DF.set_type('activity_name', **{'es:title': True}), DF.set_type('activity_description', **{ 'es:itemType': 'string', 'es:boost': True }), DF.set_type('kind', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('kind_he', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('publisher_name', **{'es:keyword': True}), DF.set_type( 'history', **{ 'es:itemType': 'object', 'es:schema': dict(fields=[ dict(name='year', type='integer'), dict(name='unit', type='string'), dict(name='subunit', type='string'), dict(name='subsubunit', type='string'), dict(name='allocated_budget', type='integer'), dict(name='num_beneficiaries', type='string', **{'es:index': False}), ]) }), DF.add_field( 'score', 'number', lambda x: (x['history'][0].get('allocated_budget') or 1000) / 1000, **{'es:score-column': True}), DF.update_resource(-1, **{'dpp:streaming': True}), DF.dump_to_path('/var/datapackages/activities/all'), DF.dump_to_sql(dict(activities={'resource-name': 'activities'})))
def test_dump_to_sql(): from dataflows import Flow, printer, dump_to_sql from sqlalchemy import create_engine f = Flow( data, printer(), dump_to_sql(dict(output_table={'resource-name': 'res_1'}), engine='sqlite:///out/test.db')) f.process() # Check validity engine = create_engine('sqlite:///out/test.db') result = list( dict(x) for x in engine.execute('select * from output_table')) assert result == data
def flow(*_): return DF.Flow( all_units(), DF.add_field('office', 'string', lambda r: r['path'][0] if len(r['path']) > 0 else None, **{'es:keyword': True}), DF.add_field('unit', 'string', lambda r: r['path'][1] if len(r['path']) > 1 else None, **{'es:keyword': True}), DF.add_field('subunit', 'string', lambda r: r['path'][2] if len(r['path']) > 2 else None, **{'es:keyword': True}), DF.add_field('subsubunit', 'string', lambda r: r['path'][3] if len(r['path']) > 3 else None, **{'es:keyword': True}), DF.add_field('breadcrumbs', 'string', lambda r: '/'.join(r['path']) or 'משרדי הממשלה', **{'es:exclude': True}), DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main', **{'es:exclude': True}), DF.delete_fields([ 'path', ]), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', 2020), DF.add_field('kind', 'string', 'gov_social_service_unit', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('score', 'number', 1000, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='units', **{'dpp:streaming': True}), # Ensure we only have the main offices DF.filter_rows(lambda r: r['unit'] is None), DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'), DF.dump_to_path('/var/datapackages/units/social_services'), DF.dump_to_sql(dict(units={'resource-name': 'units'})))
def flow(parameters): return Flow( dump_to_sql(parameters['tables'], engine=parameters.get('engine', 'env://DPP_DB_ENGINE'), updated_column=parameters.get("updated_column"), updated_id_column=parameters.get("updated_id_column")))