def test_delete_field(): from dataflows import delete_fields data2 = [ dict(x=1, y='a', xx=True), dict(x=2, y='b', xx=True), dict(x=3, y='c', xx=True), ] f = Flow( data, delete_fields(['x']) ) results, dp, _ = f.results() for i in results[0]: assert list(i.keys()) == ['y'] assert dp.descriptor['resources'][0]['schema']['fields'] == \ [dict(name='y', type='string', format='default')] f = Flow( data, data2, delete_fields(['x+']) ) results, dp, _ = f.results() for res in results: for i in res: assert list(i.keys()) == ['y'] assert dp.descriptor['resources'][0]['schema']['fields'] == \ [dict(name='y', type='string', format='default')]
def prepare_locations(): prepare_addresses() return DF.Flow( DF.load('_cache_addresses/datapackage.json'), DF.add_field( 'address', 'string', lambda r: '{} {}{}'.format( r['street_name'], r['house_number'], r['letter'] or '')), DF.add_field( 'item', 'object', lambda r: dict(value=dict(lat=float(r['lat']), lon=float(r['lon']), arnona_zones=r['arnona_zones'], שם=r['address']), display=r['address'])), DF.sort_rows('{house_number}'), DF.delete_fields([ 'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address' ]), DF.join_with_self( 'concat', ['street_name'], dict(display=dict(name='street_name'), items=dict(name='item', aggregate='array'))), DF.add_field('sort_street_address', 'string', lambda r: sort_street_address(r['display'])), DF.sort_rows('{sort_street_address}'), DF.delete_fields(['sort_street_address']), DF.printer(), DF.dump_to_path('_cache_locations'), DF.checkpoint('_cache_locations')).results()[0][0]
def get_neighborhood_features(): return DF.Flow( DF.load('neighborhoods.xlsx', name='stat-areas', deduplicate_headers=True), DF.add_field( 'neighborhoods', 'array', lambda r: [v for k, v in r.items() if v and k.startswith('neighborhood')]), DF.add_field('geometry', 'object', lambda r: geometries[r['stat-area']]), DF.concatenate( dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])), DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(), DF.join_with_self( 'stat-areas', ['neighborhood'], dict( neighborhood=None, stat_areas=dict(name='stat_area', aggregate='array'), geometries=dict(name='geometry', aggregate='array'), )), DF.add_field('geometry', 'object', lambda r: unite_geometries(r['geometries'])), DF.delete_fields(['geometries']), DF.update_resource(-1, name='neighborhoods'), DF.add_field( 'properties', 'object', lambda r: dict( x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])), DF.delete_fields(['neighborhood', 'stat_areas']), DF.checkpoint('_cache_neighborhoods')).results()[0][0]
def decp_processing(): flow = Flow( # Chargement du CSV suite à la conversion depuis JSON load("decp.csv"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), # Tri par rootId et seq pour préparer à la création de donneesActuelles sort_rows('{rootId}:{seq}', resources=0, reverse=True), donnees_actuelles, # rootId et seq peuvent maintenant être supprimés delete_fields(["rootId", "seq"], resources=0, regex=False), sort_rows('{datePublicationDonnees}', resources=0, reverse=True), # Nouvelle table dédiée aux marchés, sans données sur les titulaires print("Création de la table dédiée aux marchés..."), duplicate(source="decp", target_name="decp-sans-titulaires", target_path="decp-sans-titulaires.csv", duplicate_to_end=True), delete_fields([ "titulaire.id", "titulaire.denominationSociale", "titulaire.typeIdentifiant" ], resources="decp-sans-titulaires", regex=False), set_primary_key(["uid"], resources="decp-sans-titulaires"), deduplicate(), # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données # print("Téléchargement des données tabulaires précédentes..."), # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"), # set_type("acheteur.id", type="string"), # set_type("titulaire.id", type="string"), # set_type("codeCPV", type="string"), # set_type("lieuExecution.code", type="string"), # delete_fields(["rowid"], resources="previous-decp", regex=False), # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."), # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]), # Chargement des précédentes données dédiées aux titulaires print("Chargement des données titulaires..."), load("decp-titulaires.csv", name="decp-titulaires"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), set_type("departement", type="string"), set_type("codeAPE", type="string"), print("Enregistrement des données sur le disque..."), dump_to_path("decp")) flow.process()
def AFRR_Data(): unpivoting_fields = [{ 'name': 'aFRR_DownActivated', 'keys': { 'product': 'aFRR_DownActivated' } }, { 'name': 'aFRR_UpActivated', 'keys': { 'product': 'aFRR_UpActivated' } }] extra_keys = [{'name': 'product', 'type': 'string'}] extra_value = {'name': 'amount', 'type': 'number'} flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000', format="json", property="result.records", name="fact_afrr"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('afrr'), # Normalize/unpivot: unpivot(unpivoting_fields, extra_keys, extra_value), add_computed_field([ dict(target=dict(name='PriceArea', type='string'), operation='constant', with_='DK1'), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_='dummy'), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_='dummy') ]), add_price, delete_fields(fields=[ 'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK', 'aFRR_UpPriceEUR' ]), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_afrr', title='Automatic Frequency Restoration Reserves', source= 'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c' ), printer(), dump_to_path('afrr_data')) flow.process()
def flow(*_): return Flow( update_resource( -1, name='maya_stakeholder_list', path="data/maya_stakeholder_list.csv", ), filter_by_type, rename_fields(RENAME_FIELDS), add_field('stakeholder_type', 'string'), add_fields(FIELDS + OPTIONAL_FIELDS, 'string'), add_fields(TABLE_FIELDS + OPTIONAL_TABLE_FIELDS, 'string'), add_fields(CORE_STAKE_HOLDER_FIELDS.values(), 'string'), validate, parse_document, delete_fields([ 'document', 'pdf', 'other', 'num_files', 'parser_version', 'source', 's3_object_name' ]), fix_fields, set_type('CapitalPct', type='number'), set_type('VotePower', type='number'), set_type('CapitalPct_Dilul', type='number'), set_type('VotePower_Dilul', type='number'), )
def flow(*_): return Flow( update_resource( -1, name='maya_holdings_change', path="data/maya_holdings_change.csv", ), filter_by_type, add_fields( FIELDS + OPTIONAL_FIELDS + TABLE_FIELDS + OPTIONAL_TABLE_FIELDS, 'string'), rename_fields(RENAME_FIELDS), fix_fields(FIELDS + OPTIONAL_FIELDS + TABLE_FIELDS + OPTIONAL_TABLE_FIELDS), validate, parse_document, delete_fields([ 'document', 'other', 'num_files', 'parser_version', 'source', 's3_object_name', ]), )
def split_keyword_list(new_fieldname, fieldname, delimiter=','): def splitter(): def func(row): if row.get(fieldname): row[new_fieldname] = [ x.strip() for x in row[fieldname].split(delimiter) ] else: row[new_fieldname] = [] return func steps = [] if new_fieldname != fieldname: steps.append(add_field(new_fieldname, type='array')) steps.append(splitter()) if new_fieldname != fieldname: steps.append(delete_fields([fieldname])) steps.append( set_type(new_fieldname, type='array', **{ 'es:itemType': 'string', 'es:keyword': True })) return Flow(*steps)
def broken_links_flow(): return DF.Flow( *[ DF.Flow( DF.load(URL_TEMPLATE.format(**c), name=c['name']), DF.add_field('__name', 'string', c['name'], resources=c['name']), DF.add_field('__title', 'string', get_title(c['title']), resources=c['name']), ) for c in configuration ], DF.add_field('urls', 'array', lambda r: RE.findall(str(r))), DF.add_field('link', 'string', lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)), DF.concatenate( dict( name=['__name'], title=['__title'], link=[], urls=[], )), DF.add_field('url', 'string'), DF.add_field('error', 'string'), unwind(), DF.delete_fields(['urls']), DF.parallelize(check_broken(), 4), DF.filter_rows(lambda r: r['error'] is not None), )
def operator(name, params): connection_string = params['db_url'] source_table = params['db_table'] target_instance_name = params['target_instance_name'] target_package_id = params['target_package_id'] target_organization_id = params['target_organization_id'] print('starting db_fetcher operator') print( 'source_table={} target_instance_name={} target_package_id={} target_organization_id={}' .format(source_table, target_instance_name, target_package_id, target_organization_id)) with tempfile.TemporaryDirectory() as tempdir: csv_filename = target_package_id + '.csv' DF.Flow( DF.load(connection_string, table=source_table, name=target_package_id, infer_strategy=DF.load.INFER_PYTHON_TYPES), DF.update_resource(-1, path=csv_filename), DF.delete_fields(['_source']), DF.dump_to_path(tempdir)).process() csv_filename = os.path.join(tempdir, csv_filename) print('{}, {:,} bytes'.format(csv_filename, os.stat(csv_filename).st_size)) update_package(target_instance_name, target_organization_id, target_package_id, target_package_id, [('CSV', csv_filename)])
def test_delete_field(): from dataflows import delete_fields f = Flow(data, delete_fields(['x'])) results, dp, _ = f.results() for i in results[0]: assert list(i.keys()) == ['y'] assert dp.descriptor['resources'][0]['schema']['fields'] == \ [dict(name='y', type='string', format='default')]
def flow(*_): return DF.Flow( services(), DF.delete_fields( ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']), DF.add_field('publisher_name', 'string', lambda r: r['office'], **{'es:keyword': True}), splitter('target_audience'), splitter('subject'), splitter('intervention'), splitter('target_age_group'), floater('beneficiaries'), floater('budgetItems'), floater('manualBudget'), floater('tenders'), floater('suppliers'), floater('virtue_of_table'), fix_suppliers(), fix_tenders(), add_current_budget(), add_current_beneficiaries(), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', CURRENT_YEAR), DF.add_field('kind', 'string', 'gov_social_service', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירות חברתי', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('name', **{'es:title': True}), DF.set_type('description', **{ 'es:itemType': 'string', 'es:boost': True }), DF.add_field('score', 'number', get_score, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='activities', **{'dpp:streaming': True}), DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})), DF.filter_rows(lambda r: not r['deleted']), DF.delete_fields(['deleted']), DF.dump_to_path('/var/datapackages/activities/social_services'), DF.dump_to_sql(dict(activities={'resource-name': 'activities'})), )
def proj(): def func(row): row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True) return DF.Flow( DF.add_field('lon', 'number'), DF.add_field('lat', 'number'), func, DF.delete_fields(['X', 'Y']) )
def unwind_neighborhoods(): def f(rows): for row in rows: for n in row['neighborhoods']: row['neighborhood'] = n yield row return DF.Flow(DF.add_field('neighborhood', 'string'), f, DF.delete_fields(['neighborhoods']))
def conference_csv(): flow = Flow( # Load inputs load( od19_base + od19_feedback, name='feedback', format='csv', ), load( od19_base + od19_analysis, name='analysis', format='csv', ), # Process them set_type("Anzahl.*", type='integer', resources='analysis'), delete_fields([ "Anzahl Auflistung", ".*\\(Formel\\)", ".*Duplikate", ], resources='analysis'), not_empty_groupcol, # Save the results add_metadata( name='opendatach19', title='''Opendata.ch/2019 Forum''', licenses=[{ "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0" }], maintainers=[{ "name": "Oleg Lavrovsky", "web": "https://datalets.ch/" }], views=[{ "name": "Groups", "resources": ["analysis"], "spec": { "group": "Alle " "Bedürfnisse" "", "series": ["Anzahl Auflistung (Zahl)"], "type": "bar" }, "specType": "simple", "title": "Topic counts" }]), printer(), validate(), dump_to_path('data/opendatach19'), ) flow.process()
def Elspot_Prices_Data(): # field_metadata = get_metadata('c86859d2-942e-4029-aec1-32d56f1a2e5d') flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20elspotprices%20order%20by%20"HourUTC"%20desc%20limit%20100', format="json", property="result.records", name="fact_elspot_prices"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('load_data'), # Add product: add_computed_field([ dict(target=dict(name='product', type='string'), operation='constant', with_='Elspot'), dict(target=dict(name='amount', type='number'), operation='constant', with_=1), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_=-1), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_=-1) ]), add_price, delete_fields(fields=['SpotPriceDKK', 'SpotPriceEUR']), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_elspot_prices', title='Elspot Prices Data', source= 'https://www.energidataservice.dk/dataset/elspotprices/resource_extract/c86859d2-942e-4029-aec1-32d56f1a2e5d' ), printer(), dump_to_path('elspot_prices_data'), # dump_to_sql(tables={'elspot': {'resource-name': 'Elspot_Prices_Data', 'mode': 'append'}}, engine='postgresql://*****:*****@localhost/cubes') ) flow.process()
def flow(parameters): resources = parameters.get('resources') regex = parameters.get('regex', True) return Flow( delete_fields( parameters.get('fields', []), resources=resources, regex=regex, ) )
def flow(parameters, *args): return Flow( conditional( field_exists('quote'), Flow(delete_fields(["quote"], resources=parameters["resources"]))), add_computed_field( target=dict(name='quote', type='number'), operation=lambda row: (row["key"] == "sum") and round( tryconvert(row["value"], 0, int) / math.floor( tryconvert(row["population"], 100, int) / 100), 2) or None, resources=parameters["resources"]))
def flow(parameters): resources = parameters.get('resources') regex = parameters.get('regex', True) if 'types' in parameters: return Flow(*[ set_type(name, resources=resources, regex=regex, **options) if options is not None else delete_fields([name], resources=resources) for name, options in parameters['types'].items() ]) else: return Flow(validate())
def flow(*_): return Flow( filter_by_type, rename_fields, add_fields(FIELDS, 'string'), add_fields(ADDITIONAL_FIELDS, 'string'), parse_document, delete_fields([ 'document', 'pdf', 'other', 'num_files', 'parser_version', 'source', 's3_object_name' ]), )
def flow(self): if self.config.get('source.raw_html'): return Flow( *[ self.replace_field(original, cleaned) for original, cleaned in self.field_map.items() ], delete_fields(list(self.field_map.keys()), resources=RESOURCE_NAME), add_field(self.URL_FIELD, 'string', resources=RESOURCE_NAME), self.clean_html_values(), )
def flow(*_): return Flow( filter_by_type, rename_fields(RENAME_FIELDS), add_fields(FIELDS, 'string'), validate, parse_document, fix_fields(FIELDS), delete_fields([ 'document', 'pdf', 'other', 'num_files', 'parser_version', 'source', 's3_object_name' ]), )
def flow(*_): return Flow( update_resource( -1, name='maya_company_officer_list', path="data/maya_company_officer_list.csv", ), filter_by_type, rename_fields(RENAME_FIELDS), add_fields(FIELDS, 'string'), add_fields(OPTIONAL_FIELDS, 'string'), add_fields(TABLE_FIELDS,'string'), validate, parse_document, fix_fields(TABLE_FIELDS), delete_fields(['document', 'pdf', 'other', 'num_files', 'parser_version', 'source', 's3_object_name']), )
def flow(*_): return DF.Flow( get_updated_sources(), DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')), DF.validate(), DF.filter_rows(lambda r: r['publication_id']), DF.add_field('tender_type', 'string', lambda r: TENDER_KINDS[r['tender_type_he']], **{'es:keyword': True}), DF.join_with_self( 'tenders', KEY, dict((k, dict(aggregate='last')) for k in list(TENDER_MAPPING.keys()) + ['tender_type'])), DF.set_type('publication_id', type='string', transform=str), DF.set_type('supplier_id', type='string', transform=str), DF.set_type('tender_id', type='string', transform=lambda v: v or 'none'), DF.set_type('.+_date', type='date', format='%d.%m.%Y', on_error=DF.schema_validator.clear), DF.set_type('subjects', type='string', transform=lambda v: ';'.join(x.strip() for x in v.split(',')) if v else ''), DF.set_type('claim_date', type='datetime', transform=lambda v, field_name, row: datetime.datetime. combine(v, row['claim_time'] or datetime.time(0)) if v else None), DF.set_type('tender_type_he', **{'es:keyword': True}), DF.delete_fields(['claim_time']), DF.add_field( 'page_url', 'string', lambda r: f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'), DF.add_field('page_title', 'string', lambda r: r['description']), DF.add_field('reason', 'string', lambda r: r['regulation']), DF.add_field('documents', 'array', []), DF.add_field('contact', 'string'), DF.add_field('contact_email', 'string'), DF.validate(), DF.update_resource(-1, **{'dpp:streaming': True}), DF.printer(), )
def test_delete_fields_regex(): from dataflows import load, delete_fields flow = Flow( load('data/regex.csv'), delete_fields(['temperature (24h)'], regex=False), ) data = flow.results()[0] assert data == [[ { 'city': 'london' }, { 'city': 'paris' }, { 'city': 'rome' }, ]]
def conditional(self): new_fields = [ x.replace(':', '-') for x in self.PROHIBITED_COLUMN_TYPES ] old_fields = [x.replace(':', '-') for x in self.REQUIRED_COLUMN_TYPES] return Flow( add_computed_field( [dict( target=f, operation='constant', ) for f in new_fields], resources=RESOURCE_NAME), self.work(), *[ set_type(f, columnType=ct) for (f, ct) in zip(new_fields, self.PROHIBITED_COLUMN_TYPES) ], delete_fields(old_fields, resources=RESOURCE_NAME), )
def flow(*_): return Flow( update_resource( -1, name='reported_work_record', path="data/reported_work_record.csv", ), filter_by_type, rename_fields(RENAME_FIELDS), add_fields(FIELDS, 'string'), add_fields(TABLE_FIELDS, 'string'), validate, parse_document, fix_fields(TABLE_FIELDS), delete_fields([ 'document', 'pdf', 'other', 'num_files', 'parser_version', 'source', 's3_object_name', 'id', 'company', 'type', 'fix_for', 'fixed_by', 'next_doc', 'prev_doc' ]), )
def clean_data(filename: str, location: str) -> None: """Clean and validate data with `dataflows`, creating data packages in the process, one for each file.""" global FILE_NAME FILE_NAME = f"{location}-{filename}" clean_directory, _, processing_directory = set_location_dirs(location) exported_file = f"{clean_directory}/{filename}" _ = Flow( load( f"{processing_directory}/{filename}.csv", name=FILE_NAME, ), change_path, add_field("NameFIPS", "string"), concat_name_columns, delete_fields(["Name", "FIPS"]), set_type("Data", type="any"), validate(), dump_to_path(exported_file), ).process()[1]
def flow(*_): return DF.Flow( DF.load(URL, format='json', property='jData', name='education'), # DF.checkpoint('education'), DF.concatenate(dict( page_title=['Title'], start_date=['PobKKPublishingDate'], claim_date=['PobLastDate'], target_audience_x=['PobBudgetEntitties'], description=['PobTaktzir'], email=['PobPedagogyContactHtml'], publishing_unit_x=['PobYechida'], budget_code_x=['PobTakanaTaktzivitString'], att_title=['PobCreteriaLink_description'], att_url=['PobCreteriaLink_url'], ), resources=-1, target=dict(name='education')), enumerate_titles, DF.add_field('page_url', 'string', PAGE_URL, resources=-1), DF.add_field('publisher', 'string', 'משרד החינוך', resources=-1), DF.add_field('tender_type', 'string', 'call_for_bids', resources=-1), DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1), DF.add_field('publication_id', 'integer', 0, resources=-1), DF.add_field('tender_id', 'string', '0', resources=-1), DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1), DF.add_field('contact', 'string', lambda row: extract_hebrew(row, 'email'), resources=-1), DF.add_field('target_audience', 'string', lambda row: extract_hebrew(row, 'target_audience_x'), resources=-1), DF.add_field('contact_email', 'string', lambda row: extract_email(row, 'email'), resources=-1), DF.add_field('publishing_unit', 'string', lambda row: row['publishing_unit_x'][0]['PobYechida'], resources=-1), DF.add_field('budget_code', 'string', lambda row: extract_budget_code(row, 'budget_code_x'), resources=-1), DF.set_type('start_date', type='date', format='%d/%m/%Y %H:%M:%S'), DF.set_type('claim_date', type='datetime', format='%d/%m/%Y %H:%M:%S'), DF.add_field('documents', 'array', lambda row: [dict( description=row['att_title'], link=row['att_url'], update_time=str(row['start_date']) )], resources=-1), DF.delete_fields(['email', 'publishing_unit_x', 'budget_code_x', 'att_title', 'att_url', 'target_audience_x'], resources=-1), calculate_publication_id(6), DF.update_resource(-1, **{'dpp:streaming': True}) )
def flow(*_): return DF.Flow( all_units(), DF.add_field('office', 'string', lambda r: r['path'][0] if len(r['path']) > 0 else None, **{'es:keyword': True}), DF.add_field('unit', 'string', lambda r: r['path'][1] if len(r['path']) > 1 else None, **{'es:keyword': True}), DF.add_field('subunit', 'string', lambda r: r['path'][2] if len(r['path']) > 2 else None, **{'es:keyword': True}), DF.add_field('subsubunit', 'string', lambda r: r['path'][3] if len(r['path']) > 3 else None, **{'es:keyword': True}), DF.add_field('breadcrumbs', 'string', lambda r: '/'.join(r['path']) or 'משרדי הממשלה', **{'es:exclude': True}), DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main', **{'es:exclude': True}), DF.delete_fields([ 'path', ]), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', 2020), DF.add_field('kind', 'string', 'gov_social_service_unit', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('score', 'number', 1000, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='units', **{'dpp:streaming': True}), # Ensure we only have the main offices DF.filter_rows(lambda r: r['unit'] is None), DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'), DF.dump_to_path('/var/datapackages/units/social_services'), DF.dump_to_sql(dict(units={'resource-name': 'units'})))
skip_rows=[1, 2, 3, 4, 5, -1], headers=['Date', 'Price', 'Empty column'], format='csv', name='annual' ), extract_december_rows, load( load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its', skip_rows=[1, 2, 3, 4, 5, -1], headers=['Date', 'Price', 'Empty column'], format='csv', name='monthly' ), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}), set_type('Date', resources='annual', type='yearmonth'), set_type('Price', resources='annual', type='number'), set_type('Date', resources='monthly', type='yearmonth'), set_type('Price', resources='monthly', type='number'), validate(), delete_fields(['Empty column'], resources=None) ) def flow(parameters, datapackage, resources, stats): return gold_price_flow if __name__ == '__main__': gold_price_flow.process()