def flow(*_): gcd = google_chrome_driver() download = gcd.download( 'https://data.gov.il/dataset/246d949c-a253-4811-8a11-41a137d3d613/resource/f004176c-b85f-4542-8901-7b3176f9a054/download/f004176c-b85f-4542-8901-7b3176f9a054.csv' ) return Flow( load(download, cast_strategy=load.CAST_TO_STRINGS), concatenate(_get_columns_mapping_dict(), target=dict(name='company-details')), set_type('id', type='string'), set_type('company_registration_date', type='date', format='%d/%m/%Y'), set_type('company_is_government', type='boolean', falseValues=['לא'], trueValues=['כן']), set_type('company_is_mafera', type='boolean', falseValues=['לא'], trueValues=['מפרה', 'התראה']), set_type('company_last_report_year', type='integer'), clear_bool_values, update_resource(**{'dpp:streaming': True}, resources='company-details'), set_primary_key(['id'], resources='company-details'), printer(), )
def test_concatenate(): from dataflows import concatenate f = Flow([ { 'a': 1, 'b': 2 }, { 'a': 2, 'b': 3 }, { 'a': 3, 'b': 4 }, ], [ { 'c': 4, 'd': 5 }, { 'c': 5, 'd': 6 }, { 'c': 6, 'd': 7 }, ], concatenate({ 'f1': ['a'], 'f2': ['b', 'c'], 'f3': ['d'] })) results, _, _ = f.results() assert results[0] == [{ 'f1': 1, 'f2': 2, 'f3': None }, { 'f1': 2, 'f2': 3, 'f3': None }, { 'f1': 3, 'f2': 4, 'f3': None }, { 'f1': None, 'f2': 4, 'f3': 5 }, { 'f1': None, 'f2': 5, 'f3': 6 }, { 'f1': None, 'f2': 6, 'f3': 7 }]
def flow(parameters): return Flow( concatenate(parameters.get('fields', {}), parameters.get('target', {}), parameters.get('sources')), update_resource( parameters.get('target', {}).get('name', 'concat'), **{PROP_STREAMING: True}))
def get_neighborhood_features(): return DF.Flow( DF.load('neighborhoods.xlsx', name='stat-areas', deduplicate_headers=True), DF.add_field( 'neighborhoods', 'array', lambda r: [v for k, v in r.items() if v and k.startswith('neighborhood')]), DF.add_field('geometry', 'object', lambda r: geometries[r['stat-area']]), DF.concatenate( dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])), DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(), DF.join_with_self( 'stat-areas', ['neighborhood'], dict( neighborhood=None, stat_areas=dict(name='stat_area', aggregate='array'), geometries=dict(name='geometry', aggregate='array'), )), DF.add_field('geometry', 'object', lambda r: unite_geometries(r['geometries'])), DF.delete_fields(['geometries']), DF.update_resource(-1, name='neighborhoods'), DF.add_field( 'properties', 'object', lambda r: dict( x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])), DF.delete_fields(['neighborhood', 'stat_areas']), DF.checkpoint('_cache_neighborhoods')).results()[0][0]
def flow(*_): print('reading companies...') return Flow( data_gov_il_resource.flow(companies), fix_values(), concatenate(_get_columns_mapping_dict(), target=dict(name='company-details')), set_type('id', type='string'), set_type('company_street_number', type='string'), set_type('company_registration_date', type='date', format='%d/%m/%Y'), set_type('company_is_government', type='boolean', falseValues=['לא'], trueValues=['כן']), set_type('company_is_mafera', type='boolean', falseValues=['לא'], trueValues=['מפרה', 'התראה']), set_type('company_last_report_year', type='integer'), set_type('company_postal_code', type='string'), clear_bool_values, update_resource(**{'dpp:streaming': True}, resources='company-details'), set_primary_key(['id'], resources='company-details'), printer(), )
def broken_links_flow(): return DF.Flow( *[ DF.Flow( DF.load(URL_TEMPLATE.format(**c), name=c['name']), DF.add_field('__name', 'string', c['name'], resources=c['name']), DF.add_field('__title', 'string', get_title(c['title']), resources=c['name']), ) for c in configuration ], DF.add_field('urls', 'array', lambda r: RE.findall(str(r))), DF.add_field('link', 'string', lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)), DF.concatenate( dict( name=['__name'], title=['__title'], link=[], urls=[], )), DF.add_field('url', 'string'), DF.add_field('error', 'string'), unwind(), DF.delete_fields(['urls']), DF.parallelize(check_broken(), 4), DF.filter_rows(lambda r: r['error'] is not None), )
def prepare(): for resource_name, load in loads: DF.Flow( load, # DF.printer(tablefmt='html'), DF.concatenate( FIELD_MAPPING, dict(name=resource_name, path=resource_name + '.csv')), DF.set_type('activity_name', type='string', constraints=dict(required=True), on_error=DF.schema_validator.drop), DF.set_type('allocated_budget', type='number', groupChar=',', bareNumber=False), DF.set_type('num_beneficiaries', type='number', groupChar=',', bareNumber=False, on_error=DF.schema_validator.ignore), fix_beneficiaries, DF.set_type('num_beneficiaries', type='string'), multiply_budget, fill_org_hierarchy, # DF.printer(tablefmt='html'), DF.dump_to_path('tmp/' + resource_name), ).process()
def preflow(self): f = Flow( load('.cache/{}/datapackage.json'.format(self.ref_hash)), concatenate(fields=dict( (f, []) for f in self.REF_KEY_FIELDS + self.REF_FETCH_FIELDS), target=dict(name=self.key, path=self.key + '.csv'), resources=self.ref_hash), ) return f
def prepare_addresses(): with tempfile.NamedTemporaryFile(suffix='.csv', mode='wb') as source: shutil.copyfileobj(fetch_ckan('addresses', 'CSV'), source) source.flush() DF.Flow( DF.load(source.name), DF.concatenate( dict(street_name=['streetName'], house_number=['HouseNuber'], letter=[], lat=[], lon=[])), match_arnona(), DF.dump_to_path('_cache_addresses'), DF.checkpoint('_cache_addresses')).process()
def flow(*_): return DF.Flow( DF.load( '/var/datapackages/activities/social_services/social_services/datapackage.json' ), DF.concatenate( dict(kind=[], kind_he=[], activity_name=[], activity_description=[], publisher_name=[], history=[], max_year=[], min_year=[]), dict(name='activities', path='activities.csv')), DF.set_primary_key(['kind', 'publisher_name', 'activity_name']), DF.set_type('activity_name', **{'es:title': True}), DF.set_type('activity_description', **{ 'es:itemType': 'string', 'es:boost': True }), DF.set_type('kind', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('kind_he', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('publisher_name', **{'es:keyword': True}), DF.set_type( 'history', **{ 'es:itemType': 'object', 'es:schema': dict(fields=[ dict(name='year', type='integer'), dict(name='unit', type='string'), dict(name='subunit', type='string'), dict(name='subsubunit', type='string'), dict(name='allocated_budget', type='integer'), dict(name='num_beneficiaries', type='string', **{'es:index': False}), ]) }), DF.add_field( 'score', 'number', lambda x: (x['history'][0].get('allocated_budget') or 1000) / 1000, **{'es:score-column': True}), DF.update_resource(-1, **{'dpp:streaming': True}), DF.dump_to_path('/var/datapackages/activities/all'), DF.dump_to_sql(dict(activities={'resource-name': 'activities'})))
def flow(*_): return DF.Flow( get_updated_sources(), DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')), DF.validate(), DF.filter_rows(lambda r: r['publication_id']), DF.add_field('tender_type', 'string', lambda r: TENDER_KINDS[r['tender_type_he']], **{'es:keyword': True}), DF.join_with_self( 'tenders', KEY, dict((k, dict(aggregate='last')) for k in list(TENDER_MAPPING.keys()) + ['tender_type'])), DF.set_type('publication_id', type='string', transform=str), DF.set_type('supplier_id', type='string', transform=str), DF.set_type('tender_id', type='string', transform=lambda v: v or 'none'), DF.set_type('.+_date', type='date', format='%d.%m.%Y', on_error=DF.schema_validator.clear), DF.set_type('subjects', type='string', transform=lambda v: ';'.join(x.strip() for x in v.split(',')) if v else ''), DF.set_type('claim_date', type='datetime', transform=lambda v, field_name, row: datetime.datetime. combine(v, row['claim_time'] or datetime.time(0)) if v else None), DF.set_type('tender_type_he', **{'es:keyword': True}), DF.delete_fields(['claim_time']), DF.add_field( 'page_url', 'string', lambda r: f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'), DF.add_field('page_title', 'string', lambda r: r['description']), DF.add_field('reason', 'string', lambda r: r['regulation']), DF.add_field('documents', 'array', []), DF.add_field('contact', 'string'), DF.add_field('contact_email', 'string'), DF.validate(), DF.update_resource(-1, **{'dpp:streaming': True}), DF.printer(), )
def test_concatenate_multifield(): from dataflows import concatenate f = Flow([ { 'a': 1, 'b': 2, 'c': None }, { 'a': 2, 'b': None, 'c': 3 }, { 'a': 3, 'c': 4 }, { 'a': 3, 'b': 6, 'c': 4 }, ], concatenate({ 'f1': ['a'], 'f2': ['b', 'c'], })) results, _, _ = f.results() assert results[0] == [ { 'f1': 1, 'f2': 2 }, { 'f1': 2, 'f2': 3 }, { 'f1': 3, 'f2': 4 }, { 'f1': 3, 'f2': 4 }, ]
def flow(*_): return DF.Flow( DF.load(URL, format='json', property='jData', name='education'), # DF.checkpoint('education'), DF.concatenate(dict( page_title=['Title'], start_date=['PobKKPublishingDate'], claim_date=['PobLastDate'], target_audience_x=['PobBudgetEntitties'], description=['PobTaktzir'], email=['PobPedagogyContactHtml'], publishing_unit_x=['PobYechida'], budget_code_x=['PobTakanaTaktzivitString'], att_title=['PobCreteriaLink_description'], att_url=['PobCreteriaLink_url'], ), resources=-1, target=dict(name='education')), enumerate_titles, DF.add_field('page_url', 'string', PAGE_URL, resources=-1), DF.add_field('publisher', 'string', 'משרד החינוך', resources=-1), DF.add_field('tender_type', 'string', 'call_for_bids', resources=-1), DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1), DF.add_field('publication_id', 'integer', 0, resources=-1), DF.add_field('tender_id', 'string', '0', resources=-1), DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1), DF.add_field('contact', 'string', lambda row: extract_hebrew(row, 'email'), resources=-1), DF.add_field('target_audience', 'string', lambda row: extract_hebrew(row, 'target_audience_x'), resources=-1), DF.add_field('contact_email', 'string', lambda row: extract_email(row, 'email'), resources=-1), DF.add_field('publishing_unit', 'string', lambda row: row['publishing_unit_x'][0]['PobYechida'], resources=-1), DF.add_field('budget_code', 'string', lambda row: extract_budget_code(row, 'budget_code_x'), resources=-1), DF.set_type('start_date', type='date', format='%d/%m/%Y %H:%M:%S'), DF.set_type('claim_date', type='datetime', format='%d/%m/%Y %H:%M:%S'), DF.add_field('documents', 'array', lambda row: [dict( description=row['att_title'], link=row['att_url'], update_time=str(row['start_date']) )], resources=-1), DF.delete_fields(['email', 'publishing_unit_x', 'budget_code_x', 'att_title', 'att_url', 'target_audience_x'], resources=-1), calculate_publication_id(6), DF.update_resource(-1, **{'dpp:streaming': True}) )
def test_example_9(): from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows f = Flow( # Emmy award nominees and winners load('data/emmy.csv', name='emmies'), filter_rows(equals=[dict(winner=1)]), concatenate(dict(emmy_nominee=['nominee'], ), dict(name='emmies_filtered'), resources='emmies'), # Academy award nominees and winners load('data/academy.csv', encoding='utf8', name='oscars'), join( 'emmies_filtered', ['emmy_nominee'], # Source resource 'oscars', ['Name'], # Target resource full=False # Don't add new fields, remove unmatched rows ), filter_rows(equals=[dict(Winner='1')]), dump_to_path('out/double_winners')) _ = f.process()
def flow(*args): return DF.Flow( get(), DF.filter_rows( lambda row: bool(row['key']) and bool(row.get('title'))), simplify_tags, extract_tags('life_areas', ['Domain']), extract_tags('source_kind', ['Source', 'Resource', 'Resouce']), DF.add_field( 'authors', 'string', lambda r: None if not r.get('creators') else ', '.join( ('{name}'.format(**c) if 'name' in c else '{firstName} {lastName}'.format(**c)) for c in r.get('creators', []) if c.get('creatorType') == 'author')), DF.add_field('item_kind', 'string', lambda r: r.get('reportType') or r.get('itemKind')), DF.concatenate(MAPPING, target={ 'name': 'zotero', 'path': 'zotero.csv' }), DF.dump_to_path('data/zotero'), DF.update_resource(None, **{'dpp:streaming': True}), DF.printer())
def flow(*_): prepare() yearly_fields = [ 'year', 'unit', 'subunit', 'subsubunit', 'allocated_budget', 'num_beneficiaries' ] return DF.Flow( *[ DF.load('tmp/' + resource_name + '/datapackage.json') for resource_name, _ in loads ], DF.concatenate( FIELD_MAPPING, dict(name='social_services', path='social_services.csv')), DF.sort_rows('{year}', reverse=True), DF.add_field( 'history', 'object', lambda r: dict( (k, r[k] if not isinstance(r[k], decimal.Decimal) else int(r[k])) for k in yearly_fields)), DF.printer(), DF.join_with_self( 'social_services', ['publisher_name', 'activity_name'], dict( publisher_name=None, activity_name=None, activity_description=dict(aggregate='set'), min_year=dict(name='year', aggregate='min'), max_year=dict(name='year', aggregate='max'), history=dict(aggregate='array'), )), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.update_resource(-1, **{'dpp:streaming': True}), DF.printer(), )
def loader(name, cat): return DF.Flow( DF.load('mosadot.xlsx'), DF.concatenate( dict( municipality=['מועצה אזורית'], town=['שם יישוב'], name=['שם המוסד'], kind=['סוג המוסד'], address=['כתובת'], status=['סטטוס'], target_audience=['קהל יעד'], area=['שטח'], lat=['Y'], lon=['X'], )), fixer, category(), DF.filter_rows(lambda r: r['category'] == cat), geo(), # DF.join_with_self('concat', ['kind'], dict(kind=None)), DF.update_resource(-1, name=name, path=name + '.csv'), DF.dump_to_path(name), ).results()[0][0]
'.+)', 'keys': { 'year': '\\1' } }], [{ 'name': 'year', 'type': 'string' }], { 'name': 'value', 'type': 'number' }, ), verify_unused_fields(), DF.concatenate(FIELD_MAPPING, target=dict(name='out')), fix_urls(['source_url']), ensure_chart_title(), fix_languages(), DF.add_field('order_index', 'integer'), lambda rows: ({ **row, **{ 'order_index': i } } for i, row in enumerate(rows)), set_defaults, extrapulate_years, fix_values, DF.set_type('value', groupChar=',', bareNumber=True), fix_units,
def main_flow(prefix=''): source_url = '{}data/publications_for_es/datapackage.json'.format(prefix) package = Package(source_url) all_fields = set(field.name for resource in package.resources for field in resource.schema.fields) all_fields = dict((field_name, []) for field_name in all_fields) return Flow( load(source_url), lambda row: dict(row, json='{}'), concatenate(all_fields, target=dict(name='publications', path='publications.csv')), delete_fields(['json']), prefer_gd('title'), prefer_gd('notes'), prefer_gd('publisher'), prefer_gd('tags'), prefer_gd('language_code'), prefer_gd('pubyear'), split_keyword_list('item_kind', 'gd_Item Type'), split_keyword_list('life_areas', 'gd_Life Domains'), split_keyword_list('source_kind', 'gd_Resource Type'), split_keyword_list('languages', 'language_code', ' '), split_keyword_list('tags', 'tags'), load('data/zotero/datapackage.json'), concatenate(dict( title=[], pubyear=[], publisher=[], authors=[], life_areas=[], notes=[], languages=[], tags=[], url=[], migdar_id=[], item_kind=[], source_kind=[], isbn=[], physical_description=[], publication_distribution_details=[], doc_id=[], ), target=dict(name='publications', path='publications.csv')), set_type('title', **{'es:title': True}), set_type('notes', **{'es:hebrew': True}), set_type('publisher', **{'es:keyword': True}), add_field('year', 'integer', default=extract_year), split_and_translate('tags', 'tags', keyword=True), split_and_translate('life_areas', 'life_areas', keyword=True), split_and_translate('languages', 'languages', keyword=True), split_and_translate('source_kind', 'source_kind', keyword=True), split_and_translate('item_kind', 'item_kind', keyword=True), printer(), add_computed_field([ { 'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN }, { 'operation': 'format', 'target': 'page_title', 'with': PAGE_TITLE_PATTERN }, ]), add_computed_field([]), )
def Olap_Datapackage(): flow = Flow( # Load datapackages: load('elspot_prices_data/datapackage.json'), load('afrr_data/datapackage.json'), load('fcr_dk1_data/datapackage.json'), concatenate(fields={ 'Timestamp': ['HourUTC'], 'Area': ['PriceArea'], 'Product': ['product'], 'Amount': ['amount'], 'Price_DKK': ['PriceDKK'], 'Price_EUR': ['PriceEUR'] }, target={ 'name': 'fact', 'path': 'data/fact.csv' }), add_computed_field( [dict(target='id', operation='constant', with_='dummy')]), add_id, set_type('id', type='integer'), set_primary_key(primary_key=['id']), # Reorder so that 'id' column is the first: select_fields([ 'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK', 'Price_EUR' ], resources='fact'), # Add foreign keys: add_foreign_keys, # Fact table is ready. Now duplicate the resource to generate dim tables: # First is 'time' table: duplicate(source='fact', target_name='time', target_path='time.csv'), select_fields(['Timestamp'], resources=['time']), join_self(source_name='time', source_key=['Timestamp'], target_name='time', fields={'Timestamp': {}}), # Parse datetime fields and add a separate field for year, month and day: add_computed_field([ dict(target=dict(name='day', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d' )), dict(target=dict(name='month', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m' )), dict(target=dict(name='month_name', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B' )), dict(target=dict(name='year', type='year'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y' )), ], resources=['time']), set_primary_key(primary_key=['Timestamp'], resources=['time']), # Now 'area' table: duplicate(source='fact', target_name='area', target_path='area.csv'), select_fields(['Area'], resources=['area']), join_self(source_name='area', source_key=['Area'], target_name='area', fields={'Area': {}}), set_primary_key(primary_key=['Area'], resources=['area']), # Now 'product' table: duplicate(source='fact', target_name='product', target_path='product.csv'), select_fields(['Product'], resources=['product']), join_self(source_name='product', source_key=['Product'], target_name='product', fields={'Product': {}}), set_primary_key(primary_key=['Product'], resources=['product']), dump_to_path('olap_datapackage')) flow.process()
def func(row): row['scores'] = sorted(row.get('scores', []), key=lambda r: r['date'])[-30:] return func if __name__ == '__main__': r, _, _ = DF.Flow( DF.load(all_data(), name='cities', headers=1, override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.add_field('score_date', 'object', lambda r: dict( date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted'])) ), DF.concatenate(dict( id=[], city_name=[], score_date=[] ), target=dict(name='ranking')), DF.join_with_self('ranking', '{city_name}', dict( id=None, city_name=None, scores=dict(name='score_date', aggregate='array') )), sort_limit_scores(), DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200), DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']), DF.sort_rows('{sortkey}', reverse=True), DF.delete_fields(['sortkey']), DF.add_field('rank', 'integer', 0), DF.add_field('translations', 'object', lambda r: city_translations[r['city_name']]), DF.add_field('image', 'object', lambda r: upload_static_image(r['id'], width=280*2, height=160*2)), ranker(), ).results() rankings = r[0]
gj = json.load(open(filename)) default = dict(latest_confidence=0, latest_ratio=0, latest_reports=0, population=0) for feature in gj['features']: properties = feature['properties'] properties.update(latest.get((properties['id'], is_city), default)) upload = json.dumps(gj, cls=json_encoder).encode('utf8') path = 'data/tilesets/static-images-{}.geojson'.format(key) logging.info('UPLOADING %d bytes to %s', len(upload), path) upload_file(upload, path) upload_tileset(upload, 'static-images-' + key) if __name__ == '__main__': data, _, _ = DF.Flow( DF.load(latest_file(), name='out', override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.concatenate( dict(id=[], is_city=[], latest_confidence=['symptoms_ratio_confidence_weighted'], latest_ratio=['symptoms_ratio_weighted'], latest_reports=['num_reports_weighted'], population=[]))).results() data = dict(((r.pop('id'), r.get('is_city')), r) for r in data[0]) for key, is_city in [('cities', 1), ('neighborhoods', 0)]: process_file(key, is_city, geo_file(key), data)
if __name__ == '__main__': r, _, _ = DF.Flow( DF.load(all_data(), name='cities', headers=1, override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.add_field( 'score_date', 'object', lambda r: dict(weekday=r['date'].isoweekday() % 7, date=r['date'].toordinal(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted']))), DF.concatenate(dict(id=[], city_name=[], score_date=[]), target=dict(name='popup_data')), DF.join_with_self( 'popup_data', '{city_name}', dict(id=None, city_name=None, scores=dict(name='score_date', aggregate='array'))), sort_limit_scores(), DF.filter_rows(lambda r: r['scores'] is not None), DF.add_field('nr', 'integer', lambda r: r['scores'][-1]['nr']), DF.add_field('sr', 'number', lambda r: r['scores'][-1]['sr']), split_to_weeks(), DF.add_field('translations', 'object', lambda r: city_translations[r['city_name']]), ).results() popup_data = r[0] popup_data = dict((x.pop('id'), x) for x in popup_data)
def base_flow(): sources, *_ = Flow( list_gdrive(), filter_rows(lambda row: ( row['kind'] == 'drive#file' and row['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' )), add_field('filename', 'string', default=lambda row: 'pubfiles/{modifiedTime}-{id}.xlsx'.format(**row)), download_files(), add_field('sheet', 'string'), add_field('headers', 'integer', 1), get_sheets(), ).results() return Flow( *[ load(source['filename'], sheet=source['sheet'], headers=source['headers'], infer_strategy=load.INFER_STRINGS, cast_strategy=load.CAST_TO_STRINGS, name=source['filename']) for source in sources[0] ], filter_rows(lambda row: row.get('migdar_id') not in ('', 'None', None)), load('data/zotero/zotero.csv'), concatenate( fields={ 'migdar_id': [], 'title': ['Title', ], 'bib_title': [], 'bib_related_parts': [], 'notes': [], 'tags': ['Tags'], 'publisher': [], 'languages': ['language_code'], 'item_kind': ['Item Type', 'Item type', 'item_type'], 'pubyear': ['pubyear/pubdate'], 'life_areas': ['Life Domains', 'Domain'], 'source_kind': ['Resource Type', 'Resource type'], 'authors': ['author'], 'url': ['URL'], }, target=dict( name='publications', path='data/publications.csv' ) ), fix_nones(), fix_urls(['url']), set_type('title', **{'es:title': True}), set_type('authors', **{'es:boost': True}), set_type('notes', **{'es:hebrew': True}), set_type('publisher', **{'es:boost': True}), add_field('year', 'integer', default=extract_year), split_and_translate('tags', 'tags', keyword=True, delimiter=','), split_and_translate('life_areas', 'life_areas', keyword=True, delimiter=','), split_and_translate('languages', 'languages', keyword=True, delimiter=' '), split_and_translate('source_kind', 'source_kind', keyword=True), split_and_translate('item_kind', 'item_kind', keyword=True), fix_links('notes'), verify_migdar_id(), add_computed_field([ {'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN}, {'operation': 'format', 'target': 'page_title', 'with': PAGE_TITLE_PATTERN}, ]), add_field('title_kw', 'string', default=lambda row: row.get('title'), **{'es:keyword': True}), )
def aggregate_test_results(servers, total_duration_seconds, datacenter, only_test_method, load_generator): overview_report = {} load_steps = [] for num, _ in servers.items(): log_txt_filename = os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output', 'log.txt') http_csv_zst_filename = os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output', 'warp-bench-data.http.csv.zst') http_csv_gz_filename = os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output', 'warp-bench-data.http.csv.gz') http_csv_filename = os.path.join( DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output', 'warp-bench-data.http.{}'.format('tsv' if load_generator == 'warp' else 'csv')) https_csv_zst_filename = os.path.join( DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output', 'warp-bench-data.https.csv.zst') https_csv_gz_filename = os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output', 'warp-bench-data.https.csv.gz') https_csv_filename = os.path.join( DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output', 'warp-bench-data.https.{}'.format('tsv' if load_generator == 'warp' else 'csv')) if os.path.exists(log_txt_filename): try: with open(log_txt_filename) as f: log_txt = f.read().strip() for method in ('http', 'https'): if not only_test_method or only_test_method == method: tmp = log_txt.split( "--- {} load test successful, bucket_name=". format(method))[1].split(" ---")[0] bucket_name, dt = tmp.split(' | ') assert len(bucket_name) > 5 overview_report.setdefault( 'server_{}_{}'.format(num, method), {}).update(bucket_name=bucket_name, is_successful=True, dt=dt) except Exception: traceback.print_exc() for csv_filename, csv_zst_filename, csv_gz_filename, method in ( (http_csv_filename, http_csv_zst_filename, http_csv_gz_filename, 'http'), (https_csv_filename, https_csv_zst_filename, https_csv_gz_filename, 'https')): csv_compressed_filename = csv_zst_filename if load_generator == 'warp' else csv_gz_filename csv_compression_method = 'zst' if load_generator == 'warp' else 'gz' if (not only_test_method or only_test_method == method) and os.path.exists(csv_compressed_filename): load_steps.append( DF.load(csv_filename, name='{}_{}'.format(method, num), infer_strategy=DF.load.INFER_STRINGS)) csv_decompress(csv_compression_method, csv_compressed_filename, csv_filename) overview_report.setdefault( 'server_{}_{}'.format(num, method), {}).update(csv_size=os.path.getsize(csv_filename) if os. path.exists(csv_filename) else 0) assert len(load_steps) > 0, "none of the servers data was loaded" stats = defaultdict(int) def aggregate_stats(row): endpoint = row['endpoint'] try: op = row['op'] error = row['error'] bytes = int(row['bytes']) duration_ns = int(row['duration_ns']) first_byte = row['first_byte'] end = row['end'] duration_after_first_byte_ns = get_duration_ns(first_byte, end) assert duration_after_first_byte_ns >= 0, row assert '~' not in endpoint assert '~' not in op key_prefix = '{}~{}'.format(endpoint, op) if error: stats['{}~errors'.format(key_prefix)] += 1 else: stats['{}~successful-requests'.format(key_prefix)] += 1 stats['{}~bytes'.format(key_prefix)] += bytes stats['{}~duration_ns'.format(key_prefix)] += duration_ns stats['{}~duration_after_first_byte_ns'.format( key_prefix)] += duration_after_first_byte_ns if duration_ns > stats['{}~max-duration_ns'.format( key_prefix)]: stats['{}~max-duration_ns'.format( key_prefix)] = duration_ns if stats['{}~min-duration_ns'.format( key_prefix)] == 0 or duration_ns < stats[ '{}~min-duration_ns'.format(key_prefix)]: stats['{}~min-duration_ns'.format( key_prefix)] = duration_ns duration_seconds = duration_ns / 1000000000 bucket = None for bucket_max_seconds in BUCKET_MAX_SECONDS: if duration_seconds <= bucket_max_seconds: bucket = bucket_max_seconds break if not bucket: bucket = BUCKET_INF stats['{}~request-duration-{}'.format(key_prefix, bucket)] += 1 except: print("Exception in res {} endpoint {}".format( row['source_resource_name'], endpoint)) raise def aggregate_errors(rows): for row in rows: if row['error'] and len(row['error']) > 0: yield row concatenate_step = DF.concatenate( { "source_resource_name": [], "op": [], "bytes": [], "endpoint": [], "file": [], "error": [], "start": [], "first_byte": [], "end": [], "duration_ns": [] }, { "name": "warp_bench_data", "path": "data/warp_bench_data.csv" }) def add_source_resource_name_step(rows): for row in rows: row['source_resource_name'] = rows.res.name yield row DF.Flow( *load_steps, add_source_resource_name_step, concatenate_step, aggregate_stats, DF.dump_to_path( os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, 'warp_bench_data'))).process() try: DF.Flow( *load_steps, concatenate_step, aggregate_errors, DF.dump_to_path( os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, 'warp_bench_data_errors'))).process() except Exception: print("No Errors!") def dump_stats(): all_field_names = set() normalized_stats = {} for k, v in stats.items(): endpoint, op, *metric = k.split('~') metric = '~'.join(metric) field_name = '{}-{}'.format(op, metric) all_field_names.add(field_name) normalized_stats.setdefault(endpoint, {})[field_name] = v totals_sum = defaultdict(int) for endpoint, row in normalized_stats.items(): if not endpoint or len(endpoint) < 4: continue for field_name in all_field_names: if field_name not in row: row[field_name] = 0.0 elif not row[field_name]: row[field_name] = 0.0 for op in ['STAT', 'DELETE', 'GET', 'PUT']: total_bytes = row.get('{}-bytes'.format(op)) or 0 total_megabytes = total_bytes / 1024 / 1024 successful_requests = row.get( '{}-successful-requests'.format(op)) or 0 error_requests = row.get('{}-errors'.format(op)) or 0 total_requests = successful_requests + error_requests row['{}-percent-errors'.format(op)] = ( error_requests / total_requests * 100) if total_requests > 0 else 0 row['{}-requests-per-second'.format( op)] = total_requests / total_duration_seconds row['{}-successful-requests-per-second'.format( op)] = successful_requests / total_duration_seconds if op in ['GET', 'PUT']: row['{}-megabytes'.format(op)] = total_megabytes row['{}-megabytes-per-second'.format( op)] = total_megabytes / total_duration_seconds for bucket in [*BUCKET_MAX_SECONDS, BUCKET_INF]: row['{}-request-duration-percent-{}'.format( op, bucket)] = ( (row.get('{}-request-duration-{}'.format( op, bucket)) or 0) / successful_requests * 100) if successful_requests > 0 else 0 all_ops_total_errors = sum([ (row.get('{}-errors'.format(op)) or 0) for op in ['GET', 'PUT', 'STAT', 'DELETE'] ]) all_ops_total_successful_requests = sum([ (row.get('{}-successful-requests'.format(op)) or 0) for op in ['GET', 'PUT', 'STAT', 'DELETE'] ]) all_ops_total_requests = all_ops_total_errors + all_ops_total_successful_requests all_ops_total_bytes = sum([(row.get('{}-bytes'.format(op)) or 0) for op in ['GET', 'PUT']]) all_ops_total_megabytes = all_ops_total_bytes / 1024 / 1024 output_row = { 'endpoint': endpoint, 'datacenter': get_datacenter(endpoint, datacenter), 'total-percent-errors': (all_ops_total_errors / all_ops_total_requests * 100) if all_ops_total_requests > 0 else 0, 'total-requests-per-second': all_ops_total_requests / total_duration_seconds, 'total-successful-requests-per-second': all_ops_total_successful_requests / total_duration_seconds, 'total-megabytes-per-second': all_ops_total_megabytes / total_duration_seconds, 'total-errors': all_ops_total_errors, 'total-successful-requests': all_ops_total_successful_requests, 'total-megabytes': all_ops_total_megabytes, 'total-bytes': all_ops_total_bytes, **{ k: row.get(k) or 0 for k in [ *[ '{}-percent-errors'.format(op) for op in [ 'GET', 'PUT', 'STAT', 'DELETE' ] ], *[ '{}-requests-per-second'.format(op) for op in [ 'GET', 'PUT', 'STAT', 'DELETE' ] ], *[ '{}-successful-requests-per-second'.format(op) for op in [ 'GET', 'PUT', 'STAT', 'DELETE' ] ], *[ '{}-megabytes-per-second'.format(op) for op in [ 'GET', 'PUT' ] ], *[ '{}-errors'.format(op) for op in [ 'GET', 'PUT', 'STAT', 'DELETE' ] ], *[ '{}-successful-requests'.format(op) for op in [ 'GET', 'PUT', 'STAT', 'DELETE' ] ], *['{}-megabytes'.format(op) for op in ['GET', 'PUT']], *['{}-bytes'.format(op) for op in ['GET', 'PUT']], *[ 'GET-request-duration-percent-{}'.format(bucket) for bucket in [ *BUCKET_MAX_SECONDS, BUCKET_INF ] ], *[ 'PUT-request-duration-percent-{}'.format(bucket) for bucket in [ *BUCKET_MAX_SECONDS, BUCKET_INF ] ], ] } } yield output_row for metric in ['errors', 'successful-requests', 'bytes']: for op in [ 'total', 'GET', 'PUT', *(['STAT', 'DELETE'] if metric not in ['bytes'] else []) ]: op_metric = '{}-{}'.format(op, metric) totals_sum[op_metric] += (output_row.get(op_metric) or 0) for op in ['GET', 'PUT']: for bucket in [*BUCKET_MAX_SECONDS, BUCKET_INF]: op_bucket_metric_key = '{}-request-duration-{}'.format( op, bucket) totals_sum[op_bucket_metric_key] += ( row.get(op_bucket_metric_key) or 0) totals_sum_total_successful_requests = totals_sum[ 'total-successful-requests'] totals_sum_total_errors = totals_sum['total-errors'] totals_sum_total_requests = totals_sum_total_successful_requests + totals_sum_total_errors totals_sum_total_bytes = totals_sum['total-bytes'] totals_sum[ 'total-megabytes'] = totals_sum_total_megabytes = totals_sum_total_bytes / 1024 / 1024 totals_sum['total-percent-errors'] = ( totals_sum_total_errors / totals_sum_total_requests * 100) if totals_sum_total_requests > 0 else 0 totals_sum[ 'total-requests-per-second'] = totals_sum_total_requests / total_duration_seconds totals_sum[ 'total-successful-requests-per-second'] = totals_sum_total_successful_requests / total_duration_seconds totals_sum[ 'total-megabytes-per-second'] = totals_sum_total_megabytes / total_duration_seconds for op in ['GET', 'PUT', 'STAT', 'DELETE']: op_errors = totals_sum['{}-errors'.format(op)] op_successful_requests = totals_sum[ '{}-successful-requests'.format(op)] op_total_requests = op_errors + op_successful_requests totals_sum['{}-percent-errors'.format(op)] = ( op_errors / op_total_requests * 100) if op_total_requests > 0 else 0 totals_sum['{}-requests-per-second'.format( op)] = op_total_requests / total_duration_seconds totals_sum['{}-successful-requests-per-second'.format( op)] = op_successful_requests / total_duration_seconds if op in ['GET', 'PUT']: totals_sum['{}-megabytes'.format( op)] = totals_sum['{}-bytes'.format(op)] / 1024 / 1024 totals_sum['{}-megabytes-per-second'.format(op)] = totals_sum[ '{}-megabytes'.format(op)] / total_duration_seconds for bucket in [*BUCKET_MAX_SECONDS, BUCKET_INF]: totals_sum['{}-request-duration-percent-{}'.format( op, bucket)] = (totals_sum['{}-request-duration-{}'.format( op, bucket)] / op_successful_requests * 100) if op_successful_requests else 0 yield {'endpoint': '*', 'datacenter': '*', **totals_sum} DF.Flow( dump_stats(), DF.dump_to_path( os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, 'warp_bench_data_stats'))).process() pprint(overview_report) for num, _ in servers.items(): for method in ['http', 'https']: if not only_test_method or only_test_method == method: report = overview_report.get('server_{}_{}'.format( num, method)) or {} csv_size = report.get('csv_size') or 0 is_successful = report.get('is_successful') if csv_size < 1: print( "csv size for server {}, method {} is missing".format( num, method)) return False if not is_successful: print("server {}, method {} was not successful".format( num, method)) return False return True
def flow(parameters, *_): def take_first(field): def f(row): if field in row and isinstance(row[field], list): row[field] = row[field][0] return Flow( f, set_type(field, type='string'), ) def datetime_to_date(field): def f(row): if row.get(field): row[field] = row[field].date() return Flow( f, set_type(field, type='date'), ) def approve(parameters): def func(row): if parameters.get('filter-out') is None: return True bad_phrase = parameters['filter-out'] for f in ('page_title', 'description'): if row.get(f) and bad_phrase in row[f]: return False return True return func return Flow( fetcher(parameters), concatenate(dict( page_title=['Title'], publication_id=['ItemId'], tender_id=['ItemUniqueId'], publisher=['OfficeDesc'], start_date=['PublishDate'], claim_date=['LastDate'], decision=['StatusDesc'], description=['Description'], last_update_date=['UpdateDate'], base_url=['BaseUrl'], url_name=['UrlName'], tender_type_he=['PublicationTypeDesc'], ), resources=-1), add_field('tender_type', 'string', default=parameters['tender_type'], resources=-1), take_first('publisher'), take_first('tender_type_he'), add_field('page_url', 'string', default=lambda row: 'https://www.gov.il/he{base_url}{url_name}'.format(**row)), # delete_fields(['base_url', 'url_name']), filter_rows(approve(parameters)), set_type('publication_id', type='integer'), set_type('start_date', type='datetime', format=DATE_FMT), set_type('last_update_date', type='datetime', format=DATE_FMT), set_type('claim_date', type='datetime', format=DATE_FMT), datetime_to_date('last_update_date'), datetime_to_date('start_date'), set_primary_key(['publication_id', 'tender_type', 'tender_id']), dedup(), update_resource(-1, **parameters.pop('resource')), update_resource(-1, **{'dpp:streaming': True}), validate(), )
def process_demographics(stack): key = 'stack:demographics' try: demographics_cards = _cache.get(key) except KeyError: def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f ) def map_to_cards(): MAP = { ("דו''ח אג''ס לפי עולים וותיקים", ("סה''כ עולים",) ): 'immigrants', ("דו''ח אג''ס לפי קבוצות גיל", ('0-5', '6-12') ): 'kids', ("דו''ח אג''ס לפי קבוצות גיל", ('13-17',) ): 'teenagers', ("דו''ח אג''ס לפי קבוצות גיל", ('60-64', '65-69', '70-74', '75-120') ): 'elderly', ("דו''ח אג''ס לפי קבוצות גיל", ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59') ): 'adults', } def f(rows): for row in rows: for (source, kinds), kind in MAP.items(): if row['source'] == source and row['kind'] in kinds: row['kind'] = kind yield row return f s2n = dict( (int(stat_area), f['properties']['title']) for f in get_neighborhood_features() for stat_area in f['properties']['stat_areas'] ) MAP2 = dict( adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0), kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1), teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2), elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3), immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4), ) demographics_cards = DF.Flow( *[ DF.load(f, headers=4) for f in glob.glob('demographics/*.csv') ], DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]), DF.add_field('total', 'number', lambda r: r.get("סה''כ")), DF.delete_fields(["אג''ס", "סה''כ "]), DF.unpivot([dict( name="([-'א-ת0-9 ].+)", keys=dict( kind=r'\1' ) )], [dict( name='kind', type='string' )], dict( name='value', type='number' )), DF.validate(), add_source(), map_to_cards(), DF.concatenate(dict( total=[], value=[], kind=[], stat_id=[] )), DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))), DF.filter_rows(lambda r: r['neighborhood']), DF.join_with_self('concat', ['neighborhood', 'kind'], dict( neighborhood=None, kind=None, total=dict(aggregate='sum'), value=dict(aggregate='sum'), )), DF.duplicate('concat', 'maxes'), DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)), DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict( total=None, )), DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total'] DF.sort_rows('{score_value}', reverse=True), DF.duplicate('maxes', 'demographics'), DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))), DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)), DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']), DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_value=float(r['score_value']), score_display=r['score_display'], geometry_score=float(r['geometry_score']), )), DF.join_with_self('demographics', ['kind'], dict( kind=None, scores=dict(aggregate='array'), )), DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]), DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]), DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]), DF.sort_rows('{order}'), DF.delete_fields(['kind']), ).results()[0][0] _cache.set(key, demographics_cards) # features = [ # dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0])) # for r in DF.Flow( # DF.load('geo/stat-areas/stat-areas/datapackage.json'), # ).results()[0][0] # ] # geometry=dict(type='FeatureCollection', features=features) stack.update(dict( map=True, scheme='green', currentField='neighborhood', layout='scores', # geometry=geometry )) stack.setdefault('cards', []).extend(demographics_cards)
def process_institutions(stack): key = 'stack:institutions' try: institutions_cards = _cache.get(key) except KeyError: CRS = '+ellps=GRS80 +k=1.00007 +lat_0=31.73439361111111 +lon_0=35.20451694444445 +no_defs +proj=tmerc +units=m +x_0=219529.584 +y_0=626907.39' projector = pyproj.Proj(CRS) def proj(): def func(row): row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True) return DF.Flow( DF.add_field('lon', 'number'), DF.add_field('lat', 'number'), func, DF.delete_fields(['X', 'Y']) ) def translate_kind(): translations = { 'מרפאה': 'מרפאות', 'איצטדיון': 'איצטדיון', 'ספרייה': 'ספריות', 'בית ספר': 'בתי ספר', 'מועדון קהילתי כולל מרכז צעירים': 'מועדון קהילתי', 'בית כנסת': 'בתי כנסת', 'מועדון נוער': 'מועדון נוער', 'אולם מופעים, היכל תרבות': 'מוסדות תרבות', 'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'מרכזי פעילות לקשישים', } def func(row): row['kind'] = translations[row['kind']] return func institutions_cards = DF.Flow( *[ DF.load(f) for f in glob.glob('institutions/*xlsx') ], DF.concatenate(dict( kind=['סוג המוסד'], title=['שם המוסד'], address=['כתובת'], X=[], Y=[] )), translate_kind(), proj(), DF.add_field('feature', 'object', lambda r: geojson.Feature( properties=dict(title=r['title'], address=r['address']), geometry=geojson.Point(coordinates=[float(r['lon']), float(r['lat'])]) )), DF.delete_fields(['title', 'lon', 'lat', 'address']), DF.join_with_self('concat', ['kind'], dict( title=dict(name='kind'), features=dict(name='feature', aggregate='array') )), DF.sort_rows('{title}', reverse=True), DF.add_field('pointGeometry', 'object', lambda r: geojson.FeatureCollection(features=r['features'])), DF.add_field('content', 'string', ' '), DF.delete_fields(['features']), # DF.printer(tablefmt='html') ).results()[0][0] _cache.set(key, institutions_cards) stack.update(dict( map=True, )) stack.setdefault('cards', []) current_cards = dict( (c['title'], c) for c in stack['cards'] ) for card in institutions_cards: current_card = current_cards.pop(card['title'], None) if current_card is not None: card['content'] = current_card['content'] else: print('SPURIOUS CARD for INSTITUTIONS', card['title']) stack['cards'] = [ c for c in stack['cards'] if c['title'] in current_cards ] + institutions_cards
def fix_doc_id(rows): used = {} for row in rows: doc_id = row['doc_id'] used.setdefault(doc_id, 0) if used[doc_id] != 0: row['doc_id'] += '.{}'.format(used[doc_id]) yield row used[doc_id] += 1 cur_year = datetime.date.today().year org_flow = DF.Flow( DF.load(ORGS_URL, name='orgs'), DF.concatenate(headers, resources='orgs', target=dict(name='orgs')), fix_urls(['org_website', 'org_facebook']), DF.add_field( 'alt_names', 'array', default=lambda r: [ r[x] for x in [ 'alt_name%d' % i for i in range(1, 6) ] + ['org_name'] if x in r and r[x] ] ), DF.add_field('compact_services', 'string', lambda row: row.get('provided_services')), DF.delete_fields(['alt_name[1-5]']), *[
def prepare(): fp = re.compile('\w+', re.UNICODE) def fingerprint(x): return ''.join(fp.findall(x.upper())) langs, _, _ = DF.Flow(DF.load(all_data()), DF.concatenate(dict(city_name=[]))).results() langs = dict((k, dict()) for k in set(x['city_name'] for x in langs[0])) osm_he = {} s = tabulator.Stream(data_file('places.csv')) s.open() for item in s.iter(): if len(item) == 0: continue item = json.loads(item[0]) he = item.get('name:he', item.get('name')) if he is None: continue osm_he.setdefault(fingerprint(he), {}).update(item) s = tabulator.Stream(data_file('yeshuvim.csv'), headers=1) s.open() for item in s.iter(keyed=True): he = item['שם_ישוב'].strip().replace('(', 'XXX').replace( ')', '(').replace('XXX', ')').replace(' ', ' ') en = item['שם_ישוב_לועזי'].strip().replace(' ', ' ').replace( "'", 'xxx').title().replace('xxx', "'").replace('Xxx', "'") rec = {'name:en': en, 'name:he': he} he = fingerprint(he) osm_he.setdefault(he, {}).update(rec) osm = {} for item in osm_he.values(): names = dict( (k[5:], v) for k, v in item.items() if k.startswith('name:')) if 'name' not in item and 'he' not in names: continue names.setdefault('he', item.pop('name', None)) for v in names.values(): osm.setdefault(fingerprint(v), {}).update(names) en = names.get('en') if en and '-' in en: for v in en.split('-'): osm.setdefault(fingerprint(v), {}).update(names) old_names = dict(('he' if k == 'old_name' else k[9:], v) for k, v in item.items() if k.startswith('old_name')) for v in old_names.values(): for vv in v.split(';'): osm.setdefault(fingerprint(vv), {}).update(names) for kk, place in langs.items(): k = fingerprint(kk) if k in osm: place.update(osm[k]) else: match = process.extractOne(k, osm.keys(), scorer=fuzz.ratio) if match[1] >= 50: k = match[0] place.update(osm[k]) else: print('no match for', place, k, match) normalized = dict((k, dict(he=v.get('he'), en=v.get('en'), es=v.get('es', v.get('en')), fr=v.get('fr', v.get('en')), ru=v.get('ru', v.get('en')), ar=v.get('ar', v.get('he')))) for k, v in langs.items()) json.dump(normalized, open(DATA_FILE, 'w'))