def prepare_locations(): prepare_addresses() return DF.Flow( DF.load('_cache_addresses/datapackage.json'), DF.add_field( 'address', 'string', lambda r: '{} {}{}'.format( r['street_name'], r['house_number'], r['letter'] or '')), DF.add_field( 'item', 'object', lambda r: dict(value=dict(lat=float(r['lat']), lon=float(r['lon']), arnona_zones=r['arnona_zones'], שם=r['address']), display=r['address'])), DF.sort_rows('{house_number}'), DF.delete_fields([ 'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address' ]), DF.join_with_self( 'concat', ['street_name'], dict(display=dict(name='street_name'), items=dict(name='item', aggregate='array'))), DF.add_field('sort_street_address', 'string', lambda r: sort_street_address(r['display'])), DF.sort_rows('{sort_street_address}'), DF.delete_fields(['sort_street_address']), DF.printer(), DF.dump_to_path('_cache_locations'), DF.checkpoint('_cache_locations')).results()[0][0]
def decp_processing(): flow = Flow( # Chargement du CSV suite à la conversion depuis JSON load("decp.csv"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), # Tri par rootId et seq pour préparer à la création de donneesActuelles sort_rows('{rootId}:{seq}', resources=0, reverse=True), donnees_actuelles, # rootId et seq peuvent maintenant être supprimés delete_fields(["rootId", "seq"], resources=0, regex=False), sort_rows('{datePublicationDonnees}', resources=0, reverse=True), # Nouvelle table dédiée aux marchés, sans données sur les titulaires print("Création de la table dédiée aux marchés..."), duplicate(source="decp", target_name="decp-sans-titulaires", target_path="decp-sans-titulaires.csv", duplicate_to_end=True), delete_fields([ "titulaire.id", "titulaire.denominationSociale", "titulaire.typeIdentifiant" ], resources="decp-sans-titulaires", regex=False), set_primary_key(["uid"], resources="decp-sans-titulaires"), deduplicate(), # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données # print("Téléchargement des données tabulaires précédentes..."), # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"), # set_type("acheteur.id", type="string"), # set_type("titulaire.id", type="string"), # set_type("codeCPV", type="string"), # set_type("lieuExecution.code", type="string"), # delete_fields(["rowid"], resources="previous-decp", regex=False), # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."), # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]), # Chargement des précédentes données dédiées aux titulaires print("Chargement des données titulaires..."), load("decp-titulaires.csv", name="decp-titulaires"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), set_type("departement", type="string"), set_type("codeAPE", type="string"), print("Enregistrement des données sur le disque..."), dump_to_path("decp")) flow.process()
def test_sort_rows_decimal(): from decimal import Decimal from dataflows import sort_rows, load f = Flow( load('data/numbers.csv', cast_strategy=load.CAST_WITH_SCHEMA), sort_rows(key='{a}'), ) results, dp, _ = f.results() assert list(results[0]) == [{ 'a': Decimal('-1000') }, { 'a': Decimal('-0.5') }, { 'a': Decimal('-0.4') }, { 'a': Decimal('0') }, { 'a': Decimal('1.1') }, { 'a': Decimal('2') }, { 'a': Decimal('10') }, { 'a': Decimal('1000') }]
def test_sort_rows_datetime(): import datetime from dataflows import sort_rows f = Flow( [ { 'a': datetime.date(2000, 1, 3) }, { 'a': datetime.date(2010, 1, 2) }, { 'a': datetime.date(2020, 1, 1) }, ], sort_rows(key='{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ { 'a': datetime.date(2000, 1, 3) }, { 'a': datetime.date(2010, 1, 2) }, { 'a': datetime.date(2020, 1, 1) }, ]
def test_sort_reverse_many_rows(): from dataflows import sort_rows f = Flow( ({'a': i, 'b': i % 5} for i in range(1000)), sort_rows(key='{b}{a}', reverse=True, batch_size=0), ) results, _, _ = f.results() results = results[0] assert results[0:2] == [{'a': 999, 'b': 4}, {'a': 994, 'b': 4}] assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}]
def test_sort_rows(): from dataflows import sort_rows f = Flow( [ { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, { 'a': 3, 'b': 1 }, { 'a': 4, 'b': 1 }, ], sort_rows(key='{b}{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ { 'a': 3, 'b': 1 }, { 'a': 4, 'b': 1 }, { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, ]
def postflow(self): key_field_names = [ ct.replace(':', '-') for ct in self.config.get(CONFIG_PRIMARY_KEY) ] def save_pks(saved_pk): def func(package: PackageWrapper): for res in package.pkg.descriptor['resources']: if res['name'] == RESOURCE_NAME: saved_pk['pk'] = res['schema'].get('primaryKey', []) yield package.pkg yield from package return func def restore_pks(saved_pk): def func(package: PackageWrapper): for res in package.pkg.descriptor['resources']: if res['name'] == RESOURCE_NAME: res['schema']['primaryKey'] = saved_pk['pk'] yield package.pkg yield from package return func saved_pk = dict(pk=[]) steps = [ save_pks(saved_pk), sort_rows(self.ORDER_BY_KEY, resources=RESOURCE_NAME), join_with_self( RESOURCE_NAME, key_field_names, { **dict((f, {}) for f in key_field_names), '*': dict(aggregate='last') }), restore_pks(saved_pk) ] f = Flow(*steps) return f
def flow(*_): prepare() yearly_fields = [ 'year', 'unit', 'subunit', 'subsubunit', 'allocated_budget', 'num_beneficiaries' ] return DF.Flow( *[ DF.load('tmp/' + resource_name + '/datapackage.json') for resource_name, _ in loads ], DF.concatenate( FIELD_MAPPING, dict(name='social_services', path='social_services.csv')), DF.sort_rows('{year}', reverse=True), DF.add_field( 'history', 'object', lambda r: dict( (k, r[k] if not isinstance(r[k], decimal.Decimal) else int(r[k])) for k in yearly_fields)), DF.printer(), DF.join_with_self( 'social_services', ['publisher_name', 'activity_name'], dict( publisher_name=None, activity_name=None, activity_description=dict(aggregate='set'), min_year=dict(name='year', aggregate='min'), max_year=dict(name='year', aggregate='max'), history=dict(aggregate='array'), )), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.update_resource(-1, **{'dpp:streaming': True}), DF.printer(), )
def test_join(): from dataflows import Flow, join, join_with_self, set_type, sort_rows from decimal import Decimal characters = [ { 'first_name': 'Jaime', 'house': 'Lannister', 'last_name': 'Lannister', 'age': 34 }, { 'first_name': 'Tyrion', 'house': 'Lannister', 'last_name': 'Lannister', 'age': 27 }, { 'first_name': 'Cersei', 'house': 'Lannister', 'last_name': 'Lannister', 'age': 34 }, { 'first_name': 'Jon', 'house': 'Stark', 'last_name': 'Snow', 'age': 17 }, { 'first_name': 'Sansa', 'house': 'Stark', 'last_name': 'Stark', 'age': 14 }, { 'first_name': 'Rickon', 'house': 'Stark', 'last_name': 'Stark', 'age': 5 }, { 'first_name': 'Arya', 'house': 'Stark', 'last_name': 'Stark', 'age': 11 }, { 'first_name': 'Bran', 'house': 'Stark', 'last_name': 'Stark', 'age': 10 }, { 'first_name': 'Daenerys', 'house': 'Targaryen', 'last_name': 'Targaryen', 'age': 16 }, ] houses = [ { 'house': 'House of Lannister' }, { 'house': 'House of Greyjoy' }, { 'house': 'House of Stark' }, { 'house': 'House of Targaryen' }, { 'house': 'House of Martell' }, { 'house': 'House of Tyrell' }, ] res, _, _ = Flow( characters, set_type('age', type='number'), houses, join('res_1', 'House of {house}', 'res_2', '{house}', dict(max_age={ 'name': 'age', 'aggregate': 'max' }, avg_age={ 'name': 'age', 'aggregate': 'avg' }, representative={ 'name': 'first_name', 'aggregate': 'last' }, representative_age={'name': 'age'}, number_of_characters={'aggregate': 'count'}, last_names={ 'name': 'last_name', 'aggregate': 'counters' }), full=False, source_delete=True)).results() assert res[0] == [ { 'avg_age': Decimal('31.66666666666666666666666667'), 'house': 'House of Lannister', 'max_age': Decimal(34), 'number_of_characters': 3, 'representative': 'Cersei', 'representative_age': Decimal(34), 'last_names': [('Lannister', 3)] }, { 'avg_age': Decimal('11.4'), 'house': 'House of Stark', 'max_age': Decimal(17), 'number_of_characters': 5, 'representative': 'Bran', 'representative_age': Decimal(10), 'last_names': [('Stark', 4), ('Snow', 1)] }, { 'avg_age': Decimal(16), 'house': 'House of Targaryen', 'max_age': Decimal(16), 'number_of_characters': 1, 'representative': 'Daenerys', 'representative_age': Decimal(16), 'last_names': [('Targaryen', 1)] }, ] # Find youngest of each house res, _, _ = Flow( characters, set_type('age', type='number'), sort_rows('{age:02}'), join_with_self('res_1', '{house}', { 'the_house': { 'name': 'house' }, '*': { 'aggregate': 'first' }, }), sort_rows('{the_house}')).results() assert res[0] == [{ 'the_house': 'Lannister', 'first_name': 'Tyrion', 'last_name': 'Lannister', 'age': Decimal('27') }, { 'the_house': 'Stark', 'first_name': 'Rickon', 'last_name': 'Stark', 'age': Decimal('5') }, { 'the_house': 'Targaryen', 'first_name': 'Daenerys', 'last_name': 'Targaryen', 'age': Decimal('16') }]
}, { "format": "default", "groupChar": "", "name": "Recovered", "title": "Cumulative total recovered cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Deaths", "title": "Cumulative total deaths to date", "type": "integer" }]), checkpoint('processed_data'), # Sort rows by date and country sort_rows('{Country/Region}{Province/State}{Date}', resources='time-series-19-covid-combined'), # Duplicate the stream to create aggregated data duplicate(source='time-series-19-covid-combined', target_name='worldwide-aggregated', target_path='data/worldwide-aggregated.csv'), join_with_self(resource_name='worldwide-aggregated', join_key=['Date'], fields=dict(Date={'name': 'Date'}, Confirmed={ 'name': 'Confirmed', 'aggregate': 'sum' }, Recovered={ 'name': 'Recovered', 'aggregate': 'sum' },
"name": "Province/State", "type": "string" }, operation="format", with_="{Province_State}", resources=["us_confirmed", "us_deaths"], ), delete_fields( ["Long_", "Country_Region", "Province_State"], resources=["us_confirmed", "us_deaths"], ), checkpoint("processed_data"), printer(), # Sort rows by date and country sort_rows( "{Country/Region}{Province/State}{Date}", resources="time-series-19-covid-combined", ), # Duplicate the stream to create aggregated data duplicate( source="time-series-19-covid-combined", target_name="worldwide-aggregated", target_path="data/worldwide-aggregated.csv", ), join_with_self( resource_name="worldwide-aggregated", join_key=["Date"], fields=dict( Date={"name": "Date"}, Confirmed={ "name": "Confirmed", "aggregate": "sum"
def process_stack_demand(stack): def collect_cats(): F = 'כלל המדגם' def f(rows): cat = None for row in rows: if F in row: v = row[F] if v.startswith('סך הכל '): cat = v[7:] elif v.startswith('--- '): if not v.endswith('ללא פירוט'): subcat = v[4:] row['category'] = cat row['subcategory'] = subcat yield row else: yield row return DF.Flow( DF.add_field('category', 'string', resources=-1), DF.add_field('subcategory', 'string', resources=-1), f, DF.delete_fields([F], resources=-1), ) def fix_nones(row): row['demand_pct'] = row['demand_pct'] or 0 key = 'stack:demand' try: demand_stacks = _cache.get(key) except KeyError: demand_stacks = DF.Flow( DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2), collect_cats(), DF.update_schema(-1, missingValues=['--']), DF.unpivot( unpivot_fields=[dict( name='(.+) \\([A-Z]\\)', keys=dict( neighborhood='\\1' ), )], extra_keys=[dict( name='neighborhood', type='string' )], extra_value=dict( name='demand_pct', type='number' ), resources=-1 ), DF.validate(), DF.duplicate('demand', 'demand_stacks'), DF.join_with_self('demand', ['category', 'subcategory'], dict( category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max') )), DF.join( 'demand', ['category', 'subcategory'], 'demand_stacks', ['category', 'subcategory'], dict( max_demand=None ) ), fix_nones, DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)), DF.add_field('value', 'number', lambda r: r['demand_pct']), DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6), DF.delete_fields(['demand_pct', 'max_demand']), DF.sort_rows('{score}', reverse=True), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_display=r['display'], score_value=float(r['value']), geometry_score=float(r['score']), )), DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict( category=None, subcategory=None, scores=dict(aggregate='array'), )), DF.add_field('card', 'object', lambda r: dict( title='ביקוש ל{}'.format(r['subcategory']), content='', scores=r['scores'], test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_') )), DF.join_with_self('demand_stacks', ['category'], dict( category=None, cards=dict(name='card', aggregate='array'), )), DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')), ).results()[0][0] _cache.set(key, demand_stacks) cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards'] stack.update(dict( layout='scores', currentField='neighborhood', map=True )) stack.setdefault('cards', []).extend(cards)
def process_institutions(stack): key = 'stack:institutions' try: institutions_cards = _cache.get(key) except KeyError: CRS = '+ellps=GRS80 +k=1.00007 +lat_0=31.73439361111111 +lon_0=35.20451694444445 +no_defs +proj=tmerc +units=m +x_0=219529.584 +y_0=626907.39' projector = pyproj.Proj(CRS) def proj(): def func(row): row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True) return DF.Flow( DF.add_field('lon', 'number'), DF.add_field('lat', 'number'), func, DF.delete_fields(['X', 'Y']) ) def translate_kind(): translations = { 'מרפאה': 'מרפאות', 'איצטדיון': 'איצטדיון', 'ספרייה': 'ספריות', 'בית ספר': 'בתי ספר', 'מועדון קהילתי כולל מרכז צעירים': 'מועדון קהילתי', 'בית כנסת': 'בתי כנסת', 'מועדון נוער': 'מועדון נוער', 'אולם מופעים, היכל תרבות': 'מוסדות תרבות', 'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'מרכזי פעילות לקשישים', } def func(row): row['kind'] = translations[row['kind']] return func institutions_cards = DF.Flow( *[ DF.load(f) for f in glob.glob('institutions/*xlsx') ], DF.concatenate(dict( kind=['סוג המוסד'], title=['שם המוסד'], address=['כתובת'], X=[], Y=[] )), translate_kind(), proj(), DF.add_field('feature', 'object', lambda r: geojson.Feature( properties=dict(title=r['title'], address=r['address']), geometry=geojson.Point(coordinates=[float(r['lon']), float(r['lat'])]) )), DF.delete_fields(['title', 'lon', 'lat', 'address']), DF.join_with_self('concat', ['kind'], dict( title=dict(name='kind'), features=dict(name='feature', aggregate='array') )), DF.sort_rows('{title}', reverse=True), DF.add_field('pointGeometry', 'object', lambda r: geojson.FeatureCollection(features=r['features'])), DF.add_field('content', 'string', ' '), DF.delete_fields(['features']), # DF.printer(tablefmt='html') ).results()[0][0] _cache.set(key, institutions_cards) stack.update(dict( map=True, )) stack.setdefault('cards', []) current_cards = dict( (c['title'], c) for c in stack['cards'] ) for card in institutions_cards: current_card = current_cards.pop(card['title'], None) if current_card is not None: card['content'] = current_card['content'] else: print('SPURIOUS CARD for INSTITUTIONS', card['title']) stack['cards'] = [ c for c in stack['cards'] if c['title'] in current_cards ] + institutions_cards
def process_demographics(stack): key = 'stack:demographics' try: demographics_cards = _cache.get(key) except KeyError: def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f ) def map_to_cards(): MAP = { ("דו''ח אג''ס לפי עולים וותיקים", ("סה''כ עולים",) ): 'immigrants', ("דו''ח אג''ס לפי קבוצות גיל", ('0-5', '6-12') ): 'kids', ("דו''ח אג''ס לפי קבוצות גיל", ('13-17',) ): 'teenagers', ("דו''ח אג''ס לפי קבוצות גיל", ('60-64', '65-69', '70-74', '75-120') ): 'elderly', ("דו''ח אג''ס לפי קבוצות גיל", ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59') ): 'adults', } def f(rows): for row in rows: for (source, kinds), kind in MAP.items(): if row['source'] == source and row['kind'] in kinds: row['kind'] = kind yield row return f s2n = dict( (int(stat_area), f['properties']['title']) for f in get_neighborhood_features() for stat_area in f['properties']['stat_areas'] ) MAP2 = dict( adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0), kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1), teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2), elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3), immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4), ) demographics_cards = DF.Flow( *[ DF.load(f, headers=4) for f in glob.glob('demographics/*.csv') ], DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]), DF.add_field('total', 'number', lambda r: r.get("סה''כ")), DF.delete_fields(["אג''ס", "סה''כ "]), DF.unpivot([dict( name="([-'א-ת0-9 ].+)", keys=dict( kind=r'\1' ) )], [dict( name='kind', type='string' )], dict( name='value', type='number' )), DF.validate(), add_source(), map_to_cards(), DF.concatenate(dict( total=[], value=[], kind=[], stat_id=[] )), DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))), DF.filter_rows(lambda r: r['neighborhood']), DF.join_with_self('concat', ['neighborhood', 'kind'], dict( neighborhood=None, kind=None, total=dict(aggregate='sum'), value=dict(aggregate='sum'), )), DF.duplicate('concat', 'maxes'), DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)), DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict( total=None, )), DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total'] DF.sort_rows('{score_value}', reverse=True), DF.duplicate('maxes', 'demographics'), DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))), DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)), DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']), DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_value=float(r['score_value']), score_display=r['score_display'], geometry_score=float(r['geometry_score']), )), DF.join_with_self('demographics', ['kind'], dict( kind=None, scores=dict(aggregate='array'), )), DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]), DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]), DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]), DF.sort_rows('{order}'), DF.delete_fields(['kind']), ).results()[0][0] _cache.set(key, demographics_cards) # features = [ # dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0])) # for r in DF.Flow( # DF.load('geo/stat-areas/stat-areas/datapackage.json'), # ).results()[0][0] # ] # geometry=dict(type='FeatureCollection', features=features) stack.update(dict( map=True, scheme='green', currentField='neighborhood', layout='scores', # geometry=geometry )) stack.setdefault('cards', []).extend(demographics_cards)
def flow(*_): run_row = None last_run_row = Flow( load_if_exists('%s/last_run/datapackage.json' % OUTPUT_DIR, 'last_run', [{}])).results()[0][0][0] last_run_sha1 = last_run_row.get('COVID19-ISRAEL_github_sha1') last_run_time = last_run_row.get('start_time') if last_run_time and (datetime.datetime.now() - last_run_time).total_seconds() < 120: logging.info('last run was less then 120 seconds ago, not running') else: new_sha1 = github_pull_covid19_israel.flow({ 'dump_to_path': '%s/last_github_pull' % OUTPUT_DIR }).results()[0][0][0]['sha1'] if last_run_time and ( datetime.datetime.now() - last_run_time ).total_seconds() < 60 * 60 * 24 and last_run_sha1 == new_sha1: logging.info( "No change detected in COVID19-ISRAEL GitHub, not running") else: run_row = { 'start_time': datetime.datetime.now(), 'COVID19-ISRAEL_github_sha1': new_sha1 } for module in RUN_MODULES: try: os.makedirs('data/preprocess_raw_data/log_files/%s' % module['id'], exist_ok=True) run_covid19_israel.flow({ 'module': module['module'], 'resource_name': '%s_last_updated_files' % module['id'], 'dump_to_path': 'data/preprocess_raw_data/last_updated_files/%s' % module['id'], 'log_file': 'data/preprocess_raw_data/log_files/%s/%s.log' % (module['id'], datetime.datetime.now().strftime('%Y%m%dT%H%M%S')) }).process() run_row['%s_success' % module['id']] = 'yes' except Exception: logging.exception('failed to run %s' % module['id']) run_row['%s_success' % module['id']] = 'no' if run_row is not None: Flow( iter([run_row]), update_resource(-1, name='last_run', path='last_run.csv', **{'dpp:streaming': True}), dump_to_path('%s/last_run' % OUTPUT_DIR)).process() def _get_runs_history(): if os.path.exists('%s/runs_history/datapackage.json' % OUTPUT_DIR): for resource in Flow( load('%s/runs_history/datapackage.json' % OUTPUT_DIR), ).datastream().res_iter: yield from resource if run_row is not None: yield run_row Flow( _get_runs_history(), update_resource(-1, name='runs_history', path='runs_history', **{'dpp:streaming': True}), dump_to_path('%s/runs_history' % OUTPUT_DIR)).process() return Flow(load('%s/runs_history/datapackage.json' % OUTPUT_DIR), sort_rows('{start_time}', reverse=True), printer(num_rows=10))
def keep_last_runs_history(output_dir, run_callback, *callback_args, **callback_kwargs): run_row = {'start_time': datetime.datetime.now()} last_run_row = Flow( load_if_exists('%s/last_run/datapackage.json' % output_dir, 'last_run', [{}])).results()[0][0][0] run_row, raise_exception_msg = run_callback(last_run_row, run_row, *callback_args, **callback_kwargs) if run_row: Flow( iter([{k: v for k, v in run_row.items() if k != 'start_time'}]), update_resource(-1, name='last_run', path='last_run.csv', **{'dpp:streaming': True}), dump_to_path('%s/last_run' % output_dir)).process() run_fields = set() if os.path.exists('%s/runs_history/datapackage.json' % output_dir): with open('%s/runs_history/datapackage.json' % output_dir) as f: datapackage = json.load(f) for f in datapackage['resources'][0]['schema']['fields']: run_fields.add(f['name']) if run_row: run_row["end_time"] = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%S') for k in run_row.keys(): run_fields.add(k) def _get_runs_history(): if os.path.exists('%s/runs_history/datapackage.json' % output_dir): for resource in Flow( load('%s/runs_history/datapackage.json' % output_dir), ).datastream().res_iter: for row in resource: yield {k: row.get(k, '') for k in run_fields} if run_row: yield {k: run_row.get(k, '') for k in run_fields} Flow( _get_runs_history(), update_resource(-1, name='runs_history', path='runs_history', **{'dpp:streaming': True}), dump_to_path('%s/runs_history' % output_dir)).process() def _printer(rows): logging.info('--- last runs ---') for i, row in enumerate(rows): if i < 10: logging.info('%s:' % row['start_time']) for k in sorted(row.keys()): if k == 'start_time': continue if row[k] is None or row[k] == '': continue logging.info(' %s: %s' % (k, row[k])) yield row flow = Flow(load('%s/runs_history/datapackage.json' % output_dir), sort_rows('{start_time}', reverse=True), _printer) if raise_exception_msg: flow.process() raise Exception(raise_exception_msg) else: return flow
def test_sort_rows_number(): from dataflows import sort_rows f = Flow( [ { 'a': 0.1 }, { 'a': -3 }, { 'a': -4 }, { 'a': 10 }, { 'a': 8 }, { 'a': 0 }, { 'a': -1000000 }, { 'a': 1000000 }, { 'a': -0.1 }, { 'a': -0.2 }, { 'a': 0.2 }, { 'a': -1000001 }, { 'a': 1000001 }, { 'a': 6 }, { 'a': -10 }, { 'a': -0.001 }, { 'a': 0.001 }, { 'a': 1 }, { 'a': -1 }, ], sort_rows(key='{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ { 'a': -1000001 }, { 'a': -1000000 }, { 'a': -10 }, { 'a': -4 }, { 'a': -3 }, { 'a': -1 }, { 'a': -0.2 }, { 'a': -0.1 }, { 'a': -0.001 }, { 'a': 0 }, { 'a': 0.001 }, { 'a': 0.1 }, { 'a': 0.2 }, { 'a': 1 }, { 'a': 6 }, { 'a': 8 }, { 'a': 10 }, { 'a': 1000000 }, { 'a': 1000001 }, ]
def flow(parameters): return Flow( load_lazy_json(parameters.get('resources')), sort_rows(parameters['sort-by'], resources=parameters.get('resources'), reverse=parameters.get('reverse')))
override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.add_field('score_date', 'object', lambda r: dict( date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted'])) ), DF.concatenate(dict( id=[], city_name=[], score_date=[] ), target=dict(name='ranking')), DF.join_with_self('ranking', '{city_name}', dict( id=None, city_name=None, scores=dict(name='score_date', aggregate='array') )), sort_limit_scores(), DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200), DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']), DF.sort_rows('{sortkey}', reverse=True), DF.delete_fields(['sortkey']), DF.add_field('rank', 'integer', 0), DF.add_field('translations', 'object', lambda r: city_translations[r['city_name']]), DF.add_field('image', 'object', lambda r: upload_static_image(r['id'], width=280*2, height=160*2)), ranker(), ).results() rankings = r[0] r, _, _ = DF.Flow( DF.load(all_data(), name='cities', headers=1, override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.filter_rows(lambda r: r['num_reports_weighted'] >= 200), DF.add_field('ws', 'number', lambda r: r['symptoms_ratio_weighted'] * r['num_reports_weighted']),
flow = Flow( # Load inputs {% if input == 'file' %} load('{{input_url}}', format='{{format}}', {% if sheet %}sheet={{sheet}}{% endif %}), {% endif %} {% if input == 'remote' %} load('{{input_url}}', format='{{format}}', {% if sheet %}sheet={{sheet}}{% endif %}), {% endif %} {% if input == 'sql' %} load('{{input_url}}', table='{{input_db_table}}'), {% endif %} {% if input == 'other' %} {% endif %} # Process them (if necessary) {% if 'sort' in processing %} sort_rows('{field_name}'), # Key is a Python format string or a list of field names {% endif %} {% if 'filter' in processing %} filter_rows(), {% endif %} {% if 'find_replace' in processing %} find_replace([ dict(name='field_name', patterns=[ dict(find='re-pattern-to-find', replace='re-pattern-to-replace-with'), ]) ]), {% endif %} {% if 'delete_fields' in processing %} delete_fields(['field_name']), # Pass a list of field names to delete from the data {% endif %}