def test_duplicate_many_rows(): from dataflows import duplicate f = Flow( ({ 'a': i, 'b': i } for i in range(1000)), duplicate(), ) results, _, _ = f.results() assert len(results[0]) == 1000 assert len(results[1]) == 1000 f = Flow( ({ 'a': i, 'b': i } for i in range(10000)), duplicate(batch_size=0), ) results, _, _ = f.results() assert len(results[0]) == 10000 assert len(results[1]) == 10000
def test_duplicate(): from dataflows import duplicate a = [ { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, { 'a': 3, 'b': 1 }, { 'a': 4, 'b': 1 }, ] f = Flow( a, duplicate(), ) results, _, _ = f.results() assert list(results[0]) == a assert list(results[1]) == a
def flow(self): taxonomy = self.context.taxonomy txn_config = taxonomy.config fmt_str = [taxonomy.title + ' עבור:'] fields = txn_config['key-fields'] for f in fields: for ct in taxonomy.column_types: if ct['name'] == f: fmt_str.append('%s: "{%s}",' % (ct['title'], f.replace(':', '-'))) break fmt_str = ' '.join(fmt_str) fields = [ct.replace(':', '-') for ct in fields] all_fields = ['_source'] + fields TARGET = 'configurations' saved_config = self.config._unflatten() saved_config.setdefault('publish', {})['allowed'] = False return Flow( duplicate(RESOURCE_NAME, TARGET), join_with_self( TARGET, all_fields, dict((f, {}) for f in all_fields), ), add_computed_field([ dict(operation='format', target='snippets', with_=fmt_str), dict(operation='constant', target='key_values', with_=None), ], resources=TARGET), add_field('config', 'object', saved_config, resources=TARGET), add_field('fields', type='object', default=self.collate_values(fields), resources=TARGET), join_with_self( TARGET, ['_source'], dict( source=dict(name='_source'), config={}, key_values=dict(aggregate='array'), snippets=dict(aggregate='array'), )), set_type('source', type='string'), set_type('config', type='object'), set_type('key_values', type='array'), set_type('snippets', type='array'), set_primary_key(['source']), dump_to_sql( dict([(TARGET, { 'resource-name': TARGET, 'mode': 'update' })]), engine=self.lazy_engine(), ), )
def decp_processing(): flow = Flow( # Chargement du CSV suite à la conversion depuis JSON load("decp.csv"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), # Tri par rootId et seq pour préparer à la création de donneesActuelles sort_rows('{rootId}:{seq}', resources=0, reverse=True), donnees_actuelles, # rootId et seq peuvent maintenant être supprimés delete_fields(["rootId", "seq"], resources=0, regex=False), sort_rows('{datePublicationDonnees}', resources=0, reverse=True), # Nouvelle table dédiée aux marchés, sans données sur les titulaires print("Création de la table dédiée aux marchés..."), duplicate(source="decp", target_name="decp-sans-titulaires", target_path="decp-sans-titulaires.csv", duplicate_to_end=True), delete_fields([ "titulaire.id", "titulaire.denominationSociale", "titulaire.typeIdentifiant" ], resources="decp-sans-titulaires", regex=False), set_primary_key(["uid"], resources="decp-sans-titulaires"), deduplicate(), # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données # print("Téléchargement des données tabulaires précédentes..."), # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"), # set_type("acheteur.id", type="string"), # set_type("titulaire.id", type="string"), # set_type("codeCPV", type="string"), # set_type("lieuExecution.code", type="string"), # delete_fields(["rowid"], resources="previous-decp", regex=False), # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."), # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]), # Chargement des précédentes données dédiées aux titulaires print("Chargement des données titulaires..."), load("decp-titulaires.csv", name="decp-titulaires"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), set_type("departement", type="string"), set_type("codeAPE", type="string"), print("Enregistrement des données sur le disque..."), dump_to_path("decp")) flow.process()
def flow(parameters): return Flow( load_lazy_json(parameters.get('source')), duplicate( parameters.get('source'), parameters.get('target-name'), parameters.get('target-path'), parameters.get('batch_size', 1000), parameters.get('duplicate_to_end', False) ) )
"title": "Cumulative total recovered cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Deaths", "title": "Cumulative total deaths to date", "type": "integer" }]), checkpoint('processed_data'), # Sort rows by date and country sort_rows('{Country/Region}{Province/State}{Date}', resources='time-series-19-covid-combined'), # Duplicate the stream to create aggregated data duplicate(source='time-series-19-covid-combined', target_name='worldwide-aggregated', target_path='data/worldwide-aggregated.csv'), join_with_self(resource_name='worldwide-aggregated', join_key=['Date'], fields=dict(Date={'name': 'Date'}, Confirmed={ 'name': 'Confirmed', 'aggregate': 'sum' }, Recovered={ 'name': 'Recovered', 'aggregate': 'sum' }, Deaths={ 'name': 'Deaths', 'aggregate': 'sum'
), delete_fields( ["Long_", "Country_Region", "Province_State"], resources=["us_confirmed", "us_deaths"], ), checkpoint("processed_data"), printer(), # Sort rows by date and country sort_rows( "{Country/Region}{Province/State}{Date}", resources="time-series-19-covid-combined", ), # Duplicate the stream to create aggregated data duplicate( source="time-series-19-covid-combined", target_name="worldwide-aggregated", target_path="data/worldwide-aggregated.csv", ), join_with_self( resource_name="worldwide-aggregated", join_key=["Date"], fields=dict( Date={"name": "Date"}, Confirmed={ "name": "Confirmed", "aggregate": "sum" }, Recovered={ "name": "Recovered", "aggregate": "sum" },
def process_stack_demand(stack): def collect_cats(): F = 'כלל המדגם' def f(rows): cat = None for row in rows: if F in row: v = row[F] if v.startswith('סך הכל '): cat = v[7:] elif v.startswith('--- '): if not v.endswith('ללא פירוט'): subcat = v[4:] row['category'] = cat row['subcategory'] = subcat yield row else: yield row return DF.Flow( DF.add_field('category', 'string', resources=-1), DF.add_field('subcategory', 'string', resources=-1), f, DF.delete_fields([F], resources=-1), ) def fix_nones(row): row['demand_pct'] = row['demand_pct'] or 0 key = 'stack:demand' try: demand_stacks = _cache.get(key) except KeyError: demand_stacks = DF.Flow( DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2), collect_cats(), DF.update_schema(-1, missingValues=['--']), DF.unpivot( unpivot_fields=[dict( name='(.+) \\([A-Z]\\)', keys=dict( neighborhood='\\1' ), )], extra_keys=[dict( name='neighborhood', type='string' )], extra_value=dict( name='demand_pct', type='number' ), resources=-1 ), DF.validate(), DF.duplicate('demand', 'demand_stacks'), DF.join_with_self('demand', ['category', 'subcategory'], dict( category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max') )), DF.join( 'demand', ['category', 'subcategory'], 'demand_stacks', ['category', 'subcategory'], dict( max_demand=None ) ), fix_nones, DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)), DF.add_field('value', 'number', lambda r: r['demand_pct']), DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6), DF.delete_fields(['demand_pct', 'max_demand']), DF.sort_rows('{score}', reverse=True), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_display=r['display'], score_value=float(r['value']), geometry_score=float(r['score']), )), DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict( category=None, subcategory=None, scores=dict(aggregate='array'), )), DF.add_field('card', 'object', lambda r: dict( title='ביקוש ל{}'.format(r['subcategory']), content='', scores=r['scores'], test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_') )), DF.join_with_self('demand_stacks', ['category'], dict( category=None, cards=dict(name='card', aggregate='array'), )), DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')), ).results()[0][0] _cache.set(key, demand_stacks) cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards'] stack.update(dict( layout='scores', currentField='neighborhood', map=True )) stack.setdefault('cards', []).extend(cards)
def process_demographics(stack): key = 'stack:demographics' try: demographics_cards = _cache.get(key) except KeyError: def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f ) def map_to_cards(): MAP = { ("דו''ח אג''ס לפי עולים וותיקים", ("סה''כ עולים",) ): 'immigrants', ("דו''ח אג''ס לפי קבוצות גיל", ('0-5', '6-12') ): 'kids', ("דו''ח אג''ס לפי קבוצות גיל", ('13-17',) ): 'teenagers', ("דו''ח אג''ס לפי קבוצות גיל", ('60-64', '65-69', '70-74', '75-120') ): 'elderly', ("דו''ח אג''ס לפי קבוצות גיל", ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59') ): 'adults', } def f(rows): for row in rows: for (source, kinds), kind in MAP.items(): if row['source'] == source and row['kind'] in kinds: row['kind'] = kind yield row return f s2n = dict( (int(stat_area), f['properties']['title']) for f in get_neighborhood_features() for stat_area in f['properties']['stat_areas'] ) MAP2 = dict( adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0), kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1), teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2), elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3), immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4), ) demographics_cards = DF.Flow( *[ DF.load(f, headers=4) for f in glob.glob('demographics/*.csv') ], DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]), DF.add_field('total', 'number', lambda r: r.get("סה''כ")), DF.delete_fields(["אג''ס", "סה''כ "]), DF.unpivot([dict( name="([-'א-ת0-9 ].+)", keys=dict( kind=r'\1' ) )], [dict( name='kind', type='string' )], dict( name='value', type='number' )), DF.validate(), add_source(), map_to_cards(), DF.concatenate(dict( total=[], value=[], kind=[], stat_id=[] )), DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))), DF.filter_rows(lambda r: r['neighborhood']), DF.join_with_self('concat', ['neighborhood', 'kind'], dict( neighborhood=None, kind=None, total=dict(aggregate='sum'), value=dict(aggregate='sum'), )), DF.duplicate('concat', 'maxes'), DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)), DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict( total=None, )), DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total'] DF.sort_rows('{score_value}', reverse=True), DF.duplicate('maxes', 'demographics'), DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))), DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)), DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']), DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_value=float(r['score_value']), score_display=r['score_display'], geometry_score=float(r['geometry_score']), )), DF.join_with_self('demographics', ['kind'], dict( kind=None, scores=dict(aggregate='array'), )), DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]), DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]), DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]), DF.sort_rows('{order}'), DF.delete_fields(['kind']), ).results()[0][0] _cache.set(key, demographics_cards) # features = [ # dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0])) # for r in DF.Flow( # DF.load('geo/stat-areas/stat-areas/datapackage.json'), # ).results()[0][0] # ] # geometry=dict(type='FeatureCollection', features=features) stack.update(dict( map=True, scheme='green', currentField='neighborhood', layout='scores', # geometry=geometry )) stack.setdefault('cards', []).extend(demographics_cards)
"format": "default", "groupChar": "", "name": "Recovered", "title": "Cumulative total recovered cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Deaths", "title": "Cumulative total deaths to date", "type": "integer" }]), checkpoint('processed_data'), # Duplicate the stream to create aggregated data duplicate(source='time-series-19-covid-combined', target_name='worldwide-aggregated', target_path='worldwide-aggregated.csv'), join_with_self(resource_name='worldwide-aggregated', join_key=['Date'], fields=dict(Date={'name': 'Date'}, Confirmed={ 'name': 'Confirmed', 'aggregate': 'sum' }, Recovered={ 'name': 'Recovered', 'aggregate': 'sum' }, Deaths={ 'name': 'Deaths', 'aggregate': 'sum'
def Olap_Datapackage(): flow = Flow( # Load datapackages: load('elspot_prices_data/datapackage.json'), load('afrr_data/datapackage.json'), load('fcr_dk1_data/datapackage.json'), concatenate(fields={ 'Timestamp': ['HourUTC'], 'Area': ['PriceArea'], 'Product': ['product'], 'Amount': ['amount'], 'Price_DKK': ['PriceDKK'], 'Price_EUR': ['PriceEUR'] }, target={ 'name': 'fact', 'path': 'data/fact.csv' }), add_computed_field( [dict(target='id', operation='constant', with_='dummy')]), add_id, set_type('id', type='integer'), set_primary_key(primary_key=['id']), # Reorder so that 'id' column is the first: select_fields([ 'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK', 'Price_EUR' ], resources='fact'), # Add foreign keys: add_foreign_keys, # Fact table is ready. Now duplicate the resource to generate dim tables: # First is 'time' table: duplicate(source='fact', target_name='time', target_path='time.csv'), select_fields(['Timestamp'], resources=['time']), join_self(source_name='time', source_key=['Timestamp'], target_name='time', fields={'Timestamp': {}}), # Parse datetime fields and add a separate field for year, month and day: add_computed_field([ dict(target=dict(name='day', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d' )), dict(target=dict(name='month', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m' )), dict(target=dict(name='month_name', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B' )), dict(target=dict(name='year', type='year'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y' )), ], resources=['time']), set_primary_key(primary_key=['Timestamp'], resources=['time']), # Now 'area' table: duplicate(source='fact', target_name='area', target_path='area.csv'), select_fields(['Area'], resources=['area']), join_self(source_name='area', source_key=['Area'], target_name='area', fields={'Area': {}}), set_primary_key(primary_key=['Area'], resources=['area']), # Now 'product' table: duplicate(source='fact', target_name='product', target_path='product.csv'), select_fields(['Product'], resources=['product']), join_self(source_name='product', source_key=['Product'], target_name='product', fields={'Product': {}}), set_primary_key(primary_key=['Product'], resources=['product']), dump_to_path('olap_datapackage')) flow.process()