예제 #1
0
def test_duplicate_many_rows():
    from dataflows import duplicate

    f = Flow(
        ({
            'a': i,
            'b': i
        } for i in range(1000)),
        duplicate(),
    )

    results, _, _ = f.results()
    assert len(results[0]) == 1000
    assert len(results[1]) == 1000

    f = Flow(
        ({
            'a': i,
            'b': i
        } for i in range(10000)),
        duplicate(batch_size=0),
    )

    results, _, _ = f.results()
    assert len(results[0]) == 10000
    assert len(results[1]) == 10000
예제 #2
0
def test_duplicate():
    from dataflows import duplicate

    a = [
        {
            'a': 1,
            'b': 3
        },
        {
            'a': 2,
            'b': 3
        },
        {
            'a': 3,
            'b': 1
        },
        {
            'a': 4,
            'b': 1
        },
    ]

    f = Flow(
        a,
        duplicate(),
    )
    results, _, _ = f.results()
    assert list(results[0]) == a
    assert list(results[1]) == a
예제 #3
0
    def flow(self):
        taxonomy = self.context.taxonomy
        txn_config = taxonomy.config
        fmt_str = [taxonomy.title + ' עבור:']
        fields = txn_config['key-fields']
        for f in fields:
            for ct in taxonomy.column_types:
                if ct['name'] == f:
                    fmt_str.append('%s: "{%s}",' %
                                   (ct['title'], f.replace(':', '-')))
                    break
        fmt_str = ' '.join(fmt_str)
        fields = [ct.replace(':', '-') for ct in fields]
        all_fields = ['_source'] + fields

        TARGET = 'configurations'
        saved_config = self.config._unflatten()
        saved_config.setdefault('publish', {})['allowed'] = False

        return Flow(
            duplicate(RESOURCE_NAME, TARGET),
            join_with_self(
                TARGET,
                all_fields,
                dict((f, {}) for f in all_fields),
            ),
            add_computed_field([
                dict(operation='format', target='snippets', with_=fmt_str),
                dict(operation='constant', target='key_values', with_=None),
            ],
                               resources=TARGET),
            add_field('config', 'object', saved_config, resources=TARGET),
            add_field('fields',
                      type='object',
                      default=self.collate_values(fields),
                      resources=TARGET),
            join_with_self(
                TARGET, ['_source'],
                dict(
                    source=dict(name='_source'),
                    config={},
                    key_values=dict(aggregate='array'),
                    snippets=dict(aggregate='array'),
                )),
            set_type('source', type='string'),
            set_type('config', type='object'),
            set_type('key_values', type='array'),
            set_type('snippets', type='array'),
            set_primary_key(['source']),
            dump_to_sql(
                dict([(TARGET, {
                    'resource-name': TARGET,
                    'mode': 'update'
                })]),
                engine=self.lazy_engine(),
            ),
        )
예제 #4
0
def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()
예제 #5
0
def flow(parameters):
    return Flow(
        load_lazy_json(parameters.get('source')),
        duplicate(
            parameters.get('source'),
            parameters.get('target-name'),
            parameters.get('target-path'),
            parameters.get('batch_size', 1000),
            parameters.get('duplicate_to_end', False)
        )
    )
예제 #6
0
                   "title": "Cumulative total recovered cases to date",
                   "type": "integer"
               }, {
                   "format": "default",
                   "groupChar": "",
                   "name": "Deaths",
                   "title": "Cumulative total deaths to date",
                   "type": "integer"
               }]),
 checkpoint('processed_data'),
 # Sort rows by date and country
 sort_rows('{Country/Region}{Province/State}{Date}',
           resources='time-series-19-covid-combined'),
 # Duplicate the stream to create aggregated data
 duplicate(source='time-series-19-covid-combined',
           target_name='worldwide-aggregated',
           target_path='data/worldwide-aggregated.csv'),
 join_with_self(resource_name='worldwide-aggregated',
                join_key=['Date'],
                fields=dict(Date={'name': 'Date'},
                            Confirmed={
                                'name': 'Confirmed',
                                'aggregate': 'sum'
                            },
                            Recovered={
                                'name': 'Recovered',
                                'aggregate': 'sum'
                            },
                            Deaths={
                                'name': 'Deaths',
                                'aggregate': 'sum'
예제 #7
0
 ),
 delete_fields(
     ["Long_", "Country_Region", "Province_State"],
     resources=["us_confirmed", "us_deaths"],
 ),
 checkpoint("processed_data"),
 printer(),
 # Sort rows by date and country
 sort_rows(
     "{Country/Region}{Province/State}{Date}",
     resources="time-series-19-covid-combined",
 ),
 # Duplicate the stream to create aggregated data
 duplicate(
     source="time-series-19-covid-combined",
     target_name="worldwide-aggregated",
     target_path="data/worldwide-aggregated.csv",
 ),
 join_with_self(
     resource_name="worldwide-aggregated",
     join_key=["Date"],
     fields=dict(
         Date={"name": "Date"},
         Confirmed={
             "name": "Confirmed",
             "aggregate": "sum"
         },
         Recovered={
             "name": "Recovered",
             "aggregate": "sum"
         },
def process_stack_demand(stack):

    def collect_cats():
        F = 'כלל המדגם'
        
        def f(rows):
            cat = None
            for row in rows:
                if F in row:
                    v = row[F]
                    if v.startswith('סך הכל '):
                        cat = v[7:]
                    elif v.startswith('--- '):
                        if not v.endswith('ללא פירוט'):
                            subcat = v[4:]
                            row['category'] = cat
                            row['subcategory'] = subcat
                            yield row
                else:
                    yield row
        return DF.Flow(
            DF.add_field('category', 'string', resources=-1),
            DF.add_field('subcategory', 'string', resources=-1),
            f,
            DF.delete_fields([F], resources=-1),
        )

    def fix_nones(row):
        row['demand_pct'] = row['demand_pct'] or 0

    key = 'stack:demand'
    try:
        demand_stacks = _cache.get(key)
    except KeyError:        
        demand_stacks = DF.Flow(
            DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2),
            collect_cats(),
            DF.update_schema(-1, missingValues=['--']),
            DF.unpivot(
                unpivot_fields=[dict(
                    name='(.+) \\([A-Z]\\)',
                    keys=dict(
                        neighborhood='\\1'
                    ),
                )],
                extra_keys=[dict(
                    name='neighborhood', type='string'
                )],
                extra_value=dict(
                    name='demand_pct', type='number'
                ),
                resources=-1
            ),
            DF.validate(),
            DF.duplicate('demand', 'demand_stacks'),
            DF.join_with_self('demand', ['category', 'subcategory'], dict(
                category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max')
            )),
            DF.join(
                'demand', ['category', 'subcategory'],
                'demand_stacks', ['category', 'subcategory'],
                dict(
                    max_demand=None
                )
            ),
            fix_nones,
            DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)),
            DF.add_field('value', 'number', lambda r: r['demand_pct']),
            DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6),
            DF.delete_fields(['demand_pct', 'max_demand']),
            DF.sort_rows('{score}', reverse=True),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_display=r['display'],
                score_value=float(r['value']),
                geometry_score=float(r['score']),
            )),
            DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict(
                category=None, subcategory=None,
                scores=dict(aggregate='array'),
            )),
            DF.add_field('card', 'object', lambda r: dict(
                title='ביקוש ל{}'.format(r['subcategory']),
                content='',
                scores=r['scores'],
                test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_')
            )),
            DF.join_with_self('demand_stacks', ['category'], dict(
                category=None,
                cards=dict(name='card', aggregate='array'),
            )),
            DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')),
        ).results()[0][0]
        _cache.set(key, demand_stacks)
                    
    cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards']
    stack.update(dict(
        layout='scores',
        currentField='neighborhood',
        map=True
    ))
    stack.setdefault('cards', []).extend(cards)
def process_demographics(stack):
    key = 'stack:demographics'
    try:
        demographics_cards = _cache.get(key)
    except KeyError:        
        def add_source():
            def f(rows):
                for row in rows:
                    row['source'] = rows.res.name
                    yield row
            return DF.Flow(
                DF.add_field('source', 'string'),
                f
            )

        def map_to_cards():
            MAP = {
                ("דו''ח אג''ס לפי עולים וותיקים",
                        ("סה''כ עולים",)
                ): 'immigrants',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('0-5', '6-12')
                ): 'kids',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('13-17',)
                ): 'teenagers',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('60-64', '65-69', '70-74', '75-120')
                ): 'elderly',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59')
                ): 'adults',
            }
            
            def f(rows):
                for row in rows:
                    for (source, kinds), kind in MAP.items():
                        if row['source'] == source and row['kind'] in kinds:
                            row['kind'] = kind
                            yield row
            return f

        s2n = dict(
            (int(stat_area), f['properties']['title'])
            for f in get_neighborhood_features()
            for stat_area in f['properties']['stat_areas']
        )

        MAP2 = dict(
            adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0),
            kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1),
            teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2),
            elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3),
            immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4),
        )

        demographics_cards = DF.Flow(
            *[
                DF.load(f, headers=4)
                for f in glob.glob('demographics/*.csv')
            ],
            DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]),
            DF.add_field('total', 'number', lambda r: r.get("סה''כ")),
            DF.delete_fields(["אג''ס", "סה''כ "]),
            DF.unpivot([dict(
                name="([-'א-ת0-9 ].+)",
                keys=dict(
                    kind=r'\1'
                )
            )], [dict(
                name='kind', type='string'
            )], dict(
                name='value', type='number'
            )),
            DF.validate(),
            add_source(),
            map_to_cards(),
            DF.concatenate(dict(
                total=[], value=[], kind=[], stat_id=[]
            )),
            DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))),
            DF.filter_rows(lambda r: r['neighborhood']),
            DF.join_with_self('concat', ['neighborhood', 'kind'], dict(
                neighborhood=None,
                kind=None,
                total=dict(aggregate='sum'),
                value=dict(aggregate='sum'),
            )),
            DF.duplicate('concat', 'maxes'),
            DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)),
            DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict(
                total=None,
            )),
            DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total']  
            DF.sort_rows('{score_value}', reverse=True),
            DF.duplicate('maxes', 'demographics'),
            DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))),
            DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)),
            DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']),
            DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_value=float(r['score_value']),
                score_display=r['score_display'],
                geometry_score=float(r['geometry_score']),
            )),
            DF.join_with_self('demographics', ['kind'], dict(
                kind=None, scores=dict(aggregate='array'),
            )),
            DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]),
            DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]),
            DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]),
            DF.sort_rows('{order}'),
            DF.delete_fields(['kind']),
        ).results()[0][0]
        _cache.set(key, demographics_cards)

    # features = [
    #     dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0]))
    #     for r in DF.Flow(
    #         DF.load('geo/stat-areas/stat-areas/datapackage.json'),
    #     ).results()[0][0]
    # ]
    # geometry=dict(type='FeatureCollection', features=features)

    stack.update(dict(
        map=True,
        scheme='green',
        currentField='neighborhood',
        layout='scores',
        # geometry=geometry
    ))
    stack.setdefault('cards', []).extend(demographics_cards)
예제 #10
0
                   "format": "default",
                   "groupChar": "",
                   "name": "Recovered",
                   "title": "Cumulative total recovered cases to date",
                   "type": "integer"
               }, {
                   "format": "default",
                   "groupChar": "",
                   "name": "Deaths",
                   "title": "Cumulative total deaths to date",
                   "type": "integer"
               }]),
 checkpoint('processed_data'),
 # Duplicate the stream to create aggregated data
 duplicate(source='time-series-19-covid-combined',
           target_name='worldwide-aggregated',
           target_path='worldwide-aggregated.csv'),
 join_with_self(resource_name='worldwide-aggregated',
                join_key=['Date'],
                fields=dict(Date={'name': 'Date'},
                            Confirmed={
                                'name': 'Confirmed',
                                'aggregate': 'sum'
                            },
                            Recovered={
                                'name': 'Recovered',
                                'aggregate': 'sum'
                            },
                            Deaths={
                                'name': 'Deaths',
                                'aggregate': 'sum'
def Olap_Datapackage():
    flow = Flow(
        # Load datapackages:
        load('elspot_prices_data/datapackage.json'),
        load('afrr_data/datapackage.json'),
        load('fcr_dk1_data/datapackage.json'),
        concatenate(fields={
            'Timestamp': ['HourUTC'],
            'Area': ['PriceArea'],
            'Product': ['product'],
            'Amount': ['amount'],
            'Price_DKK': ['PriceDKK'],
            'Price_EUR': ['PriceEUR']
        },
                    target={
                        'name': 'fact',
                        'path': 'data/fact.csv'
                    }),
        add_computed_field(
            [dict(target='id', operation='constant', with_='dummy')]),
        add_id,
        set_type('id', type='integer'),
        set_primary_key(primary_key=['id']),
        # Reorder so that 'id' column is the first:
        select_fields([
            'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK',
            'Price_EUR'
        ],
                      resources='fact'),
        # Add foreign keys:
        add_foreign_keys,
        # Fact table is ready. Now duplicate the resource to generate dim tables:
        # First is 'time' table:
        duplicate(source='fact', target_name='time', target_path='time.csv'),
        select_fields(['Timestamp'], resources=['time']),
        join_self(source_name='time',
                  source_key=['Timestamp'],
                  target_name='time',
                  fields={'Timestamp': {}}),
        # Parse datetime fields and add a separate field for year, month and day:
        add_computed_field([
            dict(target=dict(name='day', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d'
                                                                           )),
            dict(target=dict(name='month', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m'
                                                                           )),
            dict(target=dict(name='month_name', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B'
                                                                           )),
            dict(target=dict(name='year', type='year'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y'
                                                                           )),
        ],
                           resources=['time']),
        set_primary_key(primary_key=['Timestamp'], resources=['time']),
        # Now 'area' table:
        duplicate(source='fact', target_name='area', target_path='area.csv'),
        select_fields(['Area'], resources=['area']),
        join_self(source_name='area',
                  source_key=['Area'],
                  target_name='area',
                  fields={'Area': {}}),
        set_primary_key(primary_key=['Area'], resources=['area']),
        # Now 'product' table:
        duplicate(source='fact',
                  target_name='product',
                  target_path='product.csv'),
        select_fields(['Product'], resources=['product']),
        join_self(source_name='product',
                  source_key=['Product'],
                  target_name='product',
                  fields={'Product': {}}),
        set_primary_key(primary_key=['Product'], resources=['product']),
        dump_to_path('olap_datapackage'))
    flow.process()