Python concatenate示例，dataflows.concatenate Python示例

示例#1

0

显示文件

文件： read_companies.py 项目： BarVolunteering/budgetkey-data-pipelines

def flow(*_):
    gcd = google_chrome_driver()
    download = gcd.download(
        'https://data.gov.il/dataset/246d949c-a253-4811-8a11-41a137d3d613/resource/f004176c-b85f-4542-8901-7b3176f9a054/download/f004176c-b85f-4542-8901-7b3176f9a054.csv'
    )
    return Flow(
        load(download, cast_strategy=load.CAST_TO_STRINGS),
        concatenate(_get_columns_mapping_dict(),
                    target=dict(name='company-details')),
        set_type('id', type='string'),
        set_type('company_registration_date', type='date', format='%d/%m/%Y'),
        set_type('company_is_government',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['כן']),
        set_type('company_is_mafera',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['מפרה', 'התראה']),
        set_type('company_last_report_year', type='integer'),
        clear_bool_values,
        update_resource(**{'dpp:streaming': True},
                        resources='company-details'),
        set_primary_key(['id'], resources='company-details'),
        printer(),
    )

示例#2

0

显示文件

def test_concatenate():
    from dataflows import concatenate

    f = Flow([
        {
            'a': 1,
            'b': 2
        },
        {
            'a': 2,
            'b': 3
        },
        {
            'a': 3,
            'b': 4
        },
    ], [
        {
            'c': 4,
            'd': 5
        },
        {
            'c': 5,
            'd': 6
        },
        {
            'c': 6,
            'd': 7
        },
    ], concatenate({
        'f1': ['a'],
        'f2': ['b', 'c'],
        'f3': ['d']
    }))
    results, _, _ = f.results()
    assert results[0] == [{
        'f1': 1,
        'f2': 2,
        'f3': None
    }, {
        'f1': 2,
        'f2': 3,
        'f3': None
    }, {
        'f1': 3,
        'f2': 4,
        'f3': None
    }, {
        'f1': None,
        'f2': 4,
        'f3': 5
    }, {
        'f1': None,
        'f2': 5,
        'f3': 6
    }, {
        'f1': None,
        'f2': 6,
        'f3': 7
    }]

示例#3

0

显示文件

def flow(parameters):
    return Flow(
        concatenate(parameters.get('fields', {}), parameters.get('target', {}),
                    parameters.get('sources')),
        update_resource(
            parameters.get('target', {}).get('name', 'concat'),
            **{PROP_STREAMING: True}))

示例#4

0

显示文件

文件： neighborhoods.py 项目： hasadna/datacity-businessgate

def get_neighborhood_features():
    return DF.Flow(
        DF.load('neighborhoods.xlsx',
                name='stat-areas',
                deduplicate_headers=True),
        DF.add_field(
            'neighborhoods', 'array', lambda r:
            [v for k, v in r.items() if v and k.startswith('neighborhood')]),
        DF.add_field('geometry', 'object',
                     lambda r: geometries[r['stat-area']]),
        DF.concatenate(
            dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])),
        DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(),
        DF.join_with_self(
            'stat-areas', ['neighborhood'],
            dict(
                neighborhood=None,
                stat_areas=dict(name='stat_area', aggregate='array'),
                geometries=dict(name='geometry', aggregate='array'),
            )),
        DF.add_field('geometry', 'object',
                     lambda r: unite_geometries(r['geometries'])),
        DF.delete_fields(['geometries']),
        DF.update_resource(-1, name='neighborhoods'),
        DF.add_field(
            'properties', 'object', lambda r: dict(
                x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])),
        DF.delete_fields(['neighborhood', 'stat_areas']),
        DF.checkpoint('_cache_neighborhoods')).results()[0][0]

示例#5

0

显示文件

文件： read_companies.py 项目： OpenBudget/budgetkey-data-pipelines

def flow(*_):
    print('reading companies...')
    return Flow(
        data_gov_il_resource.flow(companies),
        fix_values(),
        concatenate(_get_columns_mapping_dict(),
                    target=dict(name='company-details')),
        set_type('id', type='string'),
        set_type('company_street_number', type='string'),
        set_type('company_registration_date', type='date', format='%d/%m/%Y'),
        set_type('company_is_government',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['כן']),
        set_type('company_is_mafera',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['מפרה', 'התראה']),
        set_type('company_last_report_year', type='integer'),
        set_type('company_postal_code', type='string'),
        clear_bool_values,
        update_resource(**{'dpp:streaming': True},
                        resources='company-details'),
        set_primary_key(['id'], resources='company-details'),
        printer(),
    )

示例#6

0

显示文件

文件： broken_links.py 项目： hasadna/migdar-data-pipelines

def broken_links_flow():
    return DF.Flow(
        *[
            DF.Flow(
                DF.load(URL_TEMPLATE.format(**c), name=c['name']),
                DF.add_field('__name',
                             'string',
                             c['name'],
                             resources=c['name']),
                DF.add_field('__title',
                             'string',
                             get_title(c['title']),
                             resources=c['name']),
            ) for c in configuration
        ],
        DF.add_field('urls', 'array', lambda r: RE.findall(str(r))),
        DF.add_field('link', 'string',
                     lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)),
        DF.concatenate(
            dict(
                name=['__name'],
                title=['__title'],
                link=[],
                urls=[],
            )),
        DF.add_field('url', 'string'),
        DF.add_field('error', 'string'),
        unwind(),
        DF.delete_fields(['urls']),
        DF.parallelize(check_broken(), 4),
        DF.filter_rows(lambda r: r['error'] is not None),
    )

示例#7

0

显示文件

文件： historic_data.py 项目： wsheffel/budgetkey-data-pipelines

def prepare():
    for resource_name, load in loads:
        DF.Flow(
            load,
            # DF.printer(tablefmt='html'),
            DF.concatenate(
                FIELD_MAPPING,
                dict(name=resource_name, path=resource_name + '.csv')),
            DF.set_type('activity_name',
                        type='string',
                        constraints=dict(required=True),
                        on_error=DF.schema_validator.drop),
            DF.set_type('allocated_budget',
                        type='number',
                        groupChar=',',
                        bareNumber=False),
            DF.set_type('num_beneficiaries',
                        type='number',
                        groupChar=',',
                        bareNumber=False,
                        on_error=DF.schema_validator.ignore),
            fix_beneficiaries,
            DF.set_type('num_beneficiaries', type='string'),
            multiply_budget,
            fill_org_hierarchy,
            # DF.printer(tablefmt='html'),
            DF.dump_to_path('tmp/' + resource_name),
        ).process()

示例#8

0

显示文件

 def preflow(self):
     f = Flow(
         load('.cache/{}/datapackage.json'.format(self.ref_hash)),
         concatenate(fields=dict(
             (f, []) for f in self.REF_KEY_FIELDS + self.REF_FETCH_FIELDS),
                     target=dict(name=self.key, path=self.key + '.csv'),
                     resources=self.ref_hash),
     )
     return f

示例#9

0

显示文件

文件： addresses.py 项目： hasadna/datacity-businessgate

def prepare_addresses():
    with tempfile.NamedTemporaryFile(suffix='.csv', mode='wb') as source:
        shutil.copyfileobj(fetch_ckan('addresses', 'CSV'), source)
        source.flush()
        DF.Flow(
            DF.load(source.name),
            DF.concatenate(
                dict(street_name=['streetName'],
                     house_number=['HouseNuber'],
                     letter=[],
                     lat=[],
                     lon=[])), match_arnona(),
            DF.dump_to_path('_cache_addresses'),
            DF.checkpoint('_cache_addresses')).process()

示例#10

0

显示文件

文件： all_activities.py 项目： OpenBudget/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        DF.load(
            '/var/datapackages/activities/social_services/social_services/datapackage.json'
        ),
        DF.concatenate(
            dict(kind=[],
                 kind_he=[],
                 activity_name=[],
                 activity_description=[],
                 publisher_name=[],
                 history=[],
                 max_year=[],
                 min_year=[]), dict(name='activities', path='activities.csv')),
        DF.set_primary_key(['kind', 'publisher_name', 'activity_name']),
        DF.set_type('activity_name', **{'es:title': True}),
        DF.set_type('activity_description', **{
            'es:itemType': 'string',
            'es:boost': True
        }), DF.set_type('kind', **{
            'es:keyword': True,
            'es:exclude': True
        }), DF.set_type('kind_he', **{
            'es:keyword': True,
            'es:exclude': True
        }), DF.set_type('publisher_name', **{'es:keyword': True}),
        DF.set_type(
            'history', **{
                'es:itemType':
                'object',
                'es:schema':
                dict(fields=[
                    dict(name='year', type='integer'),
                    dict(name='unit', type='string'),
                    dict(name='subunit', type='string'),
                    dict(name='subsubunit', type='string'),
                    dict(name='allocated_budget', type='integer'),
                    dict(name='num_beneficiaries',
                         type='string',
                         **{'es:index': False}),
                ])
            }),
        DF.add_field(
            'score', 'number', lambda x:
            (x['history'][0].get('allocated_budget') or 1000) / 1000,
            **{'es:score-column': True}),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.dump_to_path('/var/datapackages/activities/all'),
        DF.dump_to_sql(dict(activities={'resource-name': 'activities'})))

示例#11

0

显示文件

文件： naama_scraper.py 项目： odedsh/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        get_updated_sources(),
        DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')),
        DF.validate(),
        DF.filter_rows(lambda r: r['publication_id']),
        DF.add_field('tender_type', 'string',
                     lambda r: TENDER_KINDS[r['tender_type_he']],
                     **{'es:keyword': True}),
        DF.join_with_self(
            'tenders', KEY,
            dict((k, dict(aggregate='last'))
                 for k in list(TENDER_MAPPING.keys()) + ['tender_type'])),
        DF.set_type('publication_id', type='string', transform=str),
        DF.set_type('supplier_id', type='string', transform=str),
        DF.set_type('tender_id',
                    type='string',
                    transform=lambda v: v or 'none'),
        DF.set_type('.+_date',
                    type='date',
                    format='%d.%m.%Y',
                    on_error=DF.schema_validator.clear),
        DF.set_type('subjects',
                    type='string',
                    transform=lambda v: ';'.join(x.strip()
                                                 for x in v.split(','))
                    if v else ''),
        DF.set_type('claim_date',
                    type='datetime',
                    transform=lambda v, field_name, row: datetime.datetime.
                    combine(v, row['claim_time'] or datetime.time(0))
                    if v else None),
        DF.set_type('tender_type_he', **{'es:keyword': True}),
        DF.delete_fields(['claim_time']),
        DF.add_field(
            'page_url', 'string', lambda r:
            f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'),
        DF.add_field('page_title', 'string', lambda r: r['description']),
        DF.add_field('reason', 'string', lambda r: r['regulation']),
        DF.add_field('documents', 'array', []),
        DF.add_field('contact', 'string'),
        DF.add_field('contact_email', 'string'),
        DF.validate(),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.printer(),
    )

示例#12

0

显示文件

文件： test_lib.py 项目： robertdigital/dataflows

def test_concatenate_multifield():
    from dataflows import concatenate

    f = Flow([
        {
            'a': 1,
            'b': 2,
            'c': None
        },
        {
            'a': 2,
            'b': None,
            'c': 3
        },
        {
            'a': 3,
            'c': 4
        },
        {
            'a': 3,
            'b': 6,
            'c': 4
        },
    ], concatenate({
        'f1': ['a'],
        'f2': ['b', 'c'],
    }))
    results, _, _ = f.results()
    assert results[0] == [
        {
            'f1': 1,
            'f2': 2
        },
        {
            'f1': 2,
            'f2': 3
        },
        {
            'f1': 3,
            'f2': 4
        },
        {
            'f1': 3,
            'f2': 4
        },
    ]

示例#13

0

显示文件

def flow(*_):
    return DF.Flow(
        DF.load(URL, format='json', property='jData', name='education'),
        # DF.checkpoint('education'),
        DF.concatenate(dict(
            page_title=['Title'],
            start_date=['PobKKPublishingDate'],
            claim_date=['PobLastDate'],
            target_audience_x=['PobBudgetEntitties'],
            description=['PobTaktzir'],
            email=['PobPedagogyContactHtml'],
            publishing_unit_x=['PobYechida'],
            budget_code_x=['PobTakanaTaktzivitString'],
            att_title=['PobCreteriaLink_description'],
            att_url=['PobCreteriaLink_url'],
        ), resources=-1, target=dict(name='education')),
        enumerate_titles,
        DF.add_field('page_url', 'string', PAGE_URL, resources=-1),
        DF.add_field('publisher', 'string', 'משרד החינוך', resources=-1),
        DF.add_field('tender_type', 'string', 'call_for_bids', resources=-1),
        DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1),
        DF.add_field('publication_id', 'integer', 0, resources=-1),
        DF.add_field('tender_id', 'string', '0', resources=-1),
        DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1),
        DF.add_field('contact', 'string', lambda row: extract_hebrew(row, 'email'), resources=-1),
        DF.add_field('target_audience', 'string', lambda row: extract_hebrew(row, 'target_audience_x'), resources=-1),
        DF.add_field('contact_email', 'string', lambda row: extract_email(row, 'email'), resources=-1),
        DF.add_field('publishing_unit', 'string', lambda row: row['publishing_unit_x'][0]['PobYechida'], resources=-1),
        DF.add_field('budget_code', 'string', lambda row: extract_budget_code(row, 'budget_code_x'), resources=-1),
        DF.set_type('start_date', type='date', format='%d/%m/%Y %H:%M:%S'),
        DF.set_type('claim_date', type='datetime', format='%d/%m/%Y %H:%M:%S'),
        DF.add_field('documents', 'array',
                     lambda row: [dict(
                         description=row['att_title'],
                         link=row['att_url'],
                         update_time=str(row['start_date'])
                     )], resources=-1),
        DF.delete_fields(['email', 'publishing_unit_x', 'budget_code_x', 'att_title', 'att_url', 'target_audience_x'], resources=-1),
        calculate_publication_id(6),
        DF.update_resource(-1, **{'dpp:streaming': True})
    )

示例#14

0

显示文件

文件： test_examples.py 项目： vitaly-am/dataflows

def test_example_9():
    from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        filter_rows(equals=[dict(winner=1)]),
        concatenate(dict(emmy_nominee=['nominee'], ),
                    dict(name='emmies_filtered'),
                    resources='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        join(
            'emmies_filtered',
            ['emmy_nominee'],  # Source resource
            'oscars',
            ['Name'],  # Target resource
            full=False  # Don't add new fields, remove unmatched rows
        ),
        filter_rows(equals=[dict(Winner='1')]),
        dump_to_path('out/double_winners'))
    _ = f.process()

示例#15

0

显示文件

文件： zotero.py 项目： hasadna/migdar-data-pipelines

def flow(*args):
    return DF.Flow(
        get(),
        DF.filter_rows(
            lambda row: bool(row['key']) and bool(row.get('title'))),
        simplify_tags, extract_tags('life_areas', ['Domain']),
        extract_tags('source_kind', ['Source', 'Resource', 'Resouce']),
        DF.add_field(
            'authors', 'string', lambda r: None
            if not r.get('creators') else ', '.join(
                ('{name}'.format(**c)
                 if 'name' in c else '{firstName} {lastName}'.format(**c))
                for c in r.get('creators', [])
                if c.get('creatorType') == 'author')),
        DF.add_field('item_kind', 'string',
                     lambda r: r.get('reportType') or r.get('itemKind')),
        DF.concatenate(MAPPING,
                       target={
                           'name': 'zotero',
                           'path': 'zotero.csv'
                       }), DF.dump_to_path('data/zotero'),
        DF.update_resource(None, **{'dpp:streaming': True}), DF.printer())

示例#16

0

显示文件

文件： historic_data.py 项目： wsheffel/budgetkey-data-pipelines

def flow(*_):
    prepare()
    yearly_fields = [
        'year', 'unit', 'subunit', 'subsubunit', 'allocated_budget',
        'num_beneficiaries'
    ]
    return DF.Flow(
        *[
            DF.load('tmp/' + resource_name + '/datapackage.json')
            for resource_name, _ in loads
        ],
        DF.concatenate(
            FIELD_MAPPING,
            dict(name='social_services', path='social_services.csv')),
        DF.sort_rows('{year}', reverse=True),
        DF.add_field(
            'history', 'object', lambda r: dict(
                (k, r[k]
                 if not isinstance(r[k], decimal.Decimal) else int(r[k]))
                for k in yearly_fields)),
        DF.printer(),
        DF.join_with_self(
            'social_services', ['publisher_name', 'activity_name'],
            dict(
                publisher_name=None,
                activity_name=None,
                activity_description=dict(aggregate='set'),
                min_year=dict(name='year', aggregate='min'),
                max_year=dict(name='year', aggregate='max'),
                history=dict(aggregate='array'),
            )),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.printer(),
    )

示例#17

0

显示文件

文件： datagov_fetcher.py 项目： hasadna/datacity-ckan-dgp

def loader(name, cat):
    return DF.Flow(
        DF.load('mosadot.xlsx'),
        DF.concatenate(
            dict(
                municipality=['מועצה אזורית'],
                town=['שם יישוב'],
                name=['שם המוסד'],
                kind=['סוג המוסד'],
                address=['כתובת'],
                status=['סטטוס'],
                target_audience=['קהל יעד'],
                area=['שטח'],
                lat=['Y'],
                lon=['X'],
            )),
        fixer,
        category(),
        DF.filter_rows(lambda r: r['category'] == cat),
        geo(),
        #     DF.join_with_self('concat', ['kind'], dict(kind=None)),
        DF.update_resource(-1, name=name, path=name + '.csv'),
        DF.dump_to_path(name),
    ).results()[0][0]

示例#18

0

显示文件

         '.+)',
         'keys': {
             'year': '\\1'
         }
     }],
     [{
         'name': 'year',
         'type': 'string'
     }],
     {
         'name': 'value',
         'type': 'number'
     },
 ),
 verify_unused_fields(),
 DF.concatenate(FIELD_MAPPING, target=dict(name='out')),
 fix_urls(['source_url']),
 ensure_chart_title(),
 fix_languages(),
 DF.add_field('order_index', 'integer'),
 lambda rows: ({
     **row,
     **{
         'order_index': i
     }
 } for i, row in enumerate(rows)),
 set_defaults,
 extrapulate_years,
 fix_values,
 DF.set_type('value', groupChar=',', bareNumber=True),
 fix_units,

示例#19

0

显示文件

def main_flow(prefix=''):
    source_url = '{}data/publications_for_es/datapackage.json'.format(prefix)
    package = Package(source_url)
    all_fields = set(field.name for resource in package.resources
                     for field in resource.schema.fields)
    all_fields = dict((field_name, []) for field_name in all_fields)
    return Flow(
        load(source_url),
        lambda row: dict(row, json='{}'),
        concatenate(all_fields,
                    target=dict(name='publications', path='publications.csv')),
        delete_fields(['json']),
        prefer_gd('title'),
        prefer_gd('notes'),
        prefer_gd('publisher'),
        prefer_gd('tags'),
        prefer_gd('language_code'),
        prefer_gd('pubyear'),
        split_keyword_list('item_kind', 'gd_Item Type'),
        split_keyword_list('life_areas', 'gd_Life Domains'),
        split_keyword_list('source_kind', 'gd_Resource Type'),
        split_keyword_list('languages', 'language_code', ' '),
        split_keyword_list('tags', 'tags'),
        load('data/zotero/datapackage.json'),
        concatenate(dict(
            title=[],
            pubyear=[],
            publisher=[],
            authors=[],
            life_areas=[],
            notes=[],
            languages=[],
            tags=[],
            url=[],
            migdar_id=[],
            item_kind=[],
            source_kind=[],
            isbn=[],
            physical_description=[],
            publication_distribution_details=[],
            doc_id=[],
        ),
                    target=dict(name='publications', path='publications.csv')),
        set_type('title', **{'es:title': True}),
        set_type('notes', **{'es:hebrew': True}),
        set_type('publisher', **{'es:keyword': True}),
        add_field('year', 'integer', default=extract_year),
        split_and_translate('tags', 'tags', keyword=True),
        split_and_translate('life_areas', 'life_areas', keyword=True),
        split_and_translate('languages', 'languages', keyword=True),
        split_and_translate('source_kind', 'source_kind', keyword=True),
        split_and_translate('item_kind', 'item_kind', keyword=True),
        printer(),
        add_computed_field([
            {
                'operation': 'format',
                'target': 'doc_id',
                'with': KEY_PATTERN
            },
            {
                'operation': 'format',
                'target': 'page_title',
                'with': PAGE_TITLE_PATTERN
            },
        ]),
        add_computed_field([]),
    )

示例#20

0

显示文件

文件： olap_datapackage.py 项目： anuveyatsu/marketdata-flows

def Olap_Datapackage():
    flow = Flow(
        # Load datapackages:
        load('elspot_prices_data/datapackage.json'),
        load('afrr_data/datapackage.json'),
        load('fcr_dk1_data/datapackage.json'),
        concatenate(fields={
            'Timestamp': ['HourUTC'],
            'Area': ['PriceArea'],
            'Product': ['product'],
            'Amount': ['amount'],
            'Price_DKK': ['PriceDKK'],
            'Price_EUR': ['PriceEUR']
        },
                    target={
                        'name': 'fact',
                        'path': 'data/fact.csv'
                    }),
        add_computed_field(
            [dict(target='id', operation='constant', with_='dummy')]),
        add_id,
        set_type('id', type='integer'),
        set_primary_key(primary_key=['id']),
        # Reorder so that 'id' column is the first:
        select_fields([
            'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK',
            'Price_EUR'
        ],
                      resources='fact'),
        # Add foreign keys:
        add_foreign_keys,
        # Fact table is ready. Now duplicate the resource to generate dim tables:
        # First is 'time' table:
        duplicate(source='fact', target_name='time', target_path='time.csv'),
        select_fields(['Timestamp'], resources=['time']),
        join_self(source_name='time',
                  source_key=['Timestamp'],
                  target_name='time',
                  fields={'Timestamp': {}}),
        # Parse datetime fields and add a separate field for year, month and day:
        add_computed_field([
            dict(target=dict(name='day', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d'
                                                                           )),
            dict(target=dict(name='month', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m'
                                                                           )),
            dict(target=dict(name='month_name', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B'
                                                                           )),
            dict(target=dict(name='year', type='year'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y'
                                                                           )),
        ],
                           resources=['time']),
        set_primary_key(primary_key=['Timestamp'], resources=['time']),
        # Now 'area' table:
        duplicate(source='fact', target_name='area', target_path='area.csv'),
        select_fields(['Area'], resources=['area']),
        join_self(source_name='area',
                  source_key=['Area'],
                  target_name='area',
                  fields={'Area': {}}),
        set_primary_key(primary_key=['Area'], resources=['area']),
        # Now 'product' table:
        duplicate(source='fact',
                  target_name='product',
                  target_path='product.csv'),
        select_fields(['Product'], resources=['product']),
        join_self(source_name='product',
                  source_key=['Product'],
                  target_name='product',
                  fields={'Product': {}}),
        set_primary_key(primary_key=['Product'], resources=['product']),
        dump_to_path('olap_datapackage'))
    flow.process()

示例#21

0

显示文件

    def func(row):
        row['scores'] = sorted(row.get('scores', []), key=lambda r: r['date'])[-30:]
    return func

if __name__ == '__main__':

    r, _, _ = DF.Flow(
        DF.load(all_data(), name='cities', headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field('score_date', 'object', lambda r: dict(
            date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted']))
        ),
        DF.concatenate(dict(
            id=[], city_name=[], score_date=[]
        ), target=dict(name='ranking')),
        DF.join_with_self('ranking', '{city_name}', dict(
            id=None, city_name=None, scores=dict(name='score_date', aggregate='array')
        )),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200),
        DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']),
        DF.sort_rows('{sortkey}', reverse=True),
        DF.delete_fields(['sortkey']),
        DF.add_field('rank', 'integer', 0),
        DF.add_field('translations', 'object', lambda r: city_translations[r['city_name']]),
        DF.add_field('image', 'object', lambda r: upload_static_image(r['id'], width=280*2, height=160*2)),
        ranker(),
    ).results()
    rankings = r[0]

示例#22

0

显示文件

    gj = json.load(open(filename))
    default = dict(latest_confidence=0,
                   latest_ratio=0,
                   latest_reports=0,
                   population=0)
    for feature in gj['features']:
        properties = feature['properties']
        properties.update(latest.get((properties['id'], is_city), default))
    upload = json.dumps(gj, cls=json_encoder).encode('utf8')
    path = 'data/tilesets/static-images-{}.geojson'.format(key)
    logging.info('UPLOADING %d bytes to %s', len(upload), path)
    upload_file(upload, path)
    upload_tileset(upload, 'static-images-' + key)


if __name__ == '__main__':
    data, _, _ = DF.Flow(
        DF.load(latest_file(),
                name='out',
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.concatenate(
            dict(id=[],
                 is_city=[],
                 latest_confidence=['symptoms_ratio_confidence_weighted'],
                 latest_ratio=['symptoms_ratio_weighted'],
                 latest_reports=['num_reports_weighted'],
                 population=[]))).results()
    data = dict(((r.pop('id'), r.get('is_city')), r) for r in data[0])
    for key, is_city in [('cities', 1), ('neighborhoods', 0)]:
        process_file(key, is_city, geo_file(key), data)

示例#23

0

显示文件

if __name__ == '__main__':

    r, _, _ = DF.Flow(
        DF.load(all_data(),
                name='cities',
                headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field(
            'score_date', 'object',
            lambda r: dict(weekday=r['date'].isoweekday() % 7,
                           date=r['date'].toordinal(),
                           sr=float(r['symptoms_ratio_weighted'] or 0),
                           nr=int(r['num_reports_weighted']))),
        DF.concatenate(dict(id=[], city_name=[], score_date=[]),
                       target=dict(name='popup_data')),
        DF.join_with_self(
            'popup_data', '{city_name}',
            dict(id=None,
                 city_name=None,
                 scores=dict(name='score_date', aggregate='array'))),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'] is not None),
        DF.add_field('nr', 'integer', lambda r: r['scores'][-1]['nr']),
        DF.add_field('sr', 'number', lambda r: r['scores'][-1]['sr']),
        split_to_weeks(),
        DF.add_field('translations', 'object',
                     lambda r: city_translations[r['city_name']]),
    ).results()
    popup_data = r[0]
    popup_data = dict((x.pop('id'), x) for x in popup_data)

示例#24

0

显示文件

文件： publications.py 项目： hasadna/migdar-data-pipelines

def base_flow():
    sources, *_ = Flow(
        list_gdrive(),
        filter_rows(lambda row: (
            row['kind'] == 'drive#file' and
            row['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )),
        add_field('filename', 'string',
                  default=lambda row: 'pubfiles/{modifiedTime}-{id}.xlsx'.format(**row)),
        download_files(),
        add_field('sheet', 'string'),
        add_field('headers', 'integer', 1),
        get_sheets(),
    ).results()
    return Flow(
        *[
            load(source['filename'],
                 sheet=source['sheet'],
                 headers=source['headers'],
                 infer_strategy=load.INFER_STRINGS,
                 cast_strategy=load.CAST_TO_STRINGS,
                 name=source['filename'])
            for source in sources[0]
        ],
        filter_rows(lambda row: row.get('migdar_id') not in ('', 'None', None)),
        load('data/zotero/zotero.csv'),
        concatenate(
            fields={
                'migdar_id': [],
                'title': ['Title', ],
                'bib_title': [],
                'bib_related_parts': [],

                'notes': [],
                'tags': ['Tags'],
                'publisher': [],
                'languages': ['language_code'],
                'item_kind': ['Item Type', 'Item type', 'item_type'],
                'pubyear': ['pubyear/pubdate'],
                'life_areas': ['Life Domains', 'Domain'],
                'source_kind': ['Resource Type', 'Resource type'],
                'authors': ['author'],
                'url': ['URL'],

            },
            target=dict(
                name='publications',
                path='data/publications.csv'
            )
        ),
        fix_nones(),
        fix_urls(['url']),
        set_type('title',        **{'es:title': True}),
        set_type('authors',       **{'es:boost': True}),
        set_type('notes',        **{'es:hebrew': True}),
        set_type('publisher',    **{'es:boost': True}),
        add_field('year', 'integer',
                  default=extract_year),
        split_and_translate('tags', 'tags', keyword=True, delimiter=','),
        split_and_translate('life_areas', 'life_areas', keyword=True, delimiter=','),
        split_and_translate('languages', 'languages', keyword=True, delimiter=' '),
        split_and_translate('source_kind', 'source_kind', keyword=True),
        split_and_translate('item_kind', 'item_kind', keyword=True),
        fix_links('notes'), 
        verify_migdar_id(),
        add_computed_field([
            {'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN},
            {'operation': 'format', 'target': 'page_title',
             'with': PAGE_TITLE_PATTERN},
        ]),
        add_field('title_kw', 'string',
                  default=lambda row: row.get('title'),
                  **{'es:keyword': True}),
    )

示例#25

0

显示文件

def aggregate_test_results(servers, total_duration_seconds, datacenter,
                           only_test_method, load_generator):
    overview_report = {}
    load_steps = []
    for num, _ in servers.items():
        log_txt_filename = os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR,
                                        str(num), '.output', 'log.txt')
        http_csv_zst_filename = os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR,
                                             str(num), '.output',
                                             'warp-bench-data.http.csv.zst')
        http_csv_gz_filename = os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR,
                                            str(num), '.output',
                                            'warp-bench-data.http.csv.gz')
        http_csv_filename = os.path.join(
            DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output',
            'warp-bench-data.http.{}'.format('tsv' if load_generator ==
                                             'warp' else 'csv'))
        https_csv_zst_filename = os.path.join(
            DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output',
            'warp-bench-data.https.csv.zst')
        https_csv_gz_filename = os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR,
                                             str(num), '.output',
                                             'warp-bench-data.https.csv.gz')
        https_csv_filename = os.path.join(
            DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR, str(num), '.output',
            'warp-bench-data.https.{}'.format('tsv' if load_generator ==
                                              'warp' else 'csv'))
        if os.path.exists(log_txt_filename):
            try:
                with open(log_txt_filename) as f:
                    log_txt = f.read().strip()
                    for method in ('http', 'https'):
                        if not only_test_method or only_test_method == method:
                            tmp = log_txt.split(
                                "--- {} load test successful, bucket_name=".
                                format(method))[1].split(" ---")[0]
                            bucket_name, dt = tmp.split(' | ')
                            assert len(bucket_name) > 5
                            overview_report.setdefault(
                                'server_{}_{}'.format(num, method),
                                {}).update(bucket_name=bucket_name,
                                           is_successful=True,
                                           dt=dt)
            except Exception:
                traceback.print_exc()
        for csv_filename, csv_zst_filename, csv_gz_filename, method in (
            (http_csv_filename, http_csv_zst_filename, http_csv_gz_filename,
             'http'), (https_csv_filename, https_csv_zst_filename,
                       https_csv_gz_filename, 'https')):
            csv_compressed_filename = csv_zst_filename if load_generator == 'warp' else csv_gz_filename
            csv_compression_method = 'zst' if load_generator == 'warp' else 'gz'
            if (not only_test_method or only_test_method
                    == method) and os.path.exists(csv_compressed_filename):
                load_steps.append(
                    DF.load(csv_filename,
                            name='{}_{}'.format(method, num),
                            infer_strategy=DF.load.INFER_STRINGS))
                csv_decompress(csv_compression_method, csv_compressed_filename,
                               csv_filename)
                overview_report.setdefault(
                    'server_{}_{}'.format(num, method),
                    {}).update(csv_size=os.path.getsize(csv_filename) if os.
                               path.exists(csv_filename) else 0)
    assert len(load_steps) > 0, "none of the servers data was loaded"

    stats = defaultdict(int)

    def aggregate_stats(row):
        endpoint = row['endpoint']
        try:
            op = row['op']
            error = row['error']
            bytes = int(row['bytes'])
            duration_ns = int(row['duration_ns'])
            first_byte = row['first_byte']
            end = row['end']
            duration_after_first_byte_ns = get_duration_ns(first_byte, end)
            assert duration_after_first_byte_ns >= 0, row
            assert '~' not in endpoint
            assert '~' not in op
            key_prefix = '{}~{}'.format(endpoint, op)
            if error:
                stats['{}~errors'.format(key_prefix)] += 1
            else:
                stats['{}~successful-requests'.format(key_prefix)] += 1
                stats['{}~bytes'.format(key_prefix)] += bytes
                stats['{}~duration_ns'.format(key_prefix)] += duration_ns
                stats['{}~duration_after_first_byte_ns'.format(
                    key_prefix)] += duration_after_first_byte_ns
                if duration_ns > stats['{}~max-duration_ns'.format(
                        key_prefix)]:
                    stats['{}~max-duration_ns'.format(
                        key_prefix)] = duration_ns
                if stats['{}~min-duration_ns'.format(
                        key_prefix)] == 0 or duration_ns < stats[
                            '{}~min-duration_ns'.format(key_prefix)]:
                    stats['{}~min-duration_ns'.format(
                        key_prefix)] = duration_ns
                duration_seconds = duration_ns / 1000000000
                bucket = None
                for bucket_max_seconds in BUCKET_MAX_SECONDS:
                    if duration_seconds <= bucket_max_seconds:
                        bucket = bucket_max_seconds
                        break
                if not bucket:
                    bucket = BUCKET_INF
                stats['{}~request-duration-{}'.format(key_prefix, bucket)] += 1
        except:
            print("Exception in res {} endpoint {}".format(
                row['source_resource_name'], endpoint))
            raise

    def aggregate_errors(rows):
        for row in rows:
            if row['error'] and len(row['error']) > 0:
                yield row

    concatenate_step = DF.concatenate(
        {
            "source_resource_name": [],
            "op": [],
            "bytes": [],
            "endpoint": [],
            "file": [],
            "error": [],
            "start": [],
            "first_byte": [],
            "end": [],
            "duration_ns": []
        }, {
            "name": "warp_bench_data",
            "path": "data/warp_bench_data.csv"
        })

    def add_source_resource_name_step(rows):
        for row in rows:
            row['source_resource_name'] = rows.res.name
            yield row

    DF.Flow(
        *load_steps, add_source_resource_name_step, concatenate_step,
        aggregate_stats,
        DF.dump_to_path(
            os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR,
                         'warp_bench_data'))).process()

    try:
        DF.Flow(
            *load_steps, concatenate_step, aggregate_errors,
            DF.dump_to_path(
                os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR,
                             'warp_bench_data_errors'))).process()
    except Exception:
        print("No Errors!")

    def dump_stats():
        all_field_names = set()
        normalized_stats = {}
        for k, v in stats.items():
            endpoint, op, *metric = k.split('~')
            metric = '~'.join(metric)
            field_name = '{}-{}'.format(op, metric)
            all_field_names.add(field_name)
            normalized_stats.setdefault(endpoint, {})[field_name] = v
        totals_sum = defaultdict(int)
        for endpoint, row in normalized_stats.items():
            if not endpoint or len(endpoint) < 4:
                continue
            for field_name in all_field_names:
                if field_name not in row:
                    row[field_name] = 0.0
                elif not row[field_name]:
                    row[field_name] = 0.0
            for op in ['STAT', 'DELETE', 'GET', 'PUT']:
                total_bytes = row.get('{}-bytes'.format(op)) or 0
                total_megabytes = total_bytes / 1024 / 1024
                successful_requests = row.get(
                    '{}-successful-requests'.format(op)) or 0
                error_requests = row.get('{}-errors'.format(op)) or 0
                total_requests = successful_requests + error_requests
                row['{}-percent-errors'.format(op)] = (
                    error_requests / total_requests *
                    100) if total_requests > 0 else 0
                row['{}-requests-per-second'.format(
                    op)] = total_requests / total_duration_seconds
                row['{}-successful-requests-per-second'.format(
                    op)] = successful_requests / total_duration_seconds
                if op in ['GET', 'PUT']:
                    row['{}-megabytes'.format(op)] = total_megabytes
                    row['{}-megabytes-per-second'.format(
                        op)] = total_megabytes / total_duration_seconds
                    for bucket in [*BUCKET_MAX_SECONDS, BUCKET_INF]:
                        row['{}-request-duration-percent-{}'.format(
                            op, bucket)] = (
                                (row.get('{}-request-duration-{}'.format(
                                    op, bucket)) or 0) / successful_requests *
                                100) if successful_requests > 0 else 0
            all_ops_total_errors = sum([
                (row.get('{}-errors'.format(op)) or 0)
                for op in ['GET', 'PUT', 'STAT', 'DELETE']
            ])
            all_ops_total_successful_requests = sum([
                (row.get('{}-successful-requests'.format(op)) or 0)
                for op in ['GET', 'PUT', 'STAT', 'DELETE']
            ])
            all_ops_total_requests = all_ops_total_errors + all_ops_total_successful_requests
            all_ops_total_bytes = sum([(row.get('{}-bytes'.format(op)) or 0)
                                       for op in ['GET', 'PUT']])
            all_ops_total_megabytes = all_ops_total_bytes / 1024 / 1024
            output_row = {
                'endpoint':
                endpoint,
                'datacenter':
                get_datacenter(endpoint, datacenter),
                'total-percent-errors':
                (all_ops_total_errors / all_ops_total_requests *
                 100) if all_ops_total_requests > 0 else 0,
                'total-requests-per-second':
                all_ops_total_requests / total_duration_seconds,
                'total-successful-requests-per-second':
                all_ops_total_successful_requests / total_duration_seconds,
                'total-megabytes-per-second':
                all_ops_total_megabytes / total_duration_seconds,
                'total-errors':
                all_ops_total_errors,
                'total-successful-requests':
                all_ops_total_successful_requests,
                'total-megabytes':
                all_ops_total_megabytes,
                'total-bytes':
                all_ops_total_bytes,
                **{
                    k: row.get(k) or 0
                    for k in [
                        *[
                            '{}-percent-errors'.format(op) for op in [
                                'GET', 'PUT', 'STAT', 'DELETE'
                            ]
                        ],
                        *[
                            '{}-requests-per-second'.format(op) for op in [
                                'GET', 'PUT', 'STAT', 'DELETE'
                            ]
                        ],
                        *[
                            '{}-successful-requests-per-second'.format(op) for op in [
                                'GET', 'PUT', 'STAT', 'DELETE'
                            ]
                        ],
                        *[
                            '{}-megabytes-per-second'.format(op) for op in [
                                'GET', 'PUT'
                            ]
                        ],
                        *[
                            '{}-errors'.format(op) for op in [
                                'GET', 'PUT', 'STAT', 'DELETE'
                            ]
                        ],
                        *[
                            '{}-successful-requests'.format(op) for op in [
                                'GET', 'PUT', 'STAT', 'DELETE'
                            ]
                        ],
                        *['{}-megabytes'.format(op) for op in ['GET', 'PUT']],
                        *['{}-bytes'.format(op) for op in ['GET', 'PUT']],
                        *[
                            'GET-request-duration-percent-{}'.format(bucket) for bucket in [
                                *BUCKET_MAX_SECONDS, BUCKET_INF
                            ]
                        ],
                        *[
                            'PUT-request-duration-percent-{}'.format(bucket) for bucket in [
                                *BUCKET_MAX_SECONDS, BUCKET_INF
                            ]
                        ],
                    ]
                }
            }
            yield output_row
            for metric in ['errors', 'successful-requests', 'bytes']:
                for op in [
                        'total', 'GET', 'PUT', *(['STAT', 'DELETE'] if metric
                                                 not in ['bytes'] else [])
                ]:
                    op_metric = '{}-{}'.format(op, metric)
                    totals_sum[op_metric] += (output_row.get(op_metric) or 0)
            for op in ['GET', 'PUT']:
                for bucket in [*BUCKET_MAX_SECONDS, BUCKET_INF]:
                    op_bucket_metric_key = '{}-request-duration-{}'.format(
                        op, bucket)
                    totals_sum[op_bucket_metric_key] += (
                        row.get(op_bucket_metric_key) or 0)
        totals_sum_total_successful_requests = totals_sum[
            'total-successful-requests']
        totals_sum_total_errors = totals_sum['total-errors']
        totals_sum_total_requests = totals_sum_total_successful_requests + totals_sum_total_errors
        totals_sum_total_bytes = totals_sum['total-bytes']
        totals_sum[
            'total-megabytes'] = totals_sum_total_megabytes = totals_sum_total_bytes / 1024 / 1024
        totals_sum['total-percent-errors'] = (
            totals_sum_total_errors / totals_sum_total_requests *
            100) if totals_sum_total_requests > 0 else 0
        totals_sum[
            'total-requests-per-second'] = totals_sum_total_requests / total_duration_seconds
        totals_sum[
            'total-successful-requests-per-second'] = totals_sum_total_successful_requests / total_duration_seconds
        totals_sum[
            'total-megabytes-per-second'] = totals_sum_total_megabytes / total_duration_seconds
        for op in ['GET', 'PUT', 'STAT', 'DELETE']:
            op_errors = totals_sum['{}-errors'.format(op)]
            op_successful_requests = totals_sum[
                '{}-successful-requests'.format(op)]
            op_total_requests = op_errors + op_successful_requests
            totals_sum['{}-percent-errors'.format(op)] = (
                op_errors / op_total_requests *
                100) if op_total_requests > 0 else 0
            totals_sum['{}-requests-per-second'.format(
                op)] = op_total_requests / total_duration_seconds
            totals_sum['{}-successful-requests-per-second'.format(
                op)] = op_successful_requests / total_duration_seconds
            if op in ['GET', 'PUT']:
                totals_sum['{}-megabytes'.format(
                    op)] = totals_sum['{}-bytes'.format(op)] / 1024 / 1024
                totals_sum['{}-megabytes-per-second'.format(op)] = totals_sum[
                    '{}-megabytes'.format(op)] / total_duration_seconds
                for bucket in [*BUCKET_MAX_SECONDS, BUCKET_INF]:
                    totals_sum['{}-request-duration-percent-{}'.format(
                        op,
                        bucket)] = (totals_sum['{}-request-duration-{}'.format(
                            op, bucket)] / op_successful_requests *
                                    100) if op_successful_requests else 0
        yield {'endpoint': '*', 'datacenter': '*', **totals_sum}

    DF.Flow(
        dump_stats(),
        DF.dump_to_path(
            os.path.join(DISTRIBUTED_LOAD_TESTS_OUTPUT_DIR,
                         'warp_bench_data_stats'))).process()
    pprint(overview_report)
    for num, _ in servers.items():
        for method in ['http', 'https']:
            if not only_test_method or only_test_method == method:
                report = overview_report.get('server_{}_{}'.format(
                    num, method)) or {}
                csv_size = report.get('csv_size') or 0
                is_successful = report.get('is_successful')
                if csv_size < 1:
                    print(
                        "csv size for server {}, method {} is missing".format(
                            num, method))
                    return False
                if not is_successful:
                    print("server {}, method {} was not successful".format(
                        num, method))
                    return False
    return True

示例#26

0

显示文件

def flow(parameters, *_):
    def take_first(field):
        def f(row):
            if field in row and isinstance(row[field], list):
                row[field] = row[field][0]

        return Flow(
            f,
            set_type(field, type='string'),
        )

    def datetime_to_date(field):
        def f(row):
            if row.get(field):
                row[field] = row[field].date()

        return Flow(
            f,
            set_type(field, type='date'),
        )

    def approve(parameters):
        def func(row):
            if parameters.get('filter-out') is None:
                return True
            bad_phrase = parameters['filter-out']
            for f in ('page_title', 'description'):
                if row.get(f) and bad_phrase in row[f]:
                    return False
            return True

        return func

    return Flow(
        fetcher(parameters),
        concatenate(dict(
            page_title=['Title'],
            publication_id=['ItemId'],
            tender_id=['ItemUniqueId'],
            publisher=['OfficeDesc'],
            start_date=['PublishDate'],
            claim_date=['LastDate'],
            decision=['StatusDesc'],
            description=['Description'],
            last_update_date=['UpdateDate'],
            base_url=['BaseUrl'],
            url_name=['UrlName'],
            tender_type_he=['PublicationTypeDesc'],
        ),
                    resources=-1),
        add_field('tender_type',
                  'string',
                  default=parameters['tender_type'],
                  resources=-1),
        take_first('publisher'),
        take_first('tender_type_he'),
        add_field('page_url',
                  'string',
                  default=lambda row:
                  'https://www.gov.il/he{base_url}{url_name}'.format(**row)),
        # delete_fields(['base_url', 'url_name']),
        filter_rows(approve(parameters)),
        set_type('publication_id', type='integer'),
        set_type('start_date', type='datetime', format=DATE_FMT),
        set_type('last_update_date', type='datetime', format=DATE_FMT),
        set_type('claim_date', type='datetime', format=DATE_FMT),
        datetime_to_date('last_update_date'),
        datetime_to_date('start_date'),
        set_primary_key(['publication_id', 'tender_type', 'tender_id']),
        dedup(),
        update_resource(-1, **parameters.pop('resource')),
        update_resource(-1, **{'dpp:streaming': True}),
        validate(),
    )

示例#27

0

显示文件

文件： prepare_stacks.py 项目： hasadna/datacity-businessgate

def process_demographics(stack):
    key = 'stack:demographics'
    try:
        demographics_cards = _cache.get(key)
    except KeyError:        
        def add_source():
            def f(rows):
                for row in rows:
                    row['source'] = rows.res.name
                    yield row
            return DF.Flow(
                DF.add_field('source', 'string'),
                f
            )

        def map_to_cards():
            MAP = {
                ("דו''ח אג''ס לפי עולים וותיקים",
                        ("סה''כ עולים",)
                ): 'immigrants',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('0-5', '6-12')
                ): 'kids',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('13-17',)
                ): 'teenagers',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('60-64', '65-69', '70-74', '75-120')
                ): 'elderly',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59')
                ): 'adults',
            }
            
            def f(rows):
                for row in rows:
                    for (source, kinds), kind in MAP.items():
                        if row['source'] == source and row['kind'] in kinds:
                            row['kind'] = kind
                            yield row
            return f

        s2n = dict(
            (int(stat_area), f['properties']['title'])
            for f in get_neighborhood_features()
            for stat_area in f['properties']['stat_areas']
        )

        MAP2 = dict(
            adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0),
            kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1),
            teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2),
            elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3),
            immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4),
        )

        demographics_cards = DF.Flow(
            *[
                DF.load(f, headers=4)
                for f in glob.glob('demographics/*.csv')
            ],
            DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]),
            DF.add_field('total', 'number', lambda r: r.get("סה''כ")),
            DF.delete_fields(["אג''ס", "סה''כ "]),
            DF.unpivot([dict(
                name="([-'א-ת0-9 ].+)",
                keys=dict(
                    kind=r'\1'
                )
            )], [dict(
                name='kind', type='string'
            )], dict(
                name='value', type='number'
            )),
            DF.validate(),
            add_source(),
            map_to_cards(),
            DF.concatenate(dict(
                total=[], value=[], kind=[], stat_id=[]
            )),
            DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))),
            DF.filter_rows(lambda r: r['neighborhood']),
            DF.join_with_self('concat', ['neighborhood', 'kind'], dict(
                neighborhood=None,
                kind=None,
                total=dict(aggregate='sum'),
                value=dict(aggregate='sum'),
            )),
            DF.duplicate('concat', 'maxes'),
            DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)),
            DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict(
                total=None,
            )),
            DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total']  
            DF.sort_rows('{score_value}', reverse=True),
            DF.duplicate('maxes', 'demographics'),
            DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))),
            DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)),
            DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']),
            DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_value=float(r['score_value']),
                score_display=r['score_display'],
                geometry_score=float(r['geometry_score']),
            )),
            DF.join_with_self('demographics', ['kind'], dict(
                kind=None, scores=dict(aggregate='array'),
            )),
            DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]),
            DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]),
            DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]),
            DF.sort_rows('{order}'),
            DF.delete_fields(['kind']),
        ).results()[0][0]
        _cache.set(key, demographics_cards)

    # features = [
    #     dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0]))
    #     for r in DF.Flow(
    #         DF.load('geo/stat-areas/stat-areas/datapackage.json'),
    #     ).results()[0][0]
    # ]
    # geometry=dict(type='FeatureCollection', features=features)

    stack.update(dict(
        map=True,
        scheme='green',
        currentField='neighborhood',
        layout='scores',
        # geometry=geometry
    ))
    stack.setdefault('cards', []).extend(demographics_cards)

示例#28

0

显示文件

文件： prepare_stacks.py 项目： hasadna/datacity-businessgate

def process_institutions(stack):
    key = 'stack:institutions'
    try:
        institutions_cards = _cache.get(key)
    except KeyError:
        CRS = '+ellps=GRS80 +k=1.00007 +lat_0=31.73439361111111 +lon_0=35.20451694444445 +no_defs +proj=tmerc +units=m +x_0=219529.584 +y_0=626907.39'
        projector = pyproj.Proj(CRS)

        def proj():
            def func(row):
                row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True)
            return DF.Flow(
                DF.add_field('lon', 'number'),
                DF.add_field('lat', 'number'),
                func,
                DF.delete_fields(['X', 'Y'])
            )

        def translate_kind():
            translations = {
                'מרפאה': 'מרפאות',
                'איצטדיון': 'איצטדיון',
                'ספרייה': 'ספריות',
                'בית ספר': 'בתי ספר',
                'מועדון קהילתי כולל מרכז צעירים': 'מועדון קהילתי',
                'בית כנסת': 'בתי כנסת',
                'מועדון נוער': 'מועדון נוער',
                'אולם מופעים, היכל תרבות': 'מוסדות תרבות',
                'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'מרכזי פעילות לקשישים',
            }
            def func(row):
                row['kind'] = translations[row['kind']]
            return func

        institutions_cards = DF.Flow(
            *[
                DF.load(f)
                for f in glob.glob('institutions/*xlsx')
            ],
            DF.concatenate(dict(
                kind=['סוג המוסד'],
                title=['שם המוסד'],
                address=['כתובת'],
                X=[], Y=[]
            )),
            translate_kind(),
            proj(),
            DF.add_field('feature', 'object', 
                        lambda r: geojson.Feature(
                            properties=dict(title=r['title'], address=r['address']),
                            geometry=geojson.Point(coordinates=[float(r['lon']), float(r['lat'])])
                        )),
            DF.delete_fields(['title', 'lon', 'lat', 'address']),
            DF.join_with_self('concat', ['kind'], dict(
                title=dict(name='kind'),
                features=dict(name='feature', aggregate='array')
            )),
            DF.sort_rows('{title}', reverse=True),
            DF.add_field('pointGeometry', 'object', lambda r: geojson.FeatureCollection(features=r['features'])),
            DF.add_field('content', 'string', '&nbsp;'),
            DF.delete_fields(['features']),
        #     DF.printer(tablefmt='html')
        ).results()[0][0]
        _cache.set(key, institutions_cards)

    stack.update(dict(
        map=True,
    ))
    stack.setdefault('cards', [])
    current_cards = dict(
        (c['title'], c) for c in stack['cards']
    )
    for card in institutions_cards:
        current_card = current_cards.pop(card['title'], None)
        if current_card is not None:
            card['content'] = current_card['content']
        else:
            print('SPURIOUS CARD for INSTITUTIONS', card['title'])
    stack['cards'] = [
        c for c in stack['cards']
        if c['title'] in current_cards
    ] + institutions_cards

示例#29

0

显示文件

文件： organisations.py 项目： hasadna/migdar-data-pipelines

def fix_doc_id(rows):
    used = {}
    for row in rows:
        doc_id = row['doc_id']
        used.setdefault(doc_id, 0)
        if used[doc_id] != 0:
            row['doc_id'] += '.{}'.format(used[doc_id])
        yield row
        used[doc_id] += 1

cur_year = datetime.date.today().year

org_flow = DF.Flow(
    DF.load(ORGS_URL, name='orgs'), 
    DF.concatenate(headers, resources='orgs', target=dict(name='orgs')),
    fix_urls(['org_website', 'org_facebook']),
    DF.add_field(
        'alt_names', 'array',
        default=lambda r: [
            r[x]
            for x in [
                'alt_name%d' % i
                for i in range(1, 6)
            ] + ['org_name']
            if x in r and r[x]
        ]
    ),
    DF.add_field('compact_services', 'string', lambda row: row.get('provided_services')),
    DF.delete_fields(['alt_name[1-5]']),
    *[

示例#30

0

显示文件

def prepare():
    fp = re.compile('\w+', re.UNICODE)

    def fingerprint(x):
        return ''.join(fp.findall(x.upper()))

    langs, _, _ = DF.Flow(DF.load(all_data()),
                          DF.concatenate(dict(city_name=[]))).results()
    langs = dict((k, dict()) for k in set(x['city_name'] for x in langs[0]))

    osm_he = {}
    s = tabulator.Stream(data_file('places.csv'))
    s.open()
    for item in s.iter():
        if len(item) == 0: continue
        item = json.loads(item[0])
        he = item.get('name:he', item.get('name'))
        if he is None:
            continue
        osm_he.setdefault(fingerprint(he), {}).update(item)

    s = tabulator.Stream(data_file('yeshuvim.csv'), headers=1)
    s.open()
    for item in s.iter(keyed=True):
        he = item['שם_ישוב'].strip().replace('(', 'XXX').replace(
            ')', '(').replace('XXX', ')').replace('  ', ' ')
        en = item['שם_ישוב_לועזי'].strip().replace('  ', ' ').replace(
            "'", 'xxx').title().replace('xxx', "'").replace('Xxx', "'")
        rec = {'name:en': en, 'name:he': he}
        he = fingerprint(he)
        osm_he.setdefault(he, {}).update(rec)

    osm = {}
    for item in osm_he.values():
        names = dict(
            (k[5:], v) for k, v in item.items() if k.startswith('name:'))
        if 'name' not in item and 'he' not in names:
            continue
        names.setdefault('he', item.pop('name', None))
        for v in names.values():
            osm.setdefault(fingerprint(v), {}).update(names)
        en = names.get('en')
        if en and '-' in en:
            for v in en.split('-'):
                osm.setdefault(fingerprint(v), {}).update(names)

        old_names = dict(('he' if k == 'old_name' else k[9:], v)
                         for k, v in item.items() if k.startswith('old_name'))
        for v in old_names.values():
            for vv in v.split(';'):
                osm.setdefault(fingerprint(vv), {}).update(names)

    for kk, place in langs.items():
        k = fingerprint(kk)
        if k in osm:
            place.update(osm[k])
        else:
            match = process.extractOne(k, osm.keys(), scorer=fuzz.ratio)
            if match[1] >= 50:
                k = match[0]
                place.update(osm[k])
            else:
                print('no match for', place, k, match)

    normalized = dict((k,
                       dict(he=v.get('he'),
                            en=v.get('en'),
                            es=v.get('es', v.get('en')),
                            fr=v.get('fr', v.get('en')),
                            ru=v.get('ru', v.get('en')),
                            ar=v.get('ar', v.get('he'))))
                      for k, v in langs.items())
    json.dump(normalized, open(DATA_FILE, 'w'))