Python add_field示例，dataflows.add_field Python示例

示例#1

0

显示文件

文件： collect.py 项目： i-ankit-25/population-reference-bureau

def clean_data(filename: str, location: str) -> None:
    """Clean and validate data with `dataflows`, creating data packages in the
    process, one for each file."""
    global FILE_NAME
    FILE_NAME = f"{location}-{filename}"
    clean_directory, _, processing_directory = set_location_dirs(location)
    exported_file = f"{clean_directory}/{filename}"
    _ = Flow(
        load(
            f"{processing_directory}/{filename}.csv",
            name=FILE_NAME,
        ),
        change_path,
        add_field("NameFIPS", "string"),
        concat_name_columns,
        delete_fields(["Name", "FIPS"]),
        set_type("Data", type="any"),
        validate(),
        dump_to_path(exported_file),
    ).process()[1]

示例#2

0

显示文件

def flow(*_):
    return Flow(
        filter_by_type,
        rename_fields,
        add_field('stakeholder_type', 'string'),
        add_fields(FIELDS + OPTIONAL_FIELDS, 'string'),
        add_fields(TABLE_FIELDS + OPTIONAL_TABLE_FIELDS, 'string'),
        add_fields(CORE_STAKE_HOLDER_FIELDS.values(), 'string'),
        validate,
        parse_document,
        delete_fields([
            'document', 'pdf', 'other', 'num_files', 'parser_version',
            'source', 's3_object_name'
        ]),
        fix_fields,
        set_type('CapitalPct', type='number'),
        set_type('VotePower', type='number'),
        set_type('CapitalPct_Dilul', type='number'),
        set_type('VotePower_Dilul', type='number'),
    )

示例#3

0

显示文件

文件： sitemap.py 项目： hasadna/migdar-data-pipelines

def lang_flow(lang, prefix):

    tags = [dict(doc_id=list(k)) for k in sorted(set(
            (prefix, x['hebrew'], x[lang])
            for x in translations['tags'].values()
        ))]

    def add_url(prefix_):
        def func(rows):
            for row in rows:
                if 'url' not in row:
                    yield row
                elif row.get('doc_id'):
                    row['url'] = 'https://yodaat.org/{}item/{}'.format(prefix_, row['doc_id'])
                    yield row
                else:
                    print('MMMMMMMM MISSING DOC ID', row)

        return DF.Flow(
            DF.add_field('url', 'string', resources=-1),
            func,
        )

    return DF.Flow(
        *[
            DF.Flow(
                DF.load('https://api.yodaat.org/data/{}_in_es/data/{}.csv'.format(x, y), name='{}-{}'.format(x, lang)),
                add_url(prefix)
            )
            for x, y in [
                ('publications', 'publications'),
                ('orgs', 'orgs'),
                ('datasets', 'out')
            ]
        ],
        tags,
        DF.add_field('url', 'string',
                     lambda row: 'https://yodaat.org/{}search?tag={}&itag={}&kind=all&filters={{}}&sortOrder=-year'.format(*row.get('doc_id')),
                     resources=-1),
        DF.update_resource(-1, name='tags-{}'.format(lang)),
    )

示例#4

0

显示文件

def flow(*_):
    return DF.Flow(
        DF.load('/var/datapackages/activities/social_services/historic_data/datapackage.json'),
        DF.concatenate(
            dict(
                kind=[], kind_he=[],
                activity_name=[], activity_description=[],
                publisher_name=[], history=[],
                max_year=[], min_year=[]
            ),
            dict(
                name='activities', path='activities.csv'
            )
        ),
        DF.set_primary_key(['kind', 'publisher_name', 'activity_name']),
        DF.set_type('activity_name', **{'es:title': True}),
        DF.set_type('activity_description', **{'es:itemType': 'string', 'es:boost': True}),
        DF.set_type('kind', **{'es:keyword': True, 'es:exclude': True}),
        DF.set_type('kind_he', **{'es:keyword': True, 'es:exclude': True}),
        DF.set_type('publisher_name', **{'es:keyword': True}),
        DF.set_type('history', **{
            'es:itemType': 'object',
            'es:schema': dict(
                fields=[
                    dict(name='year', type='integer'),
                    dict(name='unit', type='string'),
                    dict(name='subunit', type='string'),
                    dict(name='subsubunit', type='string'),
                    dict(name='allocated_budget', type='integer'),
                    dict(name='num_beneficiaries', type='string', **{'es:index': False}),
                ]
            )
        }),
        DF.add_field('score', 'number', lambda x: (x['history'][0]['allocated_budget'] or 1000) / 1000,
                     **{'es:score-column': True}),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.dump_to_path('/var/datapackages/activities/all'),
        DF.dump_to_sql(dict(
            activities={'resource-name': 'activities'}
        ))
    )

示例#5

0

显示文件

文件： scrape_maya_notification_list.py 项目： gal1017/budgetkey-data-pipelines

def flow(*_):
    return Flow(
        add_field('date', 'date'),
        add_field('source', 'string'),
        add_field('s3_object_name', 'string'),
        add_field('url', 'string'),
        add_field('pdf', 'array'),
        add_field('other', 'array'),
        add_field('num_files', 'number'),
        set_primary_key(['s3_object_name']),
        scrape_maya_notification_list(),
        set_primary_key(['url']),
        update_resource(-1,
                        name='maya_notification_list',
                        path="data/maya_notification_list.csv",
                        **{
                            PROP_STREAMING: True,
                        }),
    )

示例#6

0

显示文件

文件： resolve_budget_codes.py 项目： wsheffel/budgetkey-data-pipelines

def flow(*_):
    engine = create_engine(os.environ['DPP_DB_ENGINE'])
    result = engine.execute(query)
    data = (dict(r) for r in result)
    codes = dict((i['code'], i) for i in data)
    logging.info('GOT %d CODES', len(codes))
    return DF.Flow(
        DF.add_field(
            'resolved_budget_codes',
            'array',
            default=process_row(codes),
            **{
                'es:itemType':
                'object',
                'es:schema':
                dict(fields=[
                    dict(name='code', type='string', **{'es:keyword': True}),
                    dict(name='year', type='integer'),
                    dict(name='title', type='string'),
                    dict(name='doc_id', type='string', **{'es:index': False}),
                ])
            }), )

示例#7

0

显示文件

def flow(*_):
    return DF.Flow(
        services(),
        DF.delete_fields(
            ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']),
        DF.add_field('publisher_name', 'string', lambda r: r['office'],
                     **{'es:keyword': True}),
        splitter('target_audience'),
        splitter('subject'),
        splitter('intervention'),
        splitter('target_age_group'),
        floater('beneficiaries'),
        floater('budgetItems'),
        floater('manualBudget'),
        floater('tenders'),
        floater('suppliers'),
        floater('virtue_of_table'),
        fix_suppliers(),
        fix_tenders(),
        add_current_budget(),
        add_current_beneficiaries(),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', CURRENT_YEAR),
        DF.add_field('kind', 'string', 'gov_social_service', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירות חברתי', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.set_type('name', **{'es:title': True}),
        DF.set_type('description', **{
            'es:itemType': 'string',
            'es:boost': True
        }),
        DF.add_field('score', 'number', get_score, **{'es:score-column':
                                                      True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='activities', **{'dpp:streaming': True}),
        DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})),
        DF.filter_rows(lambda r: not r['deleted']),
        DF.delete_fields(['deleted']),
        DF.dump_to_path('/var/datapackages/activities/social_services'),
        DF.dump_to_sql(dict(activities={'resource-name': 'activities'})),
    )

示例#8

0

显示文件

def fix_suppliers():
    geo = fetch_codelist('geo_region')

    def func(row):
        kinds = set()
        suppliers = row.get('suppliers') or []
        eids = set()
        eids_association = set()
        eids_company = set()
        eids_municipality = set()
        geos = set()
        for v in suppliers:
            for f in ['entity_id', 'entity_name']:
                if v.get(f):
                    v[f] = v[f].replace('<em>', '').replace('</em>', '')
            v['geo'] = [geo[i] for i in v.get('geo', [])]
            geos.update(v['geo'])
            for f in ('year_activity_start', 'year_activity_end'):
                if f in v and not v[f]:
                    del v[f]
            start_year = v.get('year_activity_start') or 2020
            end_year = v.get('year_activity_end') or CURRENT_YEAR
            v['activity_years'] = list(range(start_year, end_year + 1))
            eid = v['entity_id']
            eids.add(eid)
            ekind = v['entity_kind']
            if ekind == 'company':
                kinds.add('עסקי')
                eids_company.add(eid)
            elif ekind in ('association', 'ottoman-association',
                           'cooperative'):
                kinds.add('מגזר שלישי')
                eids_association.add(eid)
            elif ekind == 'municipality':
                kinds.add('רשויות מקומיות')
                eids_municipality.add(eid)
            else:
                kinds.add('אחר')
        row['supplier_count'] = len(eids)
        row['supplier_count_company'] = len(eids_company)
        row['supplier_count_association'] = len(eids_association)
        row['supplier_count_municipality'] = len(eids_municipality)
        row['geo_coverage'] = 'ארצי' if 'ארצי' in geos else 'אזורי'

        if len(kinds) == 0:
            row['supplier_kinds'] = None
        elif len(kinds) == 1:
            row['supplier_kinds'] = kinds.pop()
        else:
            row['supplier_kinds'] = 'משולב'
        if len(suppliers) == 0:
            row['supplier_count_category'] = None
        elif len(suppliers) == 1:
            row['supplier_count_category'] = '1'
        elif 2 <= len(suppliers) <= 5:
            row['supplier_count_category'] = '2-5'
        else:
            row['supplier_count_category'] = '6+'

    return DF.Flow(DF.add_field('supplier_count_category', 'string'),
                   DF.add_field('supplier_kinds', 'string'),
                   DF.add_field('supplier_count', 'integer'),
                   DF.add_field('supplier_count_company', 'integer'),
                   DF.add_field('supplier_count_association', 'integer'),
                   DF.add_field('supplier_count_municipality', 'integer'),
                   DF.add_field('geo_coverage', 'string'), func)

示例#9

0

显示文件

def flow():
    CT = dict([
        ('ID_CAPITULO', 'economic-classification:generic:level1:label'),
        ('DESC_CAPITULO', 'economic-classification:generic:level1:label'),
        ('ID_CONCEPTO', 'economic-classification:generic:level2:code'),
        ('DESC_CAPITULO', 'economic-classification:generic:level2:label'),
        ('ID_PARTIDA_GENERICA', 'economic-classification:generic:level3:code'),
        ('DESC_PARTIDA_GENERICA', 'economic-classification:generic:level3:label'),
        ('ID_PARTIDA_ESPECIFICA', 'economic-classification:generic:level4:code'),
        ('DESC_PARTIDA_ESPECIFICA', 'economic-classification:generic:level4:label'),
    ])
    CN = dict(
        (k, v.replace(':', '-'))
        for k, v in CT.items()
    )

    new_columns = [
        'DESC_CAPITULO', 'ID_PARTIDA_GENERICA', 'DESC_PARTIDA_GENERICA', 'ID_PARTIDA_ESPECIFICA', 'DESC_PARTIDA_ESPECIFICA'
    ]

    steps = []
    steps.extend(
        add_field(CN[title], 'string', title=title, columnType=CT[title])
        for title in new_columns
        if True  # TODO
    )

    lookup = {}
    codes = datapackage.Package(
        os.path.join(os.path.dirname(__file__), 'objeto_del_gasto.datapackage.zip')
    )
    for resource in codes.resources:
        kind = resource.name
        lookup[kind] = {}
        for row in resource.iter(keyed=True):
            key = row[kind.upper().replace('Í', 'I')]
            value = row['DESCRIPCION']
            lookup[kind][key] = value

    def process(row):
        year = int(row['date-fiscal-year'])

        # Skip the LAST year of the dataset (currently 2016) it has split columns already
        if year < 2019:
            objeto = row[CN['ID_CONCEPTO']]
            if objeto:
                row[CN['ID_CAPITULO']] = objeto[0] + '000'
                row[CN['ID_CONCEPTO']] = objeto[:2] + '00'
                row[CN['DESC_CAPITULO']] = lookup['capitulo'].get(row[CN['ID_CAPITULO']])
                row[CN['DESC_CONCEPTO']] = lookup['concepto'].get(row[CN['ID_CONCEPTO']])

                nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3

            if objeto and len(objeto) >= 4:
                row[CN['ID_PARTIDA_GENERICA']] = objeto[:nb_generica_digits]

            row[CN['DESC_PARTIDA_GENERICA']] = lookup['partida_generica'].get(row.get(CN['ID_PARTIDA_GENERICA']))

            if year not in (2008, 2009, 2010):
                if objeto and len(objeto) >= 5:
                    row[CN['ID_PARTIDA_ESPECIFICA']] = objeto
                    row[CN['DESC_PARTIDA_ESPECIFICA']] = \
                        lookup['partida_específica'].get(row.get(CN['ID_PARTIDA_ESPECIFICA']))

    steps.append(process)
    return Flow(*steps)

示例#10

0

显示文件

文件： download_item_pages.py 项目： wmil-1946/wikiscraper

def download_item_pages(rows):
    session = HTMLSession()
    os.makedirs('data/musportal-item-pages/files', exist_ok=True)
    for rownum, row in enumerate(rows):
        filename = 'data/musportal-item-pages/files/rownum_{}.html'.format(rownum)
        if os.path.exists(filename):
            print('file exists: {}'.format(filename))
            row['downloaded_status_code'] = None
            row['downloaded_html_length'] = None
            row['downloaded_file_name'] = filename
        else:
            status_code, html_content = download_item_page(session, row['item_url'])
            with open(filename, 'w') as f:
                f.write(html_content)
            print('saved file: {}'.format(filename))
            row['downloaded_status_code'] = status_code
            row['downloaded_html_length'] = len(html_content)
            row['downloaded_file_name'] = filename
        yield row


print(Flow(
    load('musportal/.checkpoints/all_page_items/datapackage.json'),
    add_field('downloaded_status_code', 'integer'),
    add_field('downloaded_html_length', 'integer'),
    add_field('downloaded_file_name', 'string'),
    download_item_pages,
    printer(),
    dump_to_path('data/musportal-item-pages'),
).process()[1])

示例#11

0

显示文件

文件： prepare_stacks.py 项目： hasadna/datacity-businessgate

def process_institutions(stack):
    key = 'stack:institutions'
    try:
        institutions_cards = _cache.get(key)
    except KeyError:
        CRS = '+ellps=GRS80 +k=1.00007 +lat_0=31.73439361111111 +lon_0=35.20451694444445 +no_defs +proj=tmerc +units=m +x_0=219529.584 +y_0=626907.39'
        projector = pyproj.Proj(CRS)

        def proj():
            def func(row):
                row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True)
            return DF.Flow(
                DF.add_field('lon', 'number'),
                DF.add_field('lat', 'number'),
                func,
                DF.delete_fields(['X', 'Y'])
            )

        def translate_kind():
            translations = {
                'מרפאה': 'מרפאות',
                'איצטדיון': 'איצטדיון',
                'ספרייה': 'ספריות',
                'בית ספר': 'בתי ספר',
                'מועדון קהילתי כולל מרכז צעירים': 'מועדון קהילתי',
                'בית כנסת': 'בתי כנסת',
                'מועדון נוער': 'מועדון נוער',
                'אולם מופעים, היכל תרבות': 'מוסדות תרבות',
                'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'מרכזי פעילות לקשישים',
            }
            def func(row):
                row['kind'] = translations[row['kind']]
            return func

        institutions_cards = DF.Flow(
            *[
                DF.load(f)
                for f in glob.glob('institutions/*xlsx')
            ],
            DF.concatenate(dict(
                kind=['סוג המוסד'],
                title=['שם המוסד'],
                address=['כתובת'],
                X=[], Y=[]
            )),
            translate_kind(),
            proj(),
            DF.add_field('feature', 'object', 
                        lambda r: geojson.Feature(
                            properties=dict(title=r['title'], address=r['address']),
                            geometry=geojson.Point(coordinates=[float(r['lon']), float(r['lat'])])
                        )),
            DF.delete_fields(['title', 'lon', 'lat', 'address']),
            DF.join_with_self('concat', ['kind'], dict(
                title=dict(name='kind'),
                features=dict(name='feature', aggregate='array')
            )),
            DF.sort_rows('{title}', reverse=True),
            DF.add_field('pointGeometry', 'object', lambda r: geojson.FeatureCollection(features=r['features'])),
            DF.add_field('content', 'string', '&nbsp;'),
            DF.delete_fields(['features']),
        #     DF.printer(tablefmt='html')
        ).results()[0][0]
        _cache.set(key, institutions_cards)

    stack.update(dict(
        map=True,
    ))
    stack.setdefault('cards', [])
    current_cards = dict(
        (c['title'], c) for c in stack['cards']
    )
    for card in institutions_cards:
        current_card = current_cards.pop(card['title'], None)
        if current_card is not None:
            card['content'] = current_card['content']
        else:
            print('SPURIOUS CARD for INSTITUTIONS', card['title'])
    stack['cards'] = [
        c for c in stack['cards']
        if c['title'] in current_cards
    ] + institutions_cards

示例#12

0

显示文件

文件： publisher.py 项目： openspending/dataflows-openspending

 def flow(self):
     steps = []
     if not self.config.get(CONFIG_PUBLISH_ALLOWED):
         return None
     logger.info('Publisher Flow Preparing')
     if self.output_datapackage:
         logger.info('Publisher Flow: Dump To Path Denorm...')
         steps.extend([
             dump_to_path(self.output_datapackage)
         ])
     if self.output_db:
         db_table = 'dgp__{}_{}'.format(
             self.config.get(CONFIG_TAXONOMY_ID),
             self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME),
         )
         logger.info('Publisher Flow: Dump To DB... (%s)', db_table)
         primary_key = self.config.get(CONFIG_PRIMARY_KEY)
         mapping = self.config.get(CONFIG_MODEL_MAPPING)
         for m in mapping:
             if 'columnType' in m and m['columnType']:
                 m['slug'] = self.slugify(m['title'])
                 m['hierarchy'] = self.slugify(m['columnType'].split(':')[0])
                 m['column'] = self.column(m['columnType'])
                 m['primaryKey'] = m['columnType'] in primary_key
                 m['measure'] = m['hierarchy'] == 'value'
                 m['full_column'] = (
                     m['column'] if m['measure']
                     else '{}_{hierarchy}.{column}'.format(db_table, **m)
                 )
                 m['label'] = self.fetch_label(m['columnType'])
                 m['dataType'] = self.fetch_datatype(m['columnType'])
         prefixes = set(
             m['hierarchy']
             for m in mapping
             if m.get('measure') is False
         )
         prefixed = dict(
             (p, list(filter(lambda m: m.get('hierarchy') == p, mapping)))
             for p in prefixes
         )
         groups = [
             NormGroup([
                     m['column']
                     for m in prefixed_items
                 ], self.ref_column(prefix), self.id_column(),
                 db_table='{}_{}'.format(db_table, prefix))
             for prefix, prefixed_items in prefixed.items()
         ]
         babbage_model = dict(
             dimensions=dict(
                 (m['slug'], dict(
                     label=m['title'],
                     key_attribute=m['slug'],
                     attributes=dict([
                         (m['slug'], dict(
                             column=m['full_column'],
                             label=m['title'],
                             type=m['dataType'],
                         ))
                     ] + ([
                         (m['label']['slug'], dict(
                             column=m['label']['full_column'],
                             label=m['label']['title'],
                             type=m['label']['dataType'],
                         ))
                     ] if m.get('label') else [])),
                     join_column=[
                         self.ref_column(m['hierarchy']),
                         self.id_column()
                     ],
                     **(dict(
                         label_attribute=m['label']['slug']
                     ) if m.get('label') else {})
                 ))
                 for m in self.config.get(CONFIG_MODEL_MAPPING)
                 if m.get('measure') is False and m.get('primaryKey') is True
             ),
             fact_table=db_table,
             measures=dict(
                 (
                     m['slug'],
                     dict(
                         column=m['column'],
                         label=m['title'],
                         type='number'
                     )
                 )
                 for m in self.config.get(CONFIG_MODEL_MAPPING)
                 if m.get('measure') is True
             ),
             hierarchies=dict(
                 (prefix, dict(
                     label=prefix,
                     levels=[
                         m['slug']
                         for m in prefixed_items
                         if m.get('primaryKey') is True
                     ]
                 ))
                 for prefix, prefixed_items in prefixed.items()
             ),
         )
         steps.append(
             update_package(babbage_model=babbage_model)
         )
         source = self.config.get(CONFIG_URL)
         logger.info('Publisher Flow: _source Handling...')
         steps.extend([
             add_field('_source', 'string', source),
             append_to_primary_key(['_source']),
             clear_by_source(self.lazy_engine(), db_table, source),
         ])
         logger.info('Publisher Flow: Normalize...')
         steps.extend([
             normalize_to_db(
                 groups,
                 db_table,
                 RESOURCE_NAME,
                 self.output_db,
                 'append'
             ),
         ])
         if self.output_datapackage:
             logger.info('Publisher Flow: Dump To Path Norm...')
             steps.extend([
                 dump_to_path(self.output_datapackage + '-norm')
             ])
     if self.output_es:
         logger.info('Publisher Flow: ES...')
         steps.extend([
             self.update_es()
         ])
     logger.info('Publisher Flow Prepared')
     return Flow(*steps)

示例#13

0

显示文件

文件： historic_data.py 项目： wsheffel/budgetkey-data-pipelines

URL_PATTERN = 'https://docs.google.com/spreadsheets/d/{id}/edit#gid={gid}'

# %%
loads = []
i = 0
for source in SOURCES:
    for sheet in source['sheets']:
        i += 1
        resource_name = 'res_{}'.format(i)
        url = source['filename']
        loads.append((resource_name,
                      DF.Flow(
                          DF.load(url,
                                  name=resource_name,
                                  **sheet.get('options', {})),
                          DF.add_field('year', 'integer', source['year']),
                          DF.add_field('publisher_name', 'string',
                                       sheet['office']),
                      )))

FIELD_MAPPING = dict(
    year=[],
    publisher_name=[],
    unit=[
        'מינהל/ חטיבה', 'מינהל/ אגף', 'שם מינהל האגף', 'מנהל / אגף', 'מינהל'
    ],
    subunit=['אגף', 'אגף/ מחלקה', 'שם האגף / מחלקה', 'אגף / מחלקה'],
    subsubunit=['מחלקה'],
    activity_name=[
        'שם השירות', 'שם השירות החברתי', 'שם השירות  חברתי', 'שירות חברתי'
    ],

示例#14

0

显示文件

    return func


if __name__ == '__main__':

    r, _, _ = DF.Flow(
        DF.load(all_data(),
                name='cities',
                headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field(
            'score_date', 'object',
            lambda r: dict(weekday=r['date'].isoweekday() % 7,
                           date=r['date'].toordinal(),
                           sr=float(r['symptoms_ratio_weighted'] or 0),
                           nr=int(r['num_reports_weighted']))),
        DF.concatenate(dict(id=[], city_name=[], score_date=[]),
                       target=dict(name='popup_data')),
        DF.join_with_self(
            'popup_data', '{city_name}',
            dict(id=None,
                 city_name=None,
                 scores=dict(name='score_date', aggregate='array'))),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'] is not None),
        DF.add_field('nr', 'integer', lambda r: r['scores'][-1]['nr']),
        DF.add_field('sr', 'number', lambda r: r['scores'][-1]['sr']),
        split_to_weeks(),
        DF.add_field('translations', 'object',

示例#15

0

显示文件

文件： flow2.py 项目： datopian/ckan-ng-harvest

                    type=str,
                    help="Source ID for filter CKAN API")

args = parser.parse_args()

config.SOURCE_NAME = args.name  # Nice name of the source
config.CKAN_CATALOG_URL = args.catalog_url
config.SOURCE_ID = args.harvest_source_id

res = Flow(
    # add other resource to this process. The packages list from data.gov
    get_current_ckan_resources_from_api(harvest_source_id=config.SOURCE_ID),
    update_resource('res_1', name='ckan_results'),
    # new field at this copy for comparasion results
    add_field(name='comparison_results',
              type='object',
              resources='ckan_results'),

    # Compare both resources
    # In data.json the datasets have the identifier field: "identifier": "USDA-ERS-00071"
    # In CKAN API results the datasets have the same identifier at "extras" list: {"key": "identifier", "value": "USDA-ERS-00071"}
    compare_resources,
).results()

# save results
# comparison results
dmp = json.dumps(res[0][0], indent=2)
f = open(config.get_flow2_datasets_result_path(), 'w')
f.write(dmp)
f.close()

示例#16

0

显示文件

文件： social_services.py 项目： OpenBudget/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        all_units(),
        DF.add_field('office', 'string', lambda r: r['path'][0]
                     if len(r['path']) > 0 else None, **{'es:keyword': True}),
        DF.add_field('unit', 'string', lambda r: r['path'][1]
                     if len(r['path']) > 1 else None, **{'es:keyword': True}),
        DF.add_field('subunit', 'string', lambda r: r['path'][2]
                     if len(r['path']) > 2 else None, **{'es:keyword': True}),
        DF.add_field('subsubunit', 'string', lambda r: r['path'][3]
                     if len(r['path']) > 3 else None, **{'es:keyword': True}),
        DF.add_field('breadcrumbs', 'string',
                     lambda r: '/'.join(r['path']) or 'משרדי הממשלה',
                     **{'es:exclude': True}),
        DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main',
                     **{'es:exclude': True}),
        DF.delete_fields([
            'path',
        ]),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', 2020),
        DF.add_field('kind', 'string', 'gov_social_service_unit', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('score', 'number', 1000, **{'es:score-column': True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='units', **{'dpp:streaming': True}),

        # Ensure we only have the main offices
        DF.filter_rows(lambda r: r['unit'] is None),
        DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'),
        DF.dump_to_path('/var/datapackages/units/social_services'),
        DF.dump_to_sql(dict(units={'resource-name': 'units'})))

示例#17

0

显示文件

def objeto_del_gasto(config):

    logging.info('PREPARING objeto_del_gasto processing')

    CT = COLUMN_MAPPING
    CN = dict((k, v.replace(':', '-')) for k, v in CT.items())

    lookup = {}
    codes = datapackage.Package(
        os.path.join(os.path.dirname(__file__),
                     'objeto_del_gasto.datapackage.zip'))
    for resource in codes.resources:
        kind = resource.name
        lookup[kind] = {}
        for row in resource.iter(keyed=True):
            key = row[kind.upper().replace('Í', 'I')]
            value = row['DESCRIPCION']
            lookup[kind][key] = value

    def process(row):
        year = int(row['date-fiscal-year'])

        # Skip the LAST year of the dataset (currently 2016) it has split columns already
        if year < 2019:
            objeto = row[CN['ID_CONCEPTO']]
            if objeto and objeto != '-':
                row[CN['ID_CAPITULO']] = objeto[0] + '000'
                row[CN['ID_CONCEPTO']] = objeto[:2] + '00'
                row[CN['DESC_CAPITULO']] = lookup['capitulo'].get(
                    row[CN['ID_CAPITULO']])
                row[CN['DESC_CONCEPTO']] = lookup['concepto'].get(
                    row[CN['ID_CONCEPTO']])

                nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3

            if objeto and len(objeto) >= 4:
                row[CN['ID_PARTIDA_GENERICA']] = objeto[:nb_generica_digits]

            row[CN['DESC_PARTIDA_GENERICA']] = lookup['partida_generica'].get(
                row.get(CN['ID_PARTIDA_GENERICA']))

            if year not in (2008, 2009, 2010):
                if objeto and len(objeto) >= 5:
                    row[CN['ID_PARTIDA_ESPECIFICA']] = objeto
                    row[CN['DESC_PARTIDA_ESPECIFICA']] = \
                        lookup['partida_específica'].get(row.get(CN['ID_PARTIDA_ESPECIFICA']))

    def missing_field(mf):
        def func(dp):
            return all(f.name != mf for f in dp.resources[0].schema.fields)

        return func

    def sort_by_ct():
        def func(package):
            ct_indexes = dict(
                (ct['name'], i)
                for i, ct in enumerate(config.get(CONFIG_TAXONOMY_CT)))
            fields = sorted(((ct_indexes.get(f.get('columnType'), 1000), f)
                             for f in package.pkg.descriptor['resources'][0]
                             ['schema']['fields']),
                            key=lambda x: x[0])
            package.pkg.descriptor['resources'][0]['schema']['fields'] = [
                f[1] for f in fields
            ]
            yield package.pkg
            yield from package

        return func

    return Flow(
        *[
            conditional(
                missing_field(CN[f]),
                Flow(add_field(CN[f], 'string', columnType=ct, title=f),
                     append_to_primary_key(CN[f]) if 'ID_' in f else None))
            for f, ct in CT.items()
        ], sort_by_ct(), process)

示例#18

0

显示文件

 def postflow(self):
     metadata = self.config._unflatten().get('extra',
                                             {}).get('metadata', {})
     return Flow(add_field('metadata', 'object', metadata))

示例#19

0

显示文件

文件： prepare_stacks.py 项目： hasadna/datacity-businessgate

def process_stack_demand(stack):

    def collect_cats():
        F = 'כלל המדגם'
        
        def f(rows):
            cat = None
            for row in rows:
                if F in row:
                    v = row[F]
                    if v.startswith('סך הכל '):
                        cat = v[7:]
                    elif v.startswith('--- '):
                        if not v.endswith('ללא פירוט'):
                            subcat = v[4:]
                            row['category'] = cat
                            row['subcategory'] = subcat
                            yield row
                else:
                    yield row
        return DF.Flow(
            DF.add_field('category', 'string', resources=-1),
            DF.add_field('subcategory', 'string', resources=-1),
            f,
            DF.delete_fields([F], resources=-1),
        )

    def fix_nones(row):
        row['demand_pct'] = row['demand_pct'] or 0

    key = 'stack:demand'
    try:
        demand_stacks = _cache.get(key)
    except KeyError:        
        demand_stacks = DF.Flow(
            DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2),
            collect_cats(),
            DF.update_schema(-1, missingValues=['--']),
            DF.unpivot(
                unpivot_fields=[dict(
                    name='(.+) \\([A-Z]\\)',
                    keys=dict(
                        neighborhood='\\1'
                    ),
                )],
                extra_keys=[dict(
                    name='neighborhood', type='string'
                )],
                extra_value=dict(
                    name='demand_pct', type='number'
                ),
                resources=-1
            ),
            DF.validate(),
            DF.duplicate('demand', 'demand_stacks'),
            DF.join_with_self('demand', ['category', 'subcategory'], dict(
                category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max')
            )),
            DF.join(
                'demand', ['category', 'subcategory'],
                'demand_stacks', ['category', 'subcategory'],
                dict(
                    max_demand=None
                )
            ),
            fix_nones,
            DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)),
            DF.add_field('value', 'number', lambda r: r['demand_pct']),
            DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6),
            DF.delete_fields(['demand_pct', 'max_demand']),
            DF.sort_rows('{score}', reverse=True),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_display=r['display'],
                score_value=float(r['value']),
                geometry_score=float(r['score']),
            )),
            DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict(
                category=None, subcategory=None,
                scores=dict(aggregate='array'),
            )),
            DF.add_field('card', 'object', lambda r: dict(
                title='ביקוש ל{}'.format(r['subcategory']),
                content='',
                scores=r['scores'],
                test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_')
            )),
            DF.join_with_self('demand_stacks', ['category'], dict(
                category=None,
                cards=dict(name='card', aggregate='array'),
            )),
            DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')),
        ).results()[0][0]
        _cache.set(key, demand_stacks)
                    
    cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards']
    stack.update(dict(
        layout='scores',
        currentField='neighborhood',
        map=True
    ))
    stack.setdefault('cards', []).extend(cards)

示例#20

0

显示文件

    def _convert_type(cls, schema_type, field, prefix):
        prop = super()._convert_type(schema_type, field, prefix)
        if field.get('es:keyword'):
            prop['type'] = 'keyword'
        elif schema_type in ('number', 'integer'):
            prop['index'] = True
        return prop


if __name__ == '__main__':
    DF.Flow(
        DF.load('new-york-city-current-job-postings.zip',
                filename='nyc-jobs.csv',
                name='jobs'),
        DF.add_field('doc_id',
                     'string',
                     default=lambda row: 'job/{Job ID}'.format(**row)),
        DF.add_field('score', 'integer', default=1),
        DF.set_type('Salary Frequency', **{'es:keyword': True}),
        DF.set_primary_key(['doc_id']),
        dump_to_es(indexes={'jobs-job': [{
            'resource-name': 'jobs',
        }]},
                   mapper_cls=SampleMappingGenerator), DF.dump_to_path('data'),
        DF.add_field('value',
                     'object',
                     default=lambda row: dict((k, v) for k, v in row.items()
                                              if k not in ('doc_id', 'score')),
                     **{'es:index': False}),
        DF.select_fields(['doc_id', 'value']),
        dump_to_es(indexes={'jobs-document': [{

示例#21

0

显示文件

def flow(*_):
    DF.Flow(
        DF.load(filename, name='welfare'),
        DF.add_field('activity_name', 'string',
                     lambda r: r['שם השירות (ציבורי)']),
        DF.filter_rows(lambda r: r['activity_name']),
        DF.add_field(
            'activity_description', 'array', lambda r:
            [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)']
             ]),
        DF.add_field(
            'history', 'array', lambda r: [
                dict(
                    year=2019,
                    unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(),
                    subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1].
                    strip(),
                    subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[
                        1].strip(),
                )
            ]), DF.add_field('target_audience', 'array',
                             splitter('אוכלוסייה')),
        DF.add_field('subject', 'array', splitter('תחום ההתערבות')),
        DF.add_field('intervention', 'array', splitter('אופן התערבות')),
        DF.select_fields(FIELDS),
        DF.add_field('publisher_name', 'string', 'משרד הרווחה'),
        DF.add_field('min_year', 'integer', 2019),
        DF.add_field('max_year', 'integer', 2019),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(),
        DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process()
    return DF.Flow(
        DF.load('tmp/activities-welfare/datapackage.json'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )

示例#22

0

显示文件

def test_add_field():
    from dataflows import Flow, add_field
    f = Flow(
        (dict(a=i) for i in range(3)),
        add_field('b', 'string', 'b'),
        add_field('c', 'number'),
        add_field('d', 'boolean', title='mybool'),
    )
    results, dp, _ = f.results()
    assert results == [[{
        'a': 0,
        'b': 'b',
        'c': None,
        'd': None
    }, {
        'a': 1,
        'b': 'b',
        'c': None,
        'd': None
    }, {
        'a': 2,
        'b': 'b',
        'c': None,
        'd': None
    }]]
    assert dp.descriptor == \
        {
            'profile': 'data-package',
            'resources': [
                {
                    'name': 'res_1',
                    'path': 'res_1.csv',
                    'profile': 'tabular-data-resource',
                    'schema': {
                        'fields': [
                            {
                                'format': 'default',
                                'name': 'a',
                                'type': 'integer'
                            },
                            {
                                'format': 'default',
                                'name': 'b',
                                'type': 'string'
                            },
                            {
                                'format': 'default',
                                'name': 'c',
                                'type': 'number'
                            },
                            {
                                'format': 'default',
                                'name': 'd',
                                'title': 'mybool',
                                'type': 'boolean'
                            }
                        ],
                        'missingValues': ['']
                    }
                }
            ]
        }

示例#23

0

显示文件

def flow(parameters, *_):
    def take_first(field):
        def f(row):
            if field in row and isinstance(row[field], list):
                row[field] = row[field][0]

        return Flow(
            f,
            set_type(field, type='string'),
        )

    def datetime_to_date(field):
        def f(row):
            if row.get(field):
                row[field] = row[field].date()

        return Flow(
            f,
            set_type(field, type='date'),
        )

    def approve(parameters):
        def func(row):
            if parameters.get('filter-out') is None:
                return True
            bad_phrase = parameters['filter-out']
            for f in ('page_title', 'description'):
                if row.get(f) and bad_phrase in row[f]:
                    return False
            return True

        return func

    return Flow(
        fetcher(parameters),
        concatenate(dict(
            page_title=['Title'],
            publication_id=['ItemId'],
            tender_id=['ItemUniqueId'],
            publisher=['OfficeDesc'],
            start_date=['PublishDate'],
            claim_date=['LastDate'],
            decision=['StatusDesc'],
            description=['Description'],
            last_update_date=['UpdateDate'],
            base_url=['BaseUrl'],
            url_name=['UrlName'],
            tender_type_he=['PublicationTypeDesc'],
        ),
                    resources=-1),
        add_field('tender_type',
                  'string',
                  default=parameters['tender_type'],
                  resources=-1),
        take_first('publisher'),
        take_first('tender_type_he'),
        add_field('page_url',
                  'string',
                  default=lambda row:
                  'https://www.gov.il/he{base_url}{url_name}'.format(**row)),
        # delete_fields(['base_url', 'url_name']),
        filter_rows(approve(parameters)),
        set_type('publication_id', type='integer'),
        set_type('start_date', type='datetime', format=DATE_FMT),
        set_type('last_update_date', type='datetime', format=DATE_FMT),
        set_type('claim_date', type='datetime', format=DATE_FMT),
        datetime_to_date('last_update_date'),
        datetime_to_date('start_date'),
        set_primary_key(['publication_id', 'tender_type', 'tender_id']),
        dedup(),
        update_resource(-1, **parameters.pop('resource')),
        update_resource(-1, **{'dpp:streaming': True}),
        validate(),
    )

示例#24

0

显示文件

                    'video_url': '',
                    'main_image_url': '',
                    'preview_image_url': '',
                    'image_urls': [],
                    'item_url_he': '',
                    'item_url_en': row['URL']
                }
    else:
        yield from rows


if __name__ == '__main__':
    run_dump_print(
        Flow(
            load('data/parse_cached_apis/datapackage.json'),
            add_field('item_url_he', 'string'),
            add_field('item_url_en', 'string'),
            add_ethiopia_familynames,
            dump_cached_api('czeck-cached-api',
                            'bhjs-content/places/czech/cached-api.php'),
            dump_cached_api('ethiopia-cached-api',
                            'bhjs-content/places/ethiopia/cached-api.php'),
        ),
        'data/dump_cached_apis',
        fields=['UnitType', 'UnitTypeDesc', 'header_en', 'header_he'],
        num_rows=1,
        resources=['ethiopia-cached-api'])
    print(
        'Saved updated cached-api.php files in bhjs-content/places/*/cached-api.php'
    )

示例#25

0

显示文件

文件： education.py 项目： wsheffel/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        DF.load(URL, format='json', property='jData', name='education'),
        # DF.checkpoint('education'),
        DF.concatenate(dict(
            page_title=['Title'],
            start_date=['PobKKPublishingDate'],
            claim_date=['PobLastDate'],
            target_audience_x=['PobBudgetEntitties'],
            description=['PobTaktzir'],
            email=['PobPedagogyContactHtml'],
            publishing_unit_x=['PobYechida'],
            budget_code_x=['PobTakanaTaktzivitString'],
            att_title=['PobCreteriaLink_description'],
            att_url=['PobCreteriaLink_url'],
        ),
                       resources=-1,
                       target=dict(name='education')),
        DF.add_field('page_url', 'string', PAGE_URL, resources=-1),
        DF.add_field('publisher', 'string', 'משרד החינוך', resources=-1),
        DF.add_field('tender_type', 'string', 'call_for_bids', resources=-1),
        DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1),
        DF.add_field('publication_id', 'integer', 0, resources=-1),
        DF.add_field('tender_id', 'string', '0', resources=-1),
        DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1),
        DF.add_field('contact',
                     'string',
                     lambda row: extract_hebrew(row, 'email'),
                     resources=-1),
        DF.add_field('target_audience',
                     'string',
                     lambda row: extract_hebrew(row, 'target_audience_x'),
                     resources=-1),
        DF.add_field('contact_email',
                     'string',
                     lambda row: extract_email(row, 'email'),
                     resources=-1),
        DF.add_field('publishing_unit',
                     'string',
                     lambda row: row['publishing_unit_x'][0]['PobYechida'],
                     resources=-1),
        DF.add_field('budget_code',
                     'string',
                     lambda row: extract_budget_code(row, 'budget_code_x'),
                     resources=-1),
        DF.set_type('start_date', type='date', format='%d/%m/%Y %H:%M:%S'),
        DF.set_type('claim_date', type='datetime', format='%d/%m/%Y %H:%M:%S'),
        DF.add_field('documents',
                     'array',
                     lambda row: [
                         dict(description=row['att_title'],
                              link=row['att_url'],
                              update_time=str(row['start_date']))
                     ],
                     resources=-1),
        DF.delete_fields([
            'email', 'publishing_unit_x', 'budget_code_x', 'att_title',
            'att_url', 'target_audience_x'
        ],
                         resources=-1),
        calculate_publication_id(6),
        DF.update_resource(-1, **{'dpp:streaming': True}))

示例#26

0

显示文件

文件： prepare_stacks.py 项目： hasadna/datacity-businessgate

def process_demographics(stack):
    key = 'stack:demographics'
    try:
        demographics_cards = _cache.get(key)
    except KeyError:        
        def add_source():
            def f(rows):
                for row in rows:
                    row['source'] = rows.res.name
                    yield row
            return DF.Flow(
                DF.add_field('source', 'string'),
                f
            )

        def map_to_cards():
            MAP = {
                ("דו''ח אג''ס לפי עולים וותיקים",
                        ("סה''כ עולים",)
                ): 'immigrants',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('0-5', '6-12')
                ): 'kids',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('13-17',)
                ): 'teenagers',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('60-64', '65-69', '70-74', '75-120')
                ): 'elderly',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59')
                ): 'adults',
            }
            
            def f(rows):
                for row in rows:
                    for (source, kinds), kind in MAP.items():
                        if row['source'] == source and row['kind'] in kinds:
                            row['kind'] = kind
                            yield row
            return f

        s2n = dict(
            (int(stat_area), f['properties']['title'])
            for f in get_neighborhood_features()
            for stat_area in f['properties']['stat_areas']
        )

        MAP2 = dict(
            adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0),
            kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1),
            teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2),
            elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3),
            immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4),
        )

        demographics_cards = DF.Flow(
            *[
                DF.load(f, headers=4)
                for f in glob.glob('demographics/*.csv')
            ],
            DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]),
            DF.add_field('total', 'number', lambda r: r.get("סה''כ")),
            DF.delete_fields(["אג''ס", "סה''כ "]),
            DF.unpivot([dict(
                name="([-'א-ת0-9 ].+)",
                keys=dict(
                    kind=r'\1'
                )
            )], [dict(
                name='kind', type='string'
            )], dict(
                name='value', type='number'
            )),
            DF.validate(),
            add_source(),
            map_to_cards(),
            DF.concatenate(dict(
                total=[], value=[], kind=[], stat_id=[]
            )),
            DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))),
            DF.filter_rows(lambda r: r['neighborhood']),
            DF.join_with_self('concat', ['neighborhood', 'kind'], dict(
                neighborhood=None,
                kind=None,
                total=dict(aggregate='sum'),
                value=dict(aggregate='sum'),
            )),
            DF.duplicate('concat', 'maxes'),
            DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)),
            DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict(
                total=None,
            )),
            DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total']  
            DF.sort_rows('{score_value}', reverse=True),
            DF.duplicate('maxes', 'demographics'),
            DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))),
            DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)),
            DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']),
            DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_value=float(r['score_value']),
                score_display=r['score_display'],
                geometry_score=float(r['geometry_score']),
            )),
            DF.join_with_self('demographics', ['kind'], dict(
                kind=None, scores=dict(aggregate='array'),
            )),
            DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]),
            DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]),
            DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]),
            DF.sort_rows('{order}'),
            DF.delete_fields(['kind']),
        ).results()[0][0]
        _cache.set(key, demographics_cards)

    # features = [
    #     dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0]))
    #     for r in DF.Flow(
    #         DF.load('geo/stat-areas/stat-areas/datapackage.json'),
    #     ).results()[0][0]
    # ]
    # geometry=dict(type='FeatureCollection', features=features)

    stack.update(dict(
        map=True,
        scheme='green',
        currentField='neighborhood',
        layout='scores',
        # geometry=geometry
    ))
    stack.setdefault('cards', []).extend(demographics_cards)

示例#27

0

显示文件

文件： maya_tase_companies_current_management.py 项目： wsheffel/budgetkey-data-pipelines

def flow(*_):
    return Flow(
        update_resource(
            -1,
            name='maya_tase_companies_current_management',
            path="data/maya_tase_companies_current_management.csv",
        ), add_field('CompanyLongName', 'string'),
        add_field('CorporateNo', 'string'), add_field('Site', 'string'),
        add_field('CapitalPercent', 'string'),
        add_field('EndBalance', 'string'), add_field('Id', 'string'),
        add_field('IsFinancialExpert', 'number'),
        add_field('IsInspectionComitee', 'number'),
        add_field('IsManager', 'boolean'), add_field('Name', 'string'),
        add_field('RoleType', 'string'), add_field('SecurityName', 'string'),
        add_field('VoteCapital', 'string'), process_companies)

示例#28

0

显示文件

    return func

def sort_limit_scores():
    def func(row):
        row['scores'] = sorted(row.get('scores', []), key=lambda r: r['date'])[-30:]
    return func

if __name__ == '__main__':

    r, _, _ = DF.Flow(
        DF.load(all_data(), name='cities', headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field('score_date', 'object', lambda r: dict(
            date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted']))
        ),
        DF.concatenate(dict(
            id=[], city_name=[], score_date=[]
        ), target=dict(name='ranking')),
        DF.join_with_self('ranking', '{city_name}', dict(
            id=None, city_name=None, scores=dict(name='score_date', aggregate='array')
        )),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200),
        DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']),
        DF.sort_rows('{sortkey}', reverse=True),
        DF.delete_fields(['sortkey']),
        DF.add_field('rank', 'integer', 0),
        DF.add_field('translations', 'object', lambda r: city_translations[r['city_name']]),
        DF.add_field('image', 'object', lambda r: upload_static_image(r['id'], width=280*2, height=160*2)),

示例#29

0

显示文件

文件： organisations.py 项目： hasadna/migdar-data-pipelines

            row['doc_id'] += '.{}'.format(used[doc_id])
        yield row
        used[doc_id] += 1

cur_year = datetime.date.today().year

org_flow = DF.Flow(
    DF.load(ORGS_URL, name='orgs'), 
    DF.concatenate(headers, resources='orgs', target=dict(name='orgs')),
    fix_urls(['org_website', 'org_facebook']),
    DF.add_field(
        'alt_names', 'array',
        default=lambda r: [
            r[x]
            for x in [
                'alt_name%d' % i
                for i in range(1, 6)
            ] + ['org_name']
            if x in r and r[x]
        ]
    ),
    DF.add_field('compact_services', 'string', lambda row: row.get('provided_services')),
    DF.delete_fields(['alt_name[1-5]']),
    *[
        split_and_translate(
            f, f, 
            delimiter=',',
            keyword=f in ('org_kind', 'life_areas', 'languages', 'tags', 'compact_services')
        )
        for f in ('languages', 'life_areas', 'tags', 'regions', 'org_kind',
                  'specialties', 'provided_services', 'target_audiences', 'compact_services')

示例#30

0

显示文件

def add_gps_coordinates(stats, kv, parameters):
    logging.info('adding gps coordinates')

    def _add_gps_coordinates(rows):
        logging.info("resource name = " + rows.res.name)
        if rows.res.name == "db_data":
            source = "db"
        else:
            source = rows.res.name.split("__")[0]
        fields = parameters["source_fields"][source]
        workplace_fields = parameters.get("workplace_source_fields", {}).get(source)
        if workplace_fields and source != "db":
            raise Exception("sorry, wokrplace_fields is only supported for db source")
        for row in rows:
            inputs = {}
            workplace_inputs = {}
            for k, v in row.items():
                input = fields.get(k.strip())
                if input and v and v.strip():
                    if input in inputs:
                        logging.warning("duplicate input %s, %s: %s" % (source, input, row))
                    elif source == "db":
                        inputs[input] = json.loads(v)
                    else:
                        inputs[input] = v
                if workplace_fields:
                    input = workplace_fields.get(k.strip())
                    if input and v and v.strip():
                        if input in workplace_inputs:
                            logging.warning("duplicate workplace_input %s, %s: %s" % (source, input, row))
                        elif source == "db":
                            workplace_inputs[input] = json.loads(v)
                        else:
                            workplace_inputs[input] = v
            lat, lng, accurate = get_coords(stats, kv, inputs, get_coords_callback=parameters.get("get-coords-callback"))
            if workplace_fields:
                workplace_lat, workplace_lng, workplace_accurate = get_coords(stats, kv, workplace_inputs, get_coords_callback=parameters.get("get-coords-callback"))
            yield {
                **row,
                "lat": str(lat),
                "lng": str(lng),
                **({"address_street_accurate": str(accurate)} if source == "db" else {}),
                **({
                    "workplace_lat": str(workplace_lat),
                    "workplace_lng": str(workplace_lng),
                    **({"workplace_street_accurate": str(workplace_accurate)} if source == "db" else {}),
                } if workplace_fields else {}),
            }
        logging.info(str(dict(stats)))

    flow_args = []
    if parameters.get('load_db_data'):
        flow_args += [
            load(os.path.join(parameters['load_db_data'], 'datapackage.json'))
        ]
    if parameters.get('load_gdrive_data'):
        flow_args += [
            load(os.path.join(parameters['load_gdrive_data'], 'datapackage.json'))
        ]
    flow_args += [
        add_field('lat', 'string', default="0"),
        add_field('lng', 'string', default="0"),
        add_field('address_street_accurate', 'string', default="0", resources="db_data"),
        add_field('workplace_lat', 'string', default="0", resources="db_data"),
        add_field('workplace_lng', 'string', default="0", resources="db_data"),
        add_field('workplace_street_accurate', 'string', default="0", resources="db_data"),
        _add_gps_coordinates,
    ]
    if parameters.get('dump_to_path'):
        flow_args += [
            dump_to_path(parameters['dump_to_path'])
        ]
    return Flow(*flow_args)