示例#1
0
def test_delete_field():
    from dataflows import delete_fields
    data2 = [
        dict(x=1, y='a', xx=True),
        dict(x=2, y='b', xx=True),
        dict(x=3, y='c', xx=True),
    ]
    f = Flow(
        data,
        delete_fields(['x'])
    )
    results, dp, _ = f.results()
    for i in results[0]:
        assert list(i.keys()) == ['y']
    assert dp.descriptor['resources'][0]['schema']['fields'] == \
        [dict(name='y', type='string', format='default')]

    f = Flow(
        data,
        data2,
        delete_fields(['x+'])
    )
    results, dp, _ = f.results()
    for res in results:
        for i in res:
            assert list(i.keys()) == ['y']
    assert dp.descriptor['resources'][0]['schema']['fields'] == \
        [dict(name='y', type='string', format='default')]
def prepare_locations():
    prepare_addresses()
    return DF.Flow(
        DF.load('_cache_addresses/datapackage.json'),
        DF.add_field(
            'address', 'string', lambda r: '{} {}{}'.format(
                r['street_name'], r['house_number'], r['letter'] or '')),
        DF.add_field(
            'item', 'object',
            lambda r: dict(value=dict(lat=float(r['lat']),
                                      lon=float(r['lon']),
                                      arnona_zones=r['arnona_zones'],
                                      שם=r['address']),
                           display=r['address'])),
        DF.sort_rows('{house_number}'),
        DF.delete_fields([
            'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address'
        ]),
        DF.join_with_self(
            'concat', ['street_name'],
            dict(display=dict(name='street_name'),
                 items=dict(name='item', aggregate='array'))),
        DF.add_field('sort_street_address', 'string',
                     lambda r: sort_street_address(r['display'])),
        DF.sort_rows('{sort_street_address}'),
        DF.delete_fields(['sort_street_address']), DF.printer(),
        DF.dump_to_path('_cache_locations'),
        DF.checkpoint('_cache_locations')).results()[0][0]
def get_neighborhood_features():
    return DF.Flow(
        DF.load('neighborhoods.xlsx',
                name='stat-areas',
                deduplicate_headers=True),
        DF.add_field(
            'neighborhoods', 'array', lambda r:
            [v for k, v in r.items() if v and k.startswith('neighborhood')]),
        DF.add_field('geometry', 'object',
                     lambda r: geometries[r['stat-area']]),
        DF.concatenate(
            dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])),
        DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(),
        DF.join_with_self(
            'stat-areas', ['neighborhood'],
            dict(
                neighborhood=None,
                stat_areas=dict(name='stat_area', aggregate='array'),
                geometries=dict(name='geometry', aggregate='array'),
            )),
        DF.add_field('geometry', 'object',
                     lambda r: unite_geometries(r['geometries'])),
        DF.delete_fields(['geometries']),
        DF.update_resource(-1, name='neighborhoods'),
        DF.add_field(
            'properties', 'object', lambda r: dict(
                x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])),
        DF.delete_fields(['neighborhood', 'stat_areas']),
        DF.checkpoint('_cache_neighborhoods')).results()[0][0]
def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()
示例#5
0
def AFRR_Data():
    unpivoting_fields = [{
        'name': 'aFRR_DownActivated',
        'keys': {
            'product': 'aFRR_DownActivated'
        }
    }, {
        'name': 'aFRR_UpActivated',
        'keys': {
            'product': 'aFRR_UpActivated'
        }
    }]
    extra_keys = [{'name': 'product', 'type': 'string'}]
    extra_value = {'name': 'amount', 'type': 'number'}
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000',
            format="json",
            property="result.records",
            name="fact_afrr"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('afrr'),
        # Normalize/unpivot:
        unpivot(unpivoting_fields, extra_keys, extra_value),
        add_computed_field([
            dict(target=dict(name='PriceArea', type='string'),
                 operation='constant',
                 with_='DK1'),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_='dummy'),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_='dummy')
        ]),
        add_price,
        delete_fields(fields=[
            'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK',
            'aFRR_UpPriceEUR'
        ]),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_afrr',
            title='Automatic Frequency Restoration Reserves',
            source=
            'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c'
        ),
        printer(),
        dump_to_path('afrr_data'))
    flow.process()
def flow(*_):
    return Flow(
        update_resource(
            -1,
            name='maya_stakeholder_list',
            path="data/maya_stakeholder_list.csv",
        ),
        filter_by_type,
        rename_fields(RENAME_FIELDS),
        add_field('stakeholder_type', 'string'),
        add_fields(FIELDS + OPTIONAL_FIELDS, 'string'),
        add_fields(TABLE_FIELDS + OPTIONAL_TABLE_FIELDS, 'string'),
        add_fields(CORE_STAKE_HOLDER_FIELDS.values(), 'string'),
        validate,
        parse_document,
        delete_fields([
            'document', 'pdf', 'other', 'num_files', 'parser_version',
            'source', 's3_object_name'
        ]),
        fix_fields,
        set_type('CapitalPct', type='number'),
        set_type('VotePower', type='number'),
        set_type('CapitalPct_Dilul', type='number'),
        set_type('VotePower_Dilul', type='number'),
    )
def flow(*_):
    return Flow(
        update_resource(
            -1,
            name='maya_holdings_change',
            path="data/maya_holdings_change.csv",
        ),
        filter_by_type,
        add_fields(
            FIELDS + OPTIONAL_FIELDS + TABLE_FIELDS + OPTIONAL_TABLE_FIELDS,
            'string'),
        rename_fields(RENAME_FIELDS),
        fix_fields(FIELDS + OPTIONAL_FIELDS + TABLE_FIELDS +
                   OPTIONAL_TABLE_FIELDS),
        validate,
        parse_document,
        delete_fields([
            'document',
            'other',
            'num_files',
            'parser_version',
            'source',
            's3_object_name',
        ]),
    )
示例#8
0
def split_keyword_list(new_fieldname, fieldname, delimiter=','):
    def splitter():
        def func(row):
            if row.get(fieldname):
                row[new_fieldname] = [
                    x.strip() for x in row[fieldname].split(delimiter)
                ]
            else:
                row[new_fieldname] = []

        return func

    steps = []
    if new_fieldname != fieldname:
        steps.append(add_field(new_fieldname, type='array'))
    steps.append(splitter())
    if new_fieldname != fieldname:
        steps.append(delete_fields([fieldname]))
    steps.append(
        set_type(new_fieldname,
                 type='array',
                 **{
                     'es:itemType': 'string',
                     'es:keyword': True
                 }))
    return Flow(*steps)
def broken_links_flow():
    return DF.Flow(
        *[
            DF.Flow(
                DF.load(URL_TEMPLATE.format(**c), name=c['name']),
                DF.add_field('__name',
                             'string',
                             c['name'],
                             resources=c['name']),
                DF.add_field('__title',
                             'string',
                             get_title(c['title']),
                             resources=c['name']),
            ) for c in configuration
        ],
        DF.add_field('urls', 'array', lambda r: RE.findall(str(r))),
        DF.add_field('link', 'string',
                     lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)),
        DF.concatenate(
            dict(
                name=['__name'],
                title=['__title'],
                link=[],
                urls=[],
            )),
        DF.add_field('url', 'string'),
        DF.add_field('error', 'string'),
        unwind(),
        DF.delete_fields(['urls']),
        DF.parallelize(check_broken(), 4),
        DF.filter_rows(lambda r: r['error'] is not None),
    )
示例#10
0
def operator(name, params):
    connection_string = params['db_url']
    source_table = params['db_table']
    target_instance_name = params['target_instance_name']
    target_package_id = params['target_package_id']
    target_organization_id = params['target_organization_id']

    print('starting db_fetcher operator')
    print(
        'source_table={} target_instance_name={} target_package_id={} target_organization_id={}'
        .format(source_table, target_instance_name, target_package_id,
                target_organization_id))
    with tempfile.TemporaryDirectory() as tempdir:
        csv_filename = target_package_id + '.csv'
        DF.Flow(
            DF.load(connection_string,
                    table=source_table,
                    name=target_package_id,
                    infer_strategy=DF.load.INFER_PYTHON_TYPES),
            DF.update_resource(-1, path=csv_filename),
            DF.delete_fields(['_source']), DF.dump_to_path(tempdir)).process()
        csv_filename = os.path.join(tempdir, csv_filename)
        print('{}, {:,} bytes'.format(csv_filename,
                                      os.stat(csv_filename).st_size))
        update_package(target_instance_name, target_organization_id,
                       target_package_id, target_package_id,
                       [('CSV', csv_filename)])
示例#11
0
def test_delete_field():
    from dataflows import delete_fields
    f = Flow(data, delete_fields(['x']))
    results, dp, _ = f.results()
    for i in results[0]:
        assert list(i.keys()) == ['y']
    assert dp.descriptor['resources'][0]['schema']['fields'] == \
        [dict(name='y', type='string', format='default')]
示例#12
0
def flow(*_):
    return DF.Flow(
        services(),
        DF.delete_fields(
            ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']),
        DF.add_field('publisher_name', 'string', lambda r: r['office'],
                     **{'es:keyword': True}),
        splitter('target_audience'),
        splitter('subject'),
        splitter('intervention'),
        splitter('target_age_group'),
        floater('beneficiaries'),
        floater('budgetItems'),
        floater('manualBudget'),
        floater('tenders'),
        floater('suppliers'),
        floater('virtue_of_table'),
        fix_suppliers(),
        fix_tenders(),
        add_current_budget(),
        add_current_beneficiaries(),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', CURRENT_YEAR),
        DF.add_field('kind', 'string', 'gov_social_service', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירות חברתי', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.set_type('name', **{'es:title': True}),
        DF.set_type('description', **{
            'es:itemType': 'string',
            'es:boost': True
        }),
        DF.add_field('score', 'number', get_score, **{'es:score-column':
                                                      True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='activities', **{'dpp:streaming': True}),
        DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})),
        DF.filter_rows(lambda r: not r['deleted']),
        DF.delete_fields(['deleted']),
        DF.dump_to_path('/var/datapackages/activities/social_services'),
        DF.dump_to_sql(dict(activities={'resource-name': 'activities'})),
    )
 def proj():
     def func(row):
         row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True)
     return DF.Flow(
         DF.add_field('lon', 'number'),
         DF.add_field('lat', 'number'),
         func,
         DF.delete_fields(['X', 'Y'])
     )
def unwind_neighborhoods():
    def f(rows):
        for row in rows:
            for n in row['neighborhoods']:
                row['neighborhood'] = n
                yield row

    return DF.Flow(DF.add_field('neighborhood', 'string'), f,
                   DF.delete_fields(['neighborhoods']))
示例#15
0
def conference_csv():
    flow = Flow(
        # Load inputs
        load(
            od19_base + od19_feedback,
            name='feedback',
            format='csv',
        ),
        load(
            od19_base + od19_analysis,
            name='analysis',
            format='csv',
        ),
        # Process them
        set_type("Anzahl.*", type='integer', resources='analysis'),
        delete_fields([
            "Anzahl Auflistung",
            ".*\\(Formel\\)",
            ".*Duplikate",
        ],
                      resources='analysis'),
        not_empty_groupcol,
        # Save the results
        add_metadata(
            name='opendatach19',
            title='''Opendata.ch/2019 Forum''',
            licenses=[{
                "name":
                "ODC-PDDL-1.0",
                "path":
                "http://opendatacommons.org/licenses/pddl/",
                "title":
                "Open Data Commons Public Domain Dedication and License v1.0"
            }],
            maintainers=[{
                "name": "Oleg Lavrovsky",
                "web": "https://datalets.ch/"
            }],
            views=[{
                "name": "Groups",
                "resources": ["analysis"],
                "spec": {
                    "group": "Alle "
                    "Bedürfnisse"
                    "",
                    "series": ["Anzahl Auflistung (Zahl)"],
                    "type": "bar"
                },
                "specType": "simple",
                "title": "Topic counts"
            }]),
        printer(),
        validate(),
        dump_to_path('data/opendatach19'),
    )
    flow.process()
示例#16
0
def Elspot_Prices_Data():
    # field_metadata = get_metadata('c86859d2-942e-4029-aec1-32d56f1a2e5d')
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20elspotprices%20order%20by%20"HourUTC"%20desc%20limit%20100',
            format="json",
            property="result.records",
            name="fact_elspot_prices"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('load_data'),
        # Add product:
        add_computed_field([
            dict(target=dict(name='product', type='string'),
                 operation='constant',
                 with_='Elspot'),
            dict(target=dict(name='amount', type='number'),
                 operation='constant',
                 with_=1),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_=-1),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_=-1)
        ]),
        add_price,
        delete_fields(fields=['SpotPriceDKK', 'SpotPriceEUR']),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_elspot_prices',
            title='Elspot Prices Data',
            source=
            'https://www.energidataservice.dk/dataset/elspotprices/resource_extract/c86859d2-942e-4029-aec1-32d56f1a2e5d'
        ),
        printer(),
        dump_to_path('elspot_prices_data'),
        # dump_to_sql(tables={'elspot': {'resource-name': 'Elspot_Prices_Data', 'mode': 'append'}}, engine='postgresql://*****:*****@localhost/cubes')
    )
    flow.process()
示例#17
0
def flow(parameters):
    resources = parameters.get('resources')
    regex = parameters.get('regex', True)
    return Flow(
        delete_fields(
            parameters.get('fields', []),
            resources=resources,
            regex=regex,
        )
    )
示例#18
0
def flow(parameters, *args):
    return Flow(
        conditional(
            field_exists('quote'),
            Flow(delete_fields(["quote"], resources=parameters["resources"]))),
        add_computed_field(
            target=dict(name='quote', type='number'),
            operation=lambda row: (row["key"] == "sum") and round(
                tryconvert(row["value"], 0, int) / math.floor(
                    tryconvert(row["population"], 100, int) / 100), 2) or None,
            resources=parameters["resources"]))
示例#19
0
def flow(parameters):
    resources = parameters.get('resources')
    regex = parameters.get('regex', True)
    if 'types' in parameters:
        return Flow(*[
            set_type(name, resources=resources, regex=regex, **options) if
            options is not None else delete_fields([name], resources=resources)
            for name, options in parameters['types'].items()
        ])
    else:
        return Flow(validate())
示例#20
0
def flow(*_):
    return Flow(
        filter_by_type,
        rename_fields,
        add_fields(FIELDS, 'string'),
        add_fields(ADDITIONAL_FIELDS, 'string'),
        parse_document,
        delete_fields([
            'document', 'pdf', 'other', 'num_files', 'parser_version',
            'source', 's3_object_name'
        ]),
    )
示例#21
0
 def flow(self):
     if self.config.get('source.raw_html'):
         return Flow(
             *[
                 self.replace_field(original, cleaned)
                 for original, cleaned in self.field_map.items()
             ],
             delete_fields(list(self.field_map.keys()),
                           resources=RESOURCE_NAME),
             add_field(self.URL_FIELD, 'string', resources=RESOURCE_NAME),
             self.clean_html_values(),
         )
def flow(*_):
    return Flow(
        filter_by_type,
        rename_fields(RENAME_FIELDS),
        add_fields(FIELDS, 'string'),
        validate,
        parse_document,
        fix_fields(FIELDS),
        delete_fields([
            'document', 'pdf', 'other', 'num_files', 'parser_version',
            'source', 's3_object_name'
        ]),
    )
示例#23
0
def flow(*_):
    return Flow(
        update_resource(
            -1, name='maya_company_officer_list', path="data/maya_company_officer_list.csv",
        ),
        filter_by_type,
        rename_fields(RENAME_FIELDS),
        add_fields(FIELDS, 'string'),
        add_fields(OPTIONAL_FIELDS, 'string'),
        add_fields(TABLE_FIELDS,'string'),
        validate,
        parse_document,
        fix_fields(TABLE_FIELDS),
        delete_fields(['document', 'pdf', 'other', 'num_files', 'parser_version', 'source', 's3_object_name']),
    )
def flow(*_):
    return DF.Flow(
        get_updated_sources(),
        DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')),
        DF.validate(),
        DF.filter_rows(lambda r: r['publication_id']),
        DF.add_field('tender_type', 'string',
                     lambda r: TENDER_KINDS[r['tender_type_he']],
                     **{'es:keyword': True}),
        DF.join_with_self(
            'tenders', KEY,
            dict((k, dict(aggregate='last'))
                 for k in list(TENDER_MAPPING.keys()) + ['tender_type'])),
        DF.set_type('publication_id', type='string', transform=str),
        DF.set_type('supplier_id', type='string', transform=str),
        DF.set_type('tender_id',
                    type='string',
                    transform=lambda v: v or 'none'),
        DF.set_type('.+_date',
                    type='date',
                    format='%d.%m.%Y',
                    on_error=DF.schema_validator.clear),
        DF.set_type('subjects',
                    type='string',
                    transform=lambda v: ';'.join(x.strip()
                                                 for x in v.split(','))
                    if v else ''),
        DF.set_type('claim_date',
                    type='datetime',
                    transform=lambda v, field_name, row: datetime.datetime.
                    combine(v, row['claim_time'] or datetime.time(0))
                    if v else None),
        DF.set_type('tender_type_he', **{'es:keyword': True}),
        DF.delete_fields(['claim_time']),
        DF.add_field(
            'page_url', 'string', lambda r:
            f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'),
        DF.add_field('page_title', 'string', lambda r: r['description']),
        DF.add_field('reason', 'string', lambda r: r['regulation']),
        DF.add_field('documents', 'array', []),
        DF.add_field('contact', 'string'),
        DF.add_field('contact_email', 'string'),
        DF.validate(),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.printer(),
    )
示例#25
0
def test_delete_fields_regex():
    from dataflows import load, delete_fields
    flow = Flow(
        load('data/regex.csv'),
        delete_fields(['temperature (24h)'], regex=False),
    )
    data = flow.results()[0]
    assert data == [[
        {
            'city': 'london'
        },
        {
            'city': 'paris'
        },
        {
            'city': 'rome'
        },
    ]]
示例#26
0
 def conditional(self):
     new_fields = [
         x.replace(':', '-') for x in self.PROHIBITED_COLUMN_TYPES
     ]
     old_fields = [x.replace(':', '-') for x in self.REQUIRED_COLUMN_TYPES]
     return Flow(
         add_computed_field(
             [dict(
                 target=f,
                 operation='constant',
             ) for f in new_fields],
             resources=RESOURCE_NAME),
         self.work(),
         *[
             set_type(f, columnType=ct)
             for (f, ct) in zip(new_fields, self.PROHIBITED_COLUMN_TYPES)
         ],
         delete_fields(old_fields, resources=RESOURCE_NAME),
     )
def flow(*_):
    return Flow(
        update_resource(
            -1,
            name='reported_work_record',
            path="data/reported_work_record.csv",
        ),
        filter_by_type,
        rename_fields(RENAME_FIELDS),
        add_fields(FIELDS, 'string'),
        add_fields(TABLE_FIELDS, 'string'),
        validate,
        parse_document,
        fix_fields(TABLE_FIELDS),
        delete_fields([
            'document', 'pdf', 'other', 'num_files', 'parser_version',
            'source', 's3_object_name', 'id', 'company', 'type', 'fix_for',
            'fixed_by', 'next_doc', 'prev_doc'
        ]),
    )
def clean_data(filename: str, location: str) -> None:
    """Clean and validate data with `dataflows`, creating data packages in the
    process, one for each file."""
    global FILE_NAME
    FILE_NAME = f"{location}-{filename}"
    clean_directory, _, processing_directory = set_location_dirs(location)
    exported_file = f"{clean_directory}/{filename}"
    _ = Flow(
        load(
            f"{processing_directory}/{filename}.csv",
            name=FILE_NAME,
        ),
        change_path,
        add_field("NameFIPS", "string"),
        concat_name_columns,
        delete_fields(["Name", "FIPS"]),
        set_type("Data", type="any"),
        validate(),
        dump_to_path(exported_file),
    ).process()[1]
示例#29
0
def flow(*_):
    return DF.Flow(
        DF.load(URL, format='json', property='jData', name='education'),
        # DF.checkpoint('education'),
        DF.concatenate(dict(
            page_title=['Title'],
            start_date=['PobKKPublishingDate'],
            claim_date=['PobLastDate'],
            target_audience_x=['PobBudgetEntitties'],
            description=['PobTaktzir'],
            email=['PobPedagogyContactHtml'],
            publishing_unit_x=['PobYechida'],
            budget_code_x=['PobTakanaTaktzivitString'],
            att_title=['PobCreteriaLink_description'],
            att_url=['PobCreteriaLink_url'],
        ), resources=-1, target=dict(name='education')),
        enumerate_titles,
        DF.add_field('page_url', 'string', PAGE_URL, resources=-1),
        DF.add_field('publisher', 'string', 'משרד החינוך', resources=-1),
        DF.add_field('tender_type', 'string', 'call_for_bids', resources=-1),
        DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1),
        DF.add_field('publication_id', 'integer', 0, resources=-1),
        DF.add_field('tender_id', 'string', '0', resources=-1),
        DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1),
        DF.add_field('contact', 'string', lambda row: extract_hebrew(row, 'email'), resources=-1),
        DF.add_field('target_audience', 'string', lambda row: extract_hebrew(row, 'target_audience_x'), resources=-1),
        DF.add_field('contact_email', 'string', lambda row: extract_email(row, 'email'), resources=-1),
        DF.add_field('publishing_unit', 'string', lambda row: row['publishing_unit_x'][0]['PobYechida'], resources=-1),
        DF.add_field('budget_code', 'string', lambda row: extract_budget_code(row, 'budget_code_x'), resources=-1),
        DF.set_type('start_date', type='date', format='%d/%m/%Y %H:%M:%S'),
        DF.set_type('claim_date', type='datetime', format='%d/%m/%Y %H:%M:%S'),
        DF.add_field('documents', 'array',
                     lambda row: [dict(
                         description=row['att_title'],
                         link=row['att_url'],
                         update_time=str(row['start_date'])
                     )], resources=-1),
        DF.delete_fields(['email', 'publishing_unit_x', 'budget_code_x', 'att_title', 'att_url', 'target_audience_x'], resources=-1),
        calculate_publication_id(6),
        DF.update_resource(-1, **{'dpp:streaming': True})
    )
def flow(*_):
    return DF.Flow(
        all_units(),
        DF.add_field('office', 'string', lambda r: r['path'][0]
                     if len(r['path']) > 0 else None, **{'es:keyword': True}),
        DF.add_field('unit', 'string', lambda r: r['path'][1]
                     if len(r['path']) > 1 else None, **{'es:keyword': True}),
        DF.add_field('subunit', 'string', lambda r: r['path'][2]
                     if len(r['path']) > 2 else None, **{'es:keyword': True}),
        DF.add_field('subsubunit', 'string', lambda r: r['path'][3]
                     if len(r['path']) > 3 else None, **{'es:keyword': True}),
        DF.add_field('breadcrumbs', 'string',
                     lambda r: '/'.join(r['path']) or 'משרדי הממשלה',
                     **{'es:exclude': True}),
        DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main',
                     **{'es:exclude': True}),
        DF.delete_fields([
            'path',
        ]),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', 2020),
        DF.add_field('kind', 'string', 'gov_social_service_unit', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('score', 'number', 1000, **{'es:score-column': True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='units', **{'dpp:streaming': True}),

        # Ensure we only have the main offices
        DF.filter_rows(lambda r: r['unit'] is None),
        DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'),
        DF.dump_to_path('/var/datapackages/units/social_services'),
        DF.dump_to_sql(dict(units={'resource-name': 'units'})))
示例#31
0
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='annual'
    ),
    extract_december_rows,
    load(
        load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its',
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='monthly'
    ),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    set_type('Date', resources='annual', type='yearmonth'),
    set_type('Price', resources='annual', type='number'),
    set_type('Date', resources='monthly', type='yearmonth'),
    set_type('Price', resources='monthly', type='number'),
    validate(),
    delete_fields(['Empty column'], resources=None)
)


def flow(parameters, datapackage, resources, stats):
    return gold_price_flow


if __name__ == '__main__':
    gold_price_flow.process()