Exemplo n.º 1
0
def judges_flow(out_path):
    return Flow(
        get_tribunals(),
        update_resource(['res_1'], name='tribunals', path='tribunals.csv'),
        checkpoint('judges_tribunals'), get_judges(),
        update_resource(['res_2'], name='judges_list', path='judges_list.csv'),
        set_type('Is_In_Dimus_List', resources=['judges_list'],
                 type='boolean'), checkpoint('judges_judges_list'),
        join('tribunals', ['Tribunal_Code'],
             'judges_list', ['Tribunal_Code'],
             fields={
                 'Tribunal_Type_Code': {},
                 'Tribunal_Arkaa_Code': {
                     'name': 'Arkaa_Code'
                 },
                 'Tribunal_District_Code': {
                     'name': 'District_Code'
                 },
                 'Tribunal_Name': {
                     'name': 'Name'
                 }
             }), fetch_judges_details, checkpoint('judges_details'),
        add_field('tribunal_type_name', 'string'), parse_judges_extra_details,
        checkpoint('judges_extra_details'), parse_judge_events,
        dump_to_path(out_path), printer(num_rows=1))
Exemplo n.º 2
0
def test_load_dates_timezones():
    from dataflows import Flow, checkpoint
    from datetime import datetime, timezone
    import shutil

    dates = [
        datetime.now(),
        datetime.now(timezone.utc).astimezone()
    ]

    shutil.rmtree('.checkpoints/test_load_dates_timezones', ignore_errors=True)

    Flow(
        [{'date': d.date(), 'datetime': d} for d in dates],
        checkpoint('test_load_dates_timezones')
    ).process()

    results = Flow(
        checkpoint('test_load_dates_timezones')
    ).results()

    assert list(map(lambda x: x['date'], results[0][0])) == \
        list(map(lambda x: x.date(), dates))
    assert list(map(lambda x: x['datetime'], results[0][0])) == \
        list(map(lambda x: x, dates))
Exemplo n.º 3
0
def test_load_from_checkpoint():
    from dataflows import Flow, checkpoint
    import shutil

    shutil.rmtree('.checkpoints/test_load_from_checkpoint', ignore_errors=True)

    assert Flow([{
        'foo': 'bar'
    }], checkpoint('test_load_from_checkpoint')).process()

    assert Flow(checkpoint('test_load_from_checkpoint')).results()[0] == [[{
        'foo':
        'bar'
    }]]
Exemplo n.º 4
0
def prepare_locations():
    prepare_addresses()
    return DF.Flow(
        DF.load('_cache_addresses/datapackage.json'),
        DF.add_field(
            'address', 'string', lambda r: '{} {}{}'.format(
                r['street_name'], r['house_number'], r['letter'] or '')),
        DF.add_field(
            'item', 'object',
            lambda r: dict(value=dict(lat=float(r['lat']),
                                      lon=float(r['lon']),
                                      arnona_zones=r['arnona_zones'],
                                      שם=r['address']),
                           display=r['address'])),
        DF.sort_rows('{house_number}'),
        DF.delete_fields([
            'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address'
        ]),
        DF.join_with_self(
            'concat', ['street_name'],
            dict(display=dict(name='street_name'),
                 items=dict(name='item', aggregate='array'))),
        DF.add_field('sort_street_address', 'string',
                     lambda r: sort_street_address(r['display'])),
        DF.sort_rows('{sort_street_address}'),
        DF.delete_fields(['sort_street_address']), DF.printer(),
        DF.dump_to_path('_cache_locations'),
        DF.checkpoint('_cache_locations')).results()[0][0]
Exemplo n.º 5
0
def post_flow(phase, poster, tasks, config: Config, cache=False):
    if cache:
        config = config._unflatten()

        config_json = [config.get('source'), config.get('structure')]
        config_json = json.dumps(config_json, sort_keys=True)
        print(config_json[:64], len(config_json))
        checkpoint_name = hashlib.md5(config_json.encode('utf8')).hexdigest()

        if config.get('source'):
            path = config.get('source').get('path')
            if path:
                checkpoint_name += '_' + os.path.basename(path)

        cache = [checkpoint(checkpoint_name)]
    else:
        cache = []
    steps = [
        row_validator(phase, poster, tasks)
    ] + cache + [
        row_sender(phase, poster, tasks)
    ]
    return Flow(
        *steps
    )
Exemplo n.º 6
0
 def run_data_count_flow():
     assert Flow(
         get_data_count_views(),
         checkpoint('test_checkpoint'),
     ).results()[0] == [[{
         'foo': 'bar'
     }]]
def get_neighborhood_features():
    return DF.Flow(
        DF.load('neighborhoods.xlsx',
                name='stat-areas',
                deduplicate_headers=True),
        DF.add_field(
            'neighborhoods', 'array', lambda r:
            [v for k, v in r.items() if v and k.startswith('neighborhood')]),
        DF.add_field('geometry', 'object',
                     lambda r: geometries[r['stat-area']]),
        DF.concatenate(
            dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])),
        DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(),
        DF.join_with_self(
            'stat-areas', ['neighborhood'],
            dict(
                neighborhood=None,
                stat_areas=dict(name='stat_area', aggregate='array'),
                geometries=dict(name='geometry', aggregate='array'),
            )),
        DF.add_field('geometry', 'object',
                     lambda r: unite_geometries(r['geometries'])),
        DF.delete_fields(['geometries']),
        DF.update_resource(-1, name='neighborhoods'),
        DF.add_field(
            'properties', 'object', lambda r: dict(
                x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])),
        DF.delete_fields(['neighborhood', 'stat_areas']),
        DF.checkpoint('_cache_neighborhoods')).results()[0][0]
Exemplo n.º 8
0
def main_flow(prefix, operator):
    return Flow(
        cluster_info(operator),
        update_resource(['res_1'], name='cluster-info', path='cluster-info.csv'),
        checkpoint(f'{prefix}-cluster-info'),
        ckan_cloud_instances(operator),
        update_resource(['res_2'], name='ckan-cloud-instances', path='ckan-cloud-instances.csv'),
    )
Exemplo n.º 9
0
def dump_print_flow(flow,
                    dump_path,
                    num_rows=1,
                    fields=None,
                    checkpoint_name=None):
    return Flow(flow,
                checkpoint(checkpoint_name) if checkpoint_name else None,
                dump_to_path(dump_path),
                printer(num_rows=num_rows, fields=fields))
Exemplo n.º 10
0
    def prepare(self):
        self.ref_hash = md5(self.REF_DATAPACKAGE.encode('utf8')).hexdigest()
        self.key = self.__class__.__name__

        check = checkpoint(self.ref_hash)
        if not check.exists():
            Flow(load(self.REF_DATAPACKAGE),
                 rename_last_resource(self.ref_hash),
                 dump_to_path('.cache/{}'.format(self.ref_hash)),
                 check).process()
        logger.debug('DONE PREPARING %s', self.key)
Exemplo n.º 11
0
def AFRR_Data():
    unpivoting_fields = [{
        'name': 'aFRR_DownActivated',
        'keys': {
            'product': 'aFRR_DownActivated'
        }
    }, {
        'name': 'aFRR_UpActivated',
        'keys': {
            'product': 'aFRR_UpActivated'
        }
    }]
    extra_keys = [{'name': 'product', 'type': 'string'}]
    extra_value = {'name': 'amount', 'type': 'number'}
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000',
            format="json",
            property="result.records",
            name="fact_afrr"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('afrr'),
        # Normalize/unpivot:
        unpivot(unpivoting_fields, extra_keys, extra_value),
        add_computed_field([
            dict(target=dict(name='PriceArea', type='string'),
                 operation='constant',
                 with_='DK1'),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_='dummy'),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_='dummy')
        ]),
        add_price,
        delete_fields(fields=[
            'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK',
            'aFRR_UpPriceEUR'
        ]),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_afrr',
            title='Automatic Frequency Restoration Reserves',
            source=
            'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c'
        ),
        printer(),
        dump_to_path('afrr_data'))
    flow.process()
Exemplo n.º 12
0
def prepare_addresses():
    with tempfile.NamedTemporaryFile(suffix='.csv', mode='wb') as source:
        shutil.copyfileobj(fetch_ckan('addresses', 'CSV'), source)
        source.flush()
        DF.Flow(
            DF.load(source.name),
            DF.concatenate(
                dict(street_name=['streetName'],
                     house_number=['HouseNuber'],
                     letter=[],
                     lat=[],
                     lon=[])), match_arnona(),
            DF.dump_to_path('_cache_addresses'),
            DF.checkpoint('_cache_addresses')).process()
Exemplo n.º 13
0
def update_dataset():
    flow = Flow(
        # Load inputs
        load(f'{BASE_URL}{CONFIRMED}'),
        load(f'{BASE_URL}{RECOVERED}'),
        load(f'{BASE_URL}{DEATH}'),
        checkpoint('load_data'),
        # Process them (if necessary)
        # Save the results
        add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''),
        printer(),
        dump_to_path(),
    )
    flow.process()
Exemplo n.º 14
0
def Elspot_Prices_Data():
    # field_metadata = get_metadata('c86859d2-942e-4029-aec1-32d56f1a2e5d')
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20elspotprices%20order%20by%20"HourUTC"%20desc%20limit%20100',
            format="json",
            property="result.records",
            name="fact_elspot_prices"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('load_data'),
        # Add product:
        add_computed_field([
            dict(target=dict(name='product', type='string'),
                 operation='constant',
                 with_='Elspot'),
            dict(target=dict(name='amount', type='number'),
                 operation='constant',
                 with_=1),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_=-1),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_=-1)
        ]),
        add_price,
        delete_fields(fields=['SpotPriceDKK', 'SpotPriceEUR']),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_elspot_prices',
            title='Elspot Prices Data',
            source=
            'https://www.energidataservice.dk/dataset/elspotprices/resource_extract/c86859d2-942e-4029-aec1-32d56f1a2e5d'
        ),
        printer(),
        dump_to_path('elspot_prices_data'),
        # dump_to_sql(tables={'elspot': {'resource-name': 'Elspot_Prices_Data', 'mode': 'append'}}, engine='postgresql://*****:*****@localhost/cubes')
    )
    flow.process()
Exemplo n.º 15
0
            continue
        if row.get("Country/Region"
                   ) == "Canada" and not row.get("Province/State"):
            row["Province/State"] = "Recovery aggregated"
            row["Lat"] = row.get("Lat", "56.1304")
            row["Long"] = row.get("Long", "-106.3468")
        yield {**expected, **row}


Flow(
    load(f"{BASE_URL}{CONFIRMED}"),
    load(f"{BASE_URL}{RECOVERED}"),
    load(f"{BASE_URL}{DEATH}"),
    load(f"{BASE_URL}{CONFIRMED_US}"),
    load(f"{BASE_URL}{DEATH_US}"),
    checkpoint("load_data"),
    unpivot(unpivoting_fields, extra_keys, extra_value),
    find_replace([{
        "name": "Date",
        "patterns": [{
            "find": "/",
            "replace": "-"
        }]
    }]),
    to_normal_date,
    set_type("Date", type="date", format="%d-%m-%y", resources=None),
    set_type("Case", type="number", resources=None),
    join(
        source_name="time_series_covid19_confirmed_global",
        source_key=["Province/State", "Country/Region", "Date"],
        source_delete=True,
Exemplo n.º 16
0
def kubectl_get_volumes_flow(source_resource_name='kubectl_get_all',
                             resource_name='kubectl_get_volumes',
                             get_all_checkpoint_name=None):

    volume_object_fields = [
        'hostPath', 'secret', 'configMap', 'emptyDir', 'gcePersistentDisk',
        'nfs'
    ]

    def get_volumes(rows):
        for row in rows:
            volumes = row.get('volumes')
            for volume in (volumes if volumes else []):
                yield {
                    'name': volume.pop('name'),
                    'source_name': row['name'],
                    'source_kind': row['kind'],
                    'source_namespace': row['namespace'],
                    **{
                        field: volume.pop(field, None)
                        for field in volume_object_fields
                    },
                }
                assert len(volume) == 0, volume

    def add_volumes(package):
        package.pkg.remove_resource(source_resource_name)
        package.pkg.add_resource({
            'name': resource_name,
            'path': f'{resource_name}.csv',
            'schema': {
                'fields': [
                    {
                        'name': 'name',
                        'type': 'string'
                    },
                    {
                        'name': 'source_kind',
                        'type': 'string'
                    },
                    {
                        'name': 'source_name',
                        'type': 'string'
                    },
                    {
                        'name': 'source_namespace',
                        'type': 'string'
                    },
                    *[{
                        'name': field,
                        'type': 'object'
                    } for field in volume_object_fields],
                ]
            }
        })
        yield package.pkg
        for rows in package:
            if rows.res.name == source_resource_name:
                yield get_volumes(rows)

    def filter_volumes(rows):
        if rows.res.name == resource_name:
            for row in rows:
                if row['source_namespace'] == 'kube-system': continue
                if any((row.get(f) or row.get(f) == {})
                       for f in ['secret', 'configMap', 'emptyDir']):
                    continue
                assert row.get('nfs', None) or row.get('gcePersistentDisk',
                                                       None), row
                yield row
        else:
            yield from rows

    return Flow(
        kubectl_get_all_flow(),
        checkpoint(get_all_checkpoint_name)
        if get_all_checkpoint_name else None, add_volumes, filter_volumes)
Exemplo n.º 17
0
unpivoting_fields = [{
    'name': '([0-9]+\/[0-9]+\/[0-9]+)',
    'keys': {
        'Date': r'\1'
    }
}]

extra_keys = [{'name': 'Date', 'type': 'string'}]
extra_value = {'name': 'Case', 'type': 'number'}

Flow(
    load(f'{BASE_URL}{CONFIRMED}'),
    load(f'{BASE_URL}{RECOVERED}'),
    load(f'{BASE_URL}{DEATH}'),
    checkpoint('load_data'),
    unpivot(unpivoting_fields, extra_keys, extra_value),
    find_replace([{
        'name': 'Date',
        'patterns': [{
            'find': '/',
            'replace': '-'
        }]
    }]),
    to_normal_date,
    set_type('Date', type='date', format='%d-%m-%y', resources=None),
    set_type('Case', type='number', resources=None),
    join(source_name='time_series_19-covid-Confirmed',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_19-covid-Deaths',
Exemplo n.º 18
0
             "title": "Population Growth - World Projections (High Fertility)"
         }
     ],
     readme=readme()
 ),
 load(source_url,format='xlsx',sheet='ESTIMATES',headers=17),
 load(source_url,format='xlsx',sheet='LOW VARIANT',headers=17),
 load(source_url,format='xlsx',sheet='MEDIUM VARIANT',headers=17),
 load(source_url,format='xlsx',sheet='HIGH VARIANT',headers=17),
 load(source_url,format='xlsx',sheet='CONSTANT-FERTILITY',headers=17),
 load(source_url,format='xlsx',sheet='CONSTANT-MORTALITY',headers=17),
 load(source_url,format='xlsx',sheet='INSTANT-REPLACEMENT',headers=17),
 load(source_url,format='xlsx',sheet='MOMENTUM',headers=17),
 load(source_url,format='xlsx',sheet='ZERO-MIGRATION',headers=17),
 load(source_url,format='xlsx',sheet='NO CHANGE',headers=17),
 checkpoint('loaded'),
 delete_fields(fields=['Index', 'Variant', 'Notes']),
 rename_resources,
 unpivot(
     unpivot_fields=[{'name': '([0-9]{4})', 'keys': {'year': '\\1'}}],
     extra_keys=[{'name': 'year', 'type': 'year'}],
     extra_value={'name': 'population', 'type': 'number'},
     resources='population-estimates'
 ),
 unpivot(
     unpivot_fields=[{'name': '([0-9]{4})', 'keys': {'year': '\\1'}}],
     extra_keys=[{'name': 'year', 'type': 'year'}],
     extra_value={'name': 'population', 'type': 'number'},
     resources=resource_names[1:]
 ),
 add_computed_field([
Exemplo n.º 19
0
        if dry_run:
            print('dry run - {} -- {}'.format(file_page_title, page_text))
        else:
            print('uploading {}'.format(file_page_title))
            site = pywikibot.Site()
            site.login()
            page = pywikibot.FilePage(site, file_page_title)
            assert page.site.family == 'commons', 'invalid page site: {}'.format(page.site)
            with throttle():
                if not page.exists():
                    page.text = page_text
                    with tempfile.NamedTemporaryFile() as f:
                        f.write(requests.get(row['image']).content)
                        if page.upload(f.name, comment="uploaded by wmilbot", ignore_warnings=True):
                            print("----- {} uploaded successfully".format(row['image']))
                        else:
                            raise Exception("Upload failed")
                else:
                    page.get()
                    page.text = page_text
                    page.save(summary='update by wmilbot')
                    print('----- {} updated successfully'.format(row['image']))


Flow(
    checkpoint('scraped-site-filtered-years-album-images', checkpoint_path='btm/.checkpoints'),
    add_field('year', 'year'),
    get_years,
    upload
).process()
Exemplo n.º 20
0
            row.get('Province/State') == 'Recovered' and not \
            row.get('Recovered'):
            continue
        if row.get('Country/Region'
                   ) == 'Canada' and not row.get('Province/State'):
            row['Province/State'] = 'Recovery aggregated'
            row['Lat'] = row.get('Lat', '56.1304')
            row['Long'] = row.get('Long', '-106.3468')
        yield {**expected, **row}


Flow(
    load(f'{BASE_URL}{CONFIRMED}'),
    load(f'{BASE_URL}{RECOVERED}'),
    load(f'{BASE_URL}{DEATH}'),
    checkpoint('load_data'),
    unpivot(unpivoting_fields, extra_keys, extra_value),
    find_replace([{
        'name': 'Date',
        'patterns': [{
            'find': '/',
            'replace': '-'
        }]
    }]),
    to_normal_date,
    set_type('Date', type='date', format='%d-%m-%y', resources=None),
    set_type('Case', type='number', resources=None),
    join(source_name='time_series_covid19_confirmed_global',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_covid19_deaths_global',
Exemplo n.º 21
0
def dump_print_flow(flow, dump_path, checkpoint_name=None, **kwargs):
    return Flow(flow,
                checkpoint(checkpoint_name) if checkpoint_name else None,
                dump_to_path(dump_path), printer(**kwargs))
Exemplo n.º 22
0
def parse_dockerfiles():
    gitlab_repos = {}

    def _parse_gitlab_repos(rows):
        if rows.res.name == 'ckan-cloud-instances':
            for row in rows:
                gitlab_repo = row['gitlab_repo']
                if gitlab_repo in gitlab_repos:
                    gitlab_repos[gitlab_repo]['instances'].append(row)
                else:
                    gitlab_repos[gitlab_repo] = {'instances': [row]}
                yield row
        else:
            yield from rows

    def _get_dockerfile_from(dockerfile):
        if dockerfile:
            return [
                line.replace('FROM ', '') for line in dockerfile.split('\n')
                if line.startswith('FROM')
            ][0]
        else:
            return None

    def _parse_ckan_extensions(rows):
        if rows.res.name == 'dockerfiles':
            for row in rows:
                row['ckan_exts'] = []
                if row['dockerfile']:
                    for line in row['dockerfile'].split('\n'):
                        if 'https://github.com/' in line and '.git@' in line and '#egg=' in line:
                            ext = line.split('https://github.com/')[1].split(
                                '#egg=')[0].replace('.git@', '@')
                            row['ckan_exts'].append(ext)
                            if 'ckanext-s3filestore' in ext:
                                row['ckanext-s3filestore'] = ext
                yield row
        else:
            yield from rows

    def _get_dockerfile_row(gitlab_repo_name, gitlab_repo):
        try:
            dockerfile = CkanGitlab()._get_file(gitlab_repo_name, 'Dockerfile')
        except Exception:
            dockerfile = None
        return {
            'gitlab_repo': gitlab_repo_name,
            'instances': [i['name'] for i in gitlab_repo['instances']],
            'from': _get_dockerfile_from(dockerfile),
            'dockerfile': dockerfile
        }

    def _parse_dockerfiles(package):
        package.pkg.add_resource({
            'name': 'dockerfiles',
            'path': 'dockerfiles.csv',
            'schema': {
                'fields': [{
                    'name': 'gitlab_repo',
                    'type': 'string'
                }, {
                    'name': 'instances',
                    'type': 'array'
                }, {
                    'name': 'from',
                    'type': 'string'
                }, {
                    'name': 'dockerfile',
                    'type': 'string'
                }]
            }
        })
        yield package.pkg
        yield from package
        yield (_get_dockerfile_row(gitlab_repo_name, gitlab_repo)
               for gitlab_repo_name, gitlab_repo in gitlab_repos.items())

    return Flow(
        _parse_gitlab_repos,
        _parse_dockerfiles,
        checkpoint('ckan_images_dockerfiles'),
        add_field('ckan_exts', 'array'),
        add_field('ckanext-s3filestore', 'string'),
        _parse_ckan_extensions,
    )