Пример #1
0
    def func(rows):
        filename = 'קטלוג רווחה למערכת ההזנה 18.5.21.xlsx'
        cats = DF.Flow(
            DF.load(filename, name='welfare'),
            # DF.printer(),
            DF.rename_fields(
                {
                    'id': 'catalog_number',
                    'שם השירות (ציבורי)': 'name'
                },
                regex=False),
            DF.select_fields(['catalog_number', 'name']),
        ).results()[0][0]
        cats = dict((k.pop('name'), k) for k in cats)

        missing = []
        for row in rows:
            v = row['value']
            if v['office'] == 'משרד הרווחה':
                name = v['name']
                if name in cats:
                    rec = cats.pop(name)
                    cn = str(rec['catalog_number'])
                    if v.get('catalog_number') != cn:
                        v['catalog_number'] = cn
                        yield row
                else:
                    missing.append((name, v['id']))
        for x in missing:
            print(
                '{} (https://data-input.obudget.org/he/datarecords/social_service/{})'
                .format(*x))
Пример #2
0
def flow(*_):
    DF.Flow(
        DF.load(filename, name='welfare'),
        DF.add_field('activity_name', 'string',
                     lambda r: r['שם השירות (ציבורי)']),
        DF.filter_rows(lambda r: r['activity_name']),
        DF.add_field(
            'activity_description', 'array', lambda r:
            [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)']
             ]),
        DF.add_field(
            'history', 'array', lambda r: [
                dict(
                    year=2019,
                    unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(),
                    subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1].
                    strip(),
                    subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[
                        1].strip(),
                )
            ]), DF.add_field('target_audience', 'array',
                             splitter('אוכלוסייה')),
        DF.add_field('subject', 'array', splitter('תחום ההתערבות')),
        DF.add_field('intervention', 'array', splitter('אופן התערבות')),
        DF.select_fields(FIELDS),
        DF.add_field('publisher_name', 'string', 'משרד הרווחה'),
        DF.add_field('min_year', 'integer', 2019),
        DF.add_field('max_year', 'integer', 2019),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(),
        DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process()
    return DF.Flow(
        DF.load('tmp/activities-welfare/datapackage.json'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )
Пример #3
0
def test_select_field():
    from dataflows import select_fields
    f = Flow(data, select_fields(['y']))
    results, dp, _ = f.results()
    for i in results[0]:
        assert list(i.keys()) == ['y']
    assert dp.descriptor['resources'][0]['schema']['fields'] == \
        [dict(name='y', type='string', format='default')]
def flow(*_):
    return DF.Flow(
        DF.load('/var/datapackages/activities/social_services/datapackage.json'),
        DF.add_field('entity_id', 'string'),
        DF.add_field('soproc_supplier', 'boolean'),
        unwind(),
        DF.select_fields(['entity_id', 'soproc_supplier']),
        DF.dump_to_path('/var/datapackages/activities/social_services_suppliers'),
    )
Пример #5
0
 def build_school_cache(self):
     results, _, _ = Flow(
         load(
             'https://datacity-source-files.fra1.digitaloceanspaces.com/69-AlQasum/Education/Schools.xlsx',
             headers=1,
             infer_strategy=load.INFER_STRINGS,
             cast_strategy=load.CAST_TO_STRINGS),
         select_fields(['SEMEL_MOSA', 'X', 'Y']),
         set_type('SEMEL_MOSA', type='string'),
         set_type('X', type='number'),
         set_type('Y', type='number'),
     ).results()
     ret = dict((x['SEMEL_MOSA'], (x['X'], x['Y'])) for x in results[0])
     return ret
Пример #6
0
def flow(*_):
    return DF.Flow(
        DF.load(
            '/var/datapackages/activities/social_services/datapackage.json'),
        DF.add_field('tender_id', 'string'),
        DF.add_field('publication_id', 'string'),
        DF.add_field('tender_type', 'string'),
        DF.add_field('tender_key', 'string'),
        DF.add_field('soproc_tender', 'boolean'),
        unwind(),
        DF.select_fields([
            'tender_id', 'publication_id', 'tender_type', 'tender_key',
            'soproc_tender'
        ]),
        DF.dump_to_path(
            '/var/datapackages/activities/social_services_tenders'),
    )
Пример #7
0
def flow(*_):
    with open('data/sitemap.xml', 'w') as index:
        index.write("""<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n""")
        for kind in ('publications', 'orgs', 'datasets', 'tags'):
            for lang in ('hebrew', 'english', 'arabic'):
                index.write("""<sitemap><loc>https://api.yodaat.org/data/sitemap.{}-{}.xml</loc></sitemap>\n""".format(kind, lang))
        index.write("""</sitemapindex>""")
    return DF.Flow(
        lang_flow('hebrew', ''),
        lang_flow('english', 'en/'),
        lang_flow('arabic', 'ar/'),

        registerSiteMaps,
        DF.select_fields(['url']),
        DF.update_resource(None, **{'dpp:streaming': True}),
        DF.printer()
    )
Пример #8
0
         fields=dict(Recovered={
             'name': 'Case',
             'aggregate': 'first'
         })),
    add_computed_field(target={
        'name': 'Deaths',
        'type': 'number'
    },
                       operation='format',
                       with_='{Case}'),
    add_computed_field(target={
        'name': 'Country',
        'type': 'string'
    },
                       operation='format',
                       with_='{Country/Region}'),
    add_computed_field(target={
        'name': 'Province',
        'type': 'string'
    },
                       operation='format',
                       with_='{Province/State}'),
    delete_fields(['Case', 'Country/Region', 'Province/State']),
    update_resource('time_series_19-covid-Deaths',
                    name='time-series-19-covid-combined',
                    path='time-series-19-covid-combined.csv'),
    select_fields([
        'Province', 'Country', 'Lat', 'Long', 'Date', 'Confirmed', 'Recovered',
        'Deaths'
    ]), dump_to_path()).results()[0]
Пример #9
0
            prop['type'] = 'keyword'
        elif schema_type in ('number', 'integer'):
            prop['index'] = True
        return prop


if __name__ == '__main__':
    DF.Flow(
        DF.load('new-york-city-current-job-postings.zip',
                filename='nyc-jobs.csv',
                name='jobs'),
        DF.add_field('doc_id',
                     'string',
                     default=lambda row: 'job/{Job ID}'.format(**row)),
        DF.add_field('score', 'integer', default=1),
        DF.set_type('Salary Frequency', **{'es:keyword': True}),
        DF.set_primary_key(['doc_id']),
        dump_to_es(indexes={'jobs-job': [{
            'resource-name': 'jobs',
        }]},
                   mapper_cls=SampleMappingGenerator), DF.dump_to_path('data'),
        DF.add_field('value',
                     'object',
                     default=lambda row: dict((k, v) for k, v in row.items()
                                              if k not in ('doc_id', 'score')),
                     **{'es:index': False}),
        DF.select_fields(['doc_id', 'value']),
        dump_to_es(indexes={'jobs-document': [{
            'resource-name': 'jobs',
        }]}), DF.printer(fields=['doc_id'])).process()
def Olap_Datapackage():
    flow = Flow(
        # Load datapackages:
        load('elspot_prices_data/datapackage.json'),
        load('afrr_data/datapackage.json'),
        load('fcr_dk1_data/datapackage.json'),
        concatenate(fields={
            'Timestamp': ['HourUTC'],
            'Area': ['PriceArea'],
            'Product': ['product'],
            'Amount': ['amount'],
            'Price_DKK': ['PriceDKK'],
            'Price_EUR': ['PriceEUR']
        },
                    target={
                        'name': 'fact',
                        'path': 'data/fact.csv'
                    }),
        add_computed_field(
            [dict(target='id', operation='constant', with_='dummy')]),
        add_id,
        set_type('id', type='integer'),
        set_primary_key(primary_key=['id']),
        # Reorder so that 'id' column is the first:
        select_fields([
            'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK',
            'Price_EUR'
        ],
                      resources='fact'),
        # Add foreign keys:
        add_foreign_keys,
        # Fact table is ready. Now duplicate the resource to generate dim tables:
        # First is 'time' table:
        duplicate(source='fact', target_name='time', target_path='time.csv'),
        select_fields(['Timestamp'], resources=['time']),
        join_self(source_name='time',
                  source_key=['Timestamp'],
                  target_name='time',
                  fields={'Timestamp': {}}),
        # Parse datetime fields and add a separate field for year, month and day:
        add_computed_field([
            dict(target=dict(name='day', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d'
                                                                           )),
            dict(target=dict(name='month', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m'
                                                                           )),
            dict(target=dict(name='month_name', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B'
                                                                           )),
            dict(target=dict(name='year', type='year'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y'
                                                                           )),
        ],
                           resources=['time']),
        set_primary_key(primary_key=['Timestamp'], resources=['time']),
        # Now 'area' table:
        duplicate(source='fact', target_name='area', target_path='area.csv'),
        select_fields(['Area'], resources=['area']),
        join_self(source_name='area',
                  source_key=['Area'],
                  target_name='area',
                  fields={'Area': {}}),
        set_primary_key(primary_key=['Area'], resources=['area']),
        # Now 'product' table:
        duplicate(source='fact',
                  target_name='product',
                  target_path='product.csv'),
        select_fields(['Product'], resources=['product']),
        join_self(source_name='product',
                  source_key=['Product'],
                  target_name='product',
                  fields={'Product': {}}),
        set_primary_key(primary_key=['Product'], resources=['product']),
        dump_to_path('olap_datapackage'))
    flow.process()
Пример #11
0
        if row.get(k)
    )
    values = list(set(values))
    return values


translations = {}
for source, gid in sources.items():
    url = URL.format(gid)
    translations[source] = DF.Flow(
        DF.load(url),
        clean_row,
        DF.add_field('values', 'array',
                     default=extract_values),
        DF.filter_rows(lambda row: row['hebrew']),
        DF.select_fields(list(LANGS) + ['values'])
    ).results()[0][0]
    tx = {}
    complained = set()
    for row in translations[source]:
        v = row.get('values')
        if not v:
            continue
        for vv in v:
            vv = clean(vv)
            if tx.get(vv) not in (None, row):
                if vv not in complained:
                    complained.add(vv)
            tx[vv] = row
    if len(complained) > 0:
        print('{}:'.format(source))
Пример #12
0
import dataflows as DF
import glob

x = DF.Flow(
    ({'filename': x[:-4]} for x in glob.glob('*png')),
    DF.update_resource(-1, name='files'),
    DF.load('http://api.yodaat.org/data/orgs_in_es/data/orgs.csv', name='orgs'),
    DF.join(
        'files', '{filename}',
        'orgs', '{entity_id}',
        {
            'filename': {},
        }, full=True, source_delete=True
    ),
    DF.filter_rows(lambda row: row['filename'] is None),
    DF.select_fields(['org_name', 'entity_id']),
    DF.printer()
).process()