Пример #1
0
def test_basic_flow_no_mapping_type():

    data = [
        dict(key='key%04d' % x, value=x)
        for x in range(1000)
    ]

    conn_str = 'localhost:9200'

    DF.Flow(
        data,
        DF.update_resource(-1, name='data'),
        DF.set_primary_key(['key']),
        dump_to_es(
            engine=conn_str,
            indexes=dict(
                test_basic_flow_no_mapping_type=[dict(
                    resource_name='data'
                )]
            )
        ),
    ).process()

    time.sleep(1)
    out = list(Storage(Elasticsearch(hosts=[conn_str])).read('test_basic_flow_no_mapping_type'))
    assert data == sorted(out, key=lambda r: r['key'])
def flow(*_):
    gcd = google_chrome_driver()
    download = gcd.download(
        'https://data.gov.il/dataset/246d949c-a253-4811-8a11-41a137d3d613/resource/f004176c-b85f-4542-8901-7b3176f9a054/download/f004176c-b85f-4542-8901-7b3176f9a054.csv'
    )
    return Flow(
        load(download, cast_strategy=load.CAST_TO_STRINGS),
        concatenate(_get_columns_mapping_dict(),
                    target=dict(name='company-details')),
        set_type('id', type='string'),
        set_type('company_registration_date', type='date', format='%d/%m/%Y'),
        set_type('company_is_government',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['כן']),
        set_type('company_is_mafera',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['מפרה', 'התראה']),
        set_type('company_last_report_year', type='integer'),
        clear_bool_values,
        update_resource(**{'dpp:streaming': True},
                        resources='company-details'),
        set_primary_key(['id'], resources='company-details'),
        printer(),
    )
def flow(*_):
    print('reading companies...')
    return Flow(
        data_gov_il_resource.flow(companies),
        fix_values(),
        concatenate(_get_columns_mapping_dict(),
                    target=dict(name='company-details')),
        set_type('id', type='string'),
        set_type('company_street_number', type='string'),
        set_type('company_registration_date', type='date', format='%d/%m/%Y'),
        set_type('company_is_government',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['כן']),
        set_type('company_is_mafera',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['מפרה', 'התראה']),
        set_type('company_last_report_year', type='integer'),
        set_type('company_postal_code', type='string'),
        clear_bool_values,
        update_resource(**{'dpp:streaming': True},
                        resources='company-details'),
        set_primary_key(['id'], resources='company-details'),
        printer(),
    )
def flow(*_):
    return Flow(
        get_all_reports(),
        calculate_publication_id(1),
        set_type('start_date', type='date', format='%d-%m-%Y'),
        set_primary_key(['publication_id']),
        update_resource(-1, name='criteria', **{PROP_STREAMING: True}),
    )
Пример #5
0
def flow(*_):
    return Flow(
        fetch_results(),
        set_type('start_date', type='date', format='%d.%m.%Y'),
        process_kind,
        calculate_publication_id(2),
        set_primary_key(['publication_id']),
        update_resource(-1, name='jobiz', **{PROP_STREAMING: True}),
    )
Пример #6
0
    def flow(self):
        taxonomy = self.context.taxonomy
        txn_config = taxonomy.config
        fmt_str = [taxonomy.title + ' עבור:']
        fields = txn_config['key-fields']
        for f in fields:
            for ct in taxonomy.column_types:
                if ct['name'] == f:
                    fmt_str.append('%s: "{%s}",' %
                                   (ct['title'], f.replace(':', '-')))
                    break
        fmt_str = ' '.join(fmt_str)
        fields = [ct.replace(':', '-') for ct in fields]
        all_fields = ['_source'] + fields

        TARGET = 'configurations'
        saved_config = self.config._unflatten()
        saved_config.setdefault('publish', {})['allowed'] = False

        return Flow(
            duplicate(RESOURCE_NAME, TARGET),
            join_with_self(
                TARGET,
                all_fields,
                dict((f, {}) for f in all_fields),
            ),
            add_computed_field([
                dict(operation='format', target='snippets', with_=fmt_str),
                dict(operation='constant', target='key_values', with_=None),
            ],
                               resources=TARGET),
            add_field('config', 'object', saved_config, resources=TARGET),
            add_field('fields',
                      type='object',
                      default=self.collate_values(fields),
                      resources=TARGET),
            join_with_self(
                TARGET, ['_source'],
                dict(
                    source=dict(name='_source'),
                    config={},
                    key_values=dict(aggregate='array'),
                    snippets=dict(aggregate='array'),
                )),
            set_type('source', type='string'),
            set_type('config', type='object'),
            set_type('key_values', type='array'),
            set_type('snippets', type='array'),
            set_primary_key(['source']),
            dump_to_sql(
                dict([(TARGET, {
                    'resource-name': TARGET,
                    'mode': 'update'
                })]),
                engine=self.lazy_engine(),
            ),
        )
Пример #7
0
def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()
def flow(*_):
    return Flow(
        add_field('date', 'date'),
        add_field('source', 'string'),
        add_field('s3_object_name', 'string'),
        add_field('url', 'string'),
        add_field('pdf', 'array'),
        add_field('other', 'array'),
        add_field('num_files', 'number'),
        set_primary_key(['s3_object_name']),
        scrape_maya_notification_list(),
        set_primary_key(['url']),
        update_resource(-1,
                        name='maya_notification_list',
                        path="data/maya_notification_list.csv",
                        **{
                            PROP_STREAMING: True,
                        }),
    )
Пример #9
0
def test_set_primary_key():
    from dataflows import set_primary_key

    datas1 = [
        {'a': 1, 'b': True, 'c': 'c1'},
        {'a': 2, 'b': True, 'c': 'c2'},
    ]
    _, dp, _ = Flow(
        datas1,
        set_primary_key(['a', 'b']),
    ).results()

    assert dp.resources[0].schema.primary_key == ['a', 'b']
def flow(*_):
    return DF.Flow(
        DF.load(
            '/var/datapackages/activities/social_services/social_services/datapackage.json'
        ),
        DF.concatenate(
            dict(kind=[],
                 kind_he=[],
                 activity_name=[],
                 activity_description=[],
                 publisher_name=[],
                 history=[],
                 max_year=[],
                 min_year=[]), dict(name='activities', path='activities.csv')),
        DF.set_primary_key(['kind', 'publisher_name', 'activity_name']),
        DF.set_type('activity_name', **{'es:title': True}),
        DF.set_type('activity_description', **{
            'es:itemType': 'string',
            'es:boost': True
        }), DF.set_type('kind', **{
            'es:keyword': True,
            'es:exclude': True
        }), DF.set_type('kind_he', **{
            'es:keyword': True,
            'es:exclude': True
        }), DF.set_type('publisher_name', **{'es:keyword': True}),
        DF.set_type(
            'history', **{
                'es:itemType':
                'object',
                'es:schema':
                dict(fields=[
                    dict(name='year', type='integer'),
                    dict(name='unit', type='string'),
                    dict(name='subunit', type='string'),
                    dict(name='subsubunit', type='string'),
                    dict(name='allocated_budget', type='integer'),
                    dict(name='num_beneficiaries',
                         type='string',
                         **{'es:index': False}),
                ])
            }),
        DF.add_field(
            'score', 'number', lambda x:
            (x['history'][0].get('allocated_budget') or 1000) / 1000,
            **{'es:score-column': True}),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.dump_to_path('/var/datapackages/activities/all'),
        DF.dump_to_sql(dict(activities={'resource-name': 'activities'})))
Пример #11
0
def flow(*_):
    return DF.Flow(
        services(),
        DF.delete_fields(
            ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']),
        DF.add_field('publisher_name', 'string', lambda r: r['office'],
                     **{'es:keyword': True}),
        splitter('target_audience'),
        splitter('subject'),
        splitter('intervention'),
        splitter('target_age_group'),
        floater('beneficiaries'),
        floater('budgetItems'),
        floater('manualBudget'),
        floater('tenders'),
        floater('suppliers'),
        floater('virtue_of_table'),
        fix_suppliers(),
        fix_tenders(),
        add_current_budget(),
        add_current_beneficiaries(),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', CURRENT_YEAR),
        DF.add_field('kind', 'string', 'gov_social_service', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירות חברתי', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.set_type('name', **{'es:title': True}),
        DF.set_type('description', **{
            'es:itemType': 'string',
            'es:boost': True
        }),
        DF.add_field('score', 'number', get_score, **{'es:score-column':
                                                      True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='activities', **{'dpp:streaming': True}),
        DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})),
        DF.filter_rows(lambda r: not r['deleted']),
        DF.delete_fields(['deleted']),
        DF.dump_to_path('/var/datapackages/activities/social_services'),
        DF.dump_to_sql(dict(activities={'resource-name': 'activities'})),
    )
Пример #12
0
def test_deduplicate():
    from dataflows import deduplicate, set_primary_key

    a = [
        {
            'a': 1,
            'b': 3,
            'c': 'First'
        },
        {
            'a': 2,
            'b': 3,
            'c': 'First'
        },
        {
            'a': 1,
            'b': 3,
            'c': '!First'
        },
        {
            'a': 1,
            'b': 2,
            'c': 'First'
        },
        {
            'a': 2,
            'b': 3,
            'c': '!First'
        },
    ]

    f = Flow(
        a,
        set_primary_key(['a', 'b']),
        deduplicate(),
    )
    results, _, _ = f.results()
    assert set(x['c'] for x in results[0]) == {'First'}
def flow(*_):
    return DF.Flow(
        all_units(),
        DF.add_field('office', 'string', lambda r: r['path'][0]
                     if len(r['path']) > 0 else None, **{'es:keyword': True}),
        DF.add_field('unit', 'string', lambda r: r['path'][1]
                     if len(r['path']) > 1 else None, **{'es:keyword': True}),
        DF.add_field('subunit', 'string', lambda r: r['path'][2]
                     if len(r['path']) > 2 else None, **{'es:keyword': True}),
        DF.add_field('subsubunit', 'string', lambda r: r['path'][3]
                     if len(r['path']) > 3 else None, **{'es:keyword': True}),
        DF.add_field('breadcrumbs', 'string',
                     lambda r: '/'.join(r['path']) or 'משרדי הממשלה',
                     **{'es:exclude': True}),
        DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main',
                     **{'es:exclude': True}),
        DF.delete_fields([
            'path',
        ]),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', 2020),
        DF.add_field('kind', 'string', 'gov_social_service_unit', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('score', 'number', 1000, **{'es:score-column': True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='units', **{'dpp:streaming': True}),

        # Ensure we only have the main offices
        DF.filter_rows(lambda r: r['unit'] is None),
        DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'),
        DF.dump_to_path('/var/datapackages/units/social_services'),
        DF.dump_to_sql(dict(units={'resource-name': 'units'})))
def Olap_Datapackage():
    flow = Flow(
        # Load datapackages:
        load('elspot_prices_data/datapackage.json'),
        load('afrr_data/datapackage.json'),
        load('fcr_dk1_data/datapackage.json'),
        concatenate(fields={
            'Timestamp': ['HourUTC'],
            'Area': ['PriceArea'],
            'Product': ['product'],
            'Amount': ['amount'],
            'Price_DKK': ['PriceDKK'],
            'Price_EUR': ['PriceEUR']
        },
                    target={
                        'name': 'fact',
                        'path': 'data/fact.csv'
                    }),
        add_computed_field(
            [dict(target='id', operation='constant', with_='dummy')]),
        add_id,
        set_type('id', type='integer'),
        set_primary_key(primary_key=['id']),
        # Reorder so that 'id' column is the first:
        select_fields([
            'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK',
            'Price_EUR'
        ],
                      resources='fact'),
        # Add foreign keys:
        add_foreign_keys,
        # Fact table is ready. Now duplicate the resource to generate dim tables:
        # First is 'time' table:
        duplicate(source='fact', target_name='time', target_path='time.csv'),
        select_fields(['Timestamp'], resources=['time']),
        join_self(source_name='time',
                  source_key=['Timestamp'],
                  target_name='time',
                  fields={'Timestamp': {}}),
        # Parse datetime fields and add a separate field for year, month and day:
        add_computed_field([
            dict(target=dict(name='day', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d'
                                                                           )),
            dict(target=dict(name='month', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m'
                                                                           )),
            dict(target=dict(name='month_name', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B'
                                                                           )),
            dict(target=dict(name='year', type='year'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y'
                                                                           )),
        ],
                           resources=['time']),
        set_primary_key(primary_key=['Timestamp'], resources=['time']),
        # Now 'area' table:
        duplicate(source='fact', target_name='area', target_path='area.csv'),
        select_fields(['Area'], resources=['area']),
        join_self(source_name='area',
                  source_key=['Area'],
                  target_name='area',
                  fields={'Area': {}}),
        set_primary_key(primary_key=['Area'], resources=['area']),
        # Now 'product' table:
        duplicate(source='fact',
                  target_name='product',
                  target_path='product.csv'),
        select_fields(['Product'], resources=['product']),
        join_self(source_name='product',
                  source_key=['Product'],
                  target_name='product',
                  fields={'Product': {}}),
        set_primary_key(primary_key=['Product'], resources=['product']),
        dump_to_path('olap_datapackage'))
    flow.process()
def flow(parameters, *args):
    return Flow(
        set_primary_key(primary_key=parameters["primary-key"],
                        resources=parameters["resources"]))
Пример #16
0
def flow(parameters, *_):
    def take_first(field):
        def f(row):
            if field in row and isinstance(row[field], list):
                row[field] = row[field][0]

        return Flow(
            f,
            set_type(field, type='string'),
        )

    def datetime_to_date(field):
        def f(row):
            if row.get(field):
                row[field] = row[field].date()

        return Flow(
            f,
            set_type(field, type='date'),
        )

    def approve(parameters):
        def func(row):
            if parameters.get('filter-out') is None:
                return True
            bad_phrase = parameters['filter-out']
            for f in ('page_title', 'description'):
                if row.get(f) and bad_phrase in row[f]:
                    return False
            return True

        return func

    return Flow(
        fetcher(parameters),
        concatenate(dict(
            page_title=['Title'],
            publication_id=['ItemId'],
            tender_id=['ItemUniqueId'],
            publisher=['OfficeDesc'],
            start_date=['PublishDate'],
            claim_date=['LastDate'],
            decision=['StatusDesc'],
            description=['Description'],
            last_update_date=['UpdateDate'],
            base_url=['BaseUrl'],
            url_name=['UrlName'],
            tender_type_he=['PublicationTypeDesc'],
        ),
                    resources=-1),
        add_field('tender_type',
                  'string',
                  default=parameters['tender_type'],
                  resources=-1),
        take_first('publisher'),
        take_first('tender_type_he'),
        add_field('page_url',
                  'string',
                  default=lambda row:
                  'https://www.gov.il/he{base_url}{url_name}'.format(**row)),
        # delete_fields(['base_url', 'url_name']),
        filter_rows(approve(parameters)),
        set_type('publication_id', type='integer'),
        set_type('start_date', type='datetime', format=DATE_FMT),
        set_type('last_update_date', type='datetime', format=DATE_FMT),
        set_type('claim_date', type='datetime', format=DATE_FMT),
        datetime_to_date('last_update_date'),
        datetime_to_date('start_date'),
        set_primary_key(['publication_id', 'tender_type', 'tender_id']),
        dedup(),
        update_resource(-1, **parameters.pop('resource')),
        update_resource(-1, **{'dpp:streaming': True}),
        validate(),
    )
Пример #17
0
            prop['type'] = 'keyword'
        elif schema_type in ('number', 'integer'):
            prop['index'] = True
        return prop


if __name__ == '__main__':
    DF.Flow(
        DF.load('new-york-city-current-job-postings.zip',
                filename='nyc-jobs.csv',
                name='jobs'),
        DF.add_field('doc_id',
                     'string',
                     default=lambda row: 'job/{Job ID}'.format(**row)),
        DF.add_field('score', 'integer', default=1),
        DF.set_type('Salary Frequency', **{'es:keyword': True}),
        DF.set_primary_key(['doc_id']),
        dump_to_es(indexes={'jobs-job': [{
            'resource-name': 'jobs',
        }]},
                   mapper_cls=SampleMappingGenerator), DF.dump_to_path('data'),
        DF.add_field('value',
                     'object',
                     default=lambda row: dict((k, v) for k, v in row.items()
                                              if k not in ('doc_id', 'score')),
                     **{'es:index': False}),
        DF.select_fields(['doc_id', 'value']),
        dump_to_es(indexes={'jobs-document': [{
            'resource-name': 'jobs',
        }]}), DF.printer(fields=['doc_id'])).process()
Пример #18
0
    def flow(self):
        if len(self.errors) == 0:
            primaryKey = [
                self.ct_to_fn(f) for f in self.config.get(CONFIG_PRIMARY_KEY)
            ]

            fieldOptions = {}
            dataTypes = dict(
                (ct['name'], dict(ct.get('options', {}), type=ct['dataType']))
                for ct in self.config.get(CONFIG_TAXONOMY_CT)
                if 'dataType' in ct)
            for mf in self.config.get(CONFIG_MODEL_MAPPING):
                ct = mf.get('columnType')
                name = mf['name']
                fieldOptions[name] = {}
                if ct is not None:
                    fieldOptions[name].update(dataTypes.get(ct, {}))
                fieldOptions[name].update(mf.get('options', {}))
                fieldOptions[name]['columnType'] = ct

            extraFieldDefs = self.join_mapping_taxonomy('extra', fieldOptions)
            normalizeFieldDef = self.join_mapping_taxonomy(
                'normalize', fieldOptions)
            unpivotFields = [
                dict(
                    name=f['name'],
                    keys=f['normalize'],
                ) for f in self.config.get(CONFIG_MODEL_MAPPING)
                if 'normalize' in f
            ]
            if len(normalizeFieldDef) > 0:
                normalizeFieldDef = normalizeFieldDef[0]
            else:
                normalizeFieldDef = None

            steps = [
                self.create_fdp(),
                self.datetime_handler(),
                self.set_consts(fieldOptions),
                validate(on_error=ignore),
            ] + ([
                unpivot(unpivotFields,
                        extraFieldDefs,
                        normalizeFieldDef,
                        regex=False,
                        resources=RESOURCE_NAME),
            ] if normalizeFieldDef else []) + [
                self.copy_names_to_titles(),
                self.rename([(self.ct_to_fn(f['columnType']), f['name'])
                             for f in self.config.get(CONFIG_MODEL_MAPPING)
                             if f.get('columnType') is not None]),
                update_resource(RESOURCE_NAME, path='out.csv'),
                # *[
                #     set_type(
                #         self.ct_to_fn(f['columnType']),
                #         columnType=f['columnType'],
                #         **fieldOptions.get(f['columnType'], {}),
                #         resources=RESOURCE_NAME,
                #         on_error=ignore
                #     )
                #     for f in self.config.get(CONFIG_MODEL_MAPPING)
                #     if f.get('columnType') is not None
                # ],
                set_primary_key(primaryKey, resources=RESOURCE_NAME)
                if len(primaryKey) else None
                # printer()
            ]
            f = Flow(*steps)
            return f