예제 #1
0
    def postflow(self):
        steps = []
        logger.info('Publisher Flow Preparing')

        full_name = '{}_{}'.format(
            self.config.get(CONFIG_TAXONOMY_ID),
            slugify(self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME),
                    separator='_',
                    lowercase=True),
        )
        db_table = 'dgp__{}'.format(full_name)
        source = get_source(self.config)
        steps.extend([
            add_field('_source', 'string', source),
            append_to_primary_key('_source'),
            clear_by_source(engine, db_table, source, '_source'),
            conditional(lambda pkg: True,
                        lambda pkg: self.normalize(pkg, full_name, db_table)),
            update_stats(
                dict(view_url='https://api.openfiscal.org/api/3/cubes/{}/model'
                     .format(full_name))),
        ])

        logger.info('Publisher Flow Prepared')
        return Flow(*steps)
예제 #2
0
 def postflow(self):
     steps = []
     for ct in self.config.get(CONFIG_TAXONOMY_CT):
         name = ct['name'].replace(':', '-')
         dataType = ct['dataType']
         unique = ct.get('unique')
         if unique:
             flow = Flow(
                 add_field(name, dataType, '-', resources=RESOURCE_NAME),
                 append_to_primary_key(name)
             )
         else:
             flow = Flow(
                 add_field(name, dataType, None, resources=RESOURCE_NAME),
             )
         steps.append(
             conditional(
                 self.no_such_field(name),
                 flow
             )
         )
     return Flow(*steps)
 def flow(self):
     steps = []
     if not self.config.get(CONFIG_PUBLISH_ALLOWED):
         return None
     logger.info('Publisher Flow Preparing')
     if self.output_datapackage:
         logger.info('Publisher Flow: Dump To Path Denorm...')
         steps.extend([
             dump_to_path(self.output_datapackage)
         ])
     if self.output_db:
         db_table = 'dgp__{}_{}'.format(
             self.config.get(CONFIG_TAXONOMY_ID),
             self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME),
         )
         logger.info('Publisher Flow: Dump To DB... (%s)', db_table)
         primary_key = self.config.get(CONFIG_PRIMARY_KEY)
         mapping = self.config.get(CONFIG_MODEL_MAPPING)
         for m in mapping:
             if 'columnType' in m and m['columnType']:
                 m['slug'] = self.slugify(m['title'])
                 m['hierarchy'] = self.slugify(m['columnType'].split(':')[0])
                 m['column'] = self.column(m['columnType'])
                 m['primaryKey'] = m['columnType'] in primary_key
                 m['measure'] = m['hierarchy'] == 'value'
                 m['full_column'] = (
                     m['column'] if m['measure']
                     else '{}_{hierarchy}.{column}'.format(db_table, **m)
                 )
                 m['label'] = self.fetch_label(m['columnType'])
                 m['dataType'] = self.fetch_datatype(m['columnType'])
         prefixes = set(
             m['hierarchy']
             for m in mapping
             if m.get('measure') is False
         )
         prefixed = dict(
             (p, list(filter(lambda m: m.get('hierarchy') == p, mapping)))
             for p in prefixes
         )
         groups = [
             NormGroup([
                     m['column']
                     for m in prefixed_items
                 ], self.ref_column(prefix), self.id_column(),
                 db_table='{}_{}'.format(db_table, prefix))
             for prefix, prefixed_items in prefixed.items()
         ]
         babbage_model = dict(
             dimensions=dict(
                 (m['slug'], dict(
                     label=m['title'],
                     key_attribute=m['slug'],
                     attributes=dict([
                         (m['slug'], dict(
                             column=m['full_column'],
                             label=m['title'],
                             type=m['dataType'],
                         ))
                     ] + ([
                         (m['label']['slug'], dict(
                             column=m['label']['full_column'],
                             label=m['label']['title'],
                             type=m['label']['dataType'],
                         ))
                     ] if m.get('label') else [])),
                     join_column=[
                         self.ref_column(m['hierarchy']),
                         self.id_column()
                     ],
                     **(dict(
                         label_attribute=m['label']['slug']
                     ) if m.get('label') else {})
                 ))
                 for m in self.config.get(CONFIG_MODEL_MAPPING)
                 if m.get('measure') is False and m.get('primaryKey') is True
             ),
             fact_table=db_table,
             measures=dict(
                 (
                     m['slug'],
                     dict(
                         column=m['column'],
                         label=m['title'],
                         type='number'
                     )
                 )
                 for m in self.config.get(CONFIG_MODEL_MAPPING)
                 if m.get('measure') is True
             ),
             hierarchies=dict(
                 (prefix, dict(
                     label=prefix,
                     levels=[
                         m['slug']
                         for m in prefixed_items
                         if m.get('primaryKey') is True
                     ]
                 ))
                 for prefix, prefixed_items in prefixed.items()
             ),
         )
         steps.append(
             update_package(babbage_model=babbage_model)
         )
         source = self.config.get(CONFIG_URL)
         logger.info('Publisher Flow: _source Handling...')
         steps.extend([
             add_field('_source', 'string', source),
             append_to_primary_key(['_source']),
             clear_by_source(self.lazy_engine(), db_table, source),
         ])
         logger.info('Publisher Flow: Normalize...')
         steps.extend([
             normalize_to_db(
                 groups,
                 db_table,
                 RESOURCE_NAME,
                 self.output_db,
                 'append'
             ),
         ])
         if self.output_datapackage:
             logger.info('Publisher Flow: Dump To Path Norm...')
             steps.extend([
                 dump_to_path(self.output_datapackage + '-norm')
             ])
     if self.output_es:
         logger.info('Publisher Flow: ES...')
         steps.extend([
             self.update_es()
         ])
     logger.info('Publisher Flow Prepared')
     return Flow(*steps)
예제 #4
0
def objeto_del_gasto(config):

    logging.info('PREPARING objeto_del_gasto processing')

    CT = COLUMN_MAPPING
    CN = dict((k, v.replace(':', '-')) for k, v in CT.items())

    lookup = {}
    codes = datapackage.Package(
        os.path.join(os.path.dirname(__file__),
                     'objeto_del_gasto.datapackage.zip'))
    for resource in codes.resources:
        kind = resource.name
        lookup[kind] = {}
        for row in resource.iter(keyed=True):
            key = row[kind.upper().replace('Í', 'I')]
            value = row['DESCRIPCION']
            lookup[kind][key] = value

    def process(row):
        year = int(row['date-fiscal-year'])

        # Skip the LAST year of the dataset (currently 2016) it has split columns already
        if year < 2019:
            objeto = row[CN['ID_CONCEPTO']]
            if objeto and objeto != '-':
                row[CN['ID_CAPITULO']] = objeto[0] + '000'
                row[CN['ID_CONCEPTO']] = objeto[:2] + '00'
                row[CN['DESC_CAPITULO']] = lookup['capitulo'].get(
                    row[CN['ID_CAPITULO']])
                row[CN['DESC_CONCEPTO']] = lookup['concepto'].get(
                    row[CN['ID_CONCEPTO']])

                nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3

            if objeto and len(objeto) >= 4:
                row[CN['ID_PARTIDA_GENERICA']] = objeto[:nb_generica_digits]

            row[CN['DESC_PARTIDA_GENERICA']] = lookup['partida_generica'].get(
                row.get(CN['ID_PARTIDA_GENERICA']))

            if year not in (2008, 2009, 2010):
                if objeto and len(objeto) >= 5:
                    row[CN['ID_PARTIDA_ESPECIFICA']] = objeto
                    row[CN['DESC_PARTIDA_ESPECIFICA']] = \
                        lookup['partida_específica'].get(row.get(CN['ID_PARTIDA_ESPECIFICA']))

    def missing_field(mf):
        def func(dp):
            return all(f.name != mf for f in dp.resources[0].schema.fields)

        return func

    def sort_by_ct():
        def func(package):
            ct_indexes = dict(
                (ct['name'], i)
                for i, ct in enumerate(config.get(CONFIG_TAXONOMY_CT)))
            fields = sorted(((ct_indexes.get(f.get('columnType'), 1000), f)
                             for f in package.pkg.descriptor['resources'][0]
                             ['schema']['fields']),
                            key=lambda x: x[0])
            package.pkg.descriptor['resources'][0]['schema']['fields'] = [
                f[1] for f in fields
            ]
            yield package.pkg
            yield from package

        return func

    return Flow(
        *[
            conditional(
                missing_field(CN[f]),
                Flow(add_field(CN[f], 'string', columnType=ct, title=f),
                     append_to_primary_key(CN[f]) if 'ID_' in f else None))
            for f, ct in CT.items()
        ], sort_by_ct(), process)