def spew_flow(flow, ctx: ProcessorContext):
    flow = Flow(
        update_package(**ctx.datapackage),
        load((ctx.datapackage, ctx.resource_iterator)),
        flow,
    )
    datastream = flow.datastream()
    ctx.datapackage = datastream.dp.descriptor
    ctx.resource_iterator = datastream.res_iter
    ctx.stats = MergeableStats(datastream.stats, ctx.stats)
def flow(parameters, datapackage, resources, stats):
    stats['foo_values'] = 0

    def add_foo_field(package: PackageWrapper):
        package.pkg.descriptor['resources'][0]['schema']['fields'] += [{
            'name':
            parameters['attr'],
            'type':
            'string'
        }]
        yield package.pkg
        yield from package

    def add_foo_value(row):
        row[parameters['attr']] = 'foo'
        stats['foo_values'] += 1

    return Flow(update_package(name='_'), hello_dataflows, add_foo_field,
                add_foo_value)
示例#3
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-i', help='Path to CSV [CSV]')
    parser.add_argument('-o', help='Path to Output Directory [DIR]')
    parser.add_argument(
        '-m', help='Merge this metadata (Author, License, ...) [JSON]')

    args = parser.parse_args()

    # Load additional metadata if any
    addedMetadata = parseExtraMetadata(args.m)
    # print(addedMetadata)

    # Load with Dataflows and save back as DataPackage
    Flow(load(args.i), update_package(title=addedMetadata["title"]),
         update_package(name=addedMetadata["name"]),
         update_package(license=addedMetadata["license"]),
         update_package(licenses=addedMetadata["licenses"]),
         update_package(contributors=addedMetadata["contributors"]),
         update_package(maintainers=addedMetadata["maintainers"]),
         update_package(sources=addedMetadata["sources"]),
         dump_to_path(args.o)).process()
 def flow(self):
     steps = []
     if not self.config.get(CONFIG_PUBLISH_ALLOWED):
         return None
     logger.info('Publisher Flow Preparing')
     if self.output_datapackage:
         logger.info('Publisher Flow: Dump To Path Denorm...')
         steps.extend([
             dump_to_path(self.output_datapackage)
         ])
     if self.output_db:
         db_table = 'dgp__{}_{}'.format(
             self.config.get(CONFIG_TAXONOMY_ID),
             self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME),
         )
         logger.info('Publisher Flow: Dump To DB... (%s)', db_table)
         primary_key = self.config.get(CONFIG_PRIMARY_KEY)
         mapping = self.config.get(CONFIG_MODEL_MAPPING)
         for m in mapping:
             if 'columnType' in m and m['columnType']:
                 m['slug'] = self.slugify(m['title'])
                 m['hierarchy'] = self.slugify(m['columnType'].split(':')[0])
                 m['column'] = self.column(m['columnType'])
                 m['primaryKey'] = m['columnType'] in primary_key
                 m['measure'] = m['hierarchy'] == 'value'
                 m['full_column'] = (
                     m['column'] if m['measure']
                     else '{}_{hierarchy}.{column}'.format(db_table, **m)
                 )
                 m['label'] = self.fetch_label(m['columnType'])
                 m['dataType'] = self.fetch_datatype(m['columnType'])
         prefixes = set(
             m['hierarchy']
             for m in mapping
             if m.get('measure') is False
         )
         prefixed = dict(
             (p, list(filter(lambda m: m.get('hierarchy') == p, mapping)))
             for p in prefixes
         )
         groups = [
             NormGroup([
                     m['column']
                     for m in prefixed_items
                 ], self.ref_column(prefix), self.id_column(),
                 db_table='{}_{}'.format(db_table, prefix))
             for prefix, prefixed_items in prefixed.items()
         ]
         babbage_model = dict(
             dimensions=dict(
                 (m['slug'], dict(
                     label=m['title'],
                     key_attribute=m['slug'],
                     attributes=dict([
                         (m['slug'], dict(
                             column=m['full_column'],
                             label=m['title'],
                             type=m['dataType'],
                         ))
                     ] + ([
                         (m['label']['slug'], dict(
                             column=m['label']['full_column'],
                             label=m['label']['title'],
                             type=m['label']['dataType'],
                         ))
                     ] if m.get('label') else [])),
                     join_column=[
                         self.ref_column(m['hierarchy']),
                         self.id_column()
                     ],
                     **(dict(
                         label_attribute=m['label']['slug']
                     ) if m.get('label') else {})
                 ))
                 for m in self.config.get(CONFIG_MODEL_MAPPING)
                 if m.get('measure') is False and m.get('primaryKey') is True
             ),
             fact_table=db_table,
             measures=dict(
                 (
                     m['slug'],
                     dict(
                         column=m['column'],
                         label=m['title'],
                         type='number'
                     )
                 )
                 for m in self.config.get(CONFIG_MODEL_MAPPING)
                 if m.get('measure') is True
             ),
             hierarchies=dict(
                 (prefix, dict(
                     label=prefix,
                     levels=[
                         m['slug']
                         for m in prefixed_items
                         if m.get('primaryKey') is True
                     ]
                 ))
                 for prefix, prefixed_items in prefixed.items()
             ),
         )
         steps.append(
             update_package(babbage_model=babbage_model)
         )
         source = self.config.get(CONFIG_URL)
         logger.info('Publisher Flow: _source Handling...')
         steps.extend([
             add_field('_source', 'string', source),
             append_to_primary_key(['_source']),
             clear_by_source(self.lazy_engine(), db_table, source),
         ])
         logger.info('Publisher Flow: Normalize...')
         steps.extend([
             normalize_to_db(
                 groups,
                 db_table,
                 RESOURCE_NAME,
                 self.output_db,
                 'append'
             ),
         ])
         if self.output_datapackage:
             logger.info('Publisher Flow: Dump To Path Norm...')
             steps.extend([
                 dump_to_path(self.output_datapackage + '-norm')
             ])
     if self.output_es:
         logger.info('Publisher Flow: ES...')
         steps.extend([
             self.update_es()
         ])
     logger.info('Publisher Flow Prepared')
     return Flow(*steps)
示例#5
0
 update_package(
     name='covid-19',
     title='Novel Coronavirus 2019',
     views=[{
         "title": "Total world to date",
         "resources": ["worldwide-aggregated"],
         "specType": "simple",
         "spec": {
             "group": "Date",
             "series": ["Confirmed", "Deaths"],
             "type": "line"
         }
     }, {
         "title": "Number of confirmed cases in key countries",
         "resources": ["key-countries-pivoted"],
         "specType": "simple",
         "spec": {
             "group":
             "Date",
             "series": [
                 "China", "US", "United_Kingdom", "Italy",
                 "France", "Germany", "Spain", "Iran"
             ],
             "type":
             "line"
         }
     }, {
         "title":
         "Mortality rate in percentage",
         "resources": [{
             "name":
             "worldwide-aggregated",
             "transform": [{
                 "type":
                 "formula",
                 "expressions":
                 ["data['Deaths'] / data['Confirmed'] * 100 + '%'"],
                 "asFields": ["Mortality rate"]
             }]
         }],
         "specType":
         "simple",
         "spec": {
             "group": "Date",
             "series": ["Mortality rate"],
             "type": "bar"
         }
     }, {
         "title":
         "Increase rate from previous day in confirmed cases worldwide",
         "resources": ["worldwide-aggregated"],
         "specType": "simple",
         "spec": {
             "group": "Date",
             "series": ["Increase rate"],
             "type": "bar"
         }
     }]),
示例#6
0
                      "title": "Cumulative total confirmed cases to date",
                      "type": "integer"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Recovered",
                      "title": "Cumulative total recovered cases to date",
                      "type": "integer"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Deaths",
                      "title": "Cumulative total deaths to date",
                      "type": "integer"
                  }]),
    checkpoint('processed_country_data'),
    # Prepare data package (name, title) and add views
    update_package(name='covid-19',
                   title='Novel Coronavirus 2019',
                   views=[{
                       "title": "Total world to date",
                       "resources": ["worldwide-aggregated"],
                       "specType": "simple",
                       "spec": {
                           "group": "Date",
                           "series": ["Confirmed", "Recovered", "Deaths"],
                           "type": "line"
                       }
                   }]),
    dump_to_path()).results()[0]
示例#7
0
    def normalize(self, package, full_name, db_table):
        schema = package.descriptor['resources'][0]['schema']
        fields = schema['fields']
        primary_key = schema['primaryKey']
        mapping = []
        for f in fields:
            m = copy.deepcopy(f)
            if m.get('columnType'):
                m['slug'] = self.slugify(m['title'])
                m['hierarchy'] = self.slugify(m['columnType'].split(':')[0])
                m['column'] = self.column(m['columnType'])
                m['primaryKey'] = m['name'] in primary_key
                m['measure'] = m['hierarchy'] == 'value'
                m['full_column'] = (m['column'] if m['measure'] else
                                    '{}_{hierarchy}.{column}'.format(
                                        db_table, **m))
                m['label'] = self.fetch_label(m['columnType'], mapping)
                m['dataType'] = self.fetch_datatype(m['columnType'])
                mapping.append(m)
        prefixes = set(m['hierarchy'] for m in mapping
                       if m.get('measure') is False)
        prefixed = dict(
            (p, list(filter(lambda m: m.get('hierarchy') == p, mapping)))
            for p in prefixes)
        groups = [
            NormGroup([m['column'] for m in prefixed_items],
                      self.ref_column(prefix),
                      self.id_column(),
                      db_table='{}_{}'.format(db_table, prefix))
            for prefix, prefixed_items in prefixed.items()
        ]
        babbage_model = dict(
            dimensions=dict(
                (m['slug'],
                 dict(label=m['title'],
                      key_attribute=m['slug'],
                      attributes=dict([(m['slug'],
                                        dict(
                                            column=m['full_column'],
                                            label=m['title'],
                                            type=m['dataType'],
                                        ))] +
                                      ([(m['label']['slug'],
                                         dict(
                                             column=m['label']['full_column'],
                                             label=m['label']['title'],
                                             type=m['label']['dataType'],
                                         ))] if m.get('label') else [])),
                      join_column=[
                          self.ref_column(m['hierarchy']),
                          self.id_column()
                      ],
                      **(dict(label_attribute=m['label']['slug']) if m.
                         get('label') else {}))) for m in mapping
                if m.get('measure') is False and m.get('primaryKey') is True),
            fact_table=db_table,
            measures=dict(
                (m['slug'],
                 dict(column=m['column'], label=m['title'], type='number'))
                for m in mapping if m.get('measure') is True),
            hierarchies=dict((prefix,
                              dict(label=prefix,
                                   levels=[
                                       m['slug'] for m in prefixed_items
                                       if m.get('primaryKey') is True
                                   ]))
                             for prefix, prefixed_items in prefixed.items()),
        )

        return Flow(
            update_package(babbage_model=babbage_model),
            normalize_to_db(groups, db_table, RESOURCE_NAME,
                            db_connection_string, 'append'),
            finalizer(lambda: babbage_models.create_or_edit(
                full_name, babbage_model)))
 def postflow(self):
     metadata = self.config._unflatten().get('extra', {}).get('metadata')
     logger.info('UPDATING WITH METADATA %r', metadata)
     return Flow(
         update_package(**metadata)
     )
示例#9
0
      pivot_key_countries,
      delete_fields(['Country', 'Confirmed', 'Recovered', 'Deaths'], resources='key-countries-pivoted'),
      # Prepare data package (name, title) and add views
      update_package(
        name='covid-19',
        title='Novel Coronavirus 2019',
        views=[
            {
              "title": "Total world to date",
              "resources": ["worldwide-aggregated"],
              "specType": "simple",
              "spec": {
                "group": "Date",
                "series": ["Confirmed", "Recovered", "Deaths"],
                "type": "line"
              }
            },
            {
                "title": "Number of confirmed cases in key countries",
                "resources": ["key-countries-pivoted"],
                "specType": "simple",
                "spec": {
                    "group": "Date",
                    "series": ["China", "US", "United_Kingdom", "Italy", "France", "Germany", "Spain", "Iran"],
                    "type": "line"
                }
            }
        ]
      ),
      dump_to_path()
).results()[0]
示例#10
0
      target_key=['Province/State', 'Country/Region', 'Date'],
      fields=dict(Recovered={
          'name': 'Case',
          'aggregate': 'first'
      })),
 add_computed_field(target={
     'name': 'Deaths',
     'type': 'number'
 },
                    operation='format',
                    with_='{Case}'),
 delete_fields(['Case']),
 update_resource('time_series_19-covid-Deaths',
                 name='time-series-19-covid-combined',
                 path='data/time-series-19-covid-combined.csv'),
 update_package(name='covid-19', title='Novel Coronavirus 2019'),
 dump_to_path(),
 checkpoint('processed_data'),
 # Duplicate the stream to create aggregated data
 duplicate(source='time-series-19-covid-combined',
           target_name='worldwide-aggregated',
           target_path='worldwide-aggregated.csv'),
 join_with_self(resource_name='worldwide-aggregated',
                join_key=['Date'],
                fields=dict(Date={'name': 'Date'},
                            Confirmed={
                                'name': 'Confirmed',
                                'aggregate': 'sum'
                            },
                            Recovered={
                                'name': 'Recovered',
def flow(parameters):
    return Flow(update_package(**parameters))