Exemplo n.º 1
0
 def __init__(self, schema, name, obs_table, columns, task):
     '''
     columns: should be an ordereddict if you want to specify columns' order
     in the table
     '''
     self._id = '.'.join([schema, name])
     obs_table.id = self._id
     obs_table.tablename = '{prefix}{name}'.format(
         prefix=OBSERVATORY_PREFIX,
         name=sha1(underscore_slugify(
             self._id).encode('utf-8')).hexdigest())
     self.table = '{schema}.{table}'.format(schema=OBSERVATORY_SCHEMA,
                                            table=obs_table.tablename)
     self.qualified_tablename = '"{schema}".{table}'.format(
         schema=OBSERVATORY_SCHEMA, table=obs_table.tablename)
     self.obs_table = obs_table
     self._tablename = obs_table.tablename
     self._schema = schema
     self._name = name
     self._obs_dict = obs_table.__dict__.copy()
     self._columns = columns
     self._task = task
     if obs_table.tablename in metadata.tables:
         self._table = metadata.tables[obs_table.tablename]
     else:
         self._table = None
Exemplo n.º 2
0
 def output(self):
     session = current_session()
     for table in session.query(OBSTable):
         split = table.id.split('.')
         schema, task_id = split[0:-1], split[-1]
         modname = 'tasks.' + '.'.join(schema)
         module = __import__(modname, fromlist=['*'])
         exists = False
         for name in dir(module):
             kls = getattr(module, name)
             if not isinstance(kls, Register):
                 continue
             # this doesn't work because of underscore_slugify
             #possible_kls = '_'.join(task_id.split('_')[0:-len(kls.get_params())-1])
             if task_id.startswith(underscore_slugify(name)):
                 exists = True
         if exists is True:
             LOGGER.info('{table} exists'.format(table=table))
         else:
             # TODO drop table
             import pdb
             pdb.set_trace()
             LOGGER.info(table)
         yield PostgresTarget(schema='observatory',
                              tablename=table.tablename)
Exemplo n.º 3
0
 def _parse_columns(self, all_columns_result):
     all_columns = {}
     for col in all_columns_result:
         geom_timespans = {}
         for gt in col[10]:
             if gt[0] in geom_timespans:
                 geom_timespans[gt[0]]['timespans'].append(gt[2])
             else:
                 geom_timespans[gt[0]] = {
                     'geom_id': gt[0],
                     'geom_name': gt[1],
                     'timespans': [gt[2]],
                     'geom_tags': json.loads(gt[3])
                 }
         all_columns[col[0]] = {
             'id':
             col[0],
             'latlng':
             catalog_latlng(col[0]),
             'name':
             col[1],
             'description':
             col[2],
             'type':
             col[3],
             'extra':
             col[4],
             'aggregate':
             col[5],
             'tags':
             col[6],
             'suggested_name':
             col[7],
             'timespan':
             col[8],
             'timespan_sluggified':
             underscore_slugify('_'.join(col[8])),
             'licenses': [
                 tag_id.split('/')[1]
                 for tag_id, tag_name in col[6].items()
                 if tag_id.startswith('license/')
             ],
             'sources': [
                 tag_id.split('/')[1]
                 for tag_id, tag_name in col[6].items()
                 if tag_id.startswith('source/')
             ],
             'denoms':
             col[9],
             'geom_timespans':
             geom_timespans,
             'envelope':
             col[11]
         }
     return all_columns
Exemplo n.º 4
0
 def columns(self):
     # Here we assemble an OrderedDict using our requirements to specify the
     # columns that go into this table.
     # The column name
     input_ = self.input()
     cols = OrderedDict([('area_fips', input_['geoid_cols']['county_geoid'])
                         ])
     for naics_code, naics_cols in input_['naics'].iteritems():
         for key, coltarget in naics_cols.iteritems():
             naics_name = NAICS_CODES[naics_code]
             colname = underscore_slugify(u'{}_{}_{}'.format(
                 key, naics_code, naics_name))
             cols[colname] = coltarget
     return cols
Exemplo n.º 5
0
 def columns(self):
     # Here we assemble an OrderedDict using our requirements to specify the
     # columns that go into this table.
     # The column name
     input_ = self.input()
     cols = OrderedDict([
         ('area_fips', input_['geoid_cols']['county_geoid'])
     ])
     for naics_code, naics_cols in input_['naics'].iteritems():
         for key, coltarget in naics_cols.iteritems():
             naics_name = NAICS_CODES[naics_code]
             colname = underscore_slugify(u'{}_{}_{}'.format(
                     key, naics_code, naics_name))
             cols[colname] = coltarget
     return cols
Exemplo n.º 6
0
 def columns(self):
     # Here we assemble an OrderedDict using our requirements to specify the
     # columns that go into this table.
     # The column name
     input_ = self.input()
     cols = OrderedDict([
         ('area_fipssl', input_['geoid_cols']['county_{}{}'.format(
             TIGER_YEAR, GEOID_SUMLEVEL_COLUMN)]),
         ('area_fipssc', input_['geoid_cols']['county_{}{}'.format(
             TIGER_YEAR, GEOID_SHORELINECLIPPED_COLUMN)])
     ])
     for naics_code, naics_cols in input_['naics'].items():
         for key, coltarget in naics_cols.items():
             naics_name = NAICS_CODES[naics_code]
             colname = underscore_slugify('{}_{}_{}'.format(
                 key, naics_code, naics_name))
             cols[colname] = coltarget
     return cols
Exemplo n.º 7
0
 def output(self):
     session = current_session()
     for table in session.query(OBSTable):
         split = table.id.split('.')
         schema, task_id = split[0:-1], split[-1]
         modname = 'tasks.' + '.'.join(schema)
         module = __import__(modname, fromlist=['*'])
         exists = False
         for name in dir(module):
             kls = getattr(module, name)
             if not isinstance(kls, Register):
                 continue
             # this doesn't work because of underscore_slugify
             #possible_kls = '_'.join(task_id.split('_')[0:-len(kls.get_params())-1])
             if task_id.startswith(underscore_slugify(name)):
                 exists = True
         if exists is True:
             print('{table} exists'.format(table=table))
         else:
             # TODO drop table
             import pdb
             pdb.set_trace()
             print table
         yield PostgresTarget(schema='observatory', tablename=table.tablename)
def test_underscore_slugify():
    assert_equals(
        underscore_slugify(
            '"path.to.schema"."ClassName(param1=100, param2=foobar)"'),
        'path_to_schema_class_name_param1_100_param2_foobar')
Exemplo n.º 9
0
    def columns(self):
        cols = OrderedDict()
        code, name, description = self.naics_code, NAICS_CODES[
            self.naics_code], ''

        # This gives us easier access to the tags we defined as dependencies
        input_ = self.input()
        units = input_['units']
        sections = input_['sections']
        subsections = input_['subsections']
        parent = input_.get('parent')
        cols['qtrly_estabs'] = OBSColumn(
            id=underscore_slugify('qtrly_estabs_{}'.format(code)),
            type='Numeric',
            name='Establishments in {}'.format(name),
            description=
            'Count of establishments in a given quarter in the {name} industry (NAICS {code}).'
            '{name} is {description}.'.format(name=name,
                                              code=code,
                                              description=description),
            weight=5,
            aggregate='sum',
            tags=[
                units['businesses'], sections['united_states'],
                subsections['commerce_economy']
            ],
            targets={parent['qtrly_estabs']: DENOMINATOR} if parent else {},
        )
        cols['avg_wkly_wage'] = OBSColumn(
            # Make sure the column ID is unique within this module
            # If left blank, will be taken from this column's key in the output OrderedDict
            id=underscore_slugify('avg_wkly_wage_{}'.format(code)),
            # The PostgreSQL type of this column.  Generally Numeric for numbers and Text
            # for categories.
            type='Numeric',
            # Human-readable name.  Will be used as header in the catalog
            name='Average weekly wage for {} establishments'.format(name),
            # Human-readable description.  Will be used as content in the catalog.
            description=
            'Average weekly wage for a given quarter in the {name} industry (NAICS {code}).'
            '{name} is {description}.'.format(name=name,
                                              code=code,
                                              description=description),
            # Ranking of importance, sometimes used to favor certain measures in auto-selection
            # Weight of 0 will hide this column from the user.  We generally use between 0 and 10
            weight=5,
            # How this measure was derived, for example "sum", "median", "average", etc.
            # In cases of "sum", this means functions downstream can construct estimates
            # for arbitrary geographies
            aggregate='average',
            # Tags are our way of noting aspects of this measure like its unit, the country
            # it's relevant to, and which section(s) of the catalog it should appear in.
            tags=[
                units['money'], sections['united_states'],
                subsections['income']
            ],
            targets={cols['qtrly_estabs']: UNIVERSE},
        )
        cols['month3_emplvl'] = OBSColumn(
            id=underscore_slugify('month3_emplvl_{}'.format(code)),
            type='Numeric',
            name='Employees in {} establishments'.format(name),
            description=
            'Number of employees in the third month of a given quarter with the {name} '
            'industry (NAICS {code}). {name} is {description}.'.format(
                name=name, code=code, description=description),
            weight=5,
            aggregate='sum',
            tags=[
                units['people'], sections['united_states'],
                subsections['employment']
            ],
            targets={parent['month3_emplvl']: DENOMINATOR} if parent else {},
        )
        cols['lq_avg_wkly_wage'] = OBSColumn(
            id=underscore_slugify('lq_avg_wkly_wage_{}'.format(code)),
            type='Numeric',
            name='Average weekly wage location quotient for {} establishments'.
            format(name),
            description=
            'Location quotient of the average weekly wage for a given quarter relative to '
            'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).'
            '{name} is {description}.'.format(name=name,
                                              code=code,
                                              description=description),
            weight=3,
            aggregate=None,
            tags=[
                units['ratio'], sections['united_states'],
                subsections['income']
            ],
        )
        cols['lq_qtrly_estabs'] = OBSColumn(
            id=underscore_slugify('lq_qtrly_estabs_{}'.format(code)),
            type='Numeric',
            name='Location quotient of establishments in {}'.format(name),
            description=
            'Location quotient of the quarterly establishment count relative to '
            'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).'
            '{name} is {description}.'.format(name=name,
                                              code=code,
                                              description=description),
            weight=3,
            aggregate=None,
            tags=[
                units['ratio'], sections['united_states'],
                subsections['commerce_economy']
            ],
        )
        cols['lq_month3_emplvl'] = OBSColumn(
            id=underscore_slugify('lq_month3_emplvl_{}'.format(code)),
            type='Numeric',
            name='Employment level location quotient in {} establishments'.
            format(name),
            description=
            'Location quotient of the employment level for the third month of a given quarter '
            'relative to the U.S. (Rounded to the hundredths place) within the {name} '
            'industry (NAICS {code}). {name} is {description}.'.format(
                name=name, code=code, description=description),
            weight=3,
            aggregate=None,
            tags=[
                units['ratio'], sections['united_states'],
                subsections['employment']
            ],
        )

        source = input_['source']['qcew']
        license = input_['license']['no-restrictions']
        for colname, col in cols.items():
            col.tags.append(source)
            col.tags.append(license)
        return cols
Exemplo n.º 10
0
 def output(self):
     return LocalTarget(os.path.join('tmp', classpath(self), self.task_id) +
                        '_' + underscore_slugify(self.last_time) + '.csv')
Exemplo n.º 11
0
    def columns(self):
        columns = OrderedDict()

        input_ = self.input()

        subsectiontags = input_['subsection']
        unittags = input_['units']
        eu = input_['section']['eu']
        licensing = input_['license']['eurostat-license']
        source = input_['source']['eurostat-source']

        cache = CACHE
        dicttables_path = input_['DICTTables'].path

        session = current_session()
        resp = session.execute('''
            SELECT ARRAY_AGG(DISTINCT dimension) FROM {table}
            WHERE dimension NOT IN ('geo', 'time') AND table_code = '{table_code}';
        '''.format(table=input_['metabase'].table,
                   table_code=self.table_name.lower()))
        dimensions = resp.fetchone()[0]

        resp = session.execute('''
            WITH dimensions AS (SELECT value, dimension
            FROM {table}
            WHERE table_code = '{table_code}'
              AND dimension NOT IN ('time', 'geo'))
            SELECT ARRAY_AGG(JSON_BUILD_OBJECT({select}))
            FROM {from_}
            WHERE {where}
        '''.format(table=input_['metabase'].table,
                   table_code=self.table_name.lower(),
                   select=', '.join([
                       "'{}', {}.value".format(dim, dim) for dim in dimensions
                   ]),
                   from_=', '.join(
                       ['dimensions {}'.format(dim) for dim in dimensions]),
                   where=' AND '.join([
                       "{}.dimension = '{}'".format(dim, dim)
                       for dim in dimensions
                   ])))
        cross_prod = resp.fetchone()[0]

        tables = cache.get(dicttables_path, 'table_dic.dic')

        table_desc = tables[self.table_name]
        variable_name = table_desc.split('by')[0].strip()

        for i in cross_prod:
            dimdefs = []
            if len(cross_prod) > 1:  # Multiple variables
                var_code = underscore_slugify(self.table_name +
                                              "_".join(list(i.values())))
                if len(i) == 1:  # Only one dimension, usually "unit"
                    for unit_dic, unit_value in i.items():
                        units = cache.get(
                            dicttables_path,
                            '{dimension}.dic'.format(dimension=unit_dic))
                        dimdefs.append(units[unit_value])
                    description = "{} ".format(
                        variable_name) + "- " + ", ".join(
                            [str(x) for x in dimdefs])
                else:  # multiple dimensions, ignore "unit" when building name
                    for dimname, dimvalue in i.items():
                        if dimname != 'unit':
                            dim_dic = cache.get(
                                dicttables_path,
                                '{dimension}.dic'.format(dimension=dimname))
                            dimdefs.append(dim_dic[dimvalue])
                        description = "{} ".format(
                            variable_name) + "- " + ", ".join(
                                [str(x) for x in dimdefs])
            else:  # Only one variable
                var_code = underscore_slugify(self.table_name)
                for unit_dic, unit_value in i.items():
                    units = cache.get(
                        dicttables_path,
                        '{dimension}.dic'.format(dimension=unit_dic))
                    dimdefs.append(units[unit_value])
                description = "{} ".format(variable_name) + "- " + ", ".join(
                    [str(x) for x in dimdefs])

            try:
                units = cache.get(dicttables_path, 'unit.dic')
                unitdef = units[i['unit']]
                if "percentage" in unitdef.lower() or "per" in unitdef.lower(
                ) or "rate" in unitdef.lower():
                    final_unit_tag = "ratio"
                    aggregate = None
                elif 'nama_aux_cra' in var_code:
                    aggregate = None
                else:
                    final_unit_tag = self.units
                    aggregate = 'sum'
            except:
                final_unit_tag = self.units
                aggregate = 'sum'
            tags = [
                eu, subsectiontags[self.subsection], unittags[final_unit_tag]
            ]

            if ('ths' in var_code or 'th_t'
                    in var_code) and '(thousand persons)' not in description:
                description = description + ' (thousands)'

            columns[var_code] = OBSColumn(
                id=var_code,
                name=simplify_description(description),
                type='Numeric',
                description=description,
                weight=1,
                aggregate=aggregate,  #???
                targets={},  #???
                tags=tags,
                extra=i,
            )

        columnsFilter = ColumnsDeclarations(
            os.path.join(os.path.dirname(__file__), 'eurostat_columns.json'))
        parameters = '{{"subsection":"{subsection}","units":"{units}","nuts_level":"{nuts_level}","year":"{year}"}}'.format(
            subsection=self.subsection,
            units=self.units,
            nuts_level=self.nuts_level,
            year=self.year)
        columns = columnsFilter.filter_columns(columns, parameters)

        for _, col in columns.items():
            col.tags.append(source)
            col.tags.append(licensing)

        targets_dict = {}
        for colname, col in columns.items():
            for i, v in col.extra.items():
                if v == 'TOTAL' or v == 'T':
                    temp = dict((key, value)
                                for key, value in col.extra.items()
                                if key != i)
                    targets_dict[tuple(temp.items())] = colname

        for colname, col in columns.items():
            denoms = {}
            for nontotals, code in targets_dict.items():
                if all(item in col.extra.items()
                       for item in nontotals) and code != colname:
                    denoms[columns.get(code)] = 'denominator'
            col.targets = denoms

        nonsum = ['proportion', 'average', 'percentage', 'rate', r'%', 'share']
        for _, col in columns.items():
            if any(word in col.name.lower() for word in nonsum):
                col.aggregate = None
        return columns
Exemplo n.º 12
0
 def output(self):
     return LocalTarget(
         os.path.join('tmp', classpath(self), self.task_id) + '_' +
         underscore_slugify(self.last_time) + '.csv')
Exemplo n.º 13
0
def test_underscore_slugify():
    assert_equals(underscore_slugify('"path.to.schema"."ClassName(param1=100, param2=foobar)"'),
                  'path_to_schema_class_name_param1_100_param2_foobar'
                 )
Exemplo n.º 14
0
    def columns(self):
        cols = OrderedDict()
        code, name, description = self.naics_code, NAICS_CODES[self.naics_code], ''

        # This gives us easier access to the tags we defined as dependencies
        input_ = self.input()
        units = input_['units']
        sections = input_['sections']
        subsections = input_['subsections']
        parent = input_.get('parent')
        cols['avg_wkly_wage'] = OBSColumn(
            # Make sure the column ID is unique within this module
            # If left blank, will be taken from this column's key in the output OrderedDict
            id=underscore_slugify(u'avg_wkly_wage_{}'.format(code)),
            # The PostgreSQL type of this column.  Generally Numeric for numbers and Text
            # for categories.
            type='Numeric',
            # Human-readable name.  Will be used as header in the catalog
            name=u'Average weekly wage for {} establishments'.format(name),
            # Human-readable description.  Will be used as content in the catalog.
            description=u'Average weekly wage for a given quarter in the {name} industry (NAICS {code}).'
                        u'{name} is {description}.'.format(name=name, code=code, description=description),
            # Ranking of importance, sometimes used to favor certain measures in auto-selection
            # Weight of 0 will hide this column from the user.  We generally use between 0 and 10
            weight=5,
            # How this measure was derived, for example "sum", "median", "average", etc.
            # In cases of "sum", this means functions downstream can construct estimates
            # for arbitrary geographies
            aggregate='average',
            # Tags are our way of noting aspects of this measure like its unit, the country
            # it's relevant to, and which section(s) of the catalog it should appear in.
            tags=[units['money'], sections['united_states'], subsections['income']],
        )
        cols['qtrly_estabs'] = OBSColumn(
            id=underscore_slugify(u'qtrly_estabs_{}'.format(code)),
            type='Numeric',
            name=u'Establishments in {}'.format(name),
            description=u'Count of establishments in a given quarter in the {name} industry (NAICS {code}).'
                        u'{name} is {description}.'.format(name=name, code=code, description=description),
            weight=5,
            aggregate='sum',
            tags=[units['businesses'], sections['united_states'], subsections['commerce_economy']],
            targets={parent['qtrly_estabs']: DENOMINATOR} if parent else {},
        )
        cols['month3_emplvl'] = OBSColumn(
            id=underscore_slugify(u'month3_emplvl_{}'.format(code)),
            type='Numeric',
            name=u'Employees in {} establishments'.format(name),
            description=u'Number of employees in the third month of a given quarter with the {name} '
                        u'industry (NAICS {code}). {name} is {description}.'.format(
                            name=name, code=code, description=description),
            weight=5,
            aggregate='sum',
            tags=[units['people'], sections['united_states'], subsections['employment']],
            targets={parent['month3_emplvl']: DENOMINATOR} if parent else {},
        )
        cols['lq_avg_wkly_wage'] = OBSColumn(
            id=underscore_slugify(u'lq_avg_wkly_wage_{}'.format(code)),
            type='Numeric',
            name=u'Average weekly wage location quotient for {} establishments'.format(name),
            description=u'Location quotient of the average weekly wage for a given quarter relative to '
                        u'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).'
                        u'{name} is {description}.'.format(name=name, code=code, description=description),
            weight=3,
            aggregate=None,
            tags=[units['ratio'], sections['united_states'], subsections['income']],
        )
        cols['lq_qtrly_estabs'] = OBSColumn(
            id=underscore_slugify(u'lq_qtrly_estabs_{}'.format(code)),
            type='Numeric',
            name=u'Location quotient of establishments in {}'.format(name),
            description=u'Location quotient of the quarterly establishment count relative to '
                        u'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).'
                        u'{name} is {description}.'.format(name=name, code=code, description=description),
            weight=3,
            aggregate=None,
            tags=[units['ratio'], sections['united_states'], subsections['commerce_economy']],
        )
        cols['lq_month3_emplvl'] = OBSColumn(
            id=underscore_slugify(u'lq_month3_emplvl_{}'.format(code)),
            type='Numeric',
            name=u'Employment level location quotient in {} establishments'.format(name),
            description=u'Location quotient of the employment level for the third month of a given quarter '
                        u'relative to the U.S. (Rounded to the hundredths place) within the {name} '
                        u'industry (NAICS {code}). {name} is {description}.'.format(
                            name=name, code=code, description=description),
            weight=3,
            aggregate=None,
            tags=[units['ratio'], sections['united_states'], subsections['employment']],
        )
        return cols