def raw_data_for_geos(self, geos): data = {} # group by geo level geos = sorted(geos, key=lambda g: g.geo_level) for geo_level, geos in groupby(geos, lambda g: g.geo_level): geo_codes = [g.geo_code for g in geos] # initial values for geo_code in geo_codes: data['%s-%s' % (geo_level, geo_code)] = { 'estimate': {}, 'error': {} } session = get_session() try: geo_values = None rows = session\ .query(self.model)\ .filter(self.model.c.geo_level == geo_level)\ .filter(self.model.c.geo_code.in_(geo_codes))\ .all() for row in rows: geo_values = data['%s-%s' % (geo_level, row.geo_code)] for col in self.columns.keys(): geo_values['estimate'][col] = getattr(row, col) geo_values['error'][col] = 0 finally: session.close() return data
def _build_model_from_fields(self, fields, table_name, geo_level=None, value_type=Integer): ''' Generates an ORM model for arbitrary census fields by geography. :param list fields: the census fields in `api.models.tables.FIELD_TABLE_FIELDS`, e.g. ['highest educational level', 'type of sector'] :param str table_name: the name of the database table :param str geo_level: one of the geographics levels defined in `api.base.geo_levels`, e.g. 'province', or None if the table doesn't use them :return: ORM model class containing the given fields with type String(128), a 'total' field with type Integer and '%(geo_level)s_code' with type ForeignKey('%(geo_level)s.code') :rtype: Model ''' if table_name in _census_table_models: return _census_table_models[table_name] # We build this array in a particular order, with the geo-related fields first, # to ensure that SQLAlchemy creates the underlying table with the compound primary # key columns in the correct order: # # geo_level, geo_code, field, [field, field, ...] # # This means postgresql will use the first two elements of the compound primary # key -- geo_level and geo_code -- when looking up values for a particular # geograhy. This saves us from having to create a secondary index. table_args = [] if geo_level: # primary/foreign keys table_args.append(Column('%s_code' % geo_level, String(10), ForeignKey('%s.code' % geo_level), primary_key=True, index=True)) else: # will form a compound primary key on the fields, and the geo id table_args.append(Column('geo_level', String(15), nullable=False, primary_key=True)) table_args.append(Column('geo_code', String(10), nullable=False, primary_key=True)) # Now add the columns table_args.extend(Column(field, String(128), primary_key=True) for field in fields) # and the value column table_args.append(Column('total', value_type, nullable=True)) # create the table model class Model(Base): __table__ = Table(table_name, Base.metadata, *table_args) _census_table_models[table_name] = Model # ensure it exists in the DB session = get_session() try: Model.__table__.create(session.get_bind(), checkfirst=True) finally: session.close() return Model
def get_census_profile(geo_code, geo_level, get_params, profile_name=None): session = get_session() try: geo_summary_levels = geo_data.get_summary_geo_info(geo_code, geo_level) data = {} sections = [] selected_sections = [] if get_params.get('topic'): categories = get_params.get('topic').split(',') for cat in categories: selected_sections.extend(SECTIONS[cat]['profiles']) data['selected_topics'] = categories for cat in SECTIONS: sections.extend(SECTIONS[cat]['profiles']) for section in sections: section = section.lower().replace(' ', '_') function_name = 'get_%s_profile' % section if function_name in globals(): func = globals()[function_name] data[section] = func(geo_code, geo_level, session) # get profiles for province and/or country for level, code in geo_summary_levels: # merge summary profile into current geo profile merge_dicts(data[section], func(code, level, session), level) # tweaks to make the data nicer # show X largest groups on their own and group the rest as 'Other' if 'households' in sections: group_remainder( data['households']['roofing_material_distribution'], 5) group_remainder(data['households']['wall_material_distribution'], 5) data['all_sections'] = SECTIONS if (selected_sections == []): selected_sections = sections data['raw_selected_sections'] = selected_sections data['selected_sections'] = [ x.replace(' ', '_').lower() for x in selected_sections ] return data finally: session.close()
def raw_data_for_geos(self, geos): """ Pull raw data for a list of geo models. Returns a dict mapping the geo ids to table data. """ data = {} # group by geo level geos = sorted(geos, key=lambda g: g.geo_level) for geo_level, geos in groupby(geos, lambda g: g.geo_level): model = self.get_model(geo_level) geo_codes = [g.geo_code for g in geos] if self.table_per_level: code = '%s_code' % geo_level else: code = 'geo_code' code_attr = getattr(model, code) # initial values for geo_code in geo_codes: data['%s-%s' % (geo_level, geo_code)] = { 'estimate': {}, 'error': {} } session = get_session() try: geo_values = None fields = [getattr(model, f) for f in self.fields] rows = session\ .query(code_attr, func.sum(model.total).label('total'), *fields)\ .group_by(code_attr, *fields)\ .order_by(code_attr, *fields)\ .filter(code_attr.in_(geo_codes)) if not self.table_per_level: rows = rows.filter(model.geo_level == geo_level) rows = rows.all() def permute(level, field_keys, rows): field = self.fields[level] total = None denominator = 0 for key, rows in groupby(rows, lambda r: getattr(r, field)): new_keys = field_keys + [key] col_id = self.column_id(new_keys) if level + 1 < len(self.fields): value = permute(level + 1, new_keys, rows) else: # we've bottomed out rows = list(rows) if all(row.total is None for row in rows): value = None else: value = sum(row.total or 0 for row in rows) if self.denominator_key and self.denominator_key == key: # this row must be used as the denominator total, # rather than as an entry in the table denominator = value continue if value is not None: total = (total or 0) + value geo_values['estimate'][col_id] = value geo_values['error'][col_id] = 0 if self.denominator_key: total = denominator return total # rows for each geo for geo_code, geo_rows in groupby(rows, lambda r: getattr(r, code)): geo_values = data['%s-%s' % (geo_level, geo_code)] total = permute(0, [], geo_rows) # total if self.total_column: geo_values['estimate'][self.total_column] = total geo_values['error'][self.total_column] = 0 finally: session.close() return data
def setup_columns(self): """ Prepare our columns for use by +as_dict+ and the data API. Each 'column' is actually a unique value for each of this table's +fields+. """ self.build_models() # Each "column" is a unique permutation of the values # of this table's fields, including rollups. The ordering of the # columns is important since columns heirarchical, but are returned # "flat". # # Here's an example. Suppose our table has the following values: # # 5 years, male, 129 # 5 years, female, 131 # 10 years, male, 221 # 10 years, female, 334 # # This would produce the following columns (indented to show nesting) # # 5 years: # male # 5 years: # female # 10 years: # male # 10 years: # female # map from column id to column info. self.columns = OrderedDict() if self.has_total: self.total_column = self.column_id( [self.denominator_key or 'total']) self.columns[self.total_column] = {'name': 'Total', 'indent': 0} else: self.total_column = None session = get_session() try: model = self.get_model('country') fields = [getattr(model, f) for f in self.fields] # get distinct permutations for all fields rows = session\ .query(*fields)\ .order_by(*fields)\ .distinct()\ .all() def permute(indent, field_values, rows): field = self.fields[indent - 1] last = indent == len(self.fields) for val, rows in groupby(rows, lambda r: getattr(r, field)): # this is used to calculate the column id new_values = field_values + [val] col_id = self.column_id(new_values) self.columns[col_id] = { 'name': capitalize(val) + ('' if last else ':'), 'indent': 0 if col_id == self.total_column else indent, } if not last: permute(indent + 1, new_values, rows) permute(1, [], rows) finally: session.close()
def get_stat_data(self, geo_level, geo_code, fields=None, key_order=None, percent=True, total=None, recode=None): """ Get a data dictionary for a place from this table. This fetches the values for each column in this table and returns a data dictionary for those values, with appropriate names and metadata. :param str geo_level: the geographical level :param str geo_code: the geographical code :param str or list fields: the columns to fetch stats for. By default, all columns except geo-related and the total column (if any) are used. :param str key_order: explicit ordering of (recoded) keys, or None for the default order. Default order is the order in +fields+ if given, otherwise it's the natural column order from the DB. :param bool percent: should we calculate percentages, or just include raw values? :param int total: the total value to use for percentages, name of a field, or None to use the sum of all retrieved fields (default) :param dict recode: map from field names to strings to recode column names. Many fields can be recoded to the same thing, their values will be summed. :return: (data-dictionary, total) """ session = get_session() try: if fields is not None and not isinstance(fields, list): fields = [fields] if fields: for f in fields: if f not in self.columns: raise ValueError( "Invalid field/column '%s' for table '%s'. Valid columns are: %s" % (f, self.id, ', '.join(list(self.columns.keys())))) else: fields = list(self.columns.keys()) if self.total_column: fields.remove(self.total_column) recode = recode or {} if recode: # change lambda to dicts if not isinstance(recode, dict): recode = {f: recode(f) for f in fields} # is the total column valid? if isinstance(total, str) and total not in self.columns: raise ValueError( "Total column '%s' isn't one of the columns for table '%s'. Valid columns are: %s" % (total, self.id, ', '.join(list(self.columns.keys())))) # table columns to fetch cols = [self.model.columns[c] for c in fields] if total is not None and isinstance(total, str) and total not in cols: cols.append(total) # do the query. If this returns no data, row is None row = session\ .query(*cols)\ .filter(self.model.c.geo_level == geo_level, self.model.c.geo_code == geo_code)\ .first() if row is None: row = ZeroRow() # what's our denominator? if total is None: # sum of all columns total = sum(getattr(row, f) or 0 for f in fields) elif isinstance(total, str): total = getattr(row, total) # Now build a data dictionary based on the columns in +row+. # Multiple columns may be recoded into one, so we have to # accumulate values as we go. results = OrderedDict() key_order = key_order or fields # default key order is just the list of fields for field in key_order: val = getattr(row, field) or 0 # recode the key for this field, default is to keep it the same key = recode.get(field, field) # set the recoded field name, noting that the key may already # exist if another column recoded to it field_info = results.setdefault( key, {'name': recode.get(field, self.columns[field]['name'])}) if percent: # sum up existing values, if any val = val + field_info.get('numerators', {}).get('this', 0) field_info['values'] = {'this': p(val, total)} field_info['numerators'] = {'this': val} else: # sum up existing values, if any val = val + field_info.get('values', {}).get('this', 0) field_info['values'] = {'this': val} add_metadata(results, self) return results, total finally: session.close()
def setUp(self): self.s = get_session() DATA_TABLES.clear()