예제 #1
0
    def raw_data_for_geos(self, geos):
        data = {}

        # group by geo level
        geos = sorted(geos, key=lambda g: g.geo_level)
        for geo_level, geos in groupby(geos, lambda g: g.geo_level):
            geo_codes = [g.geo_code for g in geos]

            # initial values
            for geo_code in geo_codes:
                data['%s-%s' % (geo_level, geo_code)] = {
                    'estimate': {},
                    'error': {}
                }

            session = get_session()
            try:
                geo_values = None
                rows = session\
                    .query(self.model)\
                    .filter(self.model.c.geo_level == geo_level)\
                    .filter(self.model.c.geo_code.in_(geo_codes))\
                    .all()

                for row in rows:
                    geo_values = data['%s-%s' % (geo_level, row.geo_code)]

                    for col in self.columns.keys():
                        geo_values['estimate'][col] = getattr(row, col)
                        geo_values['error'][col] = 0

            finally:
                session.close()

        return data
예제 #2
0
    def _build_model_from_fields(self, fields, table_name, geo_level=None, value_type=Integer):
        '''
        Generates an ORM model for arbitrary census fields by geography.

        :param list fields: the census fields in `api.models.tables.FIELD_TABLE_FIELDS`, e.g. ['highest educational level', 'type of sector']
        :param str table_name: the name of the database table
        :param str geo_level: one of the geographics levels defined in `api.base.geo_levels`, e.g. 'province', or None if the table doesn't use them
        :return: ORM model class containing the given fields with type String(128), a 'total' field
        with type Integer and '%(geo_level)s_code' with type ForeignKey('%(geo_level)s.code')
        :rtype: Model
        '''
        if table_name in _census_table_models:
            return _census_table_models[table_name]

        # We build this array in a particular order, with the geo-related fields first,
        # to ensure that SQLAlchemy creates the underlying table with the compound primary
        # key columns in the correct order:
        #
        #  geo_level, geo_code, field, [field, field, ...]
        #
        # This means postgresql will use the first two elements of the compound primary
        # key -- geo_level and geo_code -- when looking up values for a particular
        # geograhy. This saves us from having to create a secondary index.
        table_args = []

        if geo_level:
            # primary/foreign keys
            table_args.append(Column('%s_code' % geo_level, String(10),
                                     ForeignKey('%s.code' % geo_level),
                                     primary_key=True, index=True))
        else:
            # will form a compound primary key on the fields, and the geo id
            table_args.append(Column('geo_level', String(15), nullable=False, primary_key=True))
            table_args.append(Column('geo_code', String(10), nullable=False, primary_key=True))

        # Now add the columns
        table_args.extend(Column(field, String(128), primary_key=True) for field in fields)
        # and the value column
        table_args.append(Column('total', value_type, nullable=True))

        # create the table model
        class Model(Base):
            __table__ = Table(table_name, Base.metadata, *table_args)
        _census_table_models[table_name] = Model

        # ensure it exists in the DB
        session = get_session()
        try:
            Model.__table__.create(session.get_bind(), checkfirst=True)
        finally:
            session.close()

        return Model
예제 #3
0
def get_census_profile(geo_code, geo_level, get_params, profile_name=None):
    session = get_session()
    try:
        geo_summary_levels = geo_data.get_summary_geo_info(geo_code, geo_level)
        data = {}
        sections = []
        selected_sections = []
        if get_params.get('topic'):
            categories = get_params.get('topic').split(',')
            for cat in categories:
                selected_sections.extend(SECTIONS[cat]['profiles'])
            data['selected_topics'] = categories

        for cat in SECTIONS:
            sections.extend(SECTIONS[cat]['profiles'])

        for section in sections:
            section = section.lower().replace(' ', '_')
            function_name = 'get_%s_profile' % section
            if function_name in globals():
                func = globals()[function_name]
                data[section] = func(geo_code, geo_level, session)

                # get profiles for province and/or country
                for level, code in geo_summary_levels:
                    # merge summary profile into current geo profile
                    merge_dicts(data[section], func(code, level, session),
                                level)

        # tweaks to make the data nicer
        # show X largest groups on their own and group the rest as 'Other'
        if 'households' in sections:
            group_remainder(
                data['households']['roofing_material_distribution'], 5)
            group_remainder(data['households']['wall_material_distribution'],
                            5)

        data['all_sections'] = SECTIONS
        if (selected_sections == []):
            selected_sections = sections
        data['raw_selected_sections'] = selected_sections
        data['selected_sections'] = [
            x.replace(' ', '_').lower() for x in selected_sections
        ]
        return data

    finally:
        session.close()
예제 #4
0
    def raw_data_for_geos(self, geos):
        """
        Pull raw data for a list of geo models.

        Returns a dict mapping the geo ids to table data.
        """
        data = {}

        # group by geo level
        geos = sorted(geos, key=lambda g: g.geo_level)
        for geo_level, geos in groupby(geos, lambda g: g.geo_level):
            model = self.get_model(geo_level)
            geo_codes = [g.geo_code for g in geos]

            if self.table_per_level:
                code = '%s_code' % geo_level
            else:
                code = 'geo_code'
            code_attr = getattr(model, code)

            # initial values
            for geo_code in geo_codes:
                data['%s-%s' % (geo_level, geo_code)] = {
                    'estimate': {},
                    'error': {}
                }

            session = get_session()
            try:
                geo_values = None
                fields = [getattr(model, f) for f in self.fields]
                rows = session\
                    .query(code_attr,
                           func.sum(model.total).label('total'),
                           *fields)\
                    .group_by(code_attr, *fields)\
                    .order_by(code_attr, *fields)\
                    .filter(code_attr.in_(geo_codes))

                if not self.table_per_level:
                    rows = rows.filter(model.geo_level == geo_level)

                rows = rows.all()

                def permute(level, field_keys, rows):
                    field = self.fields[level]
                    total = None
                    denominator = 0

                    for key, rows in groupby(rows,
                                             lambda r: getattr(r, field)):
                        new_keys = field_keys + [key]
                        col_id = self.column_id(new_keys)

                        if level + 1 < len(self.fields):
                            value = permute(level + 1, new_keys, rows)
                        else:
                            # we've bottomed out

                            rows = list(rows)
                            if all(row.total is None for row in rows):
                                value = None
                            else:
                                value = sum(row.total or 0 for row in rows)

                            if self.denominator_key and self.denominator_key == key:
                                # this row must be used as the denominator total,
                                # rather than as an entry in the table
                                denominator = value
                                continue

                        if value is not None:
                            total = (total or 0) + value
                        geo_values['estimate'][col_id] = value
                        geo_values['error'][col_id] = 0

                    if self.denominator_key:
                        total = denominator

                    return total

                # rows for each geo
                for geo_code, geo_rows in groupby(rows,
                                                  lambda r: getattr(r, code)):
                    geo_values = data['%s-%s' % (geo_level, geo_code)]
                    total = permute(0, [], geo_rows)

                    # total
                    if self.total_column:
                        geo_values['estimate'][self.total_column] = total
                        geo_values['error'][self.total_column] = 0

            finally:
                session.close()

        return data
예제 #5
0
    def setup_columns(self):
        """
        Prepare our columns for use by +as_dict+ and the data API.

        Each 'column' is actually a unique value for each of this table's +fields+.
        """
        self.build_models()

        # Each "column" is a unique permutation of the values
        # of this table's fields, including rollups. The ordering of the
        # columns is important since columns heirarchical, but are returned
        # "flat".
        #
        # Here's an example. Suppose our table has the following values:
        #
        #     5 years, male, 129
        #     5 years, female, 131
        #     10 years, male, 221
        #     10 years, female, 334
        #
        # This would produce the following columns (indented to show nesting)
        #
        # 5 years:
        #   male
        # 5 years:
        #   female
        # 10 years:
        #   male
        # 10 years:
        #   female

        # map from column id to column info.
        self.columns = OrderedDict()

        if self.has_total:
            self.total_column = self.column_id(
                [self.denominator_key or 'total'])
            self.columns[self.total_column] = {'name': 'Total', 'indent': 0}
        else:
            self.total_column = None

        session = get_session()
        try:
            model = self.get_model('country')
            fields = [getattr(model, f) for f in self.fields]

            # get distinct permutations for all fields
            rows = session\
                .query(*fields)\
                .order_by(*fields)\
                .distinct()\
                .all()

            def permute(indent, field_values, rows):
                field = self.fields[indent - 1]
                last = indent == len(self.fields)

                for val, rows in groupby(rows, lambda r: getattr(r, field)):
                    # this is used to calculate the column id
                    new_values = field_values + [val]
                    col_id = self.column_id(new_values)

                    self.columns[col_id] = {
                        'name': capitalize(val) + ('' if last else ':'),
                        'indent': 0 if col_id == self.total_column else indent,
                    }

                    if not last:
                        permute(indent + 1, new_values, rows)

            permute(1, [], rows)
        finally:
            session.close()
예제 #6
0
    def get_stat_data(self,
                      geo_level,
                      geo_code,
                      fields=None,
                      key_order=None,
                      percent=True,
                      total=None,
                      recode=None):
        """ Get a data dictionary for a place from this table.

        This fetches the values for each column in this table and returns a data
        dictionary for those values, with appropriate names and metadata.

        :param str geo_level: the geographical level
        :param str geo_code: the geographical code
        :param str or list fields: the columns to fetch stats for. By default, all columns except
                                   geo-related and the total column (if any) are used.
        :param str key_order: explicit ordering of (recoded) keys, or None for the default order.
                              Default order is the order in +fields+ if given, otherwise
                              it's the natural column order from the DB.
        :param bool percent: should we calculate percentages, or just include raw values?
        :param int total: the total value to use for percentages, name of a
                          field, or None to use the sum of all retrieved fields (default)
        :param dict recode: map from field names to strings to recode column names. Many fields
                            can be recoded to the same thing, their values will be summed.

        :return: (data-dictionary, total)
        """

        session = get_session()
        try:
            if fields is not None and not isinstance(fields, list):
                fields = [fields]
            if fields:
                for f in fields:
                    if f not in self.columns:
                        raise ValueError(
                            "Invalid field/column '%s' for table '%s'. Valid columns are: %s"
                            %
                            (f, self.id, ', '.join(list(self.columns.keys()))))
            else:
                fields = list(self.columns.keys())
                if self.total_column:
                    fields.remove(self.total_column)

            recode = recode or {}
            if recode:
                # change lambda to dicts
                if not isinstance(recode, dict):
                    recode = {f: recode(f) for f in fields}

            # is the total column valid?
            if isinstance(total, str) and total not in self.columns:
                raise ValueError(
                    "Total column '%s' isn't one of the columns for table '%s'. Valid columns are: %s"
                    % (total, self.id, ', '.join(list(self.columns.keys()))))

            # table columns to fetch
            cols = [self.model.columns[c] for c in fields]

            if total is not None and isinstance(total,
                                                str) and total not in cols:
                cols.append(total)

            # do the query. If this returns no data, row is None
            row = session\
                .query(*cols)\
                .filter(self.model.c.geo_level == geo_level,
                        self.model.c.geo_code == geo_code)\
                .first()

            if row is None:
                row = ZeroRow()

            # what's our denominator?
            if total is None:
                # sum of all columns
                total = sum(getattr(row, f) or 0 for f in fields)
            elif isinstance(total, str):
                total = getattr(row, total)

            # Now build a data dictionary based on the columns in +row+.
            # Multiple columns may be recoded into one, so we have to
            # accumulate values as we go.
            results = OrderedDict()

            key_order = key_order or fields  # default key order is just the list of fields

            for field in key_order:
                val = getattr(row, field) or 0

                # recode the key for this field, default is to keep it the same
                key = recode.get(field, field)

                # set the recoded field name, noting that the key may already
                # exist if another column recoded to it
                field_info = results.setdefault(
                    key,
                    {'name': recode.get(field, self.columns[field]['name'])})

                if percent:
                    # sum up existing values, if any
                    val = val + field_info.get('numerators', {}).get('this', 0)
                    field_info['values'] = {'this': p(val, total)}
                    field_info['numerators'] = {'this': val}
                else:
                    # sum up existing values, if any
                    val = val + field_info.get('values', {}).get('this', 0)
                    field_info['values'] = {'this': val}

            add_metadata(results, self)
            return results, total
        finally:
            session.close()
예제 #7
0
 def setUp(self):
     self.s = get_session()
     DATA_TABLES.clear()