def __init__(self, **kwargs): allowed = [ "vars_needed", "vars_and_vals", "values", "shows_and_levels", "force", "where", "order", "sort", "limit", "exclude", "auto_crosswalk", "display_names", "offset" ] self._year = None self.auto_crosswalk = False self.display_names = False self.offset = None self.vars_and_vals = {} for keyword, value in kwargs.items(): if keyword in allowed: setattr(self, keyword, value) else: raise DataUSAException("Invalid ApiObject attribute") if self.limit: self.limit = int(self.limit) if self.offset: self.offset = int(self.offset) self.subs = {} self.table_list = [] self.warnings = [] if self.exclude: self.exclude = self.exclude.split(",") if hasattr(self, "year") and self.year != ALL: self._year = self.year self.force_schema = None self.auto_crosswalk = self.auto_crosswalk in [True, 'true', '1'] self.display_names = self.display_names in ['true', '1']
def required_tables(cls, api_obj): '''Given a list of X, do Y''' vars_needed = api_obj.vars_needed + api_obj.where_vars() if api_obj.order and api_obj.order in cls.possible_variables: vars_needed = vars_needed + [api_obj.order] universe = set(vars_needed) tables_to_use = [] table_cols = [] # Make a set of the variables that will be needed to answer the query while universe: # first find the tables with biggest overlap candidates = cls.list_partial_tables(universe, api_obj) # raise Exception(candidates) top_choices = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True) # take the table with the biggest overlap tbl, overlap = top_choices.pop(0) # ensure the tables are joinable, for now that means # having atleast one column with the same name if tables_to_use: while not set(table_cols).intersection([str(c.key) for c in get_columns(tbl)]): if top_choices: tbl, overlap = top_choices.pop(0) else: raise DataUSAException("can't join tables!") tables_to_use.append(tbl) tmp_cols = [str(c.key) for c in get_columns(tbl)] table_cols += tmp_cols # remove the acquired columns from the universe universe = universe - set(tmp_cols) return tables_to_use
def handle_ordering(tables, api_obj): '''Process sort and order parameters from the API''' sort = "desc" if api_obj.sort == "desc" else "asc" if api_obj.order not in TableManager.possible_variables: raise DataUSAException("Bad order parameter", api_obj.order) my_col = get_column_from_tables(tables, api_obj.order) sort_expr = getattr(my_col, sort)() return sort_expr.nullslast()
def query(table, api_obj, stream=False): vars_and_vals = api_obj.vars_and_vals shows_and_levels = api_obj.shows_and_levels values = api_obj.values exclude = api_obj.exclude filters = process_value_filters(table, vars_and_vals, api_obj) filters += where_filters(table, api_obj.where) filters += sumlevel_filtering(table, api_obj) if values: pk = [ col for col in table.__table__.columns if col.primary_key and col.key not in values ] cols = pk + values else: cols = get_columns(table) if exclude: cols = [ col for col in cols if (isinstance(col, basestring) and col not in exclude) or col.key not in exclude ] # qry = table.query.with_entities(*cols) qry = table.query if hasattr(table, "crosswalk_join"): qry = table.crosswalk_join(qry) if stream: qry, cols = use_attr_names(table, qry, cols) qry = qry.with_entities(*cols) if hasattr(table, "JOINED_FILTER"): qry, filters = handle_join(qry, filters, table, api_obj) qry = qry.filter(*filters) if api_obj.order: sort = "desc" if api_obj.sort == "desc" else "asc" if api_obj.order not in TableManager.possible_variables: if api_obj.order == 'abs(pct_change)': pass # allow this else: raise DataUSAException("Bad order parameter", api_obj.order) sort_stmt = text("{} {} NULLS LAST".format(api_obj.order, sort)) qry = qry.order_by(sort_stmt) if api_obj.limit: qry = qry.limit(api_obj.limit) if stream: return stream_format(table, cols, qry, api_obj) return simple_format(table, cols, qry, api_obj)
def api_join_view(csv=None): api_obj = build_api_obj(default_limit=500) if api_obj.limit and api_obj.limit > 80000: raise DataUSAException("Limit parameter must be less than 80,000") tables = manager.required_tables(api_obj) data = join_api.joinable_query(tables, api_obj, manager.table_years, csv_format=csv) return data
def all_tables(cls, api_obj): vars_needed = api_obj.vars_needed candidates = [] for table in registered_models: if api_obj.order and api_obj.order in cls.possible_variables: vars_needed = vars_needed + [api_obj.order] if TableManager.table_has_cols(table, vars_needed): if TableManager.table_can_show(table, api_obj): candidates.append(table) candidates = sorted(candidates, key=attrgetter('median_moe')) if not candidates: raise DataUSAException("No tables can match the specified query.") return candidates
def list_partial_tables(cls, vars_needed, api_obj): candidates = {} for table in registered_models: overlap_size = TableManager.table_has_some_cols(table, vars_needed) if overlap_size > 0: if TableManager.table_can_show(table, api_obj): # to break ties, we'll use median moe to penalize and subtract # since larger values will be chosen first. penalty = (1 - (1.0 / table.median_moe)) if table.median_moe > 0 else 0 candidates[table] = overlap_size - penalty if not candidates: raise DataUSAException("No tables can match the specified query.") return candidates
def __init__(self, **kwargs): allowed = [ "vars_needed", "vars_and_vals", "values", "shows_and_levels", "force", "where", "order", "sort", "limit", "exclude" ] self._year = None for keyword, value in kwargs.items(): if keyword in allowed: setattr(self, keyword, value) else: raise DataUSAException("Invalid ApiObject attribute") if self.limit: self.limit = int(self.limit) self.subs = {} self.table_list = [] if self.exclude: self.exclude = self.exclude.split(",") if hasattr(self, "year") and self.year != ALL: self._year = self.year self.force_schema = None