def fetch_query(state, query): # TODO: make a better mapping here res = None try: if query.type in ['arcgis', 'json', 'ckan', 'soda']: res = request_and_parse(query.url, query.params) elif query.type in ['csv']: res = request_csv(query.url, query.params, header=query.header, encoding=query.encoding) elif query.type in ['html']: res = request(query.url, query.params, query.encoding) elif query.type in ['html:soup']: res = request_soup(query.url, query.params, query.encoding) elif query.type in ['pandas', 'xls', 'xlsx']: res = request_pandas(query) else: # the default is to send the URL as is # TODO: It's used for something, but it's not great res = query.url except Exception: logging.error("{}: Failed to fetch {}".format(state, query.url), exc_info=True) raise return res
def main(cfg): print(cfg.pretty(resolve=True)) sources = build_sources(cfg.dataset.sources_file, cfg.dataset.mapping_file) ri_source = sources[RI] queries = ri_source.queries mapping = ri_source.mapping # need to verify the correct day + day of week qs = [q for q in queries if q.type == 'pandas'] if len(qs) > 1: print("Don't know which query to choose", [q.get('desc') for q in qs]) sys.exit(1) df = request_pandas(qs[0]) df = df.rename(columns=mapping) df['DATE_INDEX'] = pd.to_datetime(df['DATE']) df = df.set_index('DATE_INDEX').sort_index() df = df[[v for k, v in mapping.items() if k != '__strptime']] # We need te last C days # and then, we need to match what would fit Sat-Sun, shifted by 1 day df = df.tail(cfg.backfill.skip + cfg.backfill.fill) # verify that the dates make sense: we're looking at the most recent day yesterday = datetime.now().date() - timedelta(days=1) assert df.index[-1].date() == yesterday, \ "Expecting last date to be yesterday, got %r" % df.index[-1].date() assert df.index[-1].day_name() == cfg.backfill.DOW, \ "Expecting backfill day to be " + cfg.backfill.DOW + ", got " + df.index[-1].day_name() # Prepare the request if 'POSITIVE' not in df.columns: df['POSITIVE'] = df['CONFIRMED'] if 'STATE' not in df.columns: df['STATE'] = RI shifted = df['DATE'] = df.index.shift(periods=cfg.backfill.shift, freq='d') df['DATE'] = shifted.strftime(cfg.output_date_format) df['lastUpdateTime'] = datetime.now(tz=timezone.utc).isoformat() print(df) # rename columns_renames = {k: v.value for k, v in Fields.__members__.items()} # one last update columns_renames['TOTAL'] = 'totalTestsPeopleViral' data = df.rename(columns=columns_renames). \ head(cfg.backfill.fill). \ dropna(axis=1). \ to_dict(orient='records') request_content = internal_client.build_edit_request( data, username=cfg.api.username) if cfg.api.url: internal_client.api_call(request_content, url=cfg.api.url, token=cfg.creds.token, staging=cfg.api.staging)
def fetch_state(self, state): ''' Fetch data for a single state, returning a tuple of (fetched_result, parsed_data) If there's no query for the state: return (None, _) ''' logging.debug("Fetching: %s", state) res = None queries = self.sources.queries_for(state) if not queries: return res, {} results = [] mapping = self.sources.mapping_for(state) for query in queries: # TODO: make a better mapping here try: if query['type'] in ['arcgis', 'json', 'ckan', 'soda']: res = request_and_parse(query['url'], query['params']) elif query['type'] in ['csv']: res = request_csv(query['url'], query['params'], header=query.get('header', True), encoding=query.get('encoding')) elif query['type'] in ['html']: res = request(query['url'], query['params']) elif query['type'] in ['html:soup']: res = request_soup(query['url'], query['params']) elif query['type'] in ['pandas', 'xls', 'xlsx']: res = request_pandas(query) results.append(res) except Exception: logging.error("{}: Failed to fetch {}".format( state, query['url']), exc_info=True) raise processed_results = [] if state in self.extras: processed_results = self.extras[state](results, mapping) else: for i, result in enumerate(results): if queries[i].get('type') == 'arcgis': partial = extract_arcgis_attributes(result, mapping, state) else: # This is a guess; getting an unknown top level object partial = extract_attributes( result, queries[i].get('data_path', []), mapping, state) processed_results.append(partial) data = self._aggregate_state_results(state, processed_results, mapping) return results, data