def _fetch_data(self, dataset, query=None): yield Result(127, { "municipality": "Robertsfors kommun", }) yield Result(17, { "municipality": "Region Gotland", })
def _fetch_data(self, dataset, query=None): if dataset.id == "Dataset_1": yield Result(127, { "date": "2017-08-10", "municipality": "Robertsfors kommun", }) elif dataset.id == "Dataset_2": yield Result(12, { "date": "2017-02-06", "municipality": "Umeå kommun", }) yield Result(130, { "date": "2017-02-07", "municipality": "Robertsfors kommun", })
def _fetch_data(self, dataset, query): """Make the actual query. :param query: """ # get latest month by default if query is None: query = "latest_month" months = [] if query == "latest_month": months.append(dataset.latest_month) elif isinstance(query, dict): since = query.get("from", "2000-01") to = query.get("to", datetime.now().strftime("%Y-%m")) months = [ x for x in dataset.dimensions["month"].allowed_values if since <= x.label and x.label <= to ] for month in months: self.log.info("Fecthing data from {}".format(month.label)) url = BASE_URL + "/BankruptcyStatisticsCategoryPage/GetStatistics" html = self._post_request(url, {"selectedvalue": month.value}) for res in parse_result_page(html): value = res["value"] res.pop("value") res["month"] = month.label yield Result(value, res)
def _parse_result_page(self, url, payload, only_region=False, region=None): """ Get data from a result page :param url: url to query :param payload: payload to pass :return: a dictlist with data """ data = [] if only_region: html = self.scraper._get_html(url) else: try: html = self.scraper._post_html(url, payload=payload) except HTTPError as e: if e.response.status_code == 500: self.scraper.log.warning( u"Unable to get {} with {}".format(url, payload)) return [] current_selection = self._get_current_selection(html) table = Datatable(html) data = [] _region = None for row in table.data: region_or_unit_id, region_or_unit_label = row["region_or_unit"] region = self.regions.get_by_label(region_or_unit_label) if region: row["region"] = region.label row["unit"] = None row["unit_id"] = None _region = region.label else: assert region_or_unit_label is not None assert region_or_unit_id is not None row["region"] = _region row["unit"] = region_or_unit_label row["unit_id"] = region_or_unit_id value = row["value"] row.pop("value", None) row.pop("region_or_unit", None) for dim in self.dimensions: if dim.id not in row: row[dim.id] = current_selection[dim.id][1] # gets label data.append(Result(value, row)) return data
def _fetch_data(self, dataset, query): """Make the actual query. The only queryable dimensions are period. >>> dataset.fetch({"period": "2016"}) >>> dataset.fetch({"period": ["2015", "2016"]}) >>> dataset.fetch({"period": "*"}) # Get all periods """ default_query = { "period": dataset.latest_period[1], } if query is None: query = {} default_query.update(query) query = default_query allowed_query_dims = ["period"] for dim in query.keys(): if dim not in allowed_query_dims: msg = "Querying on {} is not implemented yet".format(dim) raise NotImplementedError(msg) if query["period"] == "*": periods = [x[1] for x in dataset.periods] else: if not isinstance(query["period"], list): periods = [query["period"]] else: periods = query["period"] # Get the period id's needed to build url periods = [dataset._get_period_id(x) for x in periods] for period in periods: # Hack: For datasets with multiple uttag we get the latest # This should rather be a part of query if dataset.has_uttag: uttag = dataset.get_latest_uttag(period)[0] else: uttag = None url = dataset.get_xml_url(period, uttag) xml_data = self._get_html(url) for datapoint in get_data_from_xml(xml_data): value = datapoint["value"] del datapoint["value"] yield Result(value, datapoint)
def _fetch_data(self, dataset, query=None): files = [(y, m) for y in query['years'] for m in query['months']] frames = [] # Download and clean every monthly Excel file for file in files: year, month = file url = self.BASE_URL.format(year=year, month=MONTHS[month]) frame = self._clean_data(pd.read_excel(url), year, month) frames.append(frame) # Yield individual rows of type Result from the dataframe raw_data = pd.concat(frames) for i, row in raw_data.iterrows(): val = row.pop('value') yield Result(val, json.loads(row.to_json()))
def _fetch_data(self, dataset, query): if query is None: query = {} body = { 'query': [{ 'code': key, 'selection': { 'filter': filtertype, # value can be a list or a value 'values': value if isinstance(value, list) else [value] } } for key, (filtertype, value) in query.items()], 'response': { 'format': "json" } } try: raw = requests.post(self._api_path(dataset), json=body) if raw.headers["content-type"] == "text/html": # This is an error message raise(InvalidData(f"""Error message from PX Web: {raw.content} Check your query for spelling errors, or try reducing the size. """)) data = raw.json() except JSONDecodeError: raise InvalidData("""No valid response from PX Web. Check your query for spelling errors, or try reducing the size. This error is frequently due to a too large result being requested.""") # All available dimensions are not always returned. # What is returned depends on the query raw_return_dimension = data["columns"] # Filter out dimensions only raw_return_dimension = [x for x in raw_return_dimension if x["type"] != "c"] for row in data[u"data"]: for value in row[u"values"]: dimensions = {} # 'key' contains one value for each dimension, # always preserving order. for d, v in zip(raw_return_dimension, row[u"key"]): dimensions[d["code"]] = v yield Result(value, dimensions=dimensions)
def _fetch_data(self, dataset, query=None): (c, r, p) = dataset.blob self.browser\ .find_element_by_xpath("//div[@title='Skicka till Excel']")\ .click() # Press enter trice in case of any prompts actions = ActionChains(self.browser) actions.send_keys(Keys.RETURN) actions.send_keys(Keys.RETURN) actions.send_keys(Keys.RETURN) actions.perform() # Wait for download i = 0 while not os.listdir(self.tempdir): sleep(1) i += 1 if i > PAGELOAD_TIMEOUT: # TODO: Use a suitable basescraper exception raise Exception("Download timed out") sleep(20) # TODO: We need to check that the file is complete. # Something like this: # https://stackoverflow.com/questions/35891393/how-to-get-file-download-complete-status-using-selenium-web-driver-c-sharp#35892347 # WARNING: Assuming the latest downloaded xls to be our file. # This is obviously not 100 % water proof. latest_download = max(iglob(os.path.join(self.tempdir, "*.xls")), key=os.path.getctime) workbook = open_workbook(latest_download) sheet = workbook.sheet_by_index(0) periods = sheet.row_values(0)[2:-1] periods = [int(x) for x in periods] for n in range(1, sheet.nrows): row = sheet.row_values(n) region = row.pop(0) row.pop(0) # empty due to merged cells if region == "Total": break i = 0 for col in row[:-1]: yield Result(int(col), { "region": region, "period": periods[i], })
def _fetch_data(self, dataset, query): """Make the actual query. """ if query is None: query = {} # default query _query = { "year": [dataset.latest_year], "region": "Hela landet" } _query.update(query) allowed_query_dims = ["year", "region"] # Validate query for dim in query.keys(): if dim not in allowed_query_dims: msg = "Querying on {} is not implemented yet".format(dim) raise NotImplementedError(msg) for dim, value in _query.iteritems(): if value == "*": _query[dim] = [x.value for x in dataset.dimensions[dim].allowed_values] elif not isinstance(value, list): _query[dim] = [value] # get all input elem values payload = {} for input_elem in dataset.soup.select("input"): payload[input_elem["name"]] = input_elem.get("value") for region in _query["region"]: region_id = dataset._get_region_id(region) for year in _query["year"]: payload.update({ "ctl01$ctl11$lstCounties": region_id, "ctl01$ctl11$lstYearInterval": year, }) result_page = self._post_html(URL, payload) for datapoint in parse_result_page(result_page): value = datapoint["value"] del datapoint["value"] yield Result(value, datapoint)
def _fetch_data(self, dataset, query=None): html = requests.get( "http://web05.lansstyrelsen.se/transtat_O/transtat.asp").text soup = BeautifulSoup(html, 'html.parser') table = soup.find("table", "line").find_all("table")[2].findNext("table") rows = table.find_all("tr") column_headers = rows.pop(0).find_all("td", recursive=False) years = [x.text for x in column_headers[2:]] for row in rows: cells = row.find_all("td") date = cells.pop(0).text month = cells.pop(0).text i = 0 for value in cells: # Each column from here is a year. if value.text: yield Result(value.text.encode("utf-8"), { "date": date, "month": month, "year": years[i], }) i += 1
def _fetch_data(self, dataset, query): url = "http://statistik.uka.se/4.5d85793915901d205f935d0f.12.5d85793915901d205f965eab.portlet?action=resultat&view=resultTable&frageTyp=3&frageNr=240&tid=%s&grupp1=%s&grupp2=%s" thenmap_url = "http://api.thenmap.net/v1/se-7/data/%s?data_props=name|kommunkod" # 6 is 1993, the first year in the db if query is None: query = {} if "from" not in query: query['from'] = 1993 if "semesters" not in query: query['semesters'] = (2016 - query["from"]) * 2 start = (query["from"] - 1993) * 2 + 5 terms = range(start, start + query["semesters"] + 2) for t in terms: # Get all municipalities, and their codes, from this year year = ((t - 5) / 2) + 1993 semester = ["HT", "VT"][t % 2] municipalities = requests.get(thenmap_url % year).json() for id_, municipality_ in municipalities["data"].items(): municipality = municipality_.pop() code = municipality["kommunkod"].zfill(4) c, m = code[:2], code[2:] html = requests.get(url % (t, c, m)).text soup = BeautifulSoup(html, 'html.parser') table = soup.find("table") # The first rows are headers, the last are empty rows = table.find_all("tr")[5:-2] for row in rows: cells = row.find_all("td") yield Result(cells[2].text.strip(), { "municipality": municipality["name"], "school": cells[0].text.strip(), "semester": semester, "year": year, })
def _fetch_data(self, dataset, query): """ """ url = dataset.url.replace("val", "resultat") # Start building the POST payload from values of hidden inputs payload = dataset.hidden_inputs query_size = 1 for dim_key, values in query.items(): # Get dimensions from query by id # Will error on bad query try: dim = dataset.dimensions[dim_key] except NoSuchItem: dim_ids = [x.id for x in dataset.dimensions] msg = "{} is not a valid dimension id. Try: ".format( dim_key, dim_ids) raise InvalidQuery(msg) if values == "*": values = [x.value for x in dim.allowed_values] elif not isinstance(values, list): values = [values] query_values = [] for val in values: # validate the value passed used in query # will error if invalid value try: dim_value = dim.allowed_values[val] except StopIteration: # Odd exception from statscraper dim_value = dim.allowed_values.get_by_label(val) if dim_value is None: msg = "{} is not an allowed value or label for {}. Try: {}"\ .format(val, dim_key, dim.allowed_values) raise InvalidQuery(msg) query_values.append(dim_value.in_query) query_size *= len(query_values) query_value = "".join(query_values) # ";01;;02;;03;" payload[dim.query_key] = query_value # Socialstyrelsen has a limit of 50 000 datapoints per query # TODO: Split into multiple queries automatically MAX_QUERY_SIZE = 50000 if query_size > MAX_QUERY_SIZE: msg = ("Your query was too large: {}, threshold is {}. " "Try splitting it into multiple queries.")\ .format(query_size, MAX_QUERY_SIZE) raise TooLargeQuery(msg) html = self._post_html(url, payload) # Check if the result is an error page error_msg = re.search("Fel med nummer (-?\d+)", html) if error_msg: # TODO: Find out what these error codes mean. # Known codes are -1 and 10. error_code = error_msg.group(1) msg = "Result page didn't render. Socialstyrelsen error code: {}".format( error_code) raise InvalidQuery(msg) for value, index in parse_result_table(html): yield Result(value, index)
def _fetch_data(self, dataset, query={}, include_inactive_stations=False): """ Should yield dataset rows """ data = [] parameter = dataset station_dim = dataset.dimensions["station"] all_stations = station_dim.allowed_values # Step 1: Prepare query if "station" not in query: if include_inactive_stations: # Get all stations query["station"] = list(all_stations) else: # Get only active stations query["station"] = list(station_dim.active_stations()) else: if not isinstance(query["station"], list): query["station"] = [query["station"]] # Make sure that the queried stations actually exist query["station"] = [ all_stations.get_by_label(x) for x in query["station"] ] if "period" not in query: # TODO: I'd prepare to do dataset.get("period").allowed_values here query["period"] = PERIODS elif not isinstance(query["period"], list): query["period"] = [query["period"]] for period in query["period"]: if period not in PERIODS: msg = u"{} is not an allowed period".format(period) raise Exception(msg) # Step 3: Get data n_queries = len(query["station"]) * len(query["period"]) counter = 0 print("Fetching data with {} queries.".format(n_queries)) for station in query["station"]: for period in query["period"]: url = dataset.url\ .replace(".json", "/station/{}/period/{}/data.csv"\ .format(station.key, period)) print("/GET {} ".format(url)) r = requests.get(url) if r.status_code == 200: raw_data = DataCsv().from_string(r.content).to_dictlist() # TODO: This is a very hard coded parse function # Expects fixed start row and number of cols for row in raw_data: #timepoint = datetime.strptime(timepoint_str, "%Y-%m-%d %H:%M:%S") value_col = parameter.id.split(",")[0] value = float(row[value_col]) row["parameter"] = parameter.id row["station"] = station.label row["station_key"] = station.key row["period"] = period row.pop(value_col, None) datapoint = Result(value, row) yield datapoint elif r.status_code == 404: print("Warning no data at {}".format(url)) else: raise Exception("Connection error for {}".format(url))
def test_pandas_export(self): """Get results as pandas dataframe.""" result = ResultSet() result.append(Result(45483, {'city': "Voi"})) df = result.pandas self.assertTrue(ptypes.is_numeric_dtype(df.value))
def _fetch_data(self, dataset, query): """Make query for actual data. Get all regions and years by default. `period` (year), `municipality` and `municipality_groups` are the only implemented queryable dimensions. :param query: a dict with dimensions and values to query by. Examples: {"municipality": ["0180"]} {"period": 2016 } """ # Make query a dict if it already isn't if isinstance(query, dict) == False: query = {} # If nothing is set, default to all allowed municipalities queryable_dims = ['municipality', 'period', 'municipality_groups'] if all([x not in query for x in queryable_dims]): query['municipality'] = [] for x in dataset.dimensions['municipality'].allowed_values: query['municipality'].append(x.value) # Listify queried values (to allow single values in query, like {"year": 2016}) for key, values in query.items(): if not isinstance(values, list): query[key] = [values] # Format all values as strings for url creation query[key] = [str(x) for x in query[key]] # Validate query for dim in query.keys(): if dim not in queryable_dims: raise Exception( "You cannot query on dimension '{}'".format(dim)) # Check if the values are allowed if dim in ('municipality', 'municipality_groups'): allowed = [ x.value for x in dataset.dimensions[dim].allowed_values ] for dimVal in query[dim]: if dimVal not in allowed: raise Exception( "You cannot query on dimension '{}' with '{}'". format(dim, dimVal)) # base url for query next_url = '{}data/kpi/{}'.format(self.base_url, dataset.id) # Merge `municipality` and `municipality_groups` municipalities = [] if 'municipality' in query: municipalities = municipalities + query['municipality'] if 'municipality_groups' in query: municipalities = municipalities + query['municipality_groups'] if len(municipalities) > 0: next_url += '/municipality/{}'.format(','.join(municipalities)) if 'period' in query: next_url += '/year/{}'.format(','.join(query['period'])) while next_url: print('/GET {}'.format(next_url)) r = requests.get(next_url) r.raise_for_status() json_data = r.json() for row in json_data['values']: for d in row['values']: yield Result( d['value'], { 'kpi': dataset.id, 'kpi_label': dataset.label, 'municipality': row['municipality'], 'period': row['period'], 'gender': d['gender'], 'status': d['status'], }) # if 'next_page' in json_data: next_url = json_data['next_page'] else: next_url = False
def _fetch_data(self, dataset, query={}): """Make query for actual data. Get all regions and years by default. `period` (year) and `municipality` are the only implemented queryable dimensions. :param query: a dict with dimensions and values to query by. Examples: {"municipality": ["0180"]} {"period": 2016 } """ # if "municipality" not in query and "period" not in query: query = { "municipality": [x.id for x in self.dimension["municipality"].allowed_values] } # Listify queried values (to allow single values in query, like {"year": 2016}) for key, values in query.items(): if not isinstance(values, list): query[key] = [values] # Format all values as strings for url creation query[key] = [str(x) for x in query[key]] # Validate query queryable_dims = ["municipality", "period"] for dim in query.keys(): if dim not in queryable_dims: raise Exception( "You cannot query on dimension '{}'".format(dim)) #TODO: Make sure tha values passed in query are allowed. # base url for query next_url = '{}data/kpi/{}'.format(self.base_url, dataset.id) if "municipality" in query: next_url += "/municipality/{}".format(",".join( query["municipality"])) elif "period" in query: next_url += "/year/{}".format(",".join(query["period"])) while next_url: print("/GET {}".format(next_url)) r = requests.get(next_url) r.raise_for_status() json_data = r.json() for row in json_data["values"]: for d in row["values"]: yield Result( d['value'], { 'kpi': dataset.id, 'kpi_label': dataset.label, 'municipality': row['municipality'], 'period': row['period'], 'gender': d['gender'], 'status': d['status'], }) # if "next_page" in json_data: next_url = json_data["next_page"] else: next_url = False
class VantetiderDataset(Dataset): def get_url(self, region="Sverige"): if region=="Sverige": # Hack: _get_region_slug expects the page to be loaded, but to be # able to load the page we first have to show it on national levle region_slug = "Sveriges" else: region_slug = self._get_region_slug(region) return BASE_URL + region_slug + "/" + self.id + "/" @property def html(self): if not hasattr(self, "_html"): url = self.get_url() self._html = self.scraper._get_html(url) return self._html @property def soup(self): return BeautifulSoup(self.html, 'html.parser') @property def regions(self): """ Get a list of all regions """ return self.dimensions["region"].allowed_values @property def years(self): """ Get a list of all available years """ return self.dimensions["year"].allowed_values @property def latest_timepoint(self): """Get the latest available year and period. This method will have to be re-written to support pages with ajax load """ return { "year": self.dimensions["year"].default_value, "period": self.dimensions["period"].default_value, } def _get_region_slug(self, id_or_label): """ Get the regional slug to be used in url "Norrbotten" => "Norrbottens" :param id_or_label: Id or label of region """ region = self.regions.get_by_label(id_or_label) if region is None: try: region = self.regions[id_or_label] except StopIteration: # this is a strange error thrown by statscraper raise KeyError(u"{} is not a valid region id or label".format(id_or_label)) slug = region.label\ .replace(u" ","-")\ .replace(u"ö","o")\ .replace(u"Ö","O")\ .replace(u"ä","a")\ .replace(u"å","a") if not "region" in slug: slug = slug + "s" EXCEPTIONS = { "Jamtland-Harjedalens": "Jamtlands", "Rikets": "Sveriges", "Alla-landstings": "Sveriges", } if slug in EXCEPTIONS: slug = EXCEPTIONS[slug] return slug def _parse_result_page(self, url, payload, only_region=False, region=None): """ Get data from a result page :param url: url to query :param payload: payload to pass :return: a dictlist with data """ data = [] if only_region: html = self.scraper._get_html(url) else: try: html = self.scraper._post_html(url, payload=payload) except HTTPError, e: if e.response.status_code == 500: self.scraper.log.warning(u"Unable to get {} with {}".format(url, payload)) return [] current_selection = self._get_current_selection(html) table = Datatable(html) data = [] _region = None for row in table.data: region_or_unit_id, region_or_unit_label = row["region_or_unit"] region = self.regions.get_by_label(region_or_unit_label) if region: row["region"] = region.label row["unit"] = None row["unit_id"] = None _region = region.label else: assert region_or_unit_label is not None assert region_or_unit_id is not None row["region"] = _region row["unit"] = region_or_unit_label row["unit_id"] = region_or_unit_id value = row["value"] row.pop("value", None) row.pop("region_or_unit", None) for dim in self.dimensions: if dim.id not in row: row[dim.id] = current_selection[dim.id][1] # gets label data.append(Result(value, row)) return data