示例#1
0
 def _fetch_data(self, dataset, query=None):
     yield Result(127, {
         "municipality": "Robertsfors kommun",
     })
     yield Result(17, {
         "municipality": "Region Gotland",
     })
 def _fetch_data(self, dataset, query=None):
     if dataset.id == "Dataset_1":
         yield Result(127, {
             "date": "2017-08-10",
             "municipality": "Robertsfors kommun",
         })
     elif dataset.id == "Dataset_2":
         yield Result(12, {
             "date": "2017-02-06",
             "municipality": "Umeå kommun",
         })
         yield Result(130, {
             "date": "2017-02-07",
             "municipality": "Robertsfors kommun",
         })
示例#3
0
    def _fetch_data(self, dataset, query):
        """Make the actual query.
        :param query:
        """
        # get latest month by default
        if query is None:
            query = "latest_month"

        months = []
        if query == "latest_month":
            months.append(dataset.latest_month)
        elif isinstance(query, dict):
            since = query.get("from", "2000-01")
            to = query.get("to", datetime.now().strftime("%Y-%m"))
            months = [
                x for x in dataset.dimensions["month"].allowed_values
                if since <= x.label and x.label <= to
            ]

        for month in months:
            self.log.info("Fecthing data from {}".format(month.label))
            url = BASE_URL + "/BankruptcyStatisticsCategoryPage/GetStatistics"
            html = self._post_request(url, {"selectedvalue": month.value})
            for res in parse_result_page(html):
                value = res["value"]
                res.pop("value")
                res["month"] = month.label
                yield Result(value, res)
示例#4
0
    def _parse_result_page(self, url, payload, only_region=False, region=None):
        """ Get data from a result page
            :param url: url to query
            :param payload: payload to pass
            :return: a dictlist with data
        """
        data = []
        if only_region:
            html = self.scraper._get_html(url)
        else:
            try:
                html = self.scraper._post_html(url, payload=payload)
            except HTTPError as e:
                if e.response.status_code == 500:
                    self.scraper.log.warning(
                        u"Unable to get {} with {}".format(url, payload))
                    return []

        current_selection = self._get_current_selection(html)

        table = Datatable(html)
        data = []
        _region = None
        for row in table.data:
            region_or_unit_id, region_or_unit_label = row["region_or_unit"]
            region = self.regions.get_by_label(region_or_unit_label)
            if region:
                row["region"] = region.label
                row["unit"] = None
                row["unit_id"] = None
                _region = region.label
            else:
                assert region_or_unit_label is not None
                assert region_or_unit_id is not None

                row["region"] = _region
                row["unit"] = region_or_unit_label
                row["unit_id"] = region_or_unit_id

            value = row["value"]

            row.pop("value", None)
            row.pop("region_or_unit", None)
            for dim in self.dimensions:
                if dim.id not in row:
                    row[dim.id] = current_selection[dim.id][1]  # gets label
            data.append(Result(value, row))

        return data
示例#5
0
    def _fetch_data(self, dataset, query):
        """Make the actual query.

        The only queryable dimensions are period.

        >>> dataset.fetch({"period": "2016"})
        >>> dataset.fetch({"period": ["2015", "2016"]})
        >>> dataset.fetch({"period": "*"}) # Get all periods
        """
        default_query = {
            "period": dataset.latest_period[1],
        }
        if query is None:
            query = {}

        default_query.update(query)
        query = default_query
        allowed_query_dims = ["period"]

        for dim in query.keys():
            if dim not in allowed_query_dims:
                msg = "Querying on {} is not implemented yet".format(dim)
                raise NotImplementedError(msg)

        if query["period"] == "*":
            periods = [x[1] for x in dataset.periods]
        else:
            if not isinstance(query["period"], list):
                periods = [query["period"]]
            else:
                periods = query["period"]

        # Get the period id's needed to build url
        periods = [dataset._get_period_id(x) for x in periods]

        for period in periods:
            # Hack: For datasets with multiple uttag we get the latest
            # This should rather be a part of query
            if dataset.has_uttag:
                uttag = dataset.get_latest_uttag(period)[0]
            else:
                uttag = None
            url = dataset.get_xml_url(period, uttag)
            xml_data = self._get_html(url)
            for datapoint in get_data_from_xml(xml_data):
                value = datapoint["value"]
                del datapoint["value"]
                yield Result(value, datapoint)
示例#6
0
    def _fetch_data(self, dataset, query=None):
        files = [(y, m) for y in query['years'] for m in query['months']]
        frames = []

        # Download and clean every monthly Excel file
        for file in files:
            year, month = file
            url = self.BASE_URL.format(year=year, month=MONTHS[month])
            frame = self._clean_data(pd.read_excel(url), year, month)
            frames.append(frame)

        # Yield individual rows of type Result from the dataframe
        raw_data = pd.concat(frames)
        for i, row in raw_data.iterrows():
            val = row.pop('value')
            yield Result(val, json.loads(row.to_json()))
示例#7
0
    def _fetch_data(self, dataset, query):
        if query is None:
            query = {}
        body = {
            'query': [{
                'code': key,
                'selection': {
                    'filter': filtertype,
                    # value can be a list or a value
                    'values': value if isinstance(value, list) else [value]
                }
            } for key, (filtertype, value) in query.items()],
            'response': {
                'format': "json"
            }
        }
        try:
            raw = requests.post(self._api_path(dataset), json=body)
            if raw.headers["content-type"] == "text/html":
                # This is an error message
                raise(InvalidData(f"""Error message from PX Web:

{raw.content}

Check your query for spelling errors, or try reducing the size.
"""))
            data = raw.json()
        except JSONDecodeError:
            raise InvalidData("""No valid response from PX Web.
Check your query for spelling errors, or try reducing the size.
This error is frequently due to a too large result being requested.""")

        # All available dimensions are not always returned.
        # What is returned depends on the query
        raw_return_dimension = data["columns"]
        # Filter out dimensions only
        raw_return_dimension = [x for x in raw_return_dimension if x["type"] != "c"]

        for row in data[u"data"]:
            for value in row[u"values"]:
                dimensions = {}
                # 'key' contains one value for each dimension,
                # always preserving order.
                for d, v in zip(raw_return_dimension, row[u"key"]):
                    dimensions[d["code"]] = v

                yield Result(value, dimensions=dimensions)
示例#8
0
    def _fetch_data(self, dataset, query=None):
        (c, r, p) = dataset.blob

        self.browser\
            .find_element_by_xpath("//div[@title='Skicka till Excel']")\
            .click()
        # Press enter trice in case of any prompts
        actions = ActionChains(self.browser)
        actions.send_keys(Keys.RETURN)
        actions.send_keys(Keys.RETURN)
        actions.send_keys(Keys.RETURN)
        actions.perform()
        # Wait for download
        i = 0
        while not os.listdir(self.tempdir):
            sleep(1)
            i += 1
            if i > PAGELOAD_TIMEOUT:
                # TODO: Use a suitable basescraper exception
                raise Exception("Download timed out")
        sleep(20)  # TODO: We need to check that the file is complete.
        # Something like this:
        # https://stackoverflow.com/questions/35891393/how-to-get-file-download-complete-status-using-selenium-web-driver-c-sharp#35892347

        # WARNING: Assuming the latest downloaded xls to be our file.
        # This is obviously not 100 % water proof.
        latest_download = max(iglob(os.path.join(self.tempdir, "*.xls")),
                              key=os.path.getctime)
        workbook = open_workbook(latest_download)
        sheet = workbook.sheet_by_index(0)
        periods = sheet.row_values(0)[2:-1]
        periods = [int(x) for x in periods]
        for n in range(1, sheet.nrows):
            row = sheet.row_values(n)
            region = row.pop(0)
            row.pop(0)  # empty due to merged cells
            if region == "Total":
                break
            i = 0
            for col in row[:-1]:
                yield Result(int(col), {
                    "region": region,
                    "period": periods[i],
                })
示例#9
0
    def _fetch_data(self, dataset, query):
        """Make the actual query.
        """
        if query is None:
            query = {}
        # default query
        _query = {
            "year": [dataset.latest_year],
            "region": "Hela landet"
        }
        _query.update(query)
        allowed_query_dims = ["year", "region"]

        # Validate query
        for dim in query.keys():
            if dim not in allowed_query_dims:
                msg = "Querying on {} is not implemented yet".format(dim)
                raise NotImplementedError(msg)

        for dim, value in _query.iteritems():
            if value == "*":
                _query[dim] = [x.value for x in dataset.dimensions[dim].allowed_values]
            elif not isinstance(value, list):
                _query[dim] = [value]
        # get all input elem values
        payload = {}
        for input_elem in dataset.soup.select("input"):
            payload[input_elem["name"]] = input_elem.get("value")

        for region in _query["region"]:
            region_id = dataset._get_region_id(region)
            for year in _query["year"]:
                payload.update({
                    "ctl01$ctl11$lstCounties": region_id,
                    "ctl01$ctl11$lstYearInterval": year,
                })
                result_page = self._post_html(URL, payload)

                for datapoint in parse_result_page(result_page):
                    value = datapoint["value"]
                    del datapoint["value"]
                    yield Result(value, datapoint)
示例#10
0
 def _fetch_data(self, dataset, query=None):
     html = requests.get(
         "http://web05.lansstyrelsen.se/transtat_O/transtat.asp").text
     soup = BeautifulSoup(html, 'html.parser')
     table = soup.find("table",
                       "line").find_all("table")[2].findNext("table")
     rows = table.find_all("tr")
     column_headers = rows.pop(0).find_all("td", recursive=False)
     years = [x.text for x in column_headers[2:]]
     for row in rows:
         cells = row.find_all("td")
         date = cells.pop(0).text
         month = cells.pop(0).text
         i = 0
         for value in cells:
             # Each column from here is a year.
             if value.text:
                 yield Result(value.text.encode("utf-8"), {
                     "date": date,
                     "month": month,
                     "year": years[i],
                 })
             i += 1
示例#11
0
    def _fetch_data(self, dataset, query):
        url = "http://statistik.uka.se/4.5d85793915901d205f935d0f.12.5d85793915901d205f965eab.portlet?action=resultat&view=resultTable&frageTyp=3&frageNr=240&tid=%s&grupp1=%s&grupp2=%s"
        thenmap_url = "http://api.thenmap.net/v1/se-7/data/%s?data_props=name|kommunkod"
        # 6 is 1993, the first year in the db
        if query is None:
            query = {}
        if "from" not in query:
            query['from'] = 1993
        if "semesters" not in query:
            query['semesters'] = (2016 - query["from"]) * 2
        start = (query["from"] - 1993) * 2 + 5
        terms = range(start,
                      start + query["semesters"] + 2)
        for t in terms:
            # Get all municipalities, and their codes, from this year
            year = ((t - 5) / 2) + 1993
            semester = ["HT", "VT"][t % 2]
            municipalities = requests.get(thenmap_url % year).json()
            for id_, municipality_ in municipalities["data"].items():
                municipality = municipality_.pop()
                code = municipality["kommunkod"].zfill(4)
                c, m = code[:2], code[2:]
                html = requests.get(url % (t, c, m)).text
                soup = BeautifulSoup(html, 'html.parser')
                table = soup.find("table")
                # The first rows are headers, the last are empty
                rows = table.find_all("tr")[5:-2]
                for row in rows:
                    cells = row.find_all("td")

                    yield Result(cells[2].text.strip(), {
                        "municipality": municipality["name"],
                        "school": cells[0].text.strip(),
                        "semester": semester,
                        "year": year,
                    })
示例#12
0
    def _fetch_data(self, dataset, query):
        """

        """
        url = dataset.url.replace("val", "resultat")

        # Start building the POST payload from values of hidden inputs
        payload = dataset.hidden_inputs
        query_size = 1
        for dim_key, values in query.items():
            # Get dimensions from query by id
            # Will error on bad query
            try:
                dim = dataset.dimensions[dim_key]
            except NoSuchItem:
                dim_ids = [x.id for x in dataset.dimensions]
                msg = "{} is not a valid dimension id. Try: ".format(
                    dim_key, dim_ids)
                raise InvalidQuery(msg)

            if values == "*":
                values = [x.value for x in dim.allowed_values]
            elif not isinstance(values, list):
                values = [values]
            query_values = []
            for val in values:
                # validate the value passed used in query
                # will error if invalid value
                try:
                    dim_value = dim.allowed_values[val]
                except StopIteration:  # Odd exception from statscraper
                    dim_value = dim.allowed_values.get_by_label(val)
                    if dim_value is None:
                        msg = "{} is not an allowed value or label for {}. Try: {}"\
                            .format(val, dim_key, dim.allowed_values)
                        raise InvalidQuery(msg)

                query_values.append(dim_value.in_query)

            query_size *= len(query_values)
            query_value = "".join(query_values)  # ";01;;02;;03;"

            payload[dim.query_key] = query_value

        # Socialstyrelsen has a limit of 50 000 datapoints per query
        # TODO: Split into multiple queries automatically
        MAX_QUERY_SIZE = 50000
        if query_size > MAX_QUERY_SIZE:
            msg = ("Your query was too large: {}, threshold is {}. "
                   "Try splitting it into multiple queries.")\
                   .format(query_size, MAX_QUERY_SIZE)
            raise TooLargeQuery(msg)

        html = self._post_html(url, payload)

        # Check if the result is an error page
        error_msg = re.search("Fel med nummer (-?\d+)", html)
        if error_msg:
            # TODO: Find out what these error codes mean.
            # Known codes are -1  and 10.
            error_code = error_msg.group(1)
            msg = "Result page didn't render. Socialstyrelsen error code: {}".format(
                error_code)
            raise InvalidQuery(msg)

        for value, index in parse_result_table(html):
            yield Result(value, index)
示例#13
0
    def _fetch_data(self, dataset, query={}, include_inactive_stations=False):
        """ Should yield dataset rows
        """
        data = []
        parameter = dataset
        station_dim = dataset.dimensions["station"]
        all_stations = station_dim.allowed_values
        # Step 1: Prepare query
        if "station" not in query:
            if include_inactive_stations:
                # Get all stations
                query["station"] = list(all_stations)
            else:
                # Get only active stations
                query["station"] = list(station_dim.active_stations())
        else:
            if not isinstance(query["station"], list):
                query["station"] = [query["station"]]
            # Make sure that the queried stations actually exist
            query["station"] = [
                all_stations.get_by_label(x) for x in query["station"]
            ]

        if "period" not in query:
            # TODO: I'd prepare to do dataset.get("period").allowed_values here
            query["period"] = PERIODS

        elif not isinstance(query["period"], list):
            query["period"] = [query["period"]]

        for period in query["period"]:
            if period not in PERIODS:
                msg = u"{} is not an allowed period".format(period)
                raise Exception(msg)

        # Step 3: Get data
        n_queries = len(query["station"]) * len(query["period"])
        counter = 0
        print("Fetching data with {} queries.".format(n_queries))
        for station in query["station"]:
            for period in query["period"]:
                url = dataset.url\
                    .replace(".json", "/station/{}/period/{}/data.csv"\
                        .format(station.key, period))
                print("/GET {} ".format(url))
                r = requests.get(url)

                if r.status_code == 200:
                    raw_data = DataCsv().from_string(r.content).to_dictlist()

                    # TODO: This is a very hard coded parse function
                    # Expects fixed start row and number of cols
                    for row in raw_data:
                        #timepoint = datetime.strptime(timepoint_str, "%Y-%m-%d %H:%M:%S")
                        value_col = parameter.id.split(",")[0]
                        value = float(row[value_col])

                        row["parameter"] = parameter.id
                        row["station"] = station.label
                        row["station_key"] = station.key
                        row["period"] = period

                        row.pop(value_col, None)

                        datapoint = Result(value, row)

                        yield datapoint

                elif r.status_code == 404:
                    print("Warning no data at {}".format(url))
                else:
                    raise Exception("Connection error for {}".format(url))
示例#14
0
 def test_pandas_export(self):
     """Get results as pandas dataframe."""
     result = ResultSet()
     result.append(Result(45483, {'city': "Voi"}))
     df = result.pandas
     self.assertTrue(ptypes.is_numeric_dtype(df.value))
    def _fetch_data(self, dataset, query):
        """Make query for actual data.
        Get all regions and years by default.
        `period` (year), `municipality` and `municipality_groups` are the only 
        implemented queryable dimensions.

        :param query: a dict with dimensions and values to query by.
            Examples:
            {"municipality": ["0180"]}
            {"period": 2016 }
        """

        # Make query a dict if it already isn't
        if isinstance(query, dict) == False:
            query = {}

        # If nothing is set, default to all allowed municipalities
        queryable_dims = ['municipality', 'period', 'municipality_groups']
        if all([x not in query for x in queryable_dims]):
            query['municipality'] = []
            for x in dataset.dimensions['municipality'].allowed_values:
                query['municipality'].append(x.value)

        # Listify queried values (to allow single values in query, like {"year": 2016})
        for key, values in query.items():
            if not isinstance(values, list):
                query[key] = [values]
            # Format all values as strings for url creation
            query[key] = [str(x) for x in query[key]]

        # Validate query
        for dim in query.keys():
            if dim not in queryable_dims:
                raise Exception(
                    "You cannot query on dimension '{}'".format(dim))
            # Check if the values are allowed
            if dim in ('municipality', 'municipality_groups'):
                allowed = [
                    x.value for x in dataset.dimensions[dim].allowed_values
                ]
                for dimVal in query[dim]:
                    if dimVal not in allowed:
                        raise Exception(
                            "You cannot query on dimension '{}' with '{}'".
                            format(dim, dimVal))

        # base url for query
        next_url = '{}data/kpi/{}'.format(self.base_url, dataset.id)

        # Merge `municipality` and `municipality_groups`
        municipalities = []
        if 'municipality' in query:
            municipalities = municipalities + query['municipality']
        if 'municipality_groups' in query:
            municipalities = municipalities + query['municipality_groups']

        if len(municipalities) > 0:
            next_url += '/municipality/{}'.format(','.join(municipalities))
        if 'period' in query:
            next_url += '/year/{}'.format(','.join(query['period']))

        while next_url:
            print('/GET {}'.format(next_url))
            r = requests.get(next_url)
            r.raise_for_status()
            json_data = r.json()
            for row in json_data['values']:
                for d in row['values']:
                    yield Result(
                        d['value'], {
                            'kpi': dataset.id,
                            'kpi_label': dataset.label,
                            'municipality': row['municipality'],
                            'period': row['period'],
                            'gender': d['gender'],
                            'status': d['status'],
                        })

            #
            if 'next_page' in json_data:
                next_url = json_data['next_page']
            else:
                next_url = False
    def _fetch_data(self, dataset, query={}):
        """Make query for actual data.
        Get all regions and years by default.
        `period` (year) and `municipality` are the only implemented queryable
        dimensions.

        :param query: a dict with dimensions and values to query by.
            Examples:
            {"municipality": ["0180"]}
            {"period": 2016 }
        """
        #
        if "municipality" not in query and "period" not in query:
            query = {
                "municipality":
                [x.id for x in self.dimension["municipality"].allowed_values]
            }

        # Listify queried values (to allow single values in query, like {"year": 2016})
        for key, values in query.items():
            if not isinstance(values, list):
                query[key] = [values]
            # Format all values as strings for url creation
            query[key] = [str(x) for x in query[key]]

        # Validate query
        queryable_dims = ["municipality", "period"]
        for dim in query.keys():
            if dim not in queryable_dims:
                raise Exception(
                    "You cannot query on dimension '{}'".format(dim))
            #TODO: Make sure tha values passed in query are allowed.

        # base url for query
        next_url = '{}data/kpi/{}'.format(self.base_url, dataset.id)

        if "municipality" in query:
            next_url += "/municipality/{}".format(",".join(
                query["municipality"]))
        elif "period" in query:
            next_url += "/year/{}".format(",".join(query["period"]))

        while next_url:
            print("/GET {}".format(next_url))
            r = requests.get(next_url)
            r.raise_for_status()
            json_data = r.json()
            for row in json_data["values"]:
                for d in row["values"]:
                    yield Result(
                        d['value'], {
                            'kpi': dataset.id,
                            'kpi_label': dataset.label,
                            'municipality': row['municipality'],
                            'period': row['period'],
                            'gender': d['gender'],
                            'status': d['status'],
                        })

            #
            if "next_page" in json_data:
                next_url = json_data["next_page"]
            else:
                next_url = False
示例#17
0
class VantetiderDataset(Dataset):

    def get_url(self, region="Sverige"):
        if region=="Sverige":
            # Hack: _get_region_slug expects the page to be loaded, but to be
            # able to load the page we first have to show it on national levle
            region_slug = "Sveriges"
        else:
            region_slug = self._get_region_slug(region)
        return BASE_URL + region_slug + "/" + self.id + "/"

    @property
    def html(self):
        if not hasattr(self, "_html"):
            url = self.get_url()
            self._html = self.scraper._get_html(url)
        return self._html

    @property
    def soup(self):
        return BeautifulSoup(self.html, 'html.parser')

    @property
    def regions(self):
        """ Get a list of all regions
        """
        return self.dimensions["region"].allowed_values

    @property
    def years(self):
        """ Get a list of all available years
        """
        return self.dimensions["year"].allowed_values

    @property
    def latest_timepoint(self):
        """Get the latest available year and period.

        This method will have to be re-written to support pages with ajax load
        """
        return {
            "year": self.dimensions["year"].default_value,
            "period": self.dimensions["period"].default_value,
        }


    def _get_region_slug(self, id_or_label):
        """ Get the regional slug to be used in url
            "Norrbotten" => "Norrbottens"

            :param id_or_label: Id or label of region
        """
        region = self.regions.get_by_label(id_or_label)
        if region is None:
            try:
                region = self.regions[id_or_label]
            except StopIteration:
                # this is a strange error thrown by statscraper
                raise KeyError(u"{} is not a valid region id or label".format(id_or_label))

        slug = region.label\
            .replace(u" ","-")\
            .replace(u"ö","o")\
            .replace(u"Ö","O")\
            .replace(u"ä","a")\
            .replace(u"å","a")

        if not "region" in slug:
            slug = slug + "s"

        EXCEPTIONS = {
            "Jamtland-Harjedalens": "Jamtlands",
            "Rikets": "Sveriges",
            "Alla-landstings": "Sveriges",
        }
        if slug in EXCEPTIONS:
            slug = EXCEPTIONS[slug]

        return slug

    def _parse_result_page(self, url, payload, only_region=False, region=None):
        """ Get data from a result page
            :param url: url to query
            :param payload: payload to pass
            :return: a dictlist with data
        """
        data = []
        if only_region:
            html = self.scraper._get_html(url)
        else:
            try:
                html = self.scraper._post_html(url, payload=payload)
            except HTTPError, e:
                if e.response.status_code == 500:
                    self.scraper.log.warning(u"Unable to get {} with {}".format(url, payload))
                    return []

        current_selection = self._get_current_selection(html)

        table = Datatable(html)
        data = []
        _region = None
        for row in table.data:
            region_or_unit_id, region_or_unit_label = row["region_or_unit"]
            region = self.regions.get_by_label(region_or_unit_label)
            if region:
                row["region"] = region.label
                row["unit"] = None
                row["unit_id"] = None
                _region = region.label
            else:
                assert region_or_unit_label is not None
                assert region_or_unit_id is not None

                row["region"] = _region
                row["unit"] = region_or_unit_label
                row["unit_id"] = region_or_unit_id

            value = row["value"]

            row.pop("value", None)
            row.pop("region_or_unit", None)
            for dim in self.dimensions:
                if dim.id not in row:
                    row[dim.id] = current_selection[dim.id][1] # gets label
            data.append(Result(value, row))

        return data