Пример #1
0
    def parse_item(self, response):
        """Scrape data from the country' page."""
        i = ItemLoader(item=CityItem(), response=response)

        country = response.xpath("//span[@itemprop='name']/text()").extract()[-1]
        i.add_value("country", country)

        quality_list = response.css(".table_indices ::text").extract()
        quality_of_life_index = [i.strip() for i in quality_list][-3]
        i.add_value("quality_of_life_index", quality_of_life_index)

        purchasing_power_index = response.xpath(
            "//a[contains(text(), "
            "'Purchasing Power Index')]/parent::td/following-sibling::td/text()"
        ).extract_first()
        i.add_value("purchasing_power_index", purchasing_power_index)

        safety_index = response.xpath(
            "//a[contains(text(), "
            "'Safety Index')]/parent::td/following-sibling::td/text()"
        ).extract_first()
        i.add_value("safety_index", safety_index)

        health_care_index = response.xpath(
            "//a[contains(text(), "
            "'Health Care Index')]/parent::td/following-sibling::td/text()"
        ).extract_first()
        i.add_value("health_care_index", health_care_index)

        climate_index = response.xpath(
            "//a[contains(text(), "
            "'Climate Index')]/parent::td/following-sibling::td/text()"
        ).extract_first()
        i.add_value("climate_index", climate_index)

        cost_of_living_index = response.xpath(
            "//a[contains(text(), "
            "'Cost of Living Index')]/parent::td/following-sibling::td/text()"
        ).extract_first()
        i.add_value("cost_of_living_index", cost_of_living_index)

        property_price_to_income_ratio = response.xpath(
            "//a[contains(text(), "
            "'Property Price to Income Ratio')]/parent::td/following-sibling::td/text()"
        ).extract_first()
        i.add_value("property_price_to_income_ratio", property_price_to_income_ratio)

        traffic_commute_time_index = response.xpath(
            "//a[contains(text(), "
            "'Traffic Commute Time Index')]/parent::td/following-sibling::td/text()"
        ).extract_first()
        i.add_value("traffic_commute_time_index", traffic_commute_time_index)

        pollution_index = response.xpath(
            "//a[contains(text(), "
            "'Pollution Index')]/parent::td/following-sibling::td/text()"
        ).extract_first()
        i.add_value("pollution_index", pollution_index)

        try:
            base_url = "https://freedomhouse.org/report/freedom-world/2018/{}"
            res = requests.get(base_url.format(country))
            soup_text = BeautifulSoup(res.text, "lxml").text
            regex = r"Aggregate Score:(.{0,9})"
            reg = re.compile(regex)
            sc_dirty = reg.search(soup_text).group(1)
            score = "".join([s for s in sc_dirty.split()[0] if s.isdigit()])
            score = float(score)
        except AttributeError:
            score = None

        i.add_value("freedomhouse_score", score)
        yield i.load_item()