def parse_item(self, response): """Scrape data from the country' page.""" i = ItemLoader(item=CityItem(), response=response) country = response.xpath("//span[@itemprop='name']/text()").extract()[-1] i.add_value("country", country) quality_list = response.css(".table_indices ::text").extract() quality_of_life_index = [i.strip() for i in quality_list][-3] i.add_value("quality_of_life_index", quality_of_life_index) purchasing_power_index = response.xpath( "//a[contains(text(), " "'Purchasing Power Index')]/parent::td/following-sibling::td/text()" ).extract_first() i.add_value("purchasing_power_index", purchasing_power_index) safety_index = response.xpath( "//a[contains(text(), " "'Safety Index')]/parent::td/following-sibling::td/text()" ).extract_first() i.add_value("safety_index", safety_index) health_care_index = response.xpath( "//a[contains(text(), " "'Health Care Index')]/parent::td/following-sibling::td/text()" ).extract_first() i.add_value("health_care_index", health_care_index) climate_index = response.xpath( "//a[contains(text(), " "'Climate Index')]/parent::td/following-sibling::td/text()" ).extract_first() i.add_value("climate_index", climate_index) cost_of_living_index = response.xpath( "//a[contains(text(), " "'Cost of Living Index')]/parent::td/following-sibling::td/text()" ).extract_first() i.add_value("cost_of_living_index", cost_of_living_index) property_price_to_income_ratio = response.xpath( "//a[contains(text(), " "'Property Price to Income Ratio')]/parent::td/following-sibling::td/text()" ).extract_first() i.add_value("property_price_to_income_ratio", property_price_to_income_ratio) traffic_commute_time_index = response.xpath( "//a[contains(text(), " "'Traffic Commute Time Index')]/parent::td/following-sibling::td/text()" ).extract_first() i.add_value("traffic_commute_time_index", traffic_commute_time_index) pollution_index = response.xpath( "//a[contains(text(), " "'Pollution Index')]/parent::td/following-sibling::td/text()" ).extract_first() i.add_value("pollution_index", pollution_index) try: base_url = "https://freedomhouse.org/report/freedom-world/2018/{}" res = requests.get(base_url.format(country)) soup_text = BeautifulSoup(res.text, "lxml").text regex = r"Aggregate Score:(.{0,9})" reg = re.compile(regex) sc_dirty = reg.search(soup_text).group(1) score = "".join([s for s in sc_dirty.split()[0] if s.isdigit()]) score = float(score) except AttributeError: score = None i.add_value("freedomhouse_score", score) yield i.load_item()