def parse_property(self, response): loader = PropertyLoader(item=PropertyItem(), response=response) loader.add_css("header", "div.headline > h2::text") loader.add_css("description", "div.text::text") loader.add_css("price", "div.price strong span::text") loader.add_css("postal_code", "div.location span.address span.postal-code::text") loader.add_css("city", "div.location span.address span.locality::text") loader.add_css("obid", "div.date-and-clicks > strong:nth-child(1)") loader.add_css("ad_created", "div.date-and-clicks::text") loader.add_css("phone", "ul.contacts > li > span:nth-child(2)::text") loader.add_value("created", date.today()) loader.add_value("url", response.url) loader.add_value("commercial", response.meta.get("commercial")) loader.add_value("property_type", response.meta.get("property_type")) loader.add_value("city_category", response.meta.get("city_category")) # stats item = loader.load_item() return item
def parse_page(self, response): """crawl properties on a page""" # handle "Partner-Anzeigen" # need to identify correct css/xpath #print len(response.xpath("//div[text() = 'Partner-Anzeige']")) #print len(response.css("div #ResultListData > ul.alist > li[data-ssp]")) #from scrapy.shell import inspect_response #inspect_response(response, self) for box in response.css("div #ResultListData > ul > li[data-ssp]"): loader = PropertyLoader(item=PropertyItem(), response=response) #loader.add_css("header", box.css("h3::text")) loader.add_value("advertiser_id", "Immobilienscout24") loader.add_value("commercial", response.meta.get("commercial")) loader.add_value("property_type", response.meta.get("property_type")) loader.add_value("city_category", response.meta.get("city_category")) # stats item = loader.load_item() yield item # handle non-"Partner-Anzeigen" for sel in response.css("div #ResultListData > ul > li.hlisting > div.n2 > a::attr(\"href\")"): url = add_scheme_host(sel.extract()) yield scrapy.Request(url, self.parse_property, meta=response.meta)