def parse_item(self, response): item = HeelsItem() item['comment'] = response.xpath('//*[@id="thread_subject"]/text()').extract() item['image_urls'] = response.xpath('//ignore_js_op//img/@zoomfile').extract() item['source_url'] = response.url return item
def parse_post_detail(self, response): hxs = HtmlXPathSelector(response) item = HeelsItem() item['comment'] = hxs.select('//title/text()').extract() item['image_urls'] = hxs.select( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]//img/@src').extract() item['source_url'] = response.url return item
def parse_post_detail(self, response): """ Scrapy creates scrapy.http.Request objects for each URL in the start_urls attribute of the Spider, and assigns them the parse method of the spider as their callback function. """ hxs = HtmlXPathSelector(response) item = HeelsItem() item['comment'] = hxs.select('//title/text()').extract() item['image_urls'] = hxs.select('//*[@id="Blog1"]//div[contains(@class, "post")]//div[contains(@class, "post-body")]//img/@src').extract() item['source_url'] = response.url return item
def parse_post_detail(self, response): """ Scrapy creates scrapy.http.Request objects for each URL in the start_urls attribute of the Spider, and assigns them the parse method of the spider as their callback function. """ sel = Selector(response, type='html') item = HeelsItem() item['comment'] = sel.xpath('//title/text()').extract() item['image_urls'] = sel.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]//img/@src').extract() item['source_url'] = response.url return item
def parse_item_detail(self, response): """ Scrapy creates scrapy.http.Request objects for each URL in the start_urls attribute of the Spider, and assigns them the parse method of the spider as their callback function. """ hxs = HtmlXPathSelector(response) item = HeelsItem() item['comment'] = hxs.select('//*[@id="content"]//figure//figcaption/text()').extract() item['image_urls'] = hxs.select('//*[@id="content"]//span[contains(@class, "wrapper-fig-image")]//img/@src').extract() item['source_url'] = response.url return item
def parse_pin_detail(self, response): hxs = HtmlXPathSelector(response) item = HeelsItem() item['comment'] = hxs.select('//title/text()').extract() urls_1 = hxs.select( '//div[contains(@class, "pinWrapper")]//div[contains(@class, "pinImageSourceWrapper")]//img/@src' ).extract() urls_2 = hxs.select( '//div[contains(@class, "pinWrapper")]//div[contains(@class, "pinImageSourceWrapper")]//a/@href' ).extract() item['image_urls'] = urls_1 + urls_2 item['source_url'] = response.url return item