def parse(self, response): sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') for site in sites: il = WebsiteLoader(response=response, selector=site) il.add_xpath('name', 'a/text()') il.add_xpath('url', 'a/@href') il.add_xpath('description', 'text()', re='-\s([^\n]*?)\\n') yield il.load_item()
def parse(self, response): #hxs = Selector(response) #sites = hxs.select('//ul[@class="directory-url"]/li') sites = response.xpath("//div[@class='site-item ']") for site in sites: il = WebsiteLoader(response=response, selector=site) il.add_xpath('name', "div[@class='title-and-desc']/a/div/text()") il.add_xpath('url', "div[@class='title-and-desc']/a/@href") il.add_xpath('description', "div/div[@class='site-descr ']/text()") yield il.load_item()
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sites = response.xpath('//ul[@class="directory-url"]/li') for site in sites: il = WebsiteLoader(response=response, selector=site) il.add_xpath('name', 'a/text()') il.add_xpath('url', 'a/@href') il.add_xpath('description', 'text()', re='-\s([^\n]*?)\\n') yield il.load_item()
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ hxs = HtmlXPathSelector(response) sites = hxs.select('//ul[@class="directory-url"]/li') for site in sites: il = WebsiteLoader(response=response, selector=site) il.add_xpath('name', 'a/text()') il.add_xpath('url', 'a/@href') il.add_xpath('description', 'text()', re='-\s([^\n]*?)\\n') yield il.load_item()