def extract_results(self, response): selector = Selector(response.text) for item in selector.css('li.b_algo'): title = item.css('h2>a')[0].text.strip() text = None span = item.css('div.b_caption>p') if len(span) > 0: text = span[0].text.strip() url = item.css('h2>a')[0].attr('href').strip() if text is not None: yield {'title': title, 'text': text, 'url': url}
def extract_results(self, response): selector = Selector(response.text) for item in selector.css('div.g'): title = item.css('h3')[0].text.strip() text = None span_st = item.css('span.st') if len(span_st) > 0: text = span_st[0].text.strip() url = item.css('div.r>a')[0].attr('href').strip() if text is not None: yield {'title': title, 'text': text, 'url': url}
def extract_results(self, response): selector = Selector(response.text) for item in selector.css('div.result'): title = item.css('h3>a')[0].text.strip() text = None abstract = item.css('div.c-abstract') if len(abstract) > 0: text = abstract[0].text.strip() url = item.css('h3>a')[0].attr('href').strip() if text is not None: yield {'title': title, 'text': text, 'url': url}
def extract_results(self, response): selector = Selector(response.text) for item in selector.css('li.algo-result'): title = item.css('a.algo-title')[0].text.strip() text = None span = item.css('span.algo-summary') if len(span) > 0: text = span[0].text.strip() url = item.css('a.algo-title')[0].attr('href').strip() if text is not None: yield {'title': title, 'text': text, 'url': url}
def extract_results(self, response): selector = Selector(response.text) for item in selector.css('div.algo-sr'): title = item.css('h3>a')[0].text.strip() text = None p_lh_l = item.css('p.lh-l') if len(p_lh_l) > 0: text = p_lh_l[0].text.strip() url = item.css('h3>a')[0].attr('href').strip() url = unquote(self.yahoo_url_reg.search(url).group(1)) if text is not None: yield {'title': title, 'text': text, 'url': url}
def extract_results(self, response): selector = Selector(response.text) for item in selector.css('li.reItem'): a = item.css('h2>a') if len(a) <= 0: continue title = a[0].text.strip() text = None div = item.css('div.reNewsWrapper') if len(div) > 0: text = div[0].text.strip().split('\n')[0] url = urljoin('http://www.chinaso.com/search/', item.css('h2>a')[0].attr('href').strip()) if text is not None: yield {'title': title, 'text': text, 'url': url}
def parse(self, response): selector = Selector(response.text) for quote in selector.xpath('//div[@class="quote"]'): text = quote.xpath('.//span[@itemprop="text"]')[0].text author = quote.xpath('.//small[@itemprop="author"]')[0].text author_url = quote.xpath('.//span/a/@href')[0].text author_url = urljoin(str(response.url), author_url) tags = quote.xpath('.//div[@class="tags"]/a').text self.log( 'quote: %s', dict(text=text, tags=tags, author=author, author_url=author_url)) next_page_url = selector.xpath('//li[@class="next"]/a/@href')[0].text self.log('next page url: %s', next_page_url)
def parse(self, response): selector = Selector(response.text) for quote in selector.css('div.quote'): text = quote.css('span.text')[0].text author = quote.css('small.author')[0].text author_url = quote.css('small+a')[0].attr('href') author_url = urljoin(str(response.url), author_url) tags = quote.css('div.tags a').text self.quotes.append( dict(text=text, tags=tags, author=author, author_url=author_url)) next_page = selector.css('li.next a') if len(next_page) > 0: next_page_url = urljoin(str(response.url), next_page[0].attr('href')) yield HttpRequest(next_page_url, callback=self.parse)
def extract_results(self, response): selector = Selector(response.text) for item in selector.css('li.res-list'): title = item.css('h3>a')[0].text.strip() text = None res_desc = item.css('p.res-desc') if len(res_desc) > 0: text = res_desc[0].text.strip() else: res_rich = item.css('div.res-rich') if len(res_rich) > 0: text = res_rich[0].text.strip() h3_a = item.css('h3>a')[0] url = h3_a.attr('data-url') if not url: url = h3_a.attr('href').strip() if text is not None: yield {'title': title, 'text': text, 'url': url}
def extract_results(self, response): selector = Selector(response.text) for item in selector.css('div.vrwrap,div.rb'): h = item.css('h3>a') if len(h) <= 0: continue title = h[0].text.strip() text = None div_ft = item.css('div.ft') if len(div_ft) > 0: text = div_ft[0].text.strip() else: p_str = item.css('p.str_info') if len(p_str) > 0: text = p_str[0].text.strip() url = urljoin('https://www.sogou.com/', item.css('h3>a')[0].attr('href').strip()) if text is not None: yield {'title': title, 'text': text, 'url': url}
def parse(self, response): selector = Selector(response.text) hot = selector.css("div.hotnews a").text self.log("Hot News:") for i in range(len(hot)): self.log("%s: %s", i + 1, hot[i])
async def parse(self, response): selector = Selector(response.text) tags = selector.xpath("//div[contains(@class, 'tags-box')]//a").text self.log("Top ten tags: %s", tags) yield HttpRequest("http://quotes.toscrape.com/", callback=self.parse)
<a href="/author/J-K-Rowling">(about)</a> </span> <div class="tags"> Tags: <a class="tag" href="/tag/abilities/page/1/">abilities</a> <a class="tag" href="/tag/choices/page/1/">choices</a> </div> </div> </body> </html> ''' if __name__ == '__main__': from xpaw import Selector selector = Selector(text) print('# CSS Selector, content of quotes:') for quote in selector.css('div.quote'): print(quote.css('span.text')[0].text) print('# XPath, content of quotes:') for quote in selector.xpath('//div[@class="quote"]'): print(quote.xpath('.//span[@class="text"]')[0].text) print('# CSS Selector, content of quotes, with HTML tags:') for quote in selector.css('div.quote'): print(quote.css('span.text')[0].string) print('# CSS Selector, quote tags') for quote in selector.css('div.quote'):
def parse_results(engine, resp): res = [] selector = Selector(resp.text) if engine == 'Google': if 'google.com.hk/sorry/' in resp.url: raise ServiceBanError topstuff = selector.css('#topstuff').text if len(topstuff) > 0 and '未找到符合' in topstuff[0]: pass else: for item in selector.css('div.g'): try: title = item.css('h3')[0].text.strip() text = None span_st = item.css('span.st') if len(span_st) > 0: text = span_st[0].text.strip() url = item.css('div.r>a')[0].attr('href').strip() if text is not None: res.append({'title': title, 'text': text, 'url': url}) except Exception: pass elif engine == 'Yahoo': for item in selector.css('div.algo-sr'): try: title = item.css('h3>a')[0].text.strip() text = None p_lh_l = item.css('p.lh-l') if len(p_lh_l) > 0: text = p_lh_l[0].text.strip() url = item.css('h3>a')[0].attr('href').strip() url = unquote(yahoo_url_reg.search(url).group(1)) if text is not None: res.append({'title': title, 'text': text, 'url': url}) except Exception: pass elif engine == 'Ask': for item in selector.css('li.algo-result'): try: title = item.css('a.algo-title')[0].text.strip() text = None span = item.css('span.algo-summary') if len(span) > 0: text = span[0].text.strip() url = item.css('a.algo-title')[0].attr('href').strip() if text is not None: res.append({'title': title, 'text': text, 'url': url}) except Exception: pass elif engine == 'Bing': for item in selector.css('li.b_algo'): try: title = item.css('h2>a')[0].text.strip() text = None span = item.css('div.b_caption>p') if len(span) > 0: text = span[0].text.strip() url = item.css('h2>a')[0].attr('href').strip() if text is not None: res.append({'title': title, 'text': text, 'url': url}) except Exception: pass return res
def parse(self, response): selector = Selector(response.text) for quote in selector.css('div.quote'): text = quote.css('span.text')[0].text author = quote.css('small.author')[0].text self.log(author + ": " + text)