def test_get_node_text(self): elem = self.lxml_tree.xpath('//div[@id="bee"]')[0] self.assertEqual(get_node_text(elem), u'пчела mozilla = 777; body { color: green; }') self.assertEqual(get_node_text(elem, smart=True), u'пче ла') elem = self.lxml_tree.xpath('//div[@id="fly"]')[0] self.assertEqual(get_node_text(elem), u'му ха')
def strip_tags(self, content, smart=False): """ Strip tags from the HTML content. """ from lxml.html import fromstring return get_node_text(fromstring(content), smart=smart)
def text(self, smart=False, normalize_space=True): if self.is_text_node(): if normalize_space: return normalize_space_func(self.node()) else: return self.node() else: return get_node_text(self.node(), smart=smart, normalize_space=normalize_space)
def css_text(self, path, default=NULL, smart=False, normalize_space=True): """ Get normalized text of node which matches the css path. """ try: return get_node_text(self.css_one(path), smart=smart, normalize_space=normalize_space) except IndexError: if default is NULL: raise else: return default
def parse_search_results(grab, parse_index_size=False, strict_query=False): """ Parse yandex search results page content. """ if is_banned(grab): raise CaptchaError('Captcha found') elif grab.xpath_exists('//div[contains(@class, "b-error")]'): err_msg = grab.xpath_text('//div[contains(@class, "b-error")]') logging.debug('Found error message: %s' % err_msg) return [] elif grab.xpath_exists('//ol[contains(@class, "b-serp-list")]'): # TODO: #if (strict_query and ( #grab.search(u'Нет результатов для') or grab.search(u'No results found for'))): #pass #logging.debug('Query modified') results = [] # TODO: parse_index_size # Yield found results results = [] page_num = int( grab.xpath_text('//b[contains(@class, "b-pager__current")]')) for elem in grab.xpath_list('//li[contains(@class, "b-serp-item")]'): try: try: title_elem = elem.xpath('.//h2/a')[0] snippet = get_node_text( elem.xpath( './/div[contains(@class, "b-serp-item__text")]') [0]) except IndexError: # this is video item or something like that pass else: item = { 'page': page_num, } # url item['url'] = title_elem.get('href') #if url.startswith('/url?'): #url = url.split('?q=')[1].split('&')[0] #url = unquote_plus(url) item['position'] = int( elem.xpath( './/h2/b[contains(@class, "b-serp-item__number")]/text()' )[0]) # title item['title'] = get_node_text(title_elem) item['snippet'] = snippet results.append(item) except Exception as ex: logging.error('', exc_info=ex) return results else: print('parsing error') raise ParsingError('Could not identify yandex page format')
def parse_search_results(grab, parse_index_size=False, strict_query=False): """ Parse google search results page content. """ #elif grab.search(u'please type the characters below'): if grab.response.code == 403: raise AccessDenied('Access denied (HTTP 403)') elif grab.search(u'src="/sorry/image'): # Captcha!!! raise CaptchaFound('Captcha found') elif grab.css_exists('#ires'): if strict_query and \ grab.search(u'Нет результатов для') or \ grab.search(u'No results found for'): pass logging.debug('Query modified') else: if len(grab.css_list('#ires h3')): # Something was found if parse_index_size: index_size = parse_index_size(grab) else: index_size = None # Yield found results results = [] for elem in grab.xpath_list('//*[h3[@class="r"]/a]'): title_elem = elem.xpath('h3/a')[0] # url url = title_elem.get('href') if url.startswith('/url?'): url = url.split('?q=')[1].split('&')[0] url = unquote_plus(url) # title title = get_node_text(title_elem) # snippet # Google could offer two type of snippet format: simple and extended # It depends on user agent # For <IE8, Opera, <FF3 you probably get simple format try: snippet_node = elem.xpath('div[@class="s"]')[0] except IndexError as ex: # Probably it is video or some other result # Such result type is not supported yet continue try: subnode = snippet_node.xpath('span[@class="st"]')[0] snippet = get_node_text(subnode, smart=False) extended_result = True except IndexError: drop_node(snippet_node, 'div') drop_node(snippet_node, 'span[@class="f"]') snippet = get_node_text(snippet_node, smart=False) extended_result = False # filetype try: filetype = elem.xpath('.//span[contains(@class, "xsm")]'\ '/text()')[0].lower().strip('[]') except IndexError: filetype = None #if 'File Format': if url: results.append({ 'url': url, 'title': title, 'snippet': snippet, 'filetype': filetype, 'index_size': index_size, 'extended': extended_result, }) return results else: pass #return [] elif grab.css_exists('#res'): # Could be search results here? # or just message "nothing was found"? pass else: raise ParsingError('Could not identify google page format')
def task_html(self, grab, task): """ Process index page for each blog to extract valuable data """ logger.info("Get blog: {}".format(grab.response.url)) # find nodes with text nodes_with_text = filter( None, map( lambda x: x.strip(), grab.doc('//body//*').text_list() ) ) # extract text from the document # excluding script and style tags doc_text = get_node_text( grab.doc('//body').node(), smart=True ).encode('utf-8', 'ignore') page_params = { 'doc_size_bytes': sys.getsizeof(grab.response.body), 'doc_size_chars': len(grab.response.body), 'nodes_with_text': len(nodes_with_text), 'content_text_chars': len(doc_text), 'content_text_bytes': sys.getsizeof(doc_text), } # extract links to scripts and css # used on page scripts = grab.doc( '//script[@src]').attr_list('src') css = grab.doc( '//link[@rel="stylesheet"]').attr_list('href') page_imports = { 'scripts': filter(None, scripts), 'css': filter(None, css), } # extract meta attributes page_meta = { 'title': grab.doc('//title').text().lower(), 'description': grab.doc( '//meta[contains(@name, "description")]' ).attr('content', default=u"").lower(), 'keywords': grab.doc( '//meta[contains(@name, "keywords")]' ).attr('content', default=u"").lower(), 'charset': grab.doc('//meta[@charset]').attr( 'charset', default=u"").lower() } data = { 'blog': task.blog, 'source_url': grab.response.url, 'params': page_params, 'imports': page_imports, 'meta': page_meta, } self.save_blog(data) # get rss feed for this blog # we clone grab object so it would be # if user go to the blog page and click rss link g = grab.clone() g.setup(url=task.blog['rss']) yield Task('rss', grab=g, data=data)
def parse_search_results(grab, parse_index_size=False, strict_query=False): """ Parse yandex search results page content. """ if is_banned(grab): raise CaptchaError('Captcha found') elif grab.xpath_exists('//div[contains(@class, "b-error")]'): err_msg = grab.xpath_text('//div[contains(@class, "b-error")]') logging.debug('Found error message: %s' % err_msg) return [] elif grab.xpath_exists('//ol[contains(@class, "b-serp-list")]'): # TODO: #if (strict_query and ( #grab.search(u'Нет результатов для') or grab.search(u'No results found for'))): #pass #logging.debug('Query modified') results = [] # TODO: parse_index_size # Yield found results results = [] page_num = int(grab.xpath_text('//b[contains(@class, "b-pager__current")]')) for elem in grab.xpath_list('//li[contains(@class, "b-serp-item")]'): try: try: title_elem = elem.xpath('.//h2/a')[0] snippet = get_node_text( elem.xpath('.//div[contains(@class, "b-serp-item__text")]')[0]) except IndexError: # this is video item or something like that pass else: item = { 'page': page_num, } # url item['url'] = title_elem.get('href') #if url.startswith('/url?'): #url = url.split('?q=')[1].split('&')[0] #url = unquote_plus(url) item['position'] = int(elem.xpath( './/h2/b[contains(@class, "b-serp-item__number")]/text()')[0]) # title item['title'] = get_node_text(title_elem) item['snippet'] = snippet results.append(item) except Exception as ex: logging.error('', exc_info=ex) return results else: print('parsing error') raise ParsingError('Could not identify yandex page format')