def test_get_node_text(self): elem = self.lxml_tree.xpath('//div[@id="bee"]')[0] self.assertEqual(get_node_text(elem), u'пчела mozilla = 777; body { color: green; }') self.assertEqual(get_node_text(elem, smart=True), u'пче ла') elem = self.lxml_tree.xpath('//div[@id="fly"]')[0] self.assertEqual(get_node_text(elem), u'му ха')
def task_yarpage(self, grab, task): css_path = "div.b-serp-item__inner" for elem in grab.css_list(css_path): row = {} for childelems in elem.iterchildren(): if 'b-serp-item__price' == childelems.attrib['class']: row['price'] = find_node_number(childelems, ignore_spaces=True) if 'b-serp-item__header' == childelems.attrib['class']: row['header'] = get_node_text(childelems) ahref = childelems.iterchildren() row['link'] = list(ahref)[0].get('href') if 'b-serp-item__about' == childelems.attrib['class']: row['about'] = get_node_text(childelems) if 'b-serp-item__address' == childelems.attrib['class']: adresselems = childelems.iterchildren() adress_and_subway = list(adresselems)[1] adress = adress_and_subway.text adress_and_subway_iter = adress_and_subway.iterchildren() subway = list(adress_and_subway_iter)[0].text row['adress'] = adress row['subway'] = subway if 'b-serp-item__owner' == childelems.attrib['class']: row['owner'] = get_node_text(childelems) row['time'] = int(time.time()) self.csvfilesaver.save(listrow(row)) grab.url.split('=page')
def parse_search_results(grab, parse_index_size=False, strict_query=False): """ Parse yandex search results page content. """ if is_banned(grab): raise CaptchaError('Captcha found') elif grab.xpath_exists('//div[contains(@class, "b-error")]'): err_msg = grab.xpath_text('//div[contains(@class, "b-error")]') logging.debug('Found error message: %s' % err_msg) return [] elif grab.xpath_exists('//ol[contains(@class, "b-serp-list")]'): # TODO: #if (strict_query and ( #grab.search(u'Нет результатов для') or grab.search(u'No results found for'))): #pass #logging.debug('Query modified') results = [] # TODO: parse_index_size # Yield found results results = [] page_num = int(grab.xpath_text('//b[contains(@class, "b-pager__current")]')) for elem in grab.xpath_list('//li[contains(@class, "b-serp-item")]'): try: try: title_elem = elem.xpath('.//h2/a')[0] snippet = get_node_text( elem.xpath('.//div[contains(@class, "b-serp-item__text")]')[0]) except IndexError: # this is video item or something like that pass else: item = { 'page': page_num, } # url item['url'] = title_elem.get('href') #if url.startswith('/url?'): #url = url.split('?q=')[1].split('&')[0] #url = urllib.unquote_plus(url) item['position'] = int(elem.xpath( './/h2/b[contains(@class, "b-serp-item__number")]/text()')[0]) # title item['title'] = get_node_text(title_elem) item['snippet'] = snippet results.append(item) except Exception, ex: logging.error('', exc_info=ex) return results
def strip_tags(self, content, smart=False): """ Strip tags from the HTML content. """ from lxml.html import fromstring return get_node_text(fromstring(content), smart=smart)
def text(self, smart=False, normalize_space=True): elem = self.node if isinstance(elem, basestring): if normalize_space: return normalize_space_func(elem) else: return elem else: return get_node_text(elem, smart=smart, normalize_space=normalize_space)
def css_text(self, path, default=NULL, smart=False, normalize_space=True): """ Get normalized text of node which matches the css path. """ try: return get_node_text(self.css_one(path), smart=smart, normalize_space=normalize_space) except IndexError: if default is NULL: raise else: return default
def parse_search_results(grab, parse_index_size=False, strict_query=False): """ Parse yandex search results page content. """ if is_banned(grab): raise CaptchaError('Captcha found') elif grab.xpath_exists('//div[contains(@class, "b-error")]'): err_msg = grab.xpath_text('//div[contains(@class, "b-error")]') logging.debug('Found error message: %s' % err_msg) return [] elif grab.xpath_exists('//ol[contains(@class, "b-serp-list")]'): # TODO: #if (strict_query and ( #grab.search(u'Нет результатов для') or grab.search(u'No results found for'))): #pass #logging.debug('Query modified') results = [] # TODO: parse_index_size # Yield found results results = [] page_num = int( grab.xpath_text('//b[contains(@class, "b-pager__current")]')) for elem in grab.xpath_list('//li[contains(@class, "b-serp-item")]'): try: try: title_elem = elem.xpath('.//h2/a')[0] snippet = get_node_text( elem.xpath( './/div[contains(@class, "b-serp-item__text")]') [0]) except IndexError: # this is video item or something like that pass else: item = { 'page': page_num, } # url item['url'] = title_elem.get('href') #if url.startswith('/url?'): #url = url.split('?q=')[1].split('&')[0] #url = unquote_plus(url) item['position'] = int( elem.xpath( './/h2/b[contains(@class, "b-serp-item__number")]/text()' )[0]) # title item['title'] = get_node_text(title_elem) item['snippet'] = snippet results.append(item) except Exception as ex: logging.error('', exc_info=ex) return results else: print('parsing error') raise ParsingError('Could not identify yandex page format')
def parse_search_results(grab, parse_index_size=False, strict_query=False): """ Parse google search results page content. """ #elif grab.search(u'please type the characters below'): if grab.response.code == 403: raise AccessDenied('Access denied (HTTP 403)') elif grab.search(u'src="/sorry/image'): # Captcha!!! raise CaptchaFound('Captcha found') elif grab.css_exists('#ires'): if strict_query and \ grab.search(u'Нет результатов для') or \ grab.search(u'No results found for'): pass logging.debug('Query modified') else: if len(grab.css_list('#ires h3')): # Something was found if parse_index_size: index_size = parse_index_size(grab) else: index_size = None # Yield found results results = [] for elem in grab.xpath_list('//*[h3[@class="r"]/a]'): title_elem = elem.xpath('h3/a')[0] # url url = title_elem.get('href') if url.startswith('/url?'): url = url.split('?q=')[1].split('&')[0] url = unquote_plus(url) # title title = get_node_text(title_elem) # snippet # Google could offer two type of snippet format: simple and extended # It depends on user agent # For <IE8, Opera, <FF3 you probably get simple format try: snippet_node = elem.xpath('div[@class="s"]')[0] except IndexError as ex: # Probably it is video or some other result # Such result type is not supported yet continue try: subnode = snippet_node.xpath('span[@class="st"]')[0] snippet = get_node_text(subnode, smart=False) extended_result = True except IndexError: drop_node(snippet_node, 'div') drop_node(snippet_node, 'span[@class="f"]') snippet = get_node_text(snippet_node, smart=False) extended_result = False # filetype try: filetype = elem.xpath('.//span[contains(@class, "xsm")]'\ '/text()')[0].lower().strip('[]') except IndexError: filetype = None #if 'File Format': if url: results.append({ 'url': url, 'title': title, 'snippet': snippet, 'filetype': filetype, 'index_size': index_size, 'extended': extended_result, }) return results else: pass #return [] elif grab.css_exists('#res'): # Could be search results here? # or just message "nothing was found"? pass else: raise ParsingError('Could not identify google page format')
query='habrahabr' #file_out='out1.html' go_url = golib._url(query,page=1) g = Grab() #g.go(go_url, log_file=file_out) g.go(go_url) print g.css_text('#resultStats') print '---' f = open('data_file.txt','a') for elem in g.xpath_list('//div [@id="ires"]'): for i in range(len(elem.xpath('//span [@class="f nsa"]'))): title_elem = elem.xpath('//span [@class="f nsa"]')[i] data = get_node_text(title_elem, smart=False) #if (data[:1]!='1'): print 'out->',data f.write(data+'\n') f.close() print '---' #print g.xpath_list('//span [@class="f nsa"]') #print g.css_text('#search') #print g.xpath_text('//h2[@class="hd"]') #print g.xpath_text('//*[h3[@class="r"]/a]')
def parse_search_results(grab, parse_index_size=False, strict_query=False): """ Parse google search results page content. """ # elif grab.search(u'please type the characters below'): if grab.search(u'src="/sorry/image'): # Captcha!!! raise CaptchaError("Captcha found") elif grab.css_exists("#ires"): if strict_query and (grab.search(u"Нет результатов для") or grab.search(u"No results found for")): pass logging.debug("Query modified") else: if len(grab.css_list("#ires h3")): # Something was found if parse_index_size: index_size = parse_index_size(grab) else: index_size = None # Yield found results results = [] for elem in grab.xpath_list('//*[h3[@class="r"]/a]'): title_elem = elem.xpath("h3/a")[0] # url url = title_elem.get("href") if url.startswith("/url?"): url = url.split("?q=")[1].split("&")[0] url = unquote_plus(url) # title title = get_node_text(title_elem) # snippet # Google could offer two type of snippet format: simple and extended # It depends on user agent # For <IE8, Opera, <FF3 you probably get simple format try: snippet_node = elem.xpath('div[@class="s"]')[0] except IndexError as ex: # Probably it is video or some other result # Such result type is not supported yet continue try: subnode = snippet_node.xpath('span[@class="st"]')[0] snippet = get_node_text(subnode, smart=False) extended_result = True except IndexError: drop_node(snippet_node, "div") drop_node(snippet_node, 'span[@class="f"]') snippet = get_node_text(snippet_node, smart=False) extended_result = False # filetype try: filetype = elem.xpath('.//span[contains(@class, "xsm")]' "/text()")[0].lower().strip("[]") except IndexError: filetype = None # if 'File Format': if url: results.append( { "url": url, "title": title, "snippet": snippet, "filetype": filetype, "index_size": index_size, "extended": extended_result, } ) return results else: pass # return [] elif grab.css_exists("#res"): # Could be search results here? # or just message "nothing was found"? pass else: raise ParsingError("Could not identify google page format")