def __init__(self, config={}, html='', query=''): """Create new Parser instance and parse all information.""" self.config = config self.searchtype = self.config.get('search_type', 'normal') assert self.searchtype in self.search_types, 'search type "{}" is not supported in {}'.format( self.searchtype, self.__class__.__name__) self.query = query self.html = html self.dom = None self.search_results = {} self.num_results_for_query = '' self.num_results = 0 self.effective_query = '' self.page_number = -1 self.no_results = False self.related_keywords = {} # to be set by the implementing sub classes self.search_engine = '' # short alias because we use it so extensively self.css_to_xpath = HTMLTranslator().css_to_xpath if self.html: self.parse()
def __init__(self, html=None, query=''): """Create new Parser instance and parse all information. Args: html: The raw html from the search engine search. If not provided, you can parse the data later by calling parse(html) directly. searchtype: The search type. By default "normal" Raises: Assertion error if the subclassed specific parser cannot handle the the settings. """ self.searchtype = Config['SCRAPING'].get('search_type', 'normal') assert self.searchtype in self.search_types, 'search type "{}" is not supported in {}'.format( self.searchtype, self.__class__.__name__) self.query = query self.html = html self.dom = None self.search_results = {} self.num_results_for_query = '' self.num_results = 0 self.effective_query = '' self.page_number = -1 self.no_results = False # to be set by the implementing sub classes self.search_engine = '' # short alias because we use it so extensively self.css_to_xpath = HTMLTranslator().css_to_xpath if self.html: self.parse()
def get_game_data(username): wishlist_url = 'http://steamcommunity.com/id/%s/games/?tab=all' % (username,) response = urllib.request.urlopen(wishlist_url) html_data = response.read().decode('utf-8') doc = html.document_fromstring(html_data) translator = HTMLTranslator() row_selector = translator.css_to_xpath('script[language=javascript]') games = None for el in doc.xpath(row_selector): variables = parse_script(el.text_content()) for variable in variables: if variable.identifier.value == 'rgGames': games = variable return[to_map(item) for item in games.initializer.items]
def CSSSelect(expr): try: return css_select_cache[expr] except KeyError: from cssselect import HTMLTranslator from lxml.etree import XPath ans = css_select_cache[expr] = XPath( HTMLTranslator().css_to_xpath(expr)) return ans
def shorten_title(doc): title = doc.find('.//title').text if not title: return '' title = orig = norm_title(title) candidates = set() for item in ['.//h1', './/h2', './/h3']: for e in list(doc.iterfind(item)): if e.text: add_match(candidates, e.text, orig) if e.text_content(): add_match(candidates, e.text_content(), orig) from cssselect import HTMLTranslator css_to_xpath = HTMLTranslator().css_to_xpath for item in ('#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red'): for e in doc.xpath(css_to_xpath(item)): if e.text: add_match(candidates, e.text, orig) if e.text_content(): add_match(candidates, e.text_content(), orig) if candidates: title = sorted(candidates, key=len)[-1] else: for delimiter in [' | ', ' - ', ' :: ', ' / ']: if delimiter in title: parts = orig.split(delimiter) if len(parts[0].split()) >= 4: title = parts[0] break elif len(parts[-1].split()) >= 4: title = parts[-1] break else: if ': ' in title: parts = orig.split(': ') if len(parts[-1].split()) >= 4: title = parts[-1] else: title = orig.split(': ', 1)[1] if not 15 < len(title) < 150: return orig return title
def generate_examples_from_file(file_path): """Extracts a list of strings representing header and example elements from the file specifed by `file_path`. """ expression = HTMLTranslator().css_to_xpath(EXTRACTED_SELECTORS) document = lxml.html.parse(file_path) elements = document.xpath(expression) elements = filter_nonexample_headers(elements) elements = filter_duplicated_descendants(elements) for el in elements: html = lxml.etree.tostring(el, pretty_print=True, method='html') yield rewrite_asset_urls(html)
def mutate_selector_del(selector, method, expression): """Under the covers, Selectors contain an lxml.etree.Element document root, which is not exposed by the Selector interface. This is mutatable using the .remove method on parts of the selector.root document tree. Unfortunately, there is no native content removal interface in scrapy. As this is not using a published interface for Selector, it must be considered risky. In particular, it is feasible (though not likely) that scrapy could change its selector implementation to use a different HTML/XML parsing library, at which point this would fail. """ try: if method == 'xpath': s = expression elif method == 'css': s = HTMLTranslator().css_to_xpath(expression) else: raise NotImplementedError for node in selector.root.xpath(s): node.getparent().remove(node) except Exception as e: logger.error('mutate_selector_del({}, {}, {},) failed: {}'.format( selector, method, expression, e))
def find_page_breaks(self, item): if self.page_break_selectors is None: from calibre.ebooks.oeb.stylizer import fix_namespace css_to_xpath = HTMLTranslator().css_to_xpath self.page_break_selectors = set([]) stylesheets = [ x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES ] for rule in rules(stylesheets): before = getattr( rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower() after = getattr( rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower() try: if before and before not in {'avoid', 'auto', 'inherit'}: self.page_break_selectors.add((XPath( fix_namespace(css_to_xpath(rule.selectorText))), True)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-before') except: pass try: if after and after not in {'avoid', 'auto', 'inherit'}: self.page_break_selectors.add((XPath( fix_namespace(css_to_xpath(rule.selectorText))), False)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-after') except: pass page_breaks = set([]) for selector, before in self.page_break_selectors: body = item.data.xpath('//h:body', namespaces=NAMESPACES) if not body: continue for elem in selector(body[0]): if elem not in body: elem.set('pb_before', '1' if before else '0') page_breaks.add(elem) for i, elem in enumerate(item.data.iter()): try: elem.set('pb_order', str(i)) except TypeError: # Cant set attributes on comment nodes etc. continue page_breaks = list(page_breaks) page_breaks.sort(key=lambda x: int(x.get('pb_order'))) page_break_ids, page_breaks_ = [], [] for i, x in enumerate(page_breaks): x.set('id', x.get('id', 'calibre_pb_%d' % i)) id = x.get('id') try: xp = XPath('//*[@id="%s"]' % id) except: try: xp = XPath("//*[@id='%s']" % id) except: # The id has both a quote and an apostrophe or some other # Just replace it since I doubt its going to work anywhere else # either id = 'calibre_pb_%d' % i x.set('id', id) xp = XPath('//*[@id=%r]' % id) page_breaks_.append((xp, x.get('pb_before', '0') == '1')) page_break_ids.append(id) for elem in item.data.iter(): elem.attrib.pop('pb_order', False) elem.attrib.pop('pb_before', False) return page_breaks_, page_break_ids
def css_select(self, response, css_selector): document = self.parse_response(response) expression = HTMLTranslator().css_to_xpath(css_selector) return document.xpath(expression)
class GoogleParser(): """Parses data from Google SERP pages.""" # Named tuple type for the search results Result = namedtuple('LinkResult', 'link_title link_snippet link_url link_position') # short alias because we use it so extensively _xp = HTMLTranslator().css_to_xpath # Valid URL (taken from django) _REGEX_VALID_URL = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) _REGEX_VALID_URL_SIMPLE = re.compile( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) def __init__(self, html, searchtype='normal'): self.html = html self.searchtype = searchtype self.dom = None self.search_results = {'num_results_for_kw': []} # Try to parse the google HTML result using lxml try: doc = UnicodeDammit(self.html, is_html=True) parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding) self.dom = lxml.html.document_fromstring(self.html, parser=parser) self.dom.resolve_base_href() except Exception as e: print( 'Some error occurred while lxml tried to parse: {}'.format(e)) # Very redundant by now, but might change in the soon future if self.searchtype == 'normal': self.search_results.update({ 'results': [], # List of Result, list of named tuples 'ads_main': [], # The google ads in the main result set. 'ads_aside': [], # The google ads on the right aside. }) elif self.searchtype == 'video': self.search_results.update({ 'results': [], # Video search results 'ads_main': [], # The google ads in the main result set. 'ads_aside': [], # The google ads on the right aside. }) elif self.searchtype == 'image': self.search_results.update({ 'results': [], # Images links }) elif self.searchtype == 'news': self.search_results.update({ 'results': [], # Links from news search 'ads_main': [], # The google ads in the main result set. 'ads_aside': [], # The google ads on the right aside. }) ### the actual PARSING happens here parsing_actions = { 'normal': self._parse_normal_search, 'image': self._parse_image_search, 'video': self._parse_video_search, 'news': self._parse_news_search, } # Call the correct parsing method parsing_actions.get(self.searchtype)(self.dom) # Clean the results self._clean_results() def __iter__(self): """Simple magic method to iterate quickly over found non ad results""" for link_title, link_snippet, link_url in self.result['results']: yield (link_title, link_snippet, link_url) def num_results(self): """Returns the number of pages found by keyword as shown in top of SERP page.""" return self.search_results['num_results_for_kw'] @property def results(self): """Returns all results including sidebar and main result advertisements""" return { k: v for k, v in self.search_results.items() if k not in ('num_results_for_kw', ) } @property def all_results(self): return self.search_results @property def links(self): """Only returns non ad results""" return self.search_results['results'] def _clean_results(self): """Cleans/extracts the found href or data-href attributes.""" # Now try to create ParseResult objects from the URL for key in ('results', 'ads_aside', 'ads_main'): for i, e in enumerate(self.search_results[key]): # First try to extract the url from the strange relative /url?sa= format matcher = re.search(r'/url\?q=(?P<url>.*?)&sa=U&ei=', e.link_url) if matcher: url = matcher.group(1) else: url = e.link_url self.search_results[key][i] = \ self.Result(link_title=e.link_title, link_url=urllib.parse.urlparse(url), link_snippet=e.link_snippet, link_position=e.link_position) def _parse_num_results(self): # try to get the number of results for our search query try: self.search_results['num_results_for_kw'] = \ self.dom.xpath(self._xp('div#resultStats'))[0].text_content() except Exception as e: logger.debug( 'Cannot parse number of results for keyword from SERP page: {}' .format(e)) def _parse_normal_search(self, dom): """Specifies the CSS selectors to extract links/snippets for a normal search. @param dom The page source to parse. """ # There might be several list of different css selectors to handle different SERP formats css_selectors = { # to extract all links of non-ad results, including their snippets(descriptions) and titles. 'results': (['li.g', 'h3.r > a', 'div.s span.st'], ), # to parse the centered ads 'ads_main': (['div#center_col li.ads-ad', 'h3.r > a', 'div.ads-creative'], ['div#tads li', 'h3 > a:first-child', 'span:last-child']), # the ads on on the right 'ads_aside': (['#rhs_block li.ads-ad', 'h3.r > a', 'div.ads-creative'], ), } self._parse(dom, css_selectors) def _parse_image_search(self, dom): """Specifies the CSS selectors to extract links/snippets for a image search.""" css_selectors = { 'results': (['div.rg_di', 'a:first-child', 'span.rg_ilmn'], ) } self._parse(dom, css_selectors) def _parse_video_search(self, dom): """Specifies the CSS selectors to extract links/snippets for a video search. Very similar to a normal search. Basically the same. But this is a unique method because the parsing logic may change over time. """ css_selectors = { # to extract all links of non-ad results, including their snippets(descriptions) and titles. 'results': (['li.g', 'h3.r > a:first-child', 'div.s > span.st'], ), # to parse the centered ads 'ads_main': (['div#center_col li.ads-ad', 'h3.r > a', 'div.ads-creative'], ['div#tads li', 'h3 > a:first-child', 'span:last-child']), # the ads on on the right 'ads_aside': (['#rhs_block li.ads-ad', 'h3.r > a', 'div.ads-creative'], ), } self._parse(dom, css_selectors) def _parse_news_search(self, dom): """Specifies the CSS selectors to extract links/snippets for a news search. Is also similar to a normal search. But must be a separate function since https://news.google.com/nwshp? needs own parsing code... """ css_selectors = { # to extract all links of non-ad results, including their snippets(descriptions) and titles. # The first CSS selector is the wrapper element where the search results are situated # the second CSS selector selects the link and the title. If there are 4 elements in the list, then # the second and the third element are for the link and the title. # the 4th selector is for the snippet. 'results': (['li.g', 'h3.r > a:first-child', 'div.s span.st'], ), # to parse the centered ads 'ads_main': (['div#center_col li.ads-ad', 'h3.r > a', 'div.ads-creative'], ['div#tads li', 'h3 > a:first-child', 'span:last-child']), # the ads on on the right 'ads_aside': (['#rhs_block li.ads-ad', 'h3.r > a', 'div.ads-creative'], ), } self._parse(dom, css_selectors) def _parse(self, dom, css_selectors): """Generic parse method""" for key, slist in css_selectors.items(): for selectors in slist: self.search_results[key].extend( self._parse_links(dom, *selectors)) self._parse_num_results() def _parse_links(self, dom, container_selector, link_selector, snippet_selector): links = [] # Try to extract all links of non-ad results, including their snippets(descriptions) and titles. # The parsing should be as robust as possible. Sometimes we can't extract all data, but as much as humanly # possible. rank = 0 try: li_g_results = dom.xpath(self._xp(container_selector)) for i, e in enumerate(li_g_results): snippet = link = title = '' try: link_element = e.xpath(self._xp(link_selector)) link = link_element[0].get('href') title = link_element[0].text_content() # For every result where we can parse the link and title, increase the rank rank += 1 except IndexError as err: logger.debug( 'Error while parsing link/title element with selector={}: {}' .format(link_selector, err)) try: for r in e.xpath(self._xp(snippet_selector)): snippet += r.text_content() except Exception as err: logger.debug( 'Error in parsing snippet with selector={}.Error: {}'. format(snippet_selector, repr(e), err)) links.append( self.Result(link_title=title, link_url=link, link_snippet=snippet, link_position=rank)) # Catch further errors besides parsing errors that take shape as IndexErrors except Exception as err: logger.error( 'Error in parsing result links with selector={}: {}'.format( container_selector, err)) return links or []
g = requests.get('http://www.cetip.com.br') # #ctl00_Banner_lblTaxDI # //*[@id="ctl00_Banner_lblTaxDI"] tree = etree.HTML(g.text) res = etree.tostring(tree, pretty_print=True, method="html") res = tree.xpath('//*[@id="ctl00_Banner_lblTaxDI"]') print res[0].text # body > div:nth-child(1) > div:nth-child(17) > table > tbody > tr > td > div > table # /html/body/div[1]/div[6]/table/tbody/tr/td/div/table g = requests.get('http://www.portalbrasil.net/ipca.htm') tree = etree.HTML(g.text) res = etree.tostring(tree, pretty_print=True, method="html") xpath = HTMLTranslator().css_to_xpath('table:nth-last-child(1)') print xpath res = tree.xpath(xpath) print etree.tostring(res[0], pretty_print=True, method="html") # print res g = requests.get('http://www2.bmf.com.br/pages/portal/bmfbovespa/boletim1/TxRef1.asp') tree = etree.HTML(g.text) res = etree.tostring(tree, pretty_print=True, method="html") # tit <- xpathSApply(doc, "//td[contains(@class, 'tabelaTitulo')]", xmlValue) # tit <- str_replace_all(tit, '\\s+', ' ') # bases <- xpathSApply(doc, "//td[contains(@class, 'tabelaItem')]", xmlValue) # bases <- str_replace_all(bases, '\\s+', '') # bases <- str_replace_all(bases, '^(\\d+)[^\\d].*', '\\1') # bases <- as.numeric(bases)
import vim from urllib import parse as uparse from lxml import html, etree from cssselect import HTMLTranslator import sys htmltrans = HTMLTranslator() old_search_pattern = vim.eval('@/') base_url="http://etymonline.com/index.php?term={}" etymologynr = int(vim.eval("bufwinnr('^etymology$')")) word_to_look_up = sys.argv[0] term_start = "{} {{{{{{" term_end = "}}}" if etymologynr > -1: vim.command('{}wincmd w'.format(etymologynr)) else: vim.command('silent keepalt belowright split etymology') vim.command('setlocal noswapfile nobuflisted nospell nowrap modifiable') vim.command('setlocal buftype=nofile bufhidden=hide') vim.command('setlocal foldmethod=marker textwidth=80 wrapmargin=0') term_xpath = etree.XPath(htmltrans.css_to_xpath('dt')) linkfixes = etree.XPath(htmltrans.css_to_xpath("a.crossreference")) foreignfixes = etree.XPath(htmltrans.css_to_xpath("span.foreign")) definitions = html.parse(base_url.format(uparse.quote_plus(word_to_look_up))) lines = [] for foreignfix in foreignfixes(definitions):
def cssToXpath(css_selector, translator=None): if not translator: translator = HTMLTranslator() return translator.css_to_xpath(css_selector)
def css_to_xpath(css): return HTMLTranslator().css_to_xpath(css) if len(css) > 0 else ""
def CSSSelect(expr): from cssselect import HTMLTranslator from lxml.etree import XPath return XPath(HTMLTranslator().css_to_xpath(expr))
def extract(self, rules, strict=False): parselet = Parselet(rules, strict=strict) return parselet.extract(self) def cssselect(self, expr): return self._css_translator.css_to_xpath(expr) def css(self, expr): return self.xpath(self.cssselect(expr)) html.HtmlElement.extract_text = extract_text html.HtmlElement._css_translator = HTMLTranslator() html.HtmlElement.cssselect = cssselect html.HtmlElement.css = css html.HtmlElement.extract = extract html.HtmlElement.extract_urls = extract_urls def HtmlParser(response): """ :param response: :type response: :class:`dragline.http.Response` This method takes response object as its argument and returns the lxml etree object. HtmlParser function returns a lxml object of type HtmlElement which got few potential methods.
# Searching on class names with a dash ('-') from cssselect import HTMLTranslator result = lxml_document.xpath(HTMLTranslator().css_to_xpath('div.reddit-entry'))
def _parse(self): """Parse the dom according to the provided css selectors. Raises: InvalidSearchTypeExcpetion if no css selectors for the searchtype could be found. """ # try to parse the number of results. attr_name = self.searchtype + '_search_selectors' selector_dict = getattr(self, attr_name, None) # short alias because we use it so extensively css_to_xpath = HTMLTranslator().css_to_xpath # get the appropriate css selectors for the num_results for the keyword num_results_selector = getattr(self, 'num_results_search_selectors', None) if num_results_selector: self.search_results['num_results'] = self.dom.xpath( css_to_xpath(num_results_selector))[0].text_content() if not selector_dict: raise InvalidSearchTypeExcpetion( 'There is no such attribute: {}. No selectors found'.format( attr_name)) for result_type, selectors in selector_dict.items(): self.search_results[result_type] = [] results = self.dom.xpath( css_to_xpath( '{container} {result_container}'.format(**selectors))) to_extract = set( selectors.keys()) - {'container', 'result_container'} selectors_to_use = dict(((key, selectors[key]) for key in to_extract if key in selectors.keys())) for index, result in enumerate(results): # Let's add primitve support for CSS3 pseudo selectors # We just need two of them # ::text # ::attr(someattribute) # You say we should use xpath expresssions instead? # Maybe you're right, but they are complicated when it comes to classes, # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html serp_result = {} for key, selector in selectors_to_use.items(): value = None if selector.endswith('::text'): try: value = result.xpath( css_to_xpath(selector.split('::') [0]))[0].text_content() except IndexError as e: pass else: attr = re.search(r'::attr\((?P<attr>.*)\)$', selector).group('attr') if attr: try: value = result.xpath( css_to_xpath( selector.split('::')[0]))[0].get(attr) except IndexError as e: pass else: try: value = result.xpath( css_to_xpath(selector))[0].text_content() except IndexError as e: pass serp_result[key] = value if serp_result: self.search_results[result_type].append(serp_result)
def _parse(self): """Internal parse the dom according to the provided css selectors. Raises: InvalidSearchTypeExcpetion if no css selectors for the searchtype could be found. """ # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.style = True try: parser = lxml.html.HTMLParser(encoding='utf-8') self.dom = lxml.html.document_fromstring(self.html, parser=parser) self.dom = cleaner.clean_html(self.dom) self.dom.resolve_base_href() except Exception as e: # maybe wrong encoding logger.error(e) # try to parse the number of results. attr_name = self.searchtype + '_search_selectors' selector_dict = getattr(self, attr_name, None) # short alias because we use it so extensively css_to_xpath = HTMLTranslator().css_to_xpath # get the appropriate css selectors for the num_results for the keyword num_results_selector = getattr(self, 'num_results_search_selectors', None) self.search_results['num_results'] = '' if isinstance(num_results_selector, list) and num_results_selector: for selector in num_results_selector: try: self.search_results['num_results'] = self.dom.xpath( css_to_xpath(selector))[0].text_content() except IndexError as e: logger.warning( 'Cannot parse num_results from serp page with selector {}' .format(selector)) else: # leave when first selector grabbed something break if not selector_dict and not isinstance(selector_dict, dict): raise InvalidSearchTypeException( 'There is no such attribute: {}. No selectors found'.format( attr_name)) for result_type, selector_class in selector_dict.items(): self.search_results[result_type] = [] for selector_specific, selectors in selector_class.items(): results = self.dom.xpath( css_to_xpath( '{container} {result_container}'.format(**selectors))) to_extract = set( selectors.keys()) - {'container', 'result_container'} selectors_to_use = { key: selectors[key] for key in to_extract if key in selectors.keys() } for index, result in enumerate(results): # Let's add primitive support for CSS3 pseudo selectors # We just need two of them # ::text # ::attr(attribute) # You say we should use xpath expressions instead? # Maybe you're right, but they are complicated when it comes to classes, # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html serp_result = {} # key are for example 'link', 'snippet', 'snippet', ... # selector is the selector to grab these items for key, selector in selectors_to_use.items(): value = None if selector.endswith('::text'): try: value = result.xpath( css_to_xpath(selector.split('::') [0]))[0].text_content() except IndexError as e: pass else: attr = re.search(r'::attr\((?P<attr>.*)\)$', selector).group('attr') if attr: try: value = result.xpath( css_to_xpath(selector.split('::') [0]))[0].get(attr) except IndexError as e: pass else: try: value = result.xpath(css_to_xpath( selector))[0].text_content() except IndexError as e: pass serp_result[key] = value # only add items that have not None links. # Avoid duplicates. Detect them by the link. # If statement below: Lazy evaluation. The more probable case first. if 'link' in serp_result and serp_result['link'] and \ not [e for e in self.search_results[result_type] if e['link'] == serp_result['link']]: self.search_results[result_type].append(serp_result)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- __author__ = 'ipetrash' # pip install cssselect from cssselect import HTMLTranslator css_to_xpath = HTMLTranslator(xhtml=True).css_to_xpath if __name__ == '__main__': xpath_expr = css_to_xpath('div#main > a[href]') print(xpath_expr) # descendant-or-self::div[@id = 'main']/a[@href] xpath_expr = css_to_xpath('div') print(xpath_expr) # descendant-or-self::div xpath_expr = css_to_xpath('table:nth-last-child(1)') print(xpath_expr) # descendant-or-self::table[count(following-sibling::*) = 0] print() for item in ('#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red'): xpath_expr = css_to_xpath(item) print(xpath_expr)
def CSSSelect(expr): from cssselect import HTMLTranslator return HTMLTranslator().css_to_xpath(expr)
import logging from cssselect import HTMLTranslator, SelectorError from functools import lru_cache class SelectorException(RuntimeError): def __init__(self, selector): self.selector = selector translator = HTMLTranslator() @lru_cache() def xpath(pattern): return translator.css_to_xpath(pattern) logger = logging.getLogger("selector") class Selector(object): def __init__(self, document): self.document = document self.translator = HTMLTranslator() def find(self, pattern): expression = xpath(pattern) results = [Selector(d) for d in self.document.xpath(expression)] if len(results) == 0: logger.warning("Selector {0} found 0 results".format(pattern))
def scrap_value(doc, selector): xpath_selector = HTMLTranslator().css_to_xpath(selector) elems = doc.xpath(xpath_selector) return elems
def __init__(self, document): self.document = document self.translator = HTMLTranslator()
import re from dataclasses import asdict, dataclass from functools import lru_cache from typing import Callable, Iterable, Iterator, List, Optional, Tuple, TypeVar from urllib.parse import urljoin import arrow import attr import requests from cssselect import HTMLTranslator from lxml.html import fromstring from requests import Session from robobrowser import RoboBrowser from tqdm import tqdm _ctx = lru_cache()(HTMLTranslator().css_to_xpath) parser = arrow.parser.DateTimeParser('en_au', 100) T = TypeVar('T') def ctx(el, selector): return el.xpath(_ctx(selector)) def parse(string, fmt=None): if fmt is None: t = parser.parse_iso(string) else: t = parser.parse(string, fmt) return arrow.Arrow.fromdatetime(t)
def process_query(self, query): xpath_query = HTMLTranslator().css_to_xpath(query) return super(CssSelector, self).process_query(xpath_query)
def _search(self): """The actual search and parsing of the results. Private, internal method. Parsing is done with lxml and cssselect. The html structure of the Google Search results may change over time. Effective: February 2014 """ self._build_query() if DO_CACHING: html = get_cached(self._SEARCH_PARAMS) self.SEARCH_RESULTS['cache_file'] = os.path.join( CACHEDIR, cached_file_name(self._SEARCH_PARAMS)) else: html = False if not html: try: r = requests.get(self._SEARCH_URL, headers=self._HEADERS, params=self._SEARCH_PARAMS, timeout=3.0) logger.debug("Scraped with url: {}".format(r.url)) except requests.ConnectionError as cerr: print('Network problem occurred {}'.format(cerr.msg)) return False except requests.Timeout as terr: print('Connection timeout {}'.format(terr.msg)) return False if not r.ok: print('HTTP Error:', r.status_code) if str(r.status_code)[0] == '5': print('Maybe google recognizes you as sneaky spammer after' ' you requested their services too inexhaustibly :D') return False html = r.text # cache fresh results if DO_CACHING: cache_results(self._SEARCH_PARAMS, html) self.SEARCH_RESULTS['cache_file'] = os.path.join( CACHEDIR, cached_file_name(self._SEARCH_PARAMS)) # Try to parse the google HTML result using lxml try: doc = UnicodeDammit(html, is_html=True) parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding) dom = lxml.html.document_fromstring(html, parser=parser) dom.resolve_base_href() except Exception as e: print('Some error occurred while lxml tried to parse: {}'.format( e.msg)) return False # Try to extract all links of non-ad results, including their snippets(descriptions) and titles. try: li_g_results = dom.xpath(HTMLTranslator().css_to_xpath('li.g')) links = [] for e in li_g_results: try: link_element = e.xpath( HTMLTranslator().css_to_xpath('h3.r > a:first-child')) link = link_element[0].get('href') title = link_element[0].text_content() except IndexError as err: logger.error( 'Error while parsing link/title element: {}'.format( err)) continue try: snippet_element = e.xpath( HTMLTranslator().css_to_xpath('div.s > span.st')) snippet = snippet_element[0].text_content() except IndexError as err: logger.error( 'Error while parsing snippet element: {}'.format(err)) continue links.append( self.Result(link_title=title, link_url=link, link_snippet=snippet)) # Catch further errors besides parsing errors that take shape as IndexErrors except Exception as err: logger.error('Error in parsing result links: {}'.format(err)) self.SEARCH_RESULTS['results'].extend(links) # try to get the number of results for our search query try: self.SEARCH_RESULTS['num_results_for_kw'] = \ dom.xpath(HTMLTranslator().css_to_xpath('div#resultStats'))[0].text_content() except Exception as e: logger.critical(e.msg)
import lxml.html from cssselect import HTMLTranslator import re from scraper.models import FilmDict from scraper.utils import (decode_html, unicode_normalize, clean_string, string_to_list, correct_countries_list) from cinema.utils import (titlecase, country_title) html_translator = HTMLTranslator() META_XPATH = html_translator.css_to_xpath('header.carousel-caption > h6') ANCHOR_XPATH = html_translator.css_to_xpath('ul.thumbnails > li .thumbnail > a:nth-of-type(1)') SYNOPSIS_GRAPHS_XPATH = "//div[@class='lead']/p" DESCRIPTION_GRAPHS_XPATH = '//article/h4[2]/following-sibling::p' DIRECTOR_REG = r'dir\.\s+([^\d]+)' COUNTRIES_REG = r'(?:\,\s+(\w[\'\w\s]+)+)' class HTMLScraper: """docstring for HTMLScraper""" def __init__(self, raw_html, source_url=None): super(HTMLScraper, self).__init__() self.source_url = source_url self.raw_html = raw_html self._tree = None @property def tree(self): if self._tree is None: self._tree = self.make_tree() return self._tree
class LinkExtractor(ABC): """ The abstract class LinkExtractor defines the behavior that extracts links from the specific response and the must follow the specific rules, each subclass must implement function _process() which represent the extract links logic. Each element in the extracted links must be an object Link from common_crawler.link. """ _css_translator = HTMLTranslator() def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), tags=('a', 'area'), attrs=('href', ), canonicalize=False, unique=True, process_attr=None, deny_extensions=None, strip=True, restrict_xpaths=(), restrict_css=()): """ :param allow: a regular expression tuple(or single value) that the URLs must match in order to extract. :param deny: a regular expression tuple(or single value), the match the successful URLs will not be extracted. :param allow_domains: a tuple(or single value) of a string containing domains which will be considered for extracting the links. :param deny_domains: a tuple(or single value) of a string containing domains which won't be considered for extracting the links. :param tags: a tags list(or single value) to consider when extracting links. :param attrs: an attribute list(or single value) which should be considered when looking for links to extract, only for those tags specified in the tags param. :param canonicalize: canonicalize each extracted url (using w3lib.url.canonicalize_url). :param unique: whether duplicate filtering should be applied to extracted links. :param process_attr: a function which receives each value extracted from the tag and attributes scanned and can modify the value and return a new one. :param deny_extensions: a extension list(or single value) that should be ignored when extracting links. :param strip: whether to strip whitespaces from extracted attributes, according to HTML5 standard. :param restrict_xpaths: an XPath or list of XPath which defines regions inside the response where links should be extracted from. :param restrict_css: a CSS or list of CSS which defines regions inside the response where links should be extracted from. """ self.unique = unique self.strip = strip self.allowed_rule = compile_regexes(arg_to_iter(allow)) self.denied_rule = compile_regexes(arg_to_iter(deny)) self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.deny_extensions = deny_extensions or IGNORED_EXTENSIONS self.deny_extensions = {'.' + x for x in arg_to_iter(deny_extensions)} tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) self.scan_tag_func, self.scan_attr_func = lambda x: x in tags, lambda x: x in attrs self.process_attr = process_attr if callable( process_attr) else lambda v: v self.canonicalize = canonicalize if canonicalize: self.link_key = lambda link: link.url else: self.link_key = lambda link: canonicalize_url(link.url, keep_fragments=True) self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths)) self.restrict_xpaths += tuple( map(self._css_translator.css_to_xpath, arg_to_iter(restrict_css))) def _link_allowed(self, link): """Return true if the link meets the requirements of the rules.""" if not is_valid_url(link.url): return False if self.allowed_rule and not matches(link.url, self.allowed_rule): return False if self.denied_rule and matches(link.url, self.denied_rule): return False parsed_url = parse_url(link.url) if self.allow_domains and not url_in_domains(parsed_url, self.allow_domains): return False if self.deny_domains and url_in_domains(parsed_url, self.deny_domains): return False if self.deny_extensions and url_has_extension(parsed_url, self.deny_extensions): return False return True def _get_response_text(self, response, func_name='text', encoding='utf-8'): """ Return a text of the response by invoking the specific function, return itself if the response is the string. """ if isinstance(response, str): return response if hasattr(response, func_name): text = getattr(response, func_name) text = text() if callable(text) else text return text.decode(encoding) if isinstance(text, bytes) else text raise ValueError( 'The response must be str or has a function or param for getting the text' ) def _deduplicate(self, links): """Remove duplicate links.""" if self.unique: return unique_list(list_=links, key=self.link_key) return links def extract_links(self, response, encoding='utf-8'): """ Return extracted links from the specific response according to rules, invoke the function _link_allowed() for filtering invalid links. """ links = self._process(response, encoding) links = [x for x in links if self._link_allowed(x)] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = self._deduplicate(links) return links @abstractmethod def _process(self, response, encoding='utf-8'): """ Specific extract link logic that subclass implementation, need basis on the params tags, attrs, process_attr and strip to extracts. """ raise NotImplementedError
except ImportError: raise RuntimeError('You need cssutils >= 0.9.9 for calibre') from cssutils import (profile as cssprofiles, parseString, parseStyle, log as cssutils_log, CSSParser, profiles, replaceUrls) from lxml import etree from cssselect import HTMLTranslator from calibre import force_unicode from calibre.ebooks import unit_convert from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize cssutils_log.setLevel(logging.WARN) _html_css_stylesheet = None css_to_xpath = HTMLTranslator().css_to_xpath def html_css_stylesheet(): global _html_css_stylesheet if _html_css_stylesheet is None: html_css = open(os.path.join(os.path.dirname(__file__), 'html.css'), 'rb').read() _html_css_stylesheet = parseString(html_css, validate=False) _html_css_stylesheet.namespaces['h'] = XHTML_NS return _html_css_stylesheet XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS INHERITED = set([
def test_select(self): document = etree.fromstring(HTML_IDS) sort_key = dict( (el, count) for count, el in enumerate(document.getiterator())).__getitem__ css_to_xpath = GenericTranslator().css_to_xpath html_css_to_xpath = HTMLTranslator().css_to_xpath def select_ids(selector, html_only): xpath = css_to_xpath(selector) items = document.xpath(xpath) if html_only: assert items == [] xpath = html_css_to_xpath(selector) items = document.xpath(xpath) items.sort(key=sort_key) return [element.get('id', 'nil') for element in items] def pcss(main, *selectors, **kwargs): html_only = kwargs.pop('html_only', False) result = select_ids(main, html_only) for selector in selectors: assert select_ids(selector, html_only) == result return result all_ids = pcss('*') assert all_ids[:6] == [ 'html', 'nil', 'link-href', 'link-nohref', 'nil', 'outer-div' ] assert all_ids[-1:] == ['foobar-span'] assert pcss('div') == ['outer-div', 'li-div', 'foobar-div'] assert pcss('DIV', html_only=True) == ['outer-div', 'li-div', 'foobar-div' ] # case-insensitive in HTML assert pcss('div div') == ['li-div'] assert pcss('div, div div') == ['outer-div', 'li-div', 'foobar-div'] assert pcss('a[name]') == ['name-anchor'] assert pcss('a[NAme]', html_only=True) == ['name-anchor' ] # case-insensitive in HTML: assert pcss('a[rel]') == ['tag-anchor', 'nofollow-anchor'] assert pcss('a[rel="tag"]') == ['tag-anchor'] assert pcss('a[href*="localhost"]') == ['tag-anchor'] assert pcss('a[href*=""]') == [] assert pcss('a[href^="http"]') == ['tag-anchor', 'nofollow-anchor'] assert pcss('a[href^="http:"]') == ['tag-anchor'] assert pcss('a[href^=""]') == [] assert pcss('a[href$="org"]') == ['nofollow-anchor'] assert pcss('a[href$=""]') == [] assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == ['foobar-div'] assert pcss('[foobar~="ab bc"]', '[foobar~=""]', '[foobar~=" \t"]') == [] assert pcss('div[foobar~="cd"]') == [] assert pcss('*[lang|="En"]', '[lang|="En-us"]') == ['second-li'] # Attribute values are case sensitive assert pcss('*[lang|="en"]', '[lang|="en-US"]') == [] assert pcss('*[lang|="e"]') == [] # ... :lang() is not. assert pcss(':lang("EN")', '*:lang(en-US)', html_only=True) == ['second-li', 'li-div'] assert pcss(':lang("e")', html_only=True) == [] assert pcss('li:nth-child(3)') == ['third-li'] assert pcss('li:nth-child(10)') == [] assert pcss( 'li:nth-child(2n)', 'li:nth-child(even)', 'li:nth-child(2n+0)') == ['second-li', 'fourth-li', 'sixth-li'] assert pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)') == [ 'first-li', 'third-li', 'fifth-li', 'seventh-li' ] assert pcss('li:nth-child(2n+4)') == ['fourth-li', 'sixth-li'] # FIXME: I'm not 100% sure this is right: assert pcss('li:nth-child(3n+1)') == [ 'first-li', 'fourth-li', 'seventh-li' ] assert pcss('li:nth-last-child(0)') == ['seventh-li'] assert pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)') == [ 'second-li', 'fourth-li', 'sixth-li' ] assert pcss('li:nth-last-child(2n+2)') == ['second-li', 'fourth-li'] assert pcss('ol:first-of-type') == ['first-ol'] assert pcss('ol:nth-child(1)') == [] assert pcss('ol:nth-of-type(2)') == ['second-ol'] # FIXME: like above', '(1) or (2)? assert pcss('ol:nth-last-of-type(1)') == ['first-ol'] assert pcss('span:only-child') == ['foobar-span'] assert pcss('li div:only-child') == ['li-div'] assert pcss('div *:only-child') == ['li-div', 'foobar-span'] self.assertRaises(ExpressionError, pcss, 'p *:only-of-type') assert pcss('p:only-of-type') == ['paragraph'] assert pcss('a:empty', 'a:EMpty') == ['name-anchor'] assert pcss('li:empty') == [ 'third-li', 'fourth-li', 'fifth-li', 'sixth-li', 'seventh-li' ] assert pcss(':root', 'html:root') == ['html'] assert pcss('li:root', '* :root') == [] assert pcss('*:contains("link")', ':CONtains("link")') == [ 'html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor' ] assert pcss('*:contains("LInk")') == [] # case sensitive assert pcss('*:contains("e")') == [ 'html', 'nil', 'outer-div', 'first-ol', 'first-li', 'paragraph', 'p-em' ] assert pcss('*:contains("E")') == [] # case-sensitive assert pcss('.a', '.b', '*.a', 'ol.a') == ['first-ol'] assert pcss('.c', '*.c') == ['first-ol', 'third-li', 'fourth-li'] assert pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') == ['third-li', 'fourth-li'] assert pcss('#first-li', 'li#first-li', '*#first-li') == ['first-li'] assert pcss('li div', 'li > div', 'div div') == ['li-div'] assert pcss('div > div') == [] assert pcss('div>.c', 'div > .c') == ['first-ol'] assert pcss('div + div') == ['foobar-div'] assert pcss('a ~ a') == ['tag-anchor', 'nofollow-anchor'] assert pcss('a[rel="tag"] ~ a') == ['nofollow-anchor'] assert pcss('ol#first-ol li:last-child') == ['seventh-li'] assert pcss('ol#first-ol *:last-child') == ['li-div', 'seventh-li'] assert pcss('#outer-div:first-child') == ['outer-div'] assert pcss('#outer-div :first-child') == [ 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-fieldset-disabled', 'area-href' ] assert pcss('a[href]') == ['tag-anchor', 'nofollow-anchor'] assert pcss(':not(*)') == [] assert pcss('a:not([href])') == ['name-anchor'] assert pcss('ol :Not(li[class])') == [ 'first-li', 'second-li', 'li-div', 'fifth-li', 'sixth-li', 'seventh-li' ] # Invalid characters in XPath element names, should not crash assert pcss(r'di\a0 v', r'div\[') == [] assert pcss(r'[h\a0 ref]', r'[h\]ref]') == [] # HTML-specific assert pcss(':link', html_only=True) == [ 'link-href', 'tag-anchor', 'nofollow-anchor', 'area-href' ] assert pcss(':visited', html_only=True) == [] assert pcss(':enabled', html_only=True) == [ 'link-href', 'tag-anchor', 'nofollow-anchor', 'checkbox-unchecked', 'text-checked', 'checkbox-checked', 'area-href' ] assert pcss(':disabled', html_only=True) == [ 'checkbox-disabled', 'checkbox-disabled-checked', 'fieldset', 'checkbox-fieldset-disabled' ] assert pcss(':checked', html_only=True) == [ 'checkbox-checked', 'checkbox-disabled-checked' ]
import vim from urllib import parse as uparse from lxml import html, etree from cssselect import HTMLTranslator import sys htmltrans = HTMLTranslator() old_search_pattern = vim.eval('@/') base_url = "http://etymonline.com/index.php?term={}" etymologynr = int(vim.eval("bufwinnr('^etymology$')")) word_to_look_up = sys.argv[0] term_start = "{} {{{{{{" term_end = "}}}" if etymologynr > -1: vim.command('{}wincmd w'.format(etymologynr)) else: vim.command('silent keepalt belowright split etymology') vim.command('setlocal noswapfile nobuflisted nospell nowrap modifiable') vim.command('setlocal buftype=nofile bufhidden=hide') vim.command('setlocal foldmethod=marker textwidth=80 wrapmargin=0') term_xpath = etree.XPath(htmltrans.css_to_xpath('dt')) linkfixes = etree.XPath(htmltrans.css_to_xpath("a.crossreference")) foreignfixes = etree.XPath(htmltrans.css_to_xpath("span.foreign")) definitions = html.parse(base_url.format(uparse.quote_plus(word_to_look_up))) lines = [] for foreignfix in foreignfixes(definitions):
return "{expression}{element_name}".format( expression=expression, element_name=element_names[0]) elif len(element_names) > 1: element_names_xpath = " | ".join([ "self::{element_name}".format(element_name=element_name) for element_name in element_names ]) return "{expression}*[{element_names}]".format( expression=expression, element_names=element_names_xpath) else: return "{expression}*".format(expression=expression) def to_xpath(node, exact=False): """ Converts a given XPath :class:`Expression` into a corresponding string query. Args: node (Expression): An XPath :class:`Expression` to convert. exact (bool, optional): Whether the generated query should perform exact or approximate locator matches. Defaults to False. Returns: str: A valid XPath query corresponding to the given :class:`Expression`. """ return Renderer(exact=exact).render(node) _selector_to_xpath = partial(HTMLTranslator().selector_to_xpath, prefix=None)