def _parseSummary((data, headers)): """ Extract the plot summary. """ tree = parseHTML(data) for p in tree.findall('//p'): if p.get('class') == 'plotpar': return p.text.strip()
def test_parseHTML(self): """ L{eridanusstd.util.parseHTML} will use the newer html5lib API if available and parse HTML content into an LXML element tree. """ if not hasattr(html5lib, 'parse'): raise SkipTest('html5lib is too old') tree = util.parseHTML(self.path.sibling('index.html').open()) self.assertIdentical(type(tree), type(etree.ElementTree()))
def _parsePoster((data, headers)): """ Extract the URL for the poster image. """ tree = parseHTML(data) for table in tree.findall('//table'): if table.get('id') == 'principal': img = table.find('.//img') if img is not None: return IMDB_URL.click(img.get('src')) return None
def qdbUS(quoteID): url = QDB_US_URL.child(quoteID) def extractQuote(tree): quote = tree.find('//form/table/tbody') header = unicode(''.join(quote.find('tr/td').itertext())).strip() text = unicode(''.join(quote.find('tr/td/p').itertext())).strip() yield u'%s -- %s' % (header, url) for line in text.splitlines(): yield line return util.PerseverantDownloader(url).go().addCallback( lambda (data, headers): parseHTML(data)).addErrback( handleBadQuoteID, quoteID).addCallback(extractQuote)
def qdbUS(quoteID): url = QDB_US_URL.child(quoteID) def extractQuote(tree): quote = tree.find('//form/table/tbody') header = unicode(''.join(quote.find('tr/td').itertext())).strip() text = unicode(''.join(quote.find('tr/td/p').itertext())).strip() yield u'%s -- %s' % (header, url) for line in text.splitlines(): yield line return util.PerseverantDownloader(url).go( ).addCallback(lambda (data, headers): parseHTML(data) ).addErrback(handleBadQuoteID, quoteID ).addCallback(extractQuote)
def bash(quoteID): url = BASH_URL.add(quoteID) def extractQuote(tree): header = (t for t in tree.find('//p[@class="quote"]').itertext() if t not in ('+', '-', '[X]')) header = unicode(''.join(header), 'ascii').strip() text = unicode(''.join( tree.find('//p[@class="qt"]').itertext())).strip() yield u'%s -- %s' % (header, url) for line in text.splitlines(): yield line return util.PerseverantDownloader(url).go().addCallback( lambda (data, headers): parseHTML(data)).addErrback( handleBadQuoteID, quoteID).addCallback(extractQuote)
def bash(quoteID): url = BASH_URL.add(quoteID) def extractQuote(tree): header = (t for t in tree.find('//p[@class="quote"]').itertext() if t not in ('+', '-', '[X]')) header = unicode(''.join(header), 'ascii').strip() text = unicode(''.join(tree.find('//p[@class="qt"]').itertext())).strip() yield u'%s -- %s' % (header, url) for line in text.splitlines(): yield line return util.PerseverantDownloader(url).go( ).addCallback(lambda (data, headers): parseHTML(data) ).addErrback(handleBadQuoteID, quoteID ).addCallback(extractQuote)
def _parseSearchResults((data, headers)): """ Parse search result HTML into an iterable of C{(name, url, id)}. """ tree = parseHTML(data) # XXX: Maybe do something a little more less shot-in-the-darkish, like # finding the first `ol` after an `h1`. for li in tree.find('//ol').findall('li'): a = li.find('a') url = IMDB_URL.click(a.get('href')) name = unicode(a.text) # Skip video games, this should be part of the "I want movies, # I want TV series" criteria stuff. if not name.endswith(u'(VG)'): pathList = url.pathList() id = pathList[-1] or pathList[-2] yield name, url, id
def _extractTitle(data): def sanitizeTitle(title): return _whitespace.sub(u' ', title.strip()) if data: try: tree = parseHTML(data) results = tree.xpath( '//xhtml:title', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}) results = filter( None, (sanitizeTitle(unicode(e.text)) for e in results)) if results: return u';'.join(results) except: log.msg('Extracting title failed:') log.err() return None
class Calculator(object): """ Primitive screen-scraping interface to Google's calculator. """ _resultFormatting = {'sup': u'^'} def _formatResult(self, elem): """ Gracefully downgrade HTML markup in calculator results. """ def _format(): yield elem.text for child in elem.iterchildren(): tag = child.tag.split('}')[-1] extra = self._resultFormatting.get(tag) if extra is not None: yield extra yield child.text yield child.tail return filter(None, _format()) def _extractResult(self, (data, headers), expn): """ Extract the calculator result from a Google search. @rtype: C{(unicode, unicode)} @return: A pair of C{(expn, result)}. """ tree = parseHTML(data) results = tree.xpath( '//xhtml:h2[@class="r"]/xhtml:b', namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'}) if results: return u''.join(self._formatResult(results[0])) raise errors.InvalidExpression(expn)
_infoParsers = { u'director': _hyperlinkedText(u'director'), u'genre': _genre, u'release date': _releaseDate, } def _parseTitleInfo((data, headers), url): """ Parse an IMDB HTML document into structured information. The resulting dictionary contains keys that map roughly to the relevant IMDB fields of the same name. @rtype: Deferred firing with a C{dict} """ tree = parseHTML(data) info = {} info['title'] = tree.find('//h1').text.strip() # Scan all the `<div class="info">` tags for information that we know how # to parse. infoElems = (e for e in tree.findall('//div') if e.get('class') == 'info') for elem in infoElems: h5 = elem.find('h5') if h5 is None: continue infoName = h5.text if infoName is None: continue
u'director': _hyperlinkedText(u'director'), u'genre': _genre, u'release date': _releaseDate, } def _parseTitleInfo((data, headers), url): """ Parse an IMDB HTML document into structured information. The resulting dictionary contains keys that map roughly to the relevant IMDB fields of the same name. @rtype: Deferred firing with a C{dict} """ tree = parseHTML(data) info = {} info['title'] = tree.find('//h1').text.strip() # Scan all the `<div class="info">` tags for information that we know how # to parse. infoElems = (e for e in tree.findall('//div') if e.get('class') == 'info') for elem in infoElems: h5 = elem.find('h5') if h5 is None: continue infoName = h5.text if infoName is None: continue