Python parseHTML示例，eridanusstd.util.parseHTML Python示例

示例#1

0

显示文件

文件： imdb.py 项目： mithrandi/eridanus

def _parseSummary((data, headers)):
    """
    Extract the plot summary.
    """
    tree = parseHTML(data)

    for p in tree.findall('//p'):
        if p.get('class') == 'plotpar':
            return p.text.strip()

示例#2

0

显示文件

文件： imdb.py 项目： mithrandi/eridanus

def _parseSummary((data, headers)):
    """
    Extract the plot summary.
    """
    tree = parseHTML(data)

    for p in tree.findall('//p'):
        if p.get('class') == 'plotpar':
            return p.text.strip()

示例#3

0

显示文件

文件： test_util.py 项目： mithrandi/eridanus

    def test_parseHTML(self):
        """
        L{eridanusstd.util.parseHTML} will use the newer html5lib API if
        available and parse HTML content into an LXML element tree.
        """
        if not hasattr(html5lib, 'parse'):
            raise SkipTest('html5lib is too old')

        tree = util.parseHTML(self.path.sibling('index.html').open())
        self.assertIdentical(type(tree), type(etree.ElementTree()))

示例#4

0

显示文件

文件： imdb.py 项目： mithrandi/eridanus

def _parsePoster((data, headers)):
    """
    Extract the URL for the poster image.
    """
    tree = parseHTML(data)

    for table in tree.findall('//table'):
        if table.get('id') == 'principal':
            img = table.find('.//img')
            if img is not None:
                return IMDB_URL.click(img.get('src'))
            return None

示例#5

0

显示文件

文件： imdb.py 项目： mithrandi/eridanus

def _parsePoster((data, headers)):
    """
    Extract the URL for the poster image.
    """
    tree = parseHTML(data)

    for table in tree.findall('//table'):
        if table.get('id') == 'principal':
            img = table.find('.//img')
            if img is not None:
                return IMDB_URL.click(img.get('src'))
            return None

示例#6

0

显示文件

def qdbUS(quoteID):
    url = QDB_US_URL.child(quoteID)

    def extractQuote(tree):
        quote = tree.find('//form/table/tbody')
        header = unicode(''.join(quote.find('tr/td').itertext())).strip()
        text = unicode(''.join(quote.find('tr/td/p').itertext())).strip()

        yield u'%s -- %s' % (header, url)
        for line in text.splitlines():
            yield line

    return util.PerseverantDownloader(url).go().addCallback(
        lambda (data, headers): parseHTML(data)).addErrback(
            handleBadQuoteID, quoteID).addCallback(extractQuote)

示例#7

0

显示文件

文件： qdb.py 项目： mithrandi/eridanus

def qdbUS(quoteID):
    url = QDB_US_URL.child(quoteID)

    def extractQuote(tree):
        quote = tree.find('//form/table/tbody')
        header = unicode(''.join(quote.find('tr/td').itertext())).strip()
        text = unicode(''.join(quote.find('tr/td/p').itertext())).strip()

        yield u'%s -- %s' % (header, url)
        for line in text.splitlines():
            yield line

    return util.PerseverantDownloader(url).go(
        ).addCallback(lambda (data, headers): parseHTML(data)
        ).addErrback(handleBadQuoteID, quoteID
        ).addCallback(extractQuote)

示例#8

0

显示文件

def bash(quoteID):
    url = BASH_URL.add(quoteID)

    def extractQuote(tree):
        header = (t for t in tree.find('//p[@class="quote"]').itertext()
                  if t not in ('+', '-', '[X]'))
        header = unicode(''.join(header), 'ascii').strip()
        text = unicode(''.join(
            tree.find('//p[@class="qt"]').itertext())).strip()

        yield u'%s -- %s' % (header, url)
        for line in text.splitlines():
            yield line

    return util.PerseverantDownloader(url).go().addCallback(
        lambda (data, headers): parseHTML(data)).addErrback(
            handleBadQuoteID, quoteID).addCallback(extractQuote)

示例#9

0

显示文件

文件： qdb.py 项目： mithrandi/eridanus

def bash(quoteID):
    url = BASH_URL.add(quoteID)

    def extractQuote(tree):
        header = (t for t in tree.find('//p[@class="quote"]').itertext()
                  if t not in ('+', '-', '[X]'))
        header = unicode(''.join(header), 'ascii').strip()
        text = unicode(''.join(tree.find('//p[@class="qt"]').itertext())).strip()

        yield u'%s -- %s' % (header, url)
        for line in text.splitlines():
            yield line

    return util.PerseverantDownloader(url).go(
        ).addCallback(lambda (data, headers): parseHTML(data)
        ).addErrback(handleBadQuoteID, quoteID
        ).addCallback(extractQuote)

示例#10

0

显示文件

文件： imdb.py 项目： mithrandi/eridanus

def _parseSearchResults((data, headers)):
    """
    Parse search result HTML into an iterable of C{(name, url, id)}.
    """
    tree = parseHTML(data)

    # XXX: Maybe do something a little more less shot-in-the-darkish, like
    # finding the first `ol` after an `h1`.
    for li in tree.find('//ol').findall('li'):
        a = li.find('a')
        url = IMDB_URL.click(a.get('href'))
        name = unicode(a.text)
        # Skip video games, this should be part of the "I want movies,
        # I want TV series" criteria stuff.
        if not name.endswith(u'(VG)'):
            pathList = url.pathList()
            id = pathList[-1] or pathList[-2]
            yield name, url, id

示例#11

0

显示文件

文件： imdb.py 项目： mithrandi/eridanus

def _parseSearchResults((data, headers)):
    """
    Parse search result HTML into an iterable of C{(name, url, id)}.
    """
    tree = parseHTML(data)

    # XXX: Maybe do something a little more less shot-in-the-darkish, like
    # finding the first `ol` after an `h1`.
    for li in tree.find('//ol').findall('li'):
        a = li.find('a')
        url = IMDB_URL.click(a.get('href'))
        name = unicode(a.text)
        # Skip video games, this should be part of the "I want movies,
        # I want TV series" criteria stuff.
        if not name.endswith(u'(VG)'):
            pathList = url.pathList()
            id = pathList[-1] or pathList[-2]
            yield name, url, id

示例#12

0

显示文件

def _extractTitle(data):
    def sanitizeTitle(title):
        return _whitespace.sub(u' ', title.strip())

    if data:
        try:
            tree = parseHTML(data)
            results = tree.xpath(
                '//xhtml:title',
                namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'})

            results = filter(
                None, (sanitizeTitle(unicode(e.text)) for e in results))
            if results:
                return u';'.join(results)
        except:
            log.msg('Extracting title failed:')
            log.err()

    return None

示例#13

0

显示文件

文件： google.py 项目： mithrandi/eridanus

class Calculator(object):
    """
    Primitive screen-scraping interface to Google's calculator.
    """
    _resultFormatting = {'sup': u'^'}

    def _formatResult(self, elem):
        """
        Gracefully downgrade HTML markup in calculator results.
        """
        def _format():
            yield elem.text
            for child in elem.iterchildren():
                tag = child.tag.split('}')[-1]
                extra = self._resultFormatting.get(tag)
                if extra is not None:
                    yield extra
                yield child.text
                yield child.tail

        return filter(None, _format())

    def _extractResult(self, (data, headers), expn):
        """
        Extract the calculator result from a Google search.

        @rtype:  C{(unicode, unicode)}
        @return: A pair of C{(expn, result)}.
        """
        tree = parseHTML(data)
        results = tree.xpath(
            '//xhtml:h2[@class="r"]/xhtml:b',
            namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'})
        if results:
            return u''.join(self._formatResult(results[0]))
        raise errors.InvalidExpression(expn)

示例#14

0

显示文件

文件： imdb.py 项目： mithrandi/eridanus

_infoParsers = {
    u'director': _hyperlinkedText(u'director'),
    u'genre': _genre,
    u'release date': _releaseDate,
    }

def _parseTitleInfo((data, headers), url):
    """
    Parse an IMDB HTML document into structured information.

    The resulting dictionary contains keys that map roughly to the relevant
    IMDB fields of the same name.

    @rtype: Deferred firing with a C{dict}
    """
    tree = parseHTML(data)

    info = {}
    info['title'] = tree.find('//h1').text.strip()

    # Scan all the `<div class="info">` tags for information that we know how
    # to parse.
    infoElems = (e for e in tree.findall('//div') if e.get('class') == 'info')
    for elem in infoElems:
        h5 = elem.find('h5')
        if h5 is None:
            continue

        infoName = h5.text
        if infoName is None:
            continue

示例#15

0

显示文件

文件： imdb.py 项目： mithrandi/eridanus

    u'director': _hyperlinkedText(u'director'),
    u'genre': _genre,
    u'release date': _releaseDate,
}


def _parseTitleInfo((data, headers), url):
    """
    Parse an IMDB HTML document into structured information.

    The resulting dictionary contains keys that map roughly to the relevant
    IMDB fields of the same name.

    @rtype: Deferred firing with a C{dict}
    """
    tree = parseHTML(data)

    info = {}
    info['title'] = tree.find('//h1').text.strip()

    # Scan all the `<div class="info">` tags for information that we know how
    # to parse.
    infoElems = (e for e in tree.findall('//div') if e.get('class') == 'info')
    for elem in infoElems:
        h5 = elem.find('h5')
        if h5 is None:
            continue

        infoName = h5.text
        if infoName is None:
            continue