示例#1
0
def webgrep(xpath, urls, is_recursive):
    root_url = urls[0]
    is_multiple_source = is_recursive or len(urls) > 1

    queue = [(url, None) for url in urls]
    visited = dict((url, True) for url in urls)

    while len(queue) > 0:
        (url, referrer) = queue.pop(0)

        req = urllib2.Request(url)
        if referrer:
            req.add_header('Referer', referrer)
        try:
            f = urllib2.urlopen(req)
            content_type = f.info().gettype()
            if not content_type in ('text/html', 'application/xhtml+xml'):
                f.close()
                continue
            content = f.read()
            f.close()
        except urllib2.URLError as e:
            print >> sys.stderr, "%s: %s" % (url, e)
            continue

        try:
            doc = libxml2.htmlReadMemory(
                content, len(content), url, None,
                libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR
                | libxml2.HTML_PARSE_NOWARNING | libxml2.HTML_PARSE_NONET)
            ctx = doc.xpathNewContext()
            for node in ctx.xpathEvalExpression(xpath):
                content = node.content.strip()
                if node.type == 'attribute' and node.name in ('href', 'src'):
                    content = urljoin(url, content)
                if is_multiple_source:
                    print "%s:%s" % (url, content)
                else:
                    print content
            if is_recursive:
                for node in ctx.xpathEvalExpression('//a/@href'):
                    (next_url, fragment) = urldefrag(
                        urljoin(url, node.content))
                    if next_url.startswith(
                            root_url) and not next_url in visited:
                        queue.append((next_url, url))
                        visited[next_url] = True
            ctx.xpathFreeContext()
            doc.freeDoc()
        except libxml2.treeError as e:
            print >> sys.stderr, "%s: %s" % (url, e)
示例#2
0
文件: qq.py 项目: zhuliting/code
def get_html_doc(html):
  if html is None:
    return
  if len(html) == 0:
    return
  pattern = '[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]'
  html = re.sub(pattern, ' ', html)
  # get encoding
  encoding = get_charset(html)
  if encoding is None:
    encoding = 'utf-8'

  options = libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR \
          | libxml2.HTML_PARSE_PEDANTIC | libxml2.HTML_PARSE_NONET \
          | libxml2.HTML_PARSE_NOWARNING
  doc = libxml2.htmlReadMemory(html, len(html), None, encoding, options)
  return doc
示例#3
0
def webgrep(xpath, urls, is_recursive):
    root_url = urls[0]
    is_multiple_source = is_recursive or len(urls) > 1

    queue = [(url, None) for url in urls]
    visited = dict((url, True) for url in urls)

    while len(queue) > 0:
        (url, referrer) = queue.pop(0)

        req = urllib2.Request(url)
        if referrer:
            req.add_header('Referer', referrer)
        try:
            f = urllib2.urlopen(req)
            content_type = f.info().gettype()
            if not content_type in ('text/html', 'application/xhtml+xml'):
                f.close()
                continue
            content = f.read()
            f.close()
        except urllib2.URLError as e:
            print >>sys.stderr, "%s: %s" % (url, e)
            continue

        try:
            doc = libxml2.htmlReadMemory(content, len(content), url, None, libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR | libxml2.HTML_PARSE_NOWARNING | libxml2.HTML_PARSE_NONET)
            ctx = doc.xpathNewContext()
            for node in ctx.xpathEvalExpression(xpath):
                content = node.content.strip()
                if node.type == 'attribute' and node.name in ('href', 'src'):
                    content = urljoin(url, content)
                if is_multiple_source:
                    print "%s:%s" % (url, content)
                else:
                    print content
            if is_recursive:
                for node in ctx.xpathEvalExpression('//a/@href'):
                    (next_url, fragment) = urldefrag(urljoin(url, node.content))
                    if next_url.startswith(root_url) and not next_url in visited:
                        queue.append((next_url, url))
                        visited[next_url] = True
            ctx.xpathFreeContext()
            doc.freeDoc()
        except libxml2.treeError as e:
            print >>sys.stderr, "%s: %s" % (url, e)
示例#4
0
def get_html_doc(html):
    if html is None:
        return
    if len(html) == 0:
        return
    pattern = '[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]'
    html = re.sub(pattern, ' ', html)
    # get encoding
    encoding = get_charset(html)
    if encoding is None:
        encoding = 'utf-8'

    options = libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR \
            | libxml2.HTML_PARSE_PEDANTIC | libxml2.HTML_PARSE_NONET \
            | libxml2.HTML_PARSE_NOWARNING
    doc = libxml2.htmlReadMemory(html, len(html), None, encoding, options)
    return doc