Python getBeautifulSoup示例

编程语言: Python

命名空间/包名称: tl.imports

方法/功能: getBeautifulSoup

hotexamples.com的示例: 2

Python getBeautifulSoup - 已找到2个示例。这些是从开源项目中提取的最受好评的tl.imports.getBeautifulSoup现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： url.py 项目： buzzworkers/tl

 def geturls(self):
     if not self.html: self.fetch()
     urls = []
     from tl.imports import getBeautifulSoup
     soup = getBeautifulSoup()
     s = soup.BeautifulSoup(self.html)
     tags = s('a')
     for tag in tags:
        href = tag.get("href")
        if href:
            href = href.split("#")[0]
            if not href: continue
            if not href.endswith(".html"): continue
            if ".." in href: continue
            if href.startswith("mailto"): continue
            if not "http" in href:
                 if href.startswith("/"): href = self.root + href
                 else: href = self.base + "/" + href
            if not self.root in href: logging.warn("%s not in %s" % (self.root, href)) ; continue
            if href not in urls: urls.append(href)
     logging.warn("found %s urls" % len(urls))
     return urls

示例#2

显示文件

文件： spider.py 项目： buzzworkers/tl

## tl imports

from tl.utils.name import stripname
from tl.utils.exception import handle_exception
from tl.utils.urldata import UrlData
from tl.utils.generic import waitforqueue
from tl.utils.url import geturl2, striphtml, Url
from tl.lib.datadir import getdatadir
from tl.lib.persist import PersistCollection
from tl.lib.commands import cmnds
from tl.lib.examples import examples
from tl.lib.threadloop import ThreadLoop
from tl.lib.callbacks import callbacks
from tl.imports import getBeautifulSoup
soup = getBeautifulSoup()

## basic imports

from collections import deque 
import os
import logging
import re
import sys
import time
import math
import urllib.request, urllib.error, urllib.parse
import urllib.parse
import optparse
from cgi import escape
from traceback import format_exc