示例#1
0
文件: url.py 项目: buzzworkers/tl
 def geturls(self):
     if not self.html: self.fetch()
     urls = []
     from tl.imports import getBeautifulSoup
     soup = getBeautifulSoup()
     s = soup.BeautifulSoup(self.html)
     tags = s('a')
     for tag in tags:
        href = tag.get("href")
        if href:
            href = href.split("#")[0]
            if not href: continue
            if not href.endswith(".html"): continue
            if ".." in href: continue
            if href.startswith("mailto"): continue
            if not "http" in href:
                 if href.startswith("/"): href = self.root + href
                 else: href = self.base + "/" + href
            if not self.root in href: logging.warn("%s not in %s" % (self.root, href)) ; continue
            if href not in urls: urls.append(href)
     logging.warn("found %s urls" % len(urls))
     return urls
示例#2
0
文件: spider.py 项目: buzzworkers/tl
## tl imports

from tl.utils.name import stripname
from tl.utils.exception import handle_exception
from tl.utils.urldata import UrlData
from tl.utils.generic import waitforqueue
from tl.utils.url import geturl2, striphtml, Url
from tl.lib.datadir import getdatadir
from tl.lib.persist import PersistCollection
from tl.lib.commands import cmnds
from tl.lib.examples import examples
from tl.lib.threadloop import ThreadLoop
from tl.lib.callbacks import callbacks
from tl.imports import getBeautifulSoup
soup = getBeautifulSoup()

## basic imports

from collections import deque 
import os
import logging
import re
import sys
import time
import math
import urllib.request, urllib.error, urllib.parse
import urllib.parse
import optparse
from cgi import escape
from traceback import format_exc