def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, max_in_mem=100000, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, max_in_mem=max_in_mem, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') self.failed_urls = {} # it is for hight priority url to download, # start() will get url from this queque firstly self.urlqueue = gevent.queue.Queue()
def __init__(self, name): self.db = Connection(config.db_host, config.db_db, config.db_user, config.db_password) self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.hub_hosts = None self.load_hubs()
def __init__(self, name): self._workers = 0 self._workers_max = 5 self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.db = motor.motor_asyncio.AsyncIOMotorClient(config.MONGO_URI)["haodaifu"]
def __init__(self): self._workers = 0 self._workers_max = 5 self.urlpool = UrlPool() self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.db = motor.motor_asyncio.AsyncIOMotorClient( config.MONGO_URI)["cninfo"]
def __init__(self, name): self._workers = 0 self._workers_max = 30 self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.db = sanicdb.SanicDB(config.db_host, config.db_db, config.db_user, config.db_password, loop=self.loop)
def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler')
#!/usr/bin/env python3 # encoding: utf8 # author: veelion # file: bee_server.py from sanic import Sanic from sanic import response from urlpool import UrlPool urlpool = UrlPool(__file__) # 初始化urlpool,根据你的需要进行修改 hub_urls = [] urlpool.set_hubs(hub_urls, 300) urlpool.add('https://news.sina.com.cn/') # init app = Sanic(__name__) @app.listener('after_server_stop') async def cache_urlpool(app, loop): global urlpool print('caching urlpool after_server_stop') del urlpool print('bye!') @app.route('/task') async def task_get(request):