Exemplo n.º 1
0
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 max_in_mem=100000,
                 worker_conf_file='xworkers.conf',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               max_in_mem=max_in_mem,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')
        self.failed_urls = {}

        # it is for hight priority url to download,
        # start() will get url from this queque firstly
        self.urlqueue = gevent.queue.Queue()
Exemplo n.º 2
0
 def __init__(self, name):
     self.db = Connection(config.db_host, config.db_db, config.db_user,
                          config.db_password)
     self.logger = fn.init_file_logger(name + '.log')
     self.urlpool = UrlPool(name)
     self.hub_hosts = None
     self.load_hubs()
Exemplo n.º 3
0
 def __init__(self, name):
     self._workers = 0
     self._workers_max = 5
     self.logger = fn.init_file_logger(name + '.log')
     self.urlpool = UrlPool(name)
     self.loop = asyncio.get_event_loop()
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.db = motor.motor_asyncio.AsyncIOMotorClient(config.MONGO_URI)["haodaifu"]
Exemplo n.º 4
0
 def __init__(self):
     self._workers = 0
     self._workers_max = 5
     self.urlpool = UrlPool()
     self.loop = asyncio.get_event_loop()
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.db = motor.motor_asyncio.AsyncIOMotorClient(
         config.MONGO_URI)["cninfo"]
Exemplo n.º 5
0
    def __init__(self, name):
        self._workers = 0
        self._workers_max = 30
        self.logger = fn.init_file_logger(name + '.log')

        self.urlpool = UrlPool(name)

        self.loop = asyncio.get_event_loop()
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.db = sanicdb.SanicDB(config.db_host,
                                  config.db_db,
                                  config.db_user,
                                  config.db_password,
                                  loop=self.loop)
Exemplo n.º 6
0
 def __init__(self, max_working=20, common_gap=20,
              urlindex_file="", proxies_file=None,
              span_of_host=3,
              worker_conf_file='xworkers.conf',
              load_bad_url=None, logfile=''):
     self.proxypool = ProxyPool(common_gap, proxies_file)
     self.urlpool = UrlPool(urlindex_file,
                            load_bad_url=load_bad_url,
                            span_of_host=span_of_host,
                            is_good_link=self.is_good_link)
     self.max_working = max_working
     self.worker_conf_file = worker_conf_file
     self._workers = 0
     # you can customize your http header in init_urlpool()
     self.headers = None
     self._http_exception_code = 900
     if logfile:
         self.logger = init_file_logger(logfile)
     else:
         self.logger = logging.getLogger('xcrawler')
Exemplo n.º 7
0
#!/usr/bin/env python3
# encoding: utf8
# author: veelion
# file: bee_server.py

from sanic import Sanic
from sanic import response

from urlpool import UrlPool

urlpool = UrlPool(__file__)

# 初始化urlpool,根据你的需要进行修改
hub_urls = []
urlpool.set_hubs(hub_urls, 300)
urlpool.add('https://news.sina.com.cn/')

# init
app = Sanic(__name__)


@app.listener('after_server_stop')
async def cache_urlpool(app, loop):
    global urlpool
    print('caching urlpool after_server_stop')
    del urlpool
    print('bye!')


@app.route('/task')
async def task_get(request):