def __init__(self, url: str, method: str = 'GET', *, callback=None, metadata: dict = None, request_config: dict = None, request_session=None, res_type: str = 'text', **kwargs): # TODO: cookie """ Initialization parameters """ self.url = url self.method = method.upper() if self.method not in self.METHOD: raise ValueError('%s method is not supported' % self.method) self.callback = callback self.metadata = metadata if metadata is not None else {} self.request_session = request_session if request_config is None: self.request_config = self.REQUEST_CONFIG else: self.request_config = request_config self.res_type = res_type self.kwargs = kwargs self.close_request_session = False self.logger = get_logger(name=self.name) self.retry_times = self.request_config.get('RETRIES', 3)
def __init__(self, url: str, method: str = 'GET', *, callback=None, headers: dict={}, load_js: bool=False, metadata: dict={}, request_config: dict={}, request_session=None, res_type: str='text', **kwargs ): self.url = url self.method = method if self.method not in self.METHOD: raise ValueError('{} method is not supported ~~~'.format(self.method)) self.callback = callback self.headers = headers self.load_js = load_js self.metadata = metadata self.request_session = request_session if not request_config: self.request_config = self.REQUEST_CONFIG else: self.request_config = request_config self.res_type = res_type self.kwargs = kwargs self.close_request_session = False self.logger = get_logger(name=self.name) self.retry_times = self.request_config.get('RETRIES', 15)
def __init__(self, loop=None): if not self.start_urls or not isinstance(self.start_urls, list): raise ValueError( "Spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']" ) self.logger = get_logger(name=self.name) self.loop = loop or asyncio.get_event_loop()
def __init__(self, loop=None): if not self.start_urls or not isinstance(self.start_urls, list): raise ValueError( "Spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']" ) self.logger = get_logger(name=self.name) self.loop = loop or asyncio.new_event_loop() asyncio.set_event_loop(self.loop) self.request_queue = asyncio.Queue() self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
def __init__(self, middleware, loop=None): if not self.start_urls or not isinstance(self.start_urls, list): raise ValueError("Spider must have a param start_urls") self.logger = get_logger(name=self.name) self.loop = loop or asyncio.new_event_loop() asyncio.set_event_loop(self.loop) # customize middleware if isinstance(middleware, list): self.middleware = reduce(lambda x, y: x + y, middleware) else: self.middleware = middleware or Middleware() # async queue self.request_queue = asyncio.Queue() # semaphore self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
#!/usr/bin/env python import os from importlib import util from aspider.utils import get_logger log = get_logger('settings') class SettingsWrapper(object): """ SettingsWrapper returns a spider config """ def __init__(self, settings_name='settings.py'): self.my_settings = {} self.settings_name = settings_name self._load_settings() def __call__(self): return self.my_settings def settings(self): return self.my_settings def load_with_file(self, file_path): file_name = os.path.basename(file_path) if file_name[-3:] != '.py': log.error("module name must be python file, such as : example.py") module_spec = util.spec_from_file_location(file_name, file_path)