def _load_object(self, obj_class_name, silent=False): obj_class = load_object(obj_class_name) try: return self._load_frontier_object(obj_class) except NotConfigured: if not silent: raise NotConfigured
def __init__(self, manager): self.manager = manager # Get settings settings = manager.settings engine = settings.get('SQLALCHEMYBACKEND_ENGINE', DEFAULT_ENGINE) engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO', DEFAULT_ENGINE_ECHO) drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES', DEFAULT_DROP_ALL_TABLES) clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT', DEFAULT_CLEAR_CONTENT) models = settings.get('SQLALCHEMYBACKEND_MODELS', DEFAULT_MODELS) # Create engine self.engine = create_engine(engine, echo=engine_echo) # Load models self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) # Drop tables if we have to if drop_all_tables: Base.metadata.drop_all(self.engine) Base.metadata.create_all(self.engine) # Create session self.Session = sessionmaker() self.Session.configure(bind=self.engine) self.session = self.Session() # Clear content if we have to if clear_content: for name, table in Base.metadata.tables.items(): self.session.execute(table.delete())
def _start_logger(self, klass, name, level, enabled, handlers): logger = klass(name=name, level=level, enabled=enabled) for handler in handlers: if isinstance(handler, six.string_types): handler = load_object(handler) logger.add_handler(handler) return logger
def __init__(self, page_model, link_model, backend, logger, event_log_manager, frontier_middlewares=None, test_mode=False, max_pages=0, max_next_pages=0, auto_start=True, settings=None): # Settings self._settings = settings or Settings() # Logger self._logger = load_object(logger)(self._settings) assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \ self._logger.__class__.__name__ # Log frontier manager starting self.logger.manager.debug('-'*80) self.logger.manager.debug('Starting Frontier Manager...') # Test mode self._test_mode = test_mode self.logger.manager.debug('Test mode %s' % ("ENABLED" if self.test_mode else "DISABLED")) # Load page model self._page_model = load_object(page_model) assert issubclass(self._page_model, Page), "Page model '%s' must subclass Page" % \ self._page_model.__name__ # Load link model self._link_model = load_object(link_model) assert issubclass(self._link_model, Link), "Page model '%s' must subclass Link" % \ self._link_model.__name__ # Load middlewares self._frontier_middlewares = self._load_middlewares(frontier_middlewares) # Load backend self.logger.manager.debug("Loading backend '%s'" % backend) self._backend = self._load_object(backend) assert isinstance(self.backend, Backend), "backend '%s' must subclass Backend" % \ self.backend.__class__.__name__ # Init frontier components pipeline self._components_pipeline = [ ('Middleware', self.frontier_middlewares), ('Backend', self.backend), ] # Page counters self._max_pages = max_pages self._max_next_pages = max_next_pages self._n_pages = 0 # Iteration counter self._iteration = 0 # Manager finished flag self._finished = False # Load Event log manager self.logger.manager.debug("Loading event log manager '%s'" % event_log_manager) self._event_log_manager = self._load_object(event_log_manager) # Log frontier manager start self.logger.manager.debug('Frontier Manager Started!') self.logger.manager.debug('-'*80) # start/stop self._started = False self._stopped = False self._auto_start = auto_start if self.auto_start: self.start()
def __init__(self, request_model, response_model, backend, logger, event_log_manager, middlewares=None, test_mode=False, max_requests=0, max_next_requests=0, auto_start=True, settings=None): """ :param object/string request_model: The :class:`Request <crawlfrontier.core.models.Request>` object to be \ used by the frontier. :param object/string response_model: The :class:`Response <crawlfrontier.core.models.Response>` object to be \ used by the frontier. :param object/string backend: The :class:`Backend <crawlfrontier.core.components.Backend>` object to be \ used by the frontier. :param object/string logger: The :class:`Logger` object to be used by the frontier. :param object/string event_log_manager: The :class:`EventLogger` object to be used by the frontier. :param list middlewares: A list of :class:`Middleware <crawlfrontier.core.components.Middleware>` \ objects to be used by the frontier. :param bool test_mode: Activate/deactivate :ref:`frontier test mode <frontier-test-mode>`. :param int max_requests: Number of pages after which the frontier would stop (See \ :ref:`Finish conditions <frontier-finish>`). :param int max_next_requests: Maximum number of requests returned by \ :attr:`get_next_requests <crawlfrontier.core.manager.FrontierManager.get_next_requests>` method. :param bool auto_start: Activate/deactivate automatic frontier start (See :ref:`starting/stopping the \ frontier <frontier-start-stop>`). :param object/string settings: The :class:`Settings <crawlfrontier.settings.Settings>` object used by \ the frontier. """ # Settings self._settings = settings or Settings() # Logger self._logger = load_object(logger)(self._settings) assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \ self._logger.__class__.__name__ # Log frontier manager starting self.logger.manager.debug('-' * 80) self.logger.manager.debug('Starting Frontier Manager...') # Test mode self._test_mode = test_mode self.logger.manager.debug( 'Test mode %s' % ("ENABLED" if self.test_mode else "DISABLED")) # Load request model self._request_model = load_object(request_model) assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \ self._request_model.__name__ # Load response model self._response_model = load_object(response_model) assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ self._response_model.__name__ # Load middlewares self._middlewares = self._load_middlewares(middlewares) # Load backend self.logger.manager.debug("Loading backend '%s'" % backend) self._backend = self._load_object(backend) assert isinstance(self.backend, Backend), "backend '%s' must subclass Backend" % \ self.backend.__class__.__name__ # Init frontier components pipeline self._components_pipeline = [ ('Middleware', self.middlewares, True), ('Backend', self.backend, False), ] # Page counters self._max_requests = max_requests self._max_next_requests = max_next_requests self._n_requests = 0 # Iteration counter self._iteration = 0 # Manager finished flag self._finished = False # Load Event log manager self.logger.manager.debug("Loading event log manager '%s'" % event_log_manager) self._event_log_manager = self._load_object(event_log_manager) # Log frontier manager start self.logger.manager.debug('Frontier Manager Started!') self.logger.manager.debug('-' * 80) # start/stop self._started = False self._stopped = False self._auto_start = auto_start if self.auto_start: self.start()
def __init__(self, manager): fingerprint_function_name = manager.settings.get( self.fingerprint_function_name, None) if not fingerprint_function_name: raise NotConfigured self.fingerprint_function = load_object(fingerprint_function_name)
def __init__(self, manager): fingerprint_function_name = manager.settings.get(self.fingerprint_function_name, None) if not fingerprint_function_name: manager.logger.frontier.warning('Missing function "%s" in settings' % self.fingerprint_function_name) raise NotConfigured self.fingerprint_function = load_object(fingerprint_function_name)