예제 #1
0
 def _load_object(self, obj_class_name, silent=False):
     obj_class = load_object(obj_class_name)
     try:
         return self._load_frontier_object(obj_class)
     except NotConfigured:
         if not silent:
             raise NotConfigured
예제 #2
0
    def __init__(self, manager):
        self.manager = manager

        # Get settings
        settings = manager.settings
        engine = settings.get('SQLALCHEMYBACKEND_ENGINE', DEFAULT_ENGINE)
        engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO',
                                   DEFAULT_ENGINE_ECHO)
        drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES',
                                       DEFAULT_DROP_ALL_TABLES)
        clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT',
                                     DEFAULT_CLEAR_CONTENT)
        models = settings.get('SQLALCHEMYBACKEND_MODELS', DEFAULT_MODELS)

        # Create engine
        self.engine = create_engine(engine, echo=engine_echo)

        # Load models
        self.models = dict([(name, load_object(klass))
                            for name, klass in models.items()])

        # Drop tables if we have to
        if drop_all_tables:
            Base.metadata.drop_all(self.engine)
        Base.metadata.create_all(self.engine)

        # Create session
        self.Session = sessionmaker()
        self.Session.configure(bind=self.engine)
        self.session = self.Session()

        # Clear content if we have to
        if clear_content:
            for name, table in Base.metadata.tables.items():
                self.session.execute(table.delete())
예제 #3
0
 def _start_logger(self, klass, name, level, enabled, handlers):
     logger = klass(name=name, level=level, enabled=enabled)
     for handler in handlers:
         if isinstance(handler, six.string_types):
             handler = load_object(handler)
         logger.add_handler(handler)
     return logger
예제 #4
0
 def _load_object(self, obj_class_name, silent=False):
     obj_class = load_object(obj_class_name)
     try:
         return self._load_frontier_object(obj_class)
     except NotConfigured:
         if not silent:
             raise NotConfigured
예제 #5
0
 def _start_logger(self, klass, name, level, enabled, handlers):
     logger = klass(name=name, level=level, enabled=enabled)
     for handler in handlers:
         if isinstance(handler, six.string_types):
             handler = load_object(handler)
         logger.add_handler(handler)
     return logger
예제 #6
0
    def __init__(self, manager):
        self.manager = manager

        # Get settings
        settings = manager.settings
        engine = settings.get('SQLALCHEMYBACKEND_ENGINE', DEFAULT_ENGINE)
        engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO', DEFAULT_ENGINE_ECHO)
        drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES', DEFAULT_DROP_ALL_TABLES)
        clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT', DEFAULT_CLEAR_CONTENT)
        models = settings.get('SQLALCHEMYBACKEND_MODELS', DEFAULT_MODELS)

        # Create engine
        self.engine = create_engine(engine, echo=engine_echo)

        # Load models
        self.models = dict([(name, load_object(klass)) for name, klass in models.items()])

        # Drop tables if we have to
        if drop_all_tables:
            Base.metadata.drop_all(self.engine)
        Base.metadata.create_all(self.engine)

        # Create session
        self.Session = sessionmaker()
        self.Session.configure(bind=self.engine)
        self.session = self.Session()

        # Clear content if we have to
        if clear_content:
            for name, table in Base.metadata.tables.items():
                self.session.execute(table.delete())
예제 #7
0
    def __init__(self, page_model, link_model, backend, logger, event_log_manager,
                 frontier_middlewares=None, test_mode=False,
                 max_pages=0, max_next_pages=0, auto_start=True, settings=None):

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = load_object(logger)(self._settings)
        assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \
                                                         self._logger.__class__.__name__

        # Log frontier manager starting
        self.logger.manager.debug('-'*80)
        self.logger.manager.debug('Starting Frontier Manager...')

        # Test mode
        self._test_mode = test_mode
        self.logger.manager.debug('Test mode %s' % ("ENABLED" if self.test_mode else "DISABLED"))

        # Load page model
        self._page_model = load_object(page_model)
        assert issubclass(self._page_model, Page), "Page model '%s' must subclass Page" % \
                                                   self._page_model.__name__

        # Load link model
        self._link_model = load_object(link_model)
        assert issubclass(self._link_model, Link), "Page model '%s' must subclass Link" % \
                                                   self._link_model.__name__

        # Load middlewares
        self._frontier_middlewares = self._load_middlewares(frontier_middlewares)

        # Load backend
        self.logger.manager.debug("Loading backend '%s'" % backend)
        self._backend = self._load_object(backend)
        assert isinstance(self.backend, Backend), "backend '%s' must subclass Backend" % \
                                                  self.backend.__class__.__name__

        # Init frontier components pipeline
        self._components_pipeline = [
            ('Middleware', self.frontier_middlewares),
            ('Backend', self.backend),
        ]

        # Page counters
        self._max_pages = max_pages
        self._max_next_pages = max_next_pages
        self._n_pages = 0

        # Iteration counter
        self._iteration = 0

        # Manager finished flag
        self._finished = False

        # Load Event log manager
        self.logger.manager.debug("Loading event log manager '%s'" % event_log_manager)
        self._event_log_manager = self._load_object(event_log_manager)

        # Log frontier manager start
        self.logger.manager.debug('Frontier Manager Started!')
        self.logger.manager.debug('-'*80)

        # start/stop
        self._started = False
        self._stopped = False
        self._auto_start = auto_start
        if self.auto_start:
            self.start()
예제 #8
0
    def __init__(self,
                 request_model,
                 response_model,
                 backend,
                 logger,
                 event_log_manager,
                 middlewares=None,
                 test_mode=False,
                 max_requests=0,
                 max_next_requests=0,
                 auto_start=True,
                 settings=None):
        """
        :param object/string request_model: The :class:`Request <crawlfrontier.core.models.Request>` object to be \
        used by the frontier.

        :param object/string response_model: The :class:`Response <crawlfrontier.core.models.Response>` object to be \
        used by the frontier.

        :param object/string backend: The :class:`Backend <crawlfrontier.core.components.Backend>` object to be \
        used by the frontier.

        :param object/string logger: The :class:`Logger` object to be used by the frontier.

        :param object/string event_log_manager: The :class:`EventLogger` object to be used by the frontier.

        :param list middlewares: A list of :class:`Middleware <crawlfrontier.core.components.Middleware>` \
        objects to be used by the frontier.

        :param bool test_mode: Activate/deactivate :ref:`frontier test mode <frontier-test-mode>`.

        :param int max_requests: Number of pages after which the frontier would stop (See \
        :ref:`Finish conditions <frontier-finish>`).

        :param int max_next_requests: Maximum number of requests returned by \
        :attr:`get_next_requests <crawlfrontier.core.manager.FrontierManager.get_next_requests>` method.

        :param bool auto_start: Activate/deactivate automatic frontier start (See :ref:`starting/stopping the \
        frontier <frontier-start-stop>`).

        :param object/string settings: The :class:`Settings <crawlfrontier.settings.Settings>` object used by \
        the frontier.
        """

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = load_object(logger)(self._settings)
        assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \
                                                         self._logger.__class__.__name__

        # Log frontier manager starting
        self.logger.manager.debug('-' * 80)
        self.logger.manager.debug('Starting Frontier Manager...')

        # Test mode
        self._test_mode = test_mode
        self.logger.manager.debug(
            'Test mode %s' % ("ENABLED" if self.test_mode else "DISABLED"))

        # Load request model
        self._request_model = load_object(request_model)
        assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \
                                                                self._request_model.__name__

        # Load response model
        self._response_model = load_object(response_model)
        assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \
                                                                  self._response_model.__name__

        # Load middlewares
        self._middlewares = self._load_middlewares(middlewares)

        # Load backend
        self.logger.manager.debug("Loading backend '%s'" % backend)
        self._backend = self._load_object(backend)
        assert isinstance(self.backend, Backend), "backend '%s' must subclass Backend" % \
                                                  self.backend.__class__.__name__

        # Init frontier components pipeline
        self._components_pipeline = [
            ('Middleware', self.middlewares, True),
            ('Backend', self.backend, False),
        ]

        # Page counters
        self._max_requests = max_requests
        self._max_next_requests = max_next_requests
        self._n_requests = 0

        # Iteration counter
        self._iteration = 0

        # Manager finished flag
        self._finished = False

        # Load Event log manager
        self.logger.manager.debug("Loading event log manager '%s'" %
                                  event_log_manager)
        self._event_log_manager = self._load_object(event_log_manager)

        # Log frontier manager start
        self.logger.manager.debug('Frontier Manager Started!')
        self.logger.manager.debug('-' * 80)

        # start/stop
        self._started = False
        self._stopped = False
        self._auto_start = auto_start
        if self.auto_start:
            self.start()
예제 #9
0
 def __init__(self, manager):
     fingerprint_function_name = manager.settings.get(
         self.fingerprint_function_name, None)
     if not fingerprint_function_name:
         raise NotConfigured
     self.fingerprint_function = load_object(fingerprint_function_name)
예제 #10
0
 def __init__(self, manager):
     fingerprint_function_name = manager.settings.get(self.fingerprint_function_name, None)
     if not fingerprint_function_name:
         manager.logger.frontier.warning('Missing function "%s" in settings' % self.fingerprint_function_name)
         raise NotConfigured
     self.fingerprint_function = load_object(fingerprint_function_name)