Exemplo n.º 1
0
    def __init__(self, path='.'):
        self.inside_project = False
        self.cfg_path = self.get_closest_cfg(path=path)
        self.cfg = None
        self.project_dir = None
        self.module_settings = Settings()  # user-defined settings file
        self._data_dir = None

        if self.cfg_path:
            # init project dir
            self.project_dir = os.path.dirname(self.cfg_path)
            if self.project_dir not in sys.path:
                sys.path.append(self.project_dir)
            # init cfg
            self.cfg = SafeConfigParser()
            self.cfg.read([self.cfg_path])
            # init settings
            if self.cfg.has_option('crawlmi', 'settings'):
                settings_module_path = self.cfg.get('crawlmi', 'settings')
                try:
                    settings_module = __import__(settings_module_path, {}, {},
                                                 [''])
                except ImportError as exc:
                    warnings.warn(
                        'Cannot import crawlmi settings module %s: %s' %
                        (settings_module_path, exc))
                else:
                    self.module_settings = Settings.from_module(
                        settings_module)
                    self.inside_project = True
Exemplo n.º 2
0
 def setUp(self):
     self.clock = Clock()
     self.request_queue = MemoryQueue()
     self.response_queue = ResponseQueue()
     self.dwn = Downloader(Settings(self.default_settings),
                           self.request_queue,
                           self.response_queue,
                           download_handler=MockDownloaderHandler(
                               Settings()),
                           clock=self.clock)
     self.handler = self.dwn.download_handler
Exemplo n.º 3
0
 def _update_dwn(self, **kwargs):
     '''Update downloader with the new settings.
     '''
     new_settings = self.default_settings.copy()
     new_settings.update(**kwargs)
     self.dwn.processing.cancel()
     self.dwn = Downloader(Settings(new_settings),
                           self.request_queue,
                           self.response_queue,
                           download_handler=MockDownloaderHandler(
                               Settings()),
                           clock=self.clock)
     self.handler = self.dwn.download_handler
Exemplo n.º 4
0
    def __init__(self, path='.'):
        self.inside_project = False
        self.cfg_path = self.get_closest_cfg(path=path)
        self.cfg = None
        self.project_dir = None
        self.module_settings = Settings()  # user-defined settings file
        self._data_dir = None

        if self.cfg_path:
            # init project dir
            self.project_dir = os.path.dirname(self.cfg_path)
            if self.project_dir not in sys.path:
                sys.path.append(self.project_dir)
            # init cfg
            self.cfg = SafeConfigParser()
            self.cfg.read([self.cfg_path])
            # init settings
            if self.cfg.has_option('crawlmi', 'settings'):
                settings_module_path = self.cfg.get('crawlmi', 'settings')
                try:
                    settings_module = __import__(settings_module_path, {}, {}, [''])
                except ImportError as exc:
                    warnings.warn(
                        'Cannot import crawlmi settings module %s: %s' %
                        (settings_module_path, exc))
                else:
                    self.module_settings = Settings.from_module(settings_module)
                    self.inside_project = True
 def setUp(self):
     self.tmpname = self.mktemp()
     fd = open(self.tmpname + '^', 'wb')
     fd.write('0123456789')
     fd.close()
     self.download_request = FileDownloadHandler(
         Settings()).download_request
Exemplo n.º 6
0
    def get_settings(self, args, options):
        '''Return command specific settings. Default behavior is to combine
        `command_settings` with the settings received from command line.

        `self.engine` is still not initialized, so don't use it.
        '''
        custom_settings = self.command_settings.copy()

        if hasattr(options, 'set'):
            try:
                custom_settings.update(arglist_to_dict(options.set))
            except ValueError:
                raise UsageError('Invalid -s value, use -s NAME=VALUE',
                                 print_help=False)

        # logging behavior
        if getattr(options, 'logfile', None):
            custom_settings['LOG_ENABLED'] = True
            custom_settings['LOG_FILE'] = options.logfile
        if getattr(options, 'loglevel', None):
            custom_settings['LOG_ENABLED'] = True
            custom_settings['LOG_LEVEL'] = options.loglevel
        if getattr(options, 'nolog', None):
            custom_settings['LOG_ENABLED'] = False

        return Settings(custom_settings)
Exemplo n.º 7
0
 def test_get_commands(self):
     settings = Settings({
         'COMMAND_MODULES': ['crawlmi.tests.test_cmdline.sample_commands']
     })
     commands = get_commands(settings, False)
     self.assertNotIn('command1', commands)
     self.assertIn('command2', commands)
Exemplo n.º 8
0
 def setUp(self):
     self.handler = MockDownloaderHandler(Settings())
     self.clock = Clock()
     self.slot = Slot(self.handler,
                      self.default_concurrency,
                      self.default_delay,
                      self.default_randomize_delay,
                      clock=self.clock)
 def setUp(self):
     site = server.Site(UriResource(), timeout=None)
     wrapper = WrappingFactory(site)
     self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1')
     self.portno = self.port.getHost().port
     self.download_handler = HttpDownloadHandler(
         Settings({'CONCURRENT_REQUESTS_PER_DOMAIN': 8}))
     self.download_request = self.download_handler.download_request
Exemplo n.º 10
0
    def setUp(self):
        orig_spiders_dir = os.path.join(module_dir, 'test_spiders')
        self.tmpdir = self.mktemp()
        os.mkdir(self.tmpdir)
        self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx')
        shutil.copytree(orig_spiders_dir, self.spiders_dir)
        sys.path.append(self.tmpdir)

        settings = {'SPIDER_MODULES': ['test_spiders_xxx']}
        self.manager = SpiderManager(Settings(settings))
        self.lw = LogWrapper()
        self.lw.setUp()
 def setUp(self):
     self.settings = Settings({
         'DOWNLOAD_HANDLERS': {
             'file':
             'crawlmi.core.handlers.FileDownloadHandler',
             'http':
             'crawlmi.core.handlers.HttpDownloadHandler',
             'https':
             'crawlmi.tests.test_downloader_handlers.NonConfiguredHandler',
         }
     })
     self.handler = GeneralHandler(self.settings)
Exemplo n.º 12
0
    def test_failure(self):
        self.slot.download_handler = FailureDownloaderHandler(Settings())
        download_values = []

        def downloaded(value):
            download_values.append(value)

        for i in xrange(2):
            r, dfd = get_request(str(i))
            dfd.addBoth(downloaded)
            self.slot.enqueue(r, dfd)

        self.assertEqual(len(download_values), 2)
        self.assertIsInstance(download_values[0], Failure)
        self.assertIsInstance(download_values[1], Failure)
        self.assertIsNot(download_values[0], download_values[1])
        self.assertIsInstance(download_values[0].value, ValueError)
 def setUp(self):
     name = self.mktemp()
     os.mkdir(name)
     FilePath(name).child('file').setContent('0123456789')
     r = static.File(name)
     r.putChild('redirect', util.Redirect('/file'))
     r.putChild('wait', ForeverTakingResource())
     r.putChild('hang-after-headers', ForeverTakingResource(write=True))
     r.putChild('nolength', NoLengthResource())
     r.putChild('host', HostHeaderResource())
     r.putChild('payload', PayloadResource())
     r.putChild('broken', BrokenDownloadResource())
     self.site = server.Site(r, timeout=None)
     self.wrapper = WrappingFactory(self.site)
     self.port = reactor.listenTCP(0, self.wrapper, interface='127.0.0.1')
     self.portno = self.port.getHost().port
     self.download_handler = self.download_handler_cls(
         Settings({'CONCURRENT_REQUESTS_PER_DOMAIN': 8}))
     self.download_request = self.download_handler.download_request
Exemplo n.º 14
0
 def test_from_module(self):
     s = Settings.from_module('crawlmi.settings.default_settings')
     self.assertIn('CONCURRENT_REQUESTS', s)
     self.assertNotIn('blabla', s)
     self.assertNotIn('__name__', s)
Exemplo n.º 15
0
 def test_copy(self):
     s1 = Settings({'a': 'b'})
     s2 = s1.copy()
     self.assertIsInstance(s2, Settings)
     self.assertIsNot(s1, s2)
     self.assertDictEqual(s1.values, s2.values)
Exemplo n.º 16
0
 def test_exception(self):
     self.slot.download_handler = ExceptionDownloaderHandler(Settings())
     r1, dfd1 = get_request('1')
     self.slot.enqueue(r1, dfd1)
     return self.assertFailure(dfd1, Exception)
Exemplo n.º 17
0
 def setUp(self):
     values = {}
     for (k, v) in self.tests.iteritems():
         values[k] = v[0]
     self.settings = Settings(values)
Exemplo n.º 18
0
 def setUp(self):
     values = {}
     for (k, v) in self.tests.iteritems():
         values[k] = v[0]
     self.settings = Settings(values)
Exemplo n.º 19
0
class SettingsTest(unittest.TestCase):
    tests = {
        'BOOL_TRUE_1': (1, True),
        'BOOL_TRUE_2': ('1', True),
        'BOOL_TRUE_3': (True, True),
        'BOOL_FALSE_1': (0, False),
        'BOOL_FALSE_2': ('0', False),
        'BOOL_FALSE_3': (False, False),
        'BOOL_FALSE_4': (None, False),

        'INT_1': (0, 0),
        'INT_2': ('1', 1),
        'INT_3': ('-1', -1),
        'INT_4': (4.5, 4),

        'FLOAT_1': (0.0, 0.0),
        'FLOAT_2': ('5.4', 5.4),
        'FLOAT_3': (47, 47.0),
        'FLOAT_4': ('47.3', 47.3),

        'LIST_1': (['one', 'two'], ['one', 'two']),
        'LIST_2': ('one,two', ['one', 'two']),
        'LIST_3': ('one', ['one']),
        'LIST_4': ('', ['']),
    }

    def setUp(self):
        values = {}
        for (k, v) in self.tests.iteritems():
            values[k] = v[0]
        self.settings = Settings(values)

    def test_from_module(self):
        s = Settings.from_module('crawlmi.settings.default_settings')
        self.assertIn('CONCURRENT_REQUESTS', s)
        self.assertNotIn('blabla', s)
        self.assertNotIn('__name__', s)

    def test_copy(self):
        s1 = Settings({'a': 'b'})
        s2 = s1.copy()
        self.assertIsInstance(s2, Settings)
        self.assertIsNot(s1, s2)
        self.assertDictEqual(s1.values, s2.values)

    def _get_answers(self, prefix):
        result = []
        for (k, v) in self.tests.iteritems():
            if k.startswith(prefix):
                result.append((k, v[1]))
        return result

    def test_req_or_resp(self):
        req = Request('http://github.com/', meta={'INT_1': 10, 'a': 'b'})
        self.assertEqual(self.settings.get('INT_1', req_or_resp=req), 10)
        self.assertEqual(self.settings.get('a', req_or_resp=req), 'b')
        resp = Response('', request=req)
        self.assertEqual(self.settings.get('INT_1', req_or_resp=resp), 10)
        self.assertEqual(self.settings.get('a', req_or_resp=resp), 'b')

    def test_bool(self):
        for (k, v) in self._get_answers('BOOL'):
            self.assertIs(self.settings.get_bool(k), v, k)
        self.assertIs(self.settings.get_bool('invalid'), False)
        self.assertIs(self.settings.get_bool('invalid', True), True)
        self.assertRaises(ValueError, self.settings.get_bool, 'invalid', 'hello')

    def test_int(self):
        for (k, v) in self._get_answers('INT'):
            self.assertEqual(self.settings.get_int(k), v, k)
        self.assertEqual(self.settings.get_int('invalid'), 0)
        self.assertEqual(self.settings.get_int('invalid', 12), 12)
        self.assertRaises(ValueError, self.settings.get_int, 'invalid', 'hello')

    def test_float(self):
        for (k, v) in self._get_answers('FLOAT'):
            self.assertEqual(self.settings.get_float(k), v, k)
        self.assertEqual(self.settings.get_float('invalid'), 0.0)
        self.assertEqual(self.settings.get_float('invalid', 12.3), 12.3)
        self.assertRaises(ValueError, self.settings.get_float, 'invalid', 'hello')

    def test_list(self):
        for (k, v) in self._get_answers('LIST'):
            self.assertEqual(self.settings.get_list(k), v, k)
        self.assertEqual(self.settings.get_list('invalid'), [])

        listA = [1, 2, 3]
        self.assertIs(self.settings.get_list('invalid', listA), listA)
        listB = []
        self.assertIs(self.settings.get_list('invalid', listB), listB)

    def test_keys(self):
        keys = self.settings.keys()
        self.assertIsInstance(keys, list)
        self.assertEqual(len(keys), len(set(keys)))
        self.assertSetEqual(set(keys), set(self.tests.keys()))
Exemplo n.º 20
0
 def test_copy(self):
     s1 = Settings({'a': 'b'})
     s2 = s1.copy()
     self.assertIsInstance(s2, Settings)
     self.assertIsNot(s1, s2)
     self.assertDictEqual(s1.values, s2.values)
Exemplo n.º 21
0
 def test_from_module(self):
     s = Settings.from_module('crawlmi.settings.default_settings')
     self.assertIn('CONCURRENT_REQUESTS', s)
     self.assertNotIn('blabla', s)
     self.assertNotIn('__name__', s)
Exemplo n.º 22
0
class SettingsTest(unittest.TestCase):
    tests = {
        'BOOL_TRUE_1': (1, True),
        'BOOL_TRUE_2': ('1', True),
        'BOOL_TRUE_3': (True, True),
        'BOOL_FALSE_1': (0, False),
        'BOOL_FALSE_2': ('0', False),
        'BOOL_FALSE_3': (False, False),
        'BOOL_FALSE_4': (None, False),
        'INT_1': (0, 0),
        'INT_2': ('1', 1),
        'INT_3': ('-1', -1),
        'INT_4': (4.5, 4),
        'FLOAT_1': (0.0, 0.0),
        'FLOAT_2': ('5.4', 5.4),
        'FLOAT_3': (47, 47.0),
        'FLOAT_4': ('47.3', 47.3),
        'LIST_1': (['one', 'two'], ['one', 'two']),
        'LIST_2': ('one,two', ['one', 'two']),
        'LIST_3': ('one', ['one']),
        'LIST_4': ('', ['']),
    }

    def setUp(self):
        values = {}
        for (k, v) in self.tests.iteritems():
            values[k] = v[0]
        self.settings = Settings(values)

    def test_from_module(self):
        s = Settings.from_module('crawlmi.settings.default_settings')
        self.assertIn('CONCURRENT_REQUESTS', s)
        self.assertNotIn('blabla', s)
        self.assertNotIn('__name__', s)

    def test_copy(self):
        s1 = Settings({'a': 'b'})
        s2 = s1.copy()
        self.assertIsInstance(s2, Settings)
        self.assertIsNot(s1, s2)
        self.assertDictEqual(s1.values, s2.values)

    def _get_answers(self, prefix):
        result = []
        for (k, v) in self.tests.iteritems():
            if k.startswith(prefix):
                result.append((k, v[1]))
        return result

    def test_req_or_resp(self):
        req = Request('http://github.com/', meta={'INT_1': 10, 'a': 'b'})
        self.assertEqual(self.settings.get('INT_1', req_or_resp=req), 10)
        self.assertEqual(self.settings.get('a', req_or_resp=req), 'b')
        resp = Response('', request=req)
        self.assertEqual(self.settings.get('INT_1', req_or_resp=resp), 10)
        self.assertEqual(self.settings.get('a', req_or_resp=resp), 'b')

    def test_bool(self):
        for (k, v) in self._get_answers('BOOL'):
            self.assertIs(self.settings.get_bool(k), v, k)
        self.assertIs(self.settings.get_bool('invalid'), False)
        self.assertIs(self.settings.get_bool('invalid', True), True)
        self.assertRaises(ValueError, self.settings.get_bool, 'invalid',
                          'hello')

    def test_int(self):
        for (k, v) in self._get_answers('INT'):
            self.assertEqual(self.settings.get_int(k), v, k)
        self.assertEqual(self.settings.get_int('invalid'), 0)
        self.assertEqual(self.settings.get_int('invalid', 12), 12)
        self.assertRaises(ValueError, self.settings.get_int, 'invalid',
                          'hello')

    def test_float(self):
        for (k, v) in self._get_answers('FLOAT'):
            self.assertEqual(self.settings.get_float(k), v, k)
        self.assertEqual(self.settings.get_float('invalid'), 0.0)
        self.assertEqual(self.settings.get_float('invalid', 12.3), 12.3)
        self.assertRaises(ValueError, self.settings.get_float, 'invalid',
                          'hello')

    def test_list(self):
        for (k, v) in self._get_answers('LIST'):
            self.assertEqual(self.settings.get_list(k), v, k)
        self.assertEqual(self.settings.get_list('invalid'), [])

        listA = [1, 2, 3]
        self.assertIs(self.settings.get_list('invalid', listA), listA)
        listB = []
        self.assertIs(self.settings.get_list('invalid', listB), listB)

    def test_keys(self):
        keys = self.settings.keys()
        self.assertIsInstance(keys, list)
        self.assertEqual(len(keys), len(set(keys)))
        self.assertSetEqual(set(keys), set(self.tests.keys()))
Exemplo n.º 23
0
 def test_load_base_spider(self):
     settings = {'SPIDER_MODULES': ['crawlmi.tests.test_spider_manager.test_spiders.spider0']}
     self.manager = SpiderManager(Settings(settings))
     self.assertEqual(len(self.manager._spiders), 0)