Пример #1
0
 def test_generate_start_urls_from_params(self):
     genny = UrlGenerator()
     spec = self.specs['params']
     base = "https://encrypted.google.com/search?hl=en&q=%s&location=%s"
     n, t, d, c = "nosetests", "tox", "dublin", "cork"
     arg = [(n, d), (n, c), (t, d), (t, c)]
     self.assertEqual([base % (q, l) for q, l in arg], list(genny(spec[0])))
Пример #2
0
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        self.start_url_generators = {
            'start_urls': IdentityGenerator(),
            'generated_urls': UrlGenerator(settings, kw),
            'url': IdentityGenerator(),
            'feed': FeedGenerator(self.parse),
            'generated': FragmentGenerator(),
        }
        self.generic_form = GenericForm(**kw)
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        self._add_spider_args_to_spec(spec, kw)
        self._configure_js(spec, settings)
        self.plugins = self._configure_plugins(settings, spec, item_schemas,
                                               all_extractors)

        self.login_requests, self.form_requests = [], []
        self._start_urls = self._create_start_urls(spec)
        self._start_requests = self._create_start_requests(spec)
        self._create_init_requests(spec)
        self._add_allowed_domains(spec)
        self.page_actions = spec.get('page_actions', [])
Пример #3
0
 def setUp(self):
     self.generators = {
         'start_urls': IdentityGenerator(),
         'generated_urls': UrlGenerator(),
         'url': IdentityGenerator(),
         'generated': FragmentGenerator(),
     }
Пример #4
0
 def test_generate_start_urls_from_date(self):
     now = datetime.now()
     genny = UrlGenerator()
     spec = self.specs['dates']
     url = "http://www.commitstrip.com/en/{}/{:02}".format(
         now.year, now.month)
     self.assertEqual([url], list(genny(spec[0])))
Пример #5
0
 def test_generate_start_urls_from_spider_arg(self):
     genny = UrlGenerator(
         spider_args={
             'categories': ['cars-for-sale', 'houses-for-sale'],
             'sections': ['pets-for-sale', 'kitchens-for-sale']
         })
     spec = self.specs['spider_args']
     self.assertEqual(self.donedeal_start_urls, list(genny(spec[0])))
Пример #6
0
 def test_generate_start_urls_from_params_range(self):
     genny = UrlGenerator()
     spec = self.specs['params_range']
     urls = [
         "http://www.smbc-comics.com/index.php?p=%s&q=comic" % i
         for i in range(20, 30, 5)
     ]
     self.assertEqual(urls, list(genny(spec[0])))
Пример #7
0
 def test_generate_start_urls_from_range(self):
     genny = UrlGenerator()
     spec = self.specs['range']
     urls = [
         "https://www.donedeal.ie/cars-for-sale/i/%s" % i
         for i in range(100000010, 100000000, -1)
     ]
     self.assertEqual(urls, list(genny(spec[0])))
Пример #8
0
 def test_generate_start_urls_from_setting(self):
     genny = UrlGenerator(
         Settings(
             values={
                 'categories': 'cars-for-sale,houses-for-sale',
                 'sections': ['pets-for-sale', 'kitchens-for-sale']
             }))
     spec = self.specs['settings']
     self.assertEqual(self.donedeal_start_urls, list(genny(spec[0])))
Пример #9
0
 def test_misconfigured_start_urls_spec_type(self):
     genny = UrlGenerator()
     spec = [{
         "template": "http://www.smbc-comics.com/{}",
         "paths": [{
             "type": "defaults",
             "values": ["index.php"]
         }],
         "params": [],
         "params_template": {}
     }]
     self.assertEqual([], list(genny(spec[0])))
Пример #10
0
 def test_missing_arg_for_start_urls_spec(self):
     genny = UrlGenerator(Settings(values={'home': 'home.php'}),
                          {'index': 'index.php'})
     spec = [{
         "template": "http://www.smbc-comics.com/{}",
         "paths": [{
             "type": "spider_args",
             "values": ["home"]
         }],
         "params": [],
         "params_template": {}
     }]
     self.assertEqual([], list(genny(spec[0])))
     spec = [{
         "template": "http://www.smbc-comics.com/{}",
         "paths": [{
             "type": "settings",
             "values": ["index"]
         }],
         "params": [],
         "params_template": {}
     }]
     self.assertEqual([], list(genny(spec[0])))
Пример #11
0
    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        self.start_url_generators = {
            'start_urls': IdentityGenerator(),
            'generated_urls': UrlGenerator(settings, kw),

            'url': IdentityGenerator(),
            'feed': FeedGenerator(self.parse),
            'generated': FragmentGenerator(),
        }
        self.generic_form = GenericForm(**kw)
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        self._add_spider_args_to_spec(spec, kw)

        #if actions configured, then set js_enabled as true, and put url of each action to js_enable_patterns. 
        self.actions = spec.get('actions', [])
        '''
        if len(self.actions):
            spec['js_enabled']= True 
            enable_patterns = spec.get('js_enable_patterns', [] )
            for action in self.actions: 
                enable_patterns.append(action.get('url'))
            spec['js_enable_patterns']= enable_patterns
        '''

 
        self._configure_js(spec, settings)
        self.plugins = self._configure_plugins(
            settings, spec, item_schemas, all_extractors)

        self.login_requests, self.form_requests = [], []
        self._start_urls = self._create_start_urls(spec)
        self._start_requests = self._create_start_requests(spec)
        self._create_init_requests(spec)
        self._add_allowed_domains(spec)
        self.page_actions = spec.get('page_actions', [])
Пример #12
0
 def test_generate_start_urls_from_options(self):
     genny = UrlGenerator()
     spec = self.specs['options']
     self.assertEqual(self.github_start_urls, list(genny(spec[0])))
Пример #13
0
 def test_generate_start_urls_from_defaults(self):
     genny = UrlGenerator()
     spec = self.specs['defaults']
     self.assertEqual(["https://github.com/scrapinghub"],
                      list(genny(spec[0])))