예제 #1
0
    def test_normalized_generated_params_range(self):
        legacy = [{
            "template": "http://www.smbc-comics.com/{}",
            "paths": [{
                "type": "default",
                "values": ["index.php"]
            }],
            "params": [{
                "name": "p",
                "type": "range",
                "values": [20, 31]
            }, {
                "name": "q",
                "type": "options",
                "values": ['comic']
            }],
            "params_template": {}
        }]
        normalized = [{
            'url': 'http://www.smbc-comics.com/index.php?p=20-30&q=[...]',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'http://www.smbc-comics.com/'},
                {'type': 'fixed', 'value': 'index.php'},
                {'type': 'fixed', 'value': '?p='},
                {'type': 'range', 'value': '20-30'},
                {'type': 'fixed', 'value': '&q='},
                {'type': 'list', 'value': 'comic'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #2
0
    def test_normalize_generated_dates(self):
        legacy = [{
            "template": "http://www.commitstrip.com/{}/{}/{}",
            "paths": [{
                "type": "default",
                "values": ["en"]
            }, {
                "type": "date",
                "values": ["%Y"],
            }, {
                "type": "date",
                "values": ["%m"]
            }],
            "params": [],
            "params_template": {}
        }]
        normalized = [{
            'url': 'http://www.commitstrip.com/en/[...]/[...]',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'http://www.commitstrip.com/'},
                {'type': 'fixed', 'value': 'en'},
                {'type': 'fixed', 'value': '/'},
                {'type': 'date', 'value': '%Y'},
                {'type': 'fixed', 'value': '/'},
                {'type': 'date', 'value': '%m'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #3
0
    def test_normalized_generated_range(self):
        legacy = [{
            "template": "https://www.donedeal.ie/{}/{}/{}",
            "paths": [{
                "type": "default",
                "values": ["cars-for-sale"]
            }, {
                "type": "options",
                "values": ["i"],
            }, {
                "type": "range",
                "values": [10, 20]
            }],
            "params": [],
            "params_template": {}
        }]
        normalized = [{
            'url': 'https://www.donedeal.ie/cars-for-sale/[...]/10-19',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'https://www.donedeal.ie/'},
                {'type': 'fixed', 'value': 'cars-for-sale'},
                {'type': 'fixed', 'value': '/'},
                {'type': 'list', 'value': 'i'},
                {'type': 'fixed', 'value': '/'},
                {'type': 'range', 'value': '10-19'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #4
0
    def test_normalized_generated_range(self):
        legacy = [{
            "template": "https://www.donedeal.ie/{}/{}/{}",
            "paths": [{
                "type": "default",
                "values": ["cars-for-sale"]
            }, {
                "type": "options",
                "values": ["i"],
            }, {
                "type": "range",
                "values": [10, 20]
            }],
            "params": [],
            "params_template": {}
        }]
        normalized = [{
            'url': 'https://www.donedeal.ie/cars-for-sale/[...]/10-19',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'https://www.donedeal.ie/'},
                {'type': 'fixed', 'value': 'cars-for-sale'},
                {'type': 'fixed', 'value': '/'},
                {'type': 'list', 'value': 'i'},
                {'type': 'fixed', 'value': '/'},
                {'type': 'range', 'value': '10-19'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #5
0
    def test_normalize_generated_default(self):
        legacy = [{
            "template": "https://github.com/{}/fixed",
            "paths": [{
                "type": "default",
                "values": ["scrapinghub", "scrapy", "scrapy-plugins"],
            }],
            "params": [],
            "params_template": {}
        }]
        normalized = [{
            'url': 'https://github.com/scrapinghub/fixed',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'https://github.com/'},
                {
                    'type': 'fixed',
                    'value': 'scrapinghub',
                },
                {'type': 'fixed', 'value': '/fixed'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #6
0
    def test_normalized_generated_params_range(self):
        legacy = [{
            "template": "http://www.smbc-comics.com/{}",
            "paths": [{
                "type": "default",
                "values": ["index.php"]
            }],
            "params": [{
                "name": "p",
                "type": "range",
                "values": [20, 31]
            }, {
                "name": "q",
                "type": "options",
                "values": ['comic']
            }],
            "params_template": {}
        }]
        normalized = [{
            'url': 'http://www.smbc-comics.com/index.php?p=20-30&q=[...]',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'http://www.smbc-comics.com/'},
                {'type': 'fixed', 'value': 'index.php'},
                {'type': 'fixed', 'value': '?p='},
                {'type': 'range', 'value': '20-30'},
                {'type': 'fixed', 'value': '&q='},
                {'type': 'list', 'value': 'comic'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #7
0
    def test_normalize_generated_dates(self):
        legacy = [{
            "template": "http://www.commitstrip.com/{}/{}/{}",
            "paths": [{
                "type": "default",
                "values": ["en"]
            }, {
                "type": "date",
                "values": ["%Y"],
            }, {
                "type": "date",
                "values": ["%m"]
            }],
            "params": [],
            "params_template": {}
        }]
        normalized = [{
            'url': 'http://www.commitstrip.com/en/[...]/[...]',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'http://www.commitstrip.com/'},
                {'type': 'fixed', 'value': 'en'},
                {'type': 'fixed', 'value': '/'},
                {'type': 'date', 'value': '%Y'},
                {'type': 'fixed', 'value': '/'},
                {'type': 'date', 'value': '%m'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #8
0
    def test_normalize_generated_default(self):
        legacy = [{
            "template": "https://github.com/{}/fixed",
            "paths": [{
                "type": "default",
                "values": ["scrapinghub", "scrapy", "scrapy-plugins"],
            }],
            "params": [],
            "params_template": {}
        }]
        normalized = [{
            'url': 'https://github.com/scrapinghub/fixed',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'https://github.com/'},
                {
                    'type': 'fixed',
                    'value': 'scrapinghub',
                },
                {'type': 'fixed', 'value': '/fixed'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #9
0
    def test_normalize_start_url(self):
        start_urls = [{
            'url': 'https://github.com/scrapinghub',
            'type': 'fixed',
        }]
        collection = StartUrlCollection(start_urls, self.generators)

        self.assertEqual(list(collection.normalize()), start_urls)
예제 #10
0
    def test_normalize_start_url(self):
        start_urls = [{
            'url': 'https://github.com/scrapinghub',
            'type': 'fixed',
        }]
        collection = StartUrlCollection(start_urls, self.generators)

        self.assertEqual(list(collection.normalize()), start_urls)
예제 #11
0
    def test_normalized_generated_template_params(self):
        legacy = [{
            "template":
            "https://encrypted.google.com/search",
            "paths": [],
            "params": [{
                "name": "q",
                "type": "options",
                "values": ["nosetests", "tox"]
            }, {
                "name": "location",
                "type": "options",
                "values": ["dublin", "cork"]
            }],
            "params_template": [("hl", "en"), ("q", "python unittest")]
        }]
        normalized = [{
            'url':
            'https://encrypted.google.com/search?hl=en&q=[...]&location=[...]',
            'type':
            'generated',
            'fragments': [
                {
                    'type': 'fixed',
                    'value': 'https://encrypted.google.com/search'
                },
                {
                    'type': 'fixed',
                    'value': '?hl='
                },
                {
                    'type': 'fixed',
                    'value': 'en'
                },
                {
                    'type': 'fixed',
                    'value': '&q='
                },
                {
                    'type': 'list',
                    'value': 'nosetests tox'
                },
                {
                    'type': 'fixed',
                    'value': '&location='
                },
                {
                    'type': 'list',
                    'value': 'dublin cork'
                },
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #12
0
    def test_normalize_string_url(self):
        legacy = ['https://github.com/scrapinghub']
        normalized = [{
            'url': 'https://github.com/scrapinghub',
            'type': 'url',
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(legacy[0], normalized[0]['url'])
        self.assertEqual(list(collection.normalize()), normalized)
예제 #13
0
    def test_normalize_string_url(self):
        legacy = ['https://github.com/scrapinghub']
        normalized = [{
            'url': 'https://github.com/scrapinghub',
            'type': 'url',
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(legacy[0], normalized[0]['url'])
        self.assertEqual(list(collection.normalize()), normalized)
예제 #14
0
    def test_normalized_generated_template_params(self):
        legacy = [{
            "template": "https://encrypted.google.com/search",
            "paths": [],
            "params": [{
                "name": "q",
                "type": "options",
                "values": ["nosetests", "tox"]
            }, {
                "name": "location",
                "type": "options",
                "values": ["dublin", "cork"]
            }],
            "params_template": [
                ("hl", "en"),
                ("q", "python unittest")
            ]
        }]
        normalized = [{
            'url': 'https://encrypted.google.com/search?hl=en&q=[...]&location=[...]',
            'type': 'generated',
            'fragments': [
                {'type': 'fixed', 'value': 'https://encrypted.google.com/search'},
                {'type': 'fixed', 'value': '?hl='},
                {'type': 'fixed', 'value': 'en'},
                {'type': 'fixed', 'value': '&q='},
                {'type': 'list', 'value': 'nosetests tox'},
                {'type': 'fixed', 'value': '&location='},
                {'type': 'list', 'value': 'dublin cork'},
            ]
        }]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(generator_set(UrlGenerator, legacy[0]),
                         generator_set(FragmentGenerator, normalized[0]))
        self.assertEqual(list(collection.normalize()), normalized)
예제 #15
0
    def test_normalized_mixed(self):
        legacy = [
            {
                "template": "http://www.smbc-comics.com/{}",
                "paths": [{
                    "type": "default",
                    "values": ["index.php"]
                }],
                "params": [{
                    "name": "p",
                    "type": "range",
                    "values": [20, 31]
                }, {
                    "name": "q",
                    "type": "options",
                    "values": ['comic']
                }],
                "params_template": {}
            },
            'http://github.com/scrapinghub.com',
            {
                'url': 'https://github.com/[...]',
                'type': 'generated',
                'fragments': [
                    {'type': 'fixed', 'value': 'https://github.com/'},
                    {
                        'type': 'list',
                        'value': 'scrapinghub scrapy scrapy-plugins',
                    },
                ]
            }
        ]
        normalized = [
            {
                'url': 'http://www.smbc-comics.com/index.php?p=20-30&q=[...]',
                'type': 'generated',
                'fragments': [
                    {'type': 'fixed', 'value': 'http://www.smbc-comics.com/'},
                    {'type': 'fixed', 'value': 'index.php'},
                    {'type': 'fixed', 'value': '?p='},
                    {'type': 'range', 'value': '20-30'},
                    {'type': 'fixed', 'value': '&q='},
                    {'type': 'list', 'value': 'comic'},
                ]
            },
            {'url': 'http://github.com/scrapinghub.com', 'type': 'url'},
            {
                'url': 'https://github.com/[...]',
                'type': 'generated',
                'fragments': [
                    {'type': 'fixed', 'value': 'https://github.com/'},
                    {
                        'type': 'list',
                        'value': 'scrapinghub scrapy scrapy-plugins',
                    },
                ]
            },
        ]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(list(collection.normalize()), normalized)
예제 #16
0
    def test_normalized_mixed(self):
        legacy = [
            {
                "template": "http://www.smbc-comics.com/{}",
                "paths": [{
                    "type": "default",
                    "values": ["index.php"]
                }],
                "params": [{
                    "name": "p",
                    "type": "range",
                    "values": [20, 31]
                }, {
                    "name": "q",
                    "type": "options",
                    "values": ['comic']
                }],
                "params_template": {}
            },
            'http://github.com/scrapinghub.com',
            {
                'url': 'https://github.com/[...]',
                'type': 'generated',
                'fragments': [
                    {'type': 'fixed', 'value': 'https://github.com/'},
                    {
                        'type': 'list',
                        'value': 'scrapinghub scrapy scrapy-plugins',
                    },
                ]
            }
        ]
        normalized = [
            {
                'url': 'http://www.smbc-comics.com/index.php?p=20-30&q=[...]',
                'type': 'generated',
                'fragments': [
                    {'type': 'fixed', 'value': 'http://www.smbc-comics.com/'},
                    {'type': 'fixed', 'value': 'index.php'},
                    {'type': 'fixed', 'value': '?p='},
                    {'type': 'range', 'value': '20-30'},
                    {'type': 'fixed', 'value': '&q='},
                    {'type': 'list', 'value': 'comic'},
                ]
            },
            {'url': 'http://github.com/scrapinghub.com', 'type': 'url'},
            {
                'url': 'https://github.com/[...]',
                'type': 'generated',
                'fragments': [
                    {'type': 'fixed', 'value': 'https://github.com/'},
                    {
                        'type': 'list',
                        'value': 'scrapinghub scrapy scrapy-plugins',
                    },
                ]
            },
        ]
        collection = StartUrlCollection(legacy, self.generators)

        self.assertEqual(list(collection.normalize()), normalized)