Пример #1
0
    def __init__(self, crawler):
        settings = crawler.settings
        spider = crawler.spider

        if not any(self.__class__.__name__ in s
                   for s in settings.getwithbase('SPIDER_MIDDLEWARES').keys()):
            raise ValueError('%s must be in SPIDER_MIDDLEWARES' %
                             (self.__class__.__name__, ))
        if not settings.getbool('AUTOUNIT_ENABLED'):
            raise NotConfigured('scrapy-autounit is not enabled')
        if settings.getint('CONCURRENT_REQUESTS') > 1:
            logger.warn(
                'Recording with concurrency > 1! '
                'Data races in shared object modification may create broken '
                'tests.')

        self.max_fixtures = settings.getint(
            'AUTOUNIT_MAX_FIXTURES_PER_CALLBACK', default=10)
        self.max_fixtures = \
            self.max_fixtures if self.max_fixtures >= 10 else 10

        self.base_path = settings.get('AUTOUNIT_BASE_PATH',
                                      default=os.path.join(
                                          get_project_dir(), 'autounit'))
        create_dir(self.base_path, exist_ok=True)
        clear_fixtures(self.base_path, sanitize_module_name(spider.name))

        self.fixture_counters = {}
Пример #2
0
    def process_spider_input(self, response, spider):
        if self.init == 0:
            if '_parse' in response.meta:
                spider_dir = os.path.join(self.base_path, 'tests',
                                          sanitize_module_name(spider.name))
                if os.path.exists(spider_dir):
                    self.fixture_counters = get_fixture_counts(
                        spider_dir, spider,
                        spider.settings.get('TESTMASTER_EXTRA_PATH'))
            self.init += 1
        the_request = response.request
        # the parse command screws with middleware order because it uses essentially
        # two callbacks: a preliminary internal one and the real one. This is
        # grabbing the real callback from the meta.
        if '_parse' in response.meta and '_update' not in response.meta:
            the_request = response.request.copy()
            the_request.callback = response.meta['_callback']
            temp_meta = response.meta.copy()
            del temp_meta['_callback']
            the_request = the_request.replace(meta=temp_meta)

        _request = request_to_dict(the_request, spider=spider)
        if not _request['callback']:
            cb_name = 'parse'
        else:
            cb_name = _request['callback']
        test_dir = os.path.join(self.base_path, 'tests',
                                sanitize_module_name(spider.name), cb_name)
        cb_settings = get_cb_settings(test_dir)
        filter_args = {'crawler', 'settings', 'start_urls'}
        if isinstance(spider, CrawlSpider):
            filter_args |= {'rules', '_rules'}
        response.meta['_testmaster'] = pickle.dumps({
            'request':
            parse_request(the_request, spider, cb_settings),
            'response':
            response_to_dict(response),
            'spider_args':
            {k: v
             for k, v in spider.__dict__.items() if k not in filter_args},
            'middlewares':
            get_middlewares(spider),
        })

        return None
Пример #3
0
    def process_spider_output(self, response, result, spider):
        settings = spider.settings
        processed_result = []
        out = []
        for elem in result:
            out.append(elem)
            is_request = isinstance(elem, Request)
            if is_request:
                _data = parse_request(elem, spider)
            else:
                _data = parse_object(copy.deepcopy(elem), spider)
            processed_result.append({
                'type': 'request' if is_request else 'item',
                'data': _data
            })

        input_data = pickle.loads(response.meta.pop('_autounit'))

        request = input_data['request']
        callback_name = request['callback']
        spider_attr_out = {
            k: v
            for k, v in spider.__dict__.items()
            if k not in ('crawler', 'settings', 'start_urls')
        }

        data = {
            'spider_name': spider.name,
            'request': request,
            'response': input_data['response'],
            'spider_args_out': spider_attr_out,
            'result': processed_result,
            'spider_args_in': input_data['spider_args'],
            'settings': _copy_settings(settings),
            'middlewares': input_data['middlewares'],
            'python_version': 2 if six.PY2 else 3,
        }

        callback_counter = self.fixture_counters.setdefault(callback_name, 0)
        self.fixture_counters[callback_name] += 1

        test_dir, test_name = get_or_create_test_dir(
            self.base_path,
            sanitize_module_name(spider.name),
            callback_name,
            settings.get('AUTOUNIT_EXTRA_PATH'),
        )

        if callback_counter < self.max_fixtures:
            add_sample(callback_counter + 1, test_dir, test_name, data)
        else:
            r = random.randint(0, callback_counter)
            if r < self.max_fixtures:
                add_sample(r + 1, test_dir, test_name, data)

        return out
Пример #4
0
    def __init__(self, spider):
        self.spider = spider
        self.settings = spider.settings
        self.spider_name = sanitize_module_name(spider.name)
        self.spider_init_attrs = self.spider_attrs()

        self.fixture_counters = {}
        self._set_max_fixtures()

        self.base_path = get_base_path(self.settings)
        self._create_dir(self.base_path, exist_ok=True)
        self._clear_fixtures()
Пример #5
0
    def process_spider_output(self, response, result, spider):
        settings = spider.settings

        processed_result, out = parse_callback_result(result, spider)

        input_data = pickle.loads(response.meta.pop('_autounit'))

        request = input_data['request']
        callback_name = request['callback']
        d = spider.__getstate__() if hasattr(
            spider, '__getstate__') else spider.__dict__
        spider_attr_out = {
            k: v
            for k, v in d.items() if k not in get_filter_attrs(spider)
        }

        data = {
            'spider_name': spider.name,
            'request': request,
            'response': input_data['response'],
            'spider_args_out': spider_attr_out,
            'result': processed_result,
            'spider_args_in': input_data['spider_args'],
            'settings': _copy_settings(settings),
            'middlewares': input_data['middlewares'],
            'python_version': 2 if six.PY2 else 3,
        }

        callback_counter = self.fixture_counters.setdefault(callback_name, 0)
        self.fixture_counters[callback_name] += 1

        test_dir, test_name = get_or_create_test_dir(
            self.base_path,
            sanitize_module_name(spider.name),
            callback_name,
            settings.get('AUTOUNIT_EXTRA_PATH'),
        )

        index = 0
        if callback_counter < self.max_fixtures:
            index = callback_counter + 1
            add_sample(index, test_dir, test_name, data)
        else:
            r = random.randint(0, callback_counter)
            if r < self.max_fixtures:
                index = r + 1
                add_sample(index, test_dir, test_name, data)

        if index == 1:
            write_test(test_dir, test_name, request['url'])

        return out
Пример #6
0
def process_result_for_middleware(spider, callback, items, requests):
    processed_result = []
    for item in items:
        processed_result.append({'type': 'item', 'data': item})
    for req in requests:
        base_path = os.path.join(get_project_dirs()[0], 'testmaster')
        test_dir = os.path.join(base_path, 'tests',
                                sanitize_module_name(spider.name),
                                callback.__name__)
        cb_settings = None
        if os.path.exists(test_dir):
            cb_settings = get_cb_settings(test_dir)
        processed_result.append({
            'type': 'request',
            'data': parse_request(req, spider, cb_settings)
        })
    return processed_result
Пример #7
0
    def __init__(self, parser):
        self.parser = parser
        self.args = parser.parse_args()

        if not inside_project():
            self.error("No active Scrapy project")

        self.command = self.args.command

        self.spider = sanitize_module_name(self.args.spider)
        self.callback = self.args.callback
        self.fixture = self.args.fixture

        self.project_dir = get_project_dir()
        sys.path.append(self.project_dir)

        self.settings = get_project_settings()

        base_path = self.settings.get(
            'AUTOUNIT_BASE_PATH',
            default=os.path.join(self.project_dir, 'autounit'))
        self.tests_dir = os.path.join(base_path, 'tests')

        self.spider_dir = os.path.join(self.tests_dir, self.spider)

        if not os.path.isdir(self.spider_dir):
            self.error(
                "No recorded data found "
                "for spider '{}'".format(self.spider))

        extra_path = self.settings.get('AUTOUNIT_EXTRA_PATH') or ''
        self.callback_dir = os.path.join(
            self.spider_dir, extra_path, self.callback)

        if not os.path.isdir(self.callback_dir):
            self.error(
                "No recorded data found for callback "
                "'{}' from '{}' spider".format(self.callback, self.spider))

        if self.fixture:
            self.fixture_path = os.path.join(
                self.callback_dir, self.parse_fixture_arg())
            if not os.path.isfile(self.fixture_path):
                self.error("Fixture '{}' not found".format(self.fixture_path))
Пример #8
0
    def __init__(self, parser):
        self.parser = parser
        self.args = parser.parse_args()

        if not inside_project():
            self._error("No active Scrapy project")

        self.command = self.args.command

        self.spider = self.args.spider
        self.callback = self.args.callback
        self.fixture = self.args.fixture

        self.project_dir = get_project_dir()
        sys.path.append(self.project_dir)

        self.settings = get_project_settings()

        base_path = get_base_path(self.settings)
        self.tests_dir = os.path.join(base_path, 'tests')

        if self.spider:
            self.spider = sanitize_module_name(self.spider)
            self.callbacks_dir = self._get_callbacks_dir(self.spider)
            if not os.path.isdir(self.callbacks_dir):
                self._error("No recorded data found for spider '{}'".format(
                    self.spider))

            if self.callback:
                self.callback_dir = os.path.join(self.callbacks_dir,
                                                 self.callback)
                if not os.path.isdir(self.callback_dir):
                    self._error("No recorded data found for callback "
                                "'{}' from '{}' spider".format(
                                    self.callback, self.spider))

                if self.fixture:
                    self.fixture_path = os.path.join(self.callback_dir,
                                                     self.parse_fixture_arg())
                    if not os.path.isfile(self.fixture_path):
                        self._error("Fixture '{}' not found".format(
                            self.fixture_path))
Пример #9
0
    def __init__(self, settings):
        super(BotSpiderManager, self).__init__(settings)
        # Backward compatibility
        if isinstance(settings, list):
            from scrapy.conf import settings
        descriptor = Descriptor.from_settings(settings)

        for spider_data in descriptor.list():
            domain = get_domain(spider_data['url'])
            base_cls = self._spiders[spider_data['engine']]
            cls_name = sanitize_module_name(domain).encode('utf-8')
            spider_name = self.spider_name(spider_data['engine'],
                                           domain)
            kwargs = {
                'name': spider_name,
                'start_urls': [spider_data['url']],
                'allowed_domains': [domain],
            }
            kwargs.update(spider_data.get('args', {}))
            spider_cls = self._create_spider_cls(base_cls, cls_name, kwargs)
            self._spiders[spider_name] = spider_cls
Пример #10
0
 def _update_legacy_test(self, path, cassette):
     path_dir = os.path.dirname(path)
     older_version_test = os.path.join(path_dir, 'test_fixture1.py')
     if os.path.isfile(older_version_test):
         to_remove = os.path.join(path_dir, 'test_fixture*.py')
         for test in glob(to_remove):
             if test == older_version_test:
                 os.rename(test, path)
                 continue
             os.remove(test)
     test_name = (sanitize_module_name(cassette.spider_name) + '__' +
                  cassette.request['callback'])
     with open(path, 'r+') as f:
         old = f.read()
         command = 'Scrapy Autounit'
         command_re = re.search('# Generated by: (.*)  # noqa', old)
         if command_re:
             command = command_re.group(1)
         test_code = TEST_TEMPLATE.format(test_name=test_name,
                                          command=command)
         f.seek(0)
         f.write(test_code)
         f.truncate()
Пример #11
0
    def process_spider_output(self, response, result, spider):
        input_data = pickle.loads(response.meta.pop('_testmaster'))
        request = input_data['request']
        callback_name = request['callback']

        settings = spider.settings
        test_dir, test_name = get_or_create_test_dir(
            self.base_path,
            sanitize_module_name(spider.name),
            callback_name,
            settings.get('TESTMASTER_EXTRA_PATH'),
        )
        cb_settings = get_cb_settings(test_dir)
        # parse command will return requests at the end of callbacks but not
        # items... As such I am processing the result as it comes, before it
        # reaches this point (and  storing the result in meta).
        if '_parse' in response.meta and '_update' not in response.meta:
            processed_result = response.meta.pop('_processed_result')
            out = result
        else:
            processed_result, out = parse_callback_result(
                result, spider, cb_settings)

        spider_attr_out = {
            k: v
            for k, v in spider.__dict__.items()
            if k not in ('crawler', 'settings', 'start_urls')
        }
        temp_rules = spider_attr_out.get('_rules', [])
        if temp_rules:
            spider_attr_out['_rules'] = [repr(rule) for rule in temp_rules]

        data = {
            'spider_name': spider.name,
            'request': request,
            'response': input_data['response'],
            'spider_args_out': spider_attr_out,
            'result': processed_result,
            'spider_args_in': input_data['spider_args'],
            'settings': _copy_settings(settings, cb_settings),
            'middlewares': input_data['middlewares'],
            'python_version': 2 if six.PY2 else 3,
        }

        callback_counter = self.fixture_counters.setdefault(callback_name, 0)
        # self.fixture_counters[callback_name] += 1

        index = 0

        max_fixtures = update_max_fixtures(cb_settings, self.max_fixtures)
        _request = copy.deepcopy(data['request'])
        _request = clean_request(_request, spider.settings, cb_settings)

        items_out, requests_out = process_result(data['result'],
                                                 spider.settings, cb_settings)
        validate_results(test_dir, spider.settings, items_out, requests_out,
                         request['url'])

        if callback_counter < max_fixtures or '_update' in response.meta:
            index = callback_counter + 1
            if '_fixture' in response.meta:
                index = response.meta['_fixture']
            add_sample(index, test_dir, test_name, data)
            write_json(test_dir, _request, data['result'], index)

        else:
            # this random overwriting logic should only apply to generating testcases
            # via scrapy crawl
            if not ('_update' in response.meta or '_parse' in response.meta):
                r = random.randint(0, callback_counter)
                if r < max_fixtures:
                    index = r + 1
                    add_sample(index, test_dir, test_name, data)
                    write_json(test_dir, _request, data['result'], index)

        if index == 1:
            write_test(test_dir, test_name, request['url'])

        self.fixture_counters[callback_name] += 1

        # if we don't return an empty list here, 'update' keeps on making
        # requests indefinitely!
        if '_update' in response.meta:
            return []
        return out
Пример #12
0
    def __init__(self, parser):
        self.parser = parser
        self.args = parser.parse_args()

        if not inside_project():
            self.error("No active Scrapy project")

        self.command = self.args.command

        self.spider = sanitize_module_name(self.args.spider) if \
            self.args.spider else None
        try:
            self.callback = self.args.callback
        except AttributeError:
            self.callback = None
        try:
            self.fixture = self.args.fixture
        except AttributeError:
            self.fixture = None

        if self.command == 'update':
            try:
                self.new = self.args.new
            except AttributeError:
                self.new = None
            try:
                self.dynamic = self.args.dynamic
            except AttributeError:
                self.dynamic = None

        if self.command == 'clear':
            self.fixtures = self.args.fixtures.split(',')

        if self.fixture and not self.callback:
            self.error("Can't specify a fixture without a callback")

        self.project_dir, self.project_name = get_project_dirs()
        sys.path.append(self.project_dir)

        self.settings = get_project_settings()

        if self.command == "parse":
            url_list = [url.strip() for url in self.args.urls.split('|')]
            for url in url_list:
                if not is_url(url):
                    self.error("Something went wrong with your urls arg! "
                               "Note that as of version 1.0, the character for separating "
                               "multiple urls is '|', as opposed to ','")

            self.args = process_options(self.args)
            crawler_process = CrawlerProcess(self.settings)
            run_command(crawler_process, url_list, self.args)

        else:
            self.base_path = self.settings.get(
                'TESTMASTER_BASE_PATH',
                default=os.path.join(self.project_dir, 'testmaster'))
            self.tests_dir = os.path.join(self.base_path, 'tests')

            self.spider_dir = os.path.join(self.tests_dir, self.spider)

            if not os.path.isdir(self.spider_dir) and self.command != "establish":
                self.error(
                    "No recorded data found "
                    "for spider '{}'".format(self.spider))

            self.extra_path = self.settings.get('TESTMASTER_EXTRA_PATH') or ''
            if self.callback:
                self.callback_dir = os.path.join(
                    self.spider_dir, self.extra_path, self.callback)

                if self.command == 'establish':
                    if os.path.isdir(self.callback_dir):
                        self.error(
                            "Can't use 'establish' with callback arg "
                            "if callback dir for spider '{}' "
                            "exists already".format(self.spider))
            else:
                if self.command == 'inspect':
                    self.error(
                        "No recorded data found for callback "
                        "'{}' from '{}' spider".format(self.callback, self.spider))

            if self.fixture:
                self.fixture_path = os.path.join(self.callback_dir,
                                                 self.parse_fixture_arg())
                if not os.path.isfile(self.fixture_path):
                    self.error("Fixture '{}' not found".format(self.fixture_path))