def __init__(self, crawler): settings = crawler.settings spider = crawler.spider if not any(self.__class__.__name__ in s for s in settings.getwithbase('SPIDER_MIDDLEWARES').keys()): raise ValueError('%s must be in SPIDER_MIDDLEWARES' % (self.__class__.__name__, )) if not settings.getbool('AUTOUNIT_ENABLED'): raise NotConfigured('scrapy-autounit is not enabled') if settings.getint('CONCURRENT_REQUESTS') > 1: logger.warn( 'Recording with concurrency > 1! ' 'Data races in shared object modification may create broken ' 'tests.') self.max_fixtures = settings.getint( 'AUTOUNIT_MAX_FIXTURES_PER_CALLBACK', default=10) self.max_fixtures = \ self.max_fixtures if self.max_fixtures >= 10 else 10 self.base_path = settings.get('AUTOUNIT_BASE_PATH', default=os.path.join( get_project_dir(), 'autounit')) create_dir(self.base_path, exist_ok=True) clear_fixtures(self.base_path, sanitize_module_name(spider.name)) self.fixture_counters = {}
def process_spider_input(self, response, spider): if self.init == 0: if '_parse' in response.meta: spider_dir = os.path.join(self.base_path, 'tests', sanitize_module_name(spider.name)) if os.path.exists(spider_dir): self.fixture_counters = get_fixture_counts( spider_dir, spider, spider.settings.get('TESTMASTER_EXTRA_PATH')) self.init += 1 the_request = response.request # the parse command screws with middleware order because it uses essentially # two callbacks: a preliminary internal one and the real one. This is # grabbing the real callback from the meta. if '_parse' in response.meta and '_update' not in response.meta: the_request = response.request.copy() the_request.callback = response.meta['_callback'] temp_meta = response.meta.copy() del temp_meta['_callback'] the_request = the_request.replace(meta=temp_meta) _request = request_to_dict(the_request, spider=spider) if not _request['callback']: cb_name = 'parse' else: cb_name = _request['callback'] test_dir = os.path.join(self.base_path, 'tests', sanitize_module_name(spider.name), cb_name) cb_settings = get_cb_settings(test_dir) filter_args = {'crawler', 'settings', 'start_urls'} if isinstance(spider, CrawlSpider): filter_args |= {'rules', '_rules'} response.meta['_testmaster'] = pickle.dumps({ 'request': parse_request(the_request, spider, cb_settings), 'response': response_to_dict(response), 'spider_args': {k: v for k, v in spider.__dict__.items() if k not in filter_args}, 'middlewares': get_middlewares(spider), }) return None
def process_spider_output(self, response, result, spider): settings = spider.settings processed_result = [] out = [] for elem in result: out.append(elem) is_request = isinstance(elem, Request) if is_request: _data = parse_request(elem, spider) else: _data = parse_object(copy.deepcopy(elem), spider) processed_result.append({ 'type': 'request' if is_request else 'item', 'data': _data }) input_data = pickle.loads(response.meta.pop('_autounit')) request = input_data['request'] callback_name = request['callback'] spider_attr_out = { k: v for k, v in spider.__dict__.items() if k not in ('crawler', 'settings', 'start_urls') } data = { 'spider_name': spider.name, 'request': request, 'response': input_data['response'], 'spider_args_out': spider_attr_out, 'result': processed_result, 'spider_args_in': input_data['spider_args'], 'settings': _copy_settings(settings), 'middlewares': input_data['middlewares'], 'python_version': 2 if six.PY2 else 3, } callback_counter = self.fixture_counters.setdefault(callback_name, 0) self.fixture_counters[callback_name] += 1 test_dir, test_name = get_or_create_test_dir( self.base_path, sanitize_module_name(spider.name), callback_name, settings.get('AUTOUNIT_EXTRA_PATH'), ) if callback_counter < self.max_fixtures: add_sample(callback_counter + 1, test_dir, test_name, data) else: r = random.randint(0, callback_counter) if r < self.max_fixtures: add_sample(r + 1, test_dir, test_name, data) return out
def __init__(self, spider): self.spider = spider self.settings = spider.settings self.spider_name = sanitize_module_name(spider.name) self.spider_init_attrs = self.spider_attrs() self.fixture_counters = {} self._set_max_fixtures() self.base_path = get_base_path(self.settings) self._create_dir(self.base_path, exist_ok=True) self._clear_fixtures()
def process_spider_output(self, response, result, spider): settings = spider.settings processed_result, out = parse_callback_result(result, spider) input_data = pickle.loads(response.meta.pop('_autounit')) request = input_data['request'] callback_name = request['callback'] d = spider.__getstate__() if hasattr( spider, '__getstate__') else spider.__dict__ spider_attr_out = { k: v for k, v in d.items() if k not in get_filter_attrs(spider) } data = { 'spider_name': spider.name, 'request': request, 'response': input_data['response'], 'spider_args_out': spider_attr_out, 'result': processed_result, 'spider_args_in': input_data['spider_args'], 'settings': _copy_settings(settings), 'middlewares': input_data['middlewares'], 'python_version': 2 if six.PY2 else 3, } callback_counter = self.fixture_counters.setdefault(callback_name, 0) self.fixture_counters[callback_name] += 1 test_dir, test_name = get_or_create_test_dir( self.base_path, sanitize_module_name(spider.name), callback_name, settings.get('AUTOUNIT_EXTRA_PATH'), ) index = 0 if callback_counter < self.max_fixtures: index = callback_counter + 1 add_sample(index, test_dir, test_name, data) else: r = random.randint(0, callback_counter) if r < self.max_fixtures: index = r + 1 add_sample(index, test_dir, test_name, data) if index == 1: write_test(test_dir, test_name, request['url']) return out
def process_result_for_middleware(spider, callback, items, requests): processed_result = [] for item in items: processed_result.append({'type': 'item', 'data': item}) for req in requests: base_path = os.path.join(get_project_dirs()[0], 'testmaster') test_dir = os.path.join(base_path, 'tests', sanitize_module_name(spider.name), callback.__name__) cb_settings = None if os.path.exists(test_dir): cb_settings = get_cb_settings(test_dir) processed_result.append({ 'type': 'request', 'data': parse_request(req, spider, cb_settings) }) return processed_result
def __init__(self, parser): self.parser = parser self.args = parser.parse_args() if not inside_project(): self.error("No active Scrapy project") self.command = self.args.command self.spider = sanitize_module_name(self.args.spider) self.callback = self.args.callback self.fixture = self.args.fixture self.project_dir = get_project_dir() sys.path.append(self.project_dir) self.settings = get_project_settings() base_path = self.settings.get( 'AUTOUNIT_BASE_PATH', default=os.path.join(self.project_dir, 'autounit')) self.tests_dir = os.path.join(base_path, 'tests') self.spider_dir = os.path.join(self.tests_dir, self.spider) if not os.path.isdir(self.spider_dir): self.error( "No recorded data found " "for spider '{}'".format(self.spider)) extra_path = self.settings.get('AUTOUNIT_EXTRA_PATH') or '' self.callback_dir = os.path.join( self.spider_dir, extra_path, self.callback) if not os.path.isdir(self.callback_dir): self.error( "No recorded data found for callback " "'{}' from '{}' spider".format(self.callback, self.spider)) if self.fixture: self.fixture_path = os.path.join( self.callback_dir, self.parse_fixture_arg()) if not os.path.isfile(self.fixture_path): self.error("Fixture '{}' not found".format(self.fixture_path))
def __init__(self, parser): self.parser = parser self.args = parser.parse_args() if not inside_project(): self._error("No active Scrapy project") self.command = self.args.command self.spider = self.args.spider self.callback = self.args.callback self.fixture = self.args.fixture self.project_dir = get_project_dir() sys.path.append(self.project_dir) self.settings = get_project_settings() base_path = get_base_path(self.settings) self.tests_dir = os.path.join(base_path, 'tests') if self.spider: self.spider = sanitize_module_name(self.spider) self.callbacks_dir = self._get_callbacks_dir(self.spider) if not os.path.isdir(self.callbacks_dir): self._error("No recorded data found for spider '{}'".format( self.spider)) if self.callback: self.callback_dir = os.path.join(self.callbacks_dir, self.callback) if not os.path.isdir(self.callback_dir): self._error("No recorded data found for callback " "'{}' from '{}' spider".format( self.callback, self.spider)) if self.fixture: self.fixture_path = os.path.join(self.callback_dir, self.parse_fixture_arg()) if not os.path.isfile(self.fixture_path): self._error("Fixture '{}' not found".format( self.fixture_path))
def __init__(self, settings): super(BotSpiderManager, self).__init__(settings) # Backward compatibility if isinstance(settings, list): from scrapy.conf import settings descriptor = Descriptor.from_settings(settings) for spider_data in descriptor.list(): domain = get_domain(spider_data['url']) base_cls = self._spiders[spider_data['engine']] cls_name = sanitize_module_name(domain).encode('utf-8') spider_name = self.spider_name(spider_data['engine'], domain) kwargs = { 'name': spider_name, 'start_urls': [spider_data['url']], 'allowed_domains': [domain], } kwargs.update(spider_data.get('args', {})) spider_cls = self._create_spider_cls(base_cls, cls_name, kwargs) self._spiders[spider_name] = spider_cls
def _update_legacy_test(self, path, cassette): path_dir = os.path.dirname(path) older_version_test = os.path.join(path_dir, 'test_fixture1.py') if os.path.isfile(older_version_test): to_remove = os.path.join(path_dir, 'test_fixture*.py') for test in glob(to_remove): if test == older_version_test: os.rename(test, path) continue os.remove(test) test_name = (sanitize_module_name(cassette.spider_name) + '__' + cassette.request['callback']) with open(path, 'r+') as f: old = f.read() command = 'Scrapy Autounit' command_re = re.search('# Generated by: (.*) # noqa', old) if command_re: command = command_re.group(1) test_code = TEST_TEMPLATE.format(test_name=test_name, command=command) f.seek(0) f.write(test_code) f.truncate()
def process_spider_output(self, response, result, spider): input_data = pickle.loads(response.meta.pop('_testmaster')) request = input_data['request'] callback_name = request['callback'] settings = spider.settings test_dir, test_name = get_or_create_test_dir( self.base_path, sanitize_module_name(spider.name), callback_name, settings.get('TESTMASTER_EXTRA_PATH'), ) cb_settings = get_cb_settings(test_dir) # parse command will return requests at the end of callbacks but not # items... As such I am processing the result as it comes, before it # reaches this point (and storing the result in meta). if '_parse' in response.meta and '_update' not in response.meta: processed_result = response.meta.pop('_processed_result') out = result else: processed_result, out = parse_callback_result( result, spider, cb_settings) spider_attr_out = { k: v for k, v in spider.__dict__.items() if k not in ('crawler', 'settings', 'start_urls') } temp_rules = spider_attr_out.get('_rules', []) if temp_rules: spider_attr_out['_rules'] = [repr(rule) for rule in temp_rules] data = { 'spider_name': spider.name, 'request': request, 'response': input_data['response'], 'spider_args_out': spider_attr_out, 'result': processed_result, 'spider_args_in': input_data['spider_args'], 'settings': _copy_settings(settings, cb_settings), 'middlewares': input_data['middlewares'], 'python_version': 2 if six.PY2 else 3, } callback_counter = self.fixture_counters.setdefault(callback_name, 0) # self.fixture_counters[callback_name] += 1 index = 0 max_fixtures = update_max_fixtures(cb_settings, self.max_fixtures) _request = copy.deepcopy(data['request']) _request = clean_request(_request, spider.settings, cb_settings) items_out, requests_out = process_result(data['result'], spider.settings, cb_settings) validate_results(test_dir, spider.settings, items_out, requests_out, request['url']) if callback_counter < max_fixtures or '_update' in response.meta: index = callback_counter + 1 if '_fixture' in response.meta: index = response.meta['_fixture'] add_sample(index, test_dir, test_name, data) write_json(test_dir, _request, data['result'], index) else: # this random overwriting logic should only apply to generating testcases # via scrapy crawl if not ('_update' in response.meta or '_parse' in response.meta): r = random.randint(0, callback_counter) if r < max_fixtures: index = r + 1 add_sample(index, test_dir, test_name, data) write_json(test_dir, _request, data['result'], index) if index == 1: write_test(test_dir, test_name, request['url']) self.fixture_counters[callback_name] += 1 # if we don't return an empty list here, 'update' keeps on making # requests indefinitely! if '_update' in response.meta: return [] return out
def __init__(self, parser): self.parser = parser self.args = parser.parse_args() if not inside_project(): self.error("No active Scrapy project") self.command = self.args.command self.spider = sanitize_module_name(self.args.spider) if \ self.args.spider else None try: self.callback = self.args.callback except AttributeError: self.callback = None try: self.fixture = self.args.fixture except AttributeError: self.fixture = None if self.command == 'update': try: self.new = self.args.new except AttributeError: self.new = None try: self.dynamic = self.args.dynamic except AttributeError: self.dynamic = None if self.command == 'clear': self.fixtures = self.args.fixtures.split(',') if self.fixture and not self.callback: self.error("Can't specify a fixture without a callback") self.project_dir, self.project_name = get_project_dirs() sys.path.append(self.project_dir) self.settings = get_project_settings() if self.command == "parse": url_list = [url.strip() for url in self.args.urls.split('|')] for url in url_list: if not is_url(url): self.error("Something went wrong with your urls arg! " "Note that as of version 1.0, the character for separating " "multiple urls is '|', as opposed to ','") self.args = process_options(self.args) crawler_process = CrawlerProcess(self.settings) run_command(crawler_process, url_list, self.args) else: self.base_path = self.settings.get( 'TESTMASTER_BASE_PATH', default=os.path.join(self.project_dir, 'testmaster')) self.tests_dir = os.path.join(self.base_path, 'tests') self.spider_dir = os.path.join(self.tests_dir, self.spider) if not os.path.isdir(self.spider_dir) and self.command != "establish": self.error( "No recorded data found " "for spider '{}'".format(self.spider)) self.extra_path = self.settings.get('TESTMASTER_EXTRA_PATH') or '' if self.callback: self.callback_dir = os.path.join( self.spider_dir, self.extra_path, self.callback) if self.command == 'establish': if os.path.isdir(self.callback_dir): self.error( "Can't use 'establish' with callback arg " "if callback dir for spider '{}' " "exists already".format(self.spider)) else: if self.command == 'inspect': self.error( "No recorded data found for callback " "'{}' from '{}' spider".format(self.callback, self.spider)) if self.fixture: self.fixture_path = os.path.join(self.callback_dir, self.parse_fixture_arg()) if not os.path.isfile(self.fixture_path): self.error("Fixture '{}' not found".format(self.fixture_path))