def run_spider(self, spider, spider_parameters=None, f_output=None, project=None): ret_future = Future() items_file = os.path.join(self.project_workspace_dir, 'items.jl') runner = 'scrapydd.utils.runner' pargs = [self.python, '-m', runner, 'crawl', spider] if project: spider_parameters['BOT_NAME'] = project if spider_parameters: for spider_parameter_key, spider_parameter_value in spider_parameters.items(): pargs += [ '-s', '%s=%s' % (spider_parameter_key, spider_parameter_value) ] pargs += ['-o', str(path_to_file_uri(items_file))] env = os.environ.copy() env['SCRAPY_PROJECT'] = str(self.project_name) # env['SCRAPY_JOB'] = str(self.task.id) env['SCRAPY_FEED_URI'] = str(path_to_file_uri(items_file)) env['SCRAPY_EGG'] = 'spider.egg' p = Popen(pargs, env=env, stdout=f_output, cwd=self.project_workspace_dir, stderr=f_output, encoding='utf8') self.processes.append(p) def done(process): self.processes.remove(process) if process.returncode: return ret_future.set_exception(ProcessFailed()) return ret_future.set_result(items_file) wait_process(p, done) return ret_future
def test_path_to_file_uri(self): if os.name == 'nt': self.assertEqual(path_to_file_uri("C:\\windows\clock.avi"), "file:///C:/windows/clock.avi") else: self.assertEqual(path_to_file_uri("/some/path.txt"), "file:///some/path.txt") fn = "test.txt" x = path_to_file_uri(fn) self.assert_(x.startswith('file:///')) self.assertEqual(file_uri_to_path(x).lower(), os.path.abspath(fn).lower())
def test_file_uri_to_path(self): if os.name == 'nt': self.assertEqual(file_uri_to_path("file:///C:/windows/clock.avi"), "C:\\windows\clock.avi") uri = "file:///C:/windows/clock.avi" uri2 = path_to_file_uri(file_uri_to_path(uri)) self.assertEqual(uri, uri2) else: self.assertEqual(file_uri_to_path("file:///path/to/test.txt"), "/path/to/test.txt") self.assertEqual(file_uri_to_path("/path/to/test.txt"), "/path/to/test.txt") uri = "file:///path/to/test.txt" uri2 = path_to_file_uri(file_uri_to_path(uri)) self.assertEqual(uri, uri2) self.assertEqual(file_uri_to_path("test.txt"), "test.txt")
def _get_feed_uri(self, message, ext): url = urlparse(self.items_dir) if url.scheme.lower() in ['', 'file']: return path_to_file_uri(self._get_file(message, url.path, ext)) return urlunparse((url.scheme, url.netloc, '/'.join([ url.path, message['_project'], message['_spider'], '%s.%s' % (message['_job'], ext) ]), url.params, url.query, url.fragment))
def test_download(self): def _test(response): self.assertEqual(response.url, request.url) self.assertEqual(response.status, 200) self.assertEqual(response.body, b'0123456789') request = Request(path_to_file_uri(self.tmpname + '^')) assert request.url.upper().endswith('%5E') return self.download_request(request, Spider('foo')).addCallback(_test)
def test_download(self): def _test(response): self.assertEquals(response.url, request.url) self.assertEquals(response.status, 200) self.assertEquals(response.body, "0123456789") request = Request(path_to_file_uri(self.tmpname + "^")) assert request.url.upper().endswith("%5E") return self.download_request(request, Spider("foo")).addCallback(_test)
def _get_feed_uri(self, message, ext): url = urlparse(self.items_dir) if url.scheme.lower() in ['', 'file']: return path_to_file_uri(self._get_file(message, url.path, ext)) return urlunparse((url.scheme, url.netloc, '/'.join([url.path, message['_project'], message['_spider'], '%s.%s' % (message['_job'], ext)]), url.params, url.query, url.fragment))
def execute_subprocess(self): future = Future() # init items file workspace = ProjectWorkspace(self.task.project_name) self.items_file = os.path.join(self.workspace_dir, '%s.%s' % (self.task.id, 'jl')) python = workspace.python runner = 'scrapyd.runner' pargs = [python, '-m', runner, 'crawl', self.task.spider_name] for spider_parameter_key, spider_parameter_value in self.task.spider_parameters.items(): pargs += [ '-s', '%s=%s' % (spider_parameter_key, spider_parameter_value) ] env = os.environ.copy() env['SCRAPY_PROJECT'] = str(self.task.project_name) env['SCRAPY_JOB'] = str(self.task.id) env['SCRAPY_FEED_URI'] = str(path_to_file_uri(self.items_file)) try: self.p = subprocess.Popen(pargs, env=env, stdout=self._f_output, cwd=self.workspace_dir, stderr=self._f_output) if self.on_subprocess_start: self.on_subprocess_start(self.task, self.p.pid) except Exception as e: return self.complete_with_error('Error when starting crawl subprocess : %s' % e) logger.info('job %s started on pid: %d' % (self.task.id, self.p.pid)) def check_process(): execute_result = self.p.poll() logger.debug('check process') if execute_result is not None: logger.info('task complete') future.set_result(self.complete(execute_result)) self.check_process_callback = PeriodicCallback(check_process, 1*1000) self.check_process_callback.start() return future
def test_store_file_uri_makedirs(self): path = os.path.abspath(self.mktemp()) path = os.path.join(path, 'more', 'paths', 'file.txt') uri = path_to_file_uri(path) return self._assert_stores(FileFeedStorage(uri), path)
def test_store_file_uri(self): path = os.path.abspath(self.mktemp()) uri = path_to_file_uri(path) return self._assert_stores(FileFeedStorage(uri), path)