Exemplo n.º 1
0
    def run_spider(self, spider, spider_parameters=None, f_output=None, project=None):
        ret_future = Future()
        items_file = os.path.join(self.project_workspace_dir, 'items.jl')
        runner = 'scrapydd.utils.runner'
        pargs = [self.python, '-m', runner, 'crawl', spider]
        if project:
            spider_parameters['BOT_NAME'] = project
        if spider_parameters:
            for spider_parameter_key, spider_parameter_value in spider_parameters.items():
                pargs += [
                    '-s',
                    '%s=%s' % (spider_parameter_key, spider_parameter_value)
                ]
        pargs += ['-o', str(path_to_file_uri(items_file))]

        env = os.environ.copy()
        env['SCRAPY_PROJECT'] = str(self.project_name)
        # env['SCRAPY_JOB'] = str(self.task.id)
        env['SCRAPY_FEED_URI'] = str(path_to_file_uri(items_file))
        env['SCRAPY_EGG'] = 'spider.egg'

        p = Popen(pargs, env=env, stdout=f_output,
                  cwd=self.project_workspace_dir,
                  stderr=f_output, encoding='utf8')
        self.processes.append(p)

        def done(process):
            self.processes.remove(process)
            if process.returncode:
                return ret_future.set_exception(ProcessFailed())

            return ret_future.set_result(items_file)

        wait_process(p, done)
        return ret_future
Exemplo n.º 2
0
    def test_path_to_file_uri(self):
        if os.name == 'nt':
            self.assertEqual(path_to_file_uri("C:\\windows\clock.avi"),
                             "file:///C:/windows/clock.avi")
        else:
            self.assertEqual(path_to_file_uri("/some/path.txt"),
                             "file:///some/path.txt")

        fn = "test.txt"
        x = path_to_file_uri(fn)
        self.assert_(x.startswith('file:///'))
        self.assertEqual(file_uri_to_path(x).lower(), os.path.abspath(fn).lower())
Exemplo n.º 3
0
    def test_path_to_file_uri(self):
        if os.name == 'nt':
            self.assertEqual(path_to_file_uri("C:\\windows\clock.avi"),
                             "file:///C:/windows/clock.avi")
        else:
            self.assertEqual(path_to_file_uri("/some/path.txt"),
                             "file:///some/path.txt")

        fn = "test.txt"
        x = path_to_file_uri(fn)
        self.assert_(x.startswith('file:///'))
        self.assertEqual(file_uri_to_path(x).lower(), os.path.abspath(fn).lower())
Exemplo n.º 4
0
    def test_file_uri_to_path(self):
        if os.name == 'nt':
            self.assertEqual(file_uri_to_path("file:///C:/windows/clock.avi"),
                             "C:\\windows\clock.avi")
            uri = "file:///C:/windows/clock.avi"
            uri2 = path_to_file_uri(file_uri_to_path(uri))
            self.assertEqual(uri, uri2)
        else:
            self.assertEqual(file_uri_to_path("file:///path/to/test.txt"),
                             "/path/to/test.txt")
            self.assertEqual(file_uri_to_path("/path/to/test.txt"),
                             "/path/to/test.txt")
            uri = "file:///path/to/test.txt"
            uri2 = path_to_file_uri(file_uri_to_path(uri))
            self.assertEqual(uri, uri2)

        self.assertEqual(file_uri_to_path("test.txt"), "test.txt")
Exemplo n.º 5
0
 def _get_feed_uri(self, message, ext):
     url = urlparse(self.items_dir)
     if url.scheme.lower() in ['', 'file']:
         return path_to_file_uri(self._get_file(message, url.path, ext))
     return urlunparse((url.scheme, url.netloc, '/'.join([
         url.path, message['_project'], message['_spider'],
         '%s.%s' % (message['_job'], ext)
     ]), url.params, url.query, url.fragment))
    def test_download(self):
        def _test(response):
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertEqual(response.body, b'0123456789')

        request = Request(path_to_file_uri(self.tmpname + '^'))
        assert request.url.upper().endswith('%5E')
        return self.download_request(request, Spider('foo')).addCallback(_test)
Exemplo n.º 7
0
    def test_download(self):
        def _test(response):
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.status, 200)
            self.assertEquals(response.body, "0123456789")

        request = Request(path_to_file_uri(self.tmpname + "^"))
        assert request.url.upper().endswith("%5E")
        return self.download_request(request, Spider("foo")).addCallback(_test)
Exemplo n.º 8
0
    def test_download(self):
        def _test(response):
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertEqual(response.body, b'0123456789')

        request = Request(path_to_file_uri(self.tmpname + '^'))
        assert request.url.upper().endswith('%5E')
        return self.download_request(request, Spider('foo')).addCallback(_test)
Exemplo n.º 9
0
    def test_file_uri_to_path(self):
        if os.name == 'nt':
            self.assertEqual(file_uri_to_path("file:///C:/windows/clock.avi"),
                             "C:\\windows\clock.avi")
            uri = "file:///C:/windows/clock.avi"
            uri2 = path_to_file_uri(file_uri_to_path(uri))
            self.assertEqual(uri, uri2)
        else:
            self.assertEqual(file_uri_to_path("file:///path/to/test.txt"),
                             "/path/to/test.txt")
            self.assertEqual(file_uri_to_path("/path/to/test.txt"),
                             "/path/to/test.txt")
            uri = "file:///path/to/test.txt"
            uri2 = path_to_file_uri(file_uri_to_path(uri))
            self.assertEqual(uri, uri2)

        self.assertEqual(file_uri_to_path("test.txt"),
                         "test.txt")
Exemplo n.º 10
0
 def _get_feed_uri(self, message, ext):
     url = urlparse(self.items_dir)
     if url.scheme.lower() in ['', 'file']:
         return path_to_file_uri(self._get_file(message, url.path, ext))
     return urlunparse((url.scheme,
                        url.netloc,
                        '/'.join([url.path,
                                  message['_project'],
                                  message['_spider'],
                                  '%s.%s' % (message['_job'], ext)]),
                        url.params,
                        url.query,
                        url.fragment))
Exemplo n.º 11
0
    def execute_subprocess(self):
        future = Future()
        # init items file
        workspace = ProjectWorkspace(self.task.project_name)
        self.items_file = os.path.join(self.workspace_dir, '%s.%s' % (self.task.id, 'jl'))
        python = workspace.python
        runner = 'scrapyd.runner'
        pargs = [python, '-m', runner, 'crawl', self.task.spider_name]
        for spider_parameter_key, spider_parameter_value in self.task.spider_parameters.items():
            pargs += [
                        '-s',
                        '%s=%s' % (spider_parameter_key, spider_parameter_value)
                      ]

        env = os.environ.copy()
        env['SCRAPY_PROJECT'] = str(self.task.project_name)
        env['SCRAPY_JOB'] = str(self.task.id)
        env['SCRAPY_FEED_URI'] = str(path_to_file_uri(self.items_file))
        try:
            self.p = subprocess.Popen(pargs, env=env, stdout=self._f_output, cwd=self.workspace_dir, stderr=self._f_output)
            if self.on_subprocess_start:
                self.on_subprocess_start(self.task, self.p.pid)

        except Exception as e:
            return self.complete_with_error('Error when starting crawl subprocess : %s' % e)
        logger.info('job %s started on pid: %d' % (self.task.id, self.p.pid))

        def check_process():
            execute_result = self.p.poll()
            logger.debug('check process')
            if execute_result is not None:
                logger.info('task complete')
                future.set_result(self.complete(execute_result))

        self.check_process_callback = PeriodicCallback(check_process, 1*1000)
        self.check_process_callback.start()
        return future
Exemplo n.º 12
0
 def test_store_file_uri_makedirs(self):
     path = os.path.abspath(self.mktemp())
     path = os.path.join(path, 'more', 'paths', 'file.txt')
     uri = path_to_file_uri(path)
     return self._assert_stores(FileFeedStorage(uri), path)
Exemplo n.º 13
0
 def test_store_file_uri(self):
     path = os.path.abspath(self.mktemp())
     uri = path_to_file_uri(path)
     return self._assert_stores(FileFeedStorage(uri), path)
Exemplo n.º 14
0
 def test_store_file_uri_makedirs(self):
     path = os.path.abspath(self.mktemp())
     path = os.path.join(path, 'more', 'paths', 'file.txt')
     uri = path_to_file_uri(path)
     return self._assert_stores(FileFeedStorage(uri), path)
Exemplo n.º 15
0
 def test_store_file_uri(self):
     path = os.path.abspath(self.mktemp())
     uri = path_to_file_uri(path)
     return self._assert_stores(FileFeedStorage(uri), path)