def _check_select(self): """ interactive mode of select tasks """ if not self.interactive: return super(OneScheduler, self)._check_select() # waiting for running tasks if self.running_task > 0: return is_crawled = [] def run(project=None): return crawl('on_start', project=project) def crawl(url, project=None, **kwargs): """ Crawl given url, same parameters as BaseHandler.crawl url - url or taskid, parameters will be used if in taskdb project - can be ignored if only one project exists. """ # looking up the project instance if project is None: if len(self.projects) == 1: project = list(self.projects.keys())[0] else: raise LookupError('You need specify the project: %r' % list(self.projects.keys())) project_data = self.processor.project_manager.get(project) if not project_data: raise LookupError('no such project: %s' % project) # get task package instance = project_data['instance'] instance._reset() task = instance.crawl(url, **kwargs) if isinstance(task, list): raise Exception('url list is not allowed in interactive mode') # check task in taskdb if not kwargs: dbtask = self.taskdb.get_task(task['project'], task['taskid'], fields=self.request_task_fields) if not dbtask: dbtask = self.taskdb.get_task(task['project'], task['url'], fields=self.request_task_fields) if dbtask: task = dbtask # select the task self.on_select_task(task) is_crawled.append(True) shell.ask_exit() def quit_interactive(): '''Quit interactive mode''' is_crawled.append(True) self.interactive = False shell.ask_exit() def quit_pyspider(): '''Close pyspider''' is_crawled[:] = [] shell.ask_exit() shell = utils.get_python_console() banner = ( 'pyspider shell - Select task\n' 'crawl(url, project=None, **kwargs) - same parameters as BaseHandler.crawl\n' 'quit_interactive() - Quit interactive mode\n' 'quit_pyspider() - Close pyspider' ) if hasattr(shell, 'show_banner'): shell.show_banner(banner) shell.interact() else: shell.interact(banner) if not is_crawled: self.ioloop.add_callback(self.ioloop.stop)
def _check_select(self): """ interactive mode of select tasks """ if not self.interactive: return super(OneScheduler, self)._check_select() # waiting for running tasks if self.running_task > 0: return is_crawled = [] def run(project=None): return crawl('on_start', project=project) def crawl(url, project=None, **kwargs): """ Crawl given url, same parameters as BaseHandler.crawl url - url or taskid, parameters will be used if in taskdb project - can be ignored if only one project exists. """ # looking up the project instance if project is None: if len(self.projects) == 1: project = list(self.projects.keys())[0] else: raise LookupError('You need specify the project: %r' % list(self.projects.keys())) project_data = self.processor.project_manager.get(project) if not project_data: raise LookupError('no such project: %s' % project) # get task package instance = project_data['instance'] instance._reset() task = instance.crawl(url, **kwargs) if isinstance(task, list): raise Exception('url list is not allowed in interactive mode') # check task in taskdb if not kwargs: dbtask = self.taskdb.get_task(task['project'], task['taskid'], fields=self.request_task_fields) if not dbtask: dbtask = self.taskdb.get_task(task['project'], task['url'], fields=self.request_task_fields) if dbtask: task = dbtask # select the task self.on_select_task(task) is_crawled.append(True) shell.ask_exit() def quit_interactive(): '''Quit interactive mode''' is_crawled.append(True) self.interactive = False shell.ask_exit() def quit_pyspider(): '''Close pyspider''' is_crawled[:] = [] shell.ask_exit() shell = utils.get_python_console() shell.interact( 'pyspider shell - Select task\n' 'crawl(url, project=None, **kwargs) - same parameters as BaseHandler.crawl\n' 'quit_interactive() - Quit interactive mode\n' 'quit_pyspider() - Close pyspider' ) if not is_crawled: self.ioloop.stop()