Пример #1
0
    def execute(self, args, opts):            
        
        task = None
        
        if opts.task_id:
            task = Task().load(id=opts.task_id)
        if opts.task_name:
            task = Task().next(name=opts.task_name)
            
        if task or len(args):
            
            if task:
                domain = task.domain
            else:
                domain = args[0]
            
            spider = spiders.fromdomain(domain)         
            scrapymanager.configure()
            if opts.child:
                def _stop():
                    pass
                # monkeypatching stop command to prevent stoping prematurely in child mode
                scrapymanager.stop = _stop
            if not task.locked:
                task.lock()
            self.crawl(spider, task)
            scrapyengine.start()

        else:
            log.msg('You must specify atleast 1 domain', level=log.ERROR)
Пример #2
0
    def run(self, args, opts):
        if opts.list:
            self._list_templates()
            return

        if opts.dump:
            template_file = self._find_template(opts.template)
            if template_file:
                template = open(template_file, 'r')
                print template.read() 
            return

        if len(args) < 2:
            return False

        module = sanitize_module_name(args[0])
        domain = args[1]
        spider = spiders.fromdomain(domain)
        if spider and not opts.force:
            print "Spider '%s' already exists in module:" % domain
            print "  %s" % spider.__module__
            sys.exit(1)

        template_file = self._find_template(opts.template)
        if template_file:
            self._genspider(module, domain, opts.template, template_file)
Пример #3
0
def _get_spider_requests(*args):
    """Collect requests and spiders from the given arguments. Returns a dict of
    spider -> list of requests
    """
    spider_requests = defaultdict(list)
    for arg in args:
        if isinstance(arg, tuple):
            request, spider = arg
            spider_requests[spider] = request
        elif isinstance(arg, Request):
            spider = spiders.fromurl(arg.url) or BaseSpider('default')
            if spider:
                spider_requests[spider] += [arg]
            else:
                log.msg('Could not find spider for request: %s' % arg, log.ERROR)
        elif isinstance(arg, BaseSpider):
            spider_requests[arg] += arg.start_requests()
        elif is_url(arg):
            spider = spiders.fromurl(arg) or BaseSpider('default')
            if spider:
                for req in arg_to_iter(spider.make_requests_from_url(arg)):
                    spider_requests[spider] += [req]
            else:
                log.msg('Could not find spider for url: %s' % arg, log.ERROR)
        elif isinstance(arg, basestring):
            spider = spiders.fromdomain(arg)
            if spider:
                spider_requests[spider] += spider.start_requests()
            else:
                log.msg('Could not find spider for domain: %s' % arg, log.ERROR)
        else:
            raise TypeError("Unsupported argument: %r" % arg)
    return spider_requests