Exemplo n.º 1
0
 def run(self, args, opts):
     if len(args) != 2:
         return False
     output = args[1]
     file = open(output, 'w+b')
     root, ext = os.path.splitext(output)
     exporter =  {
         '.json': JsonLinesItemExporter,
         '.xml': XmlItemExporter,
         '.csv': CsvItemExporter,
         '.pickle': PickleItemExporter
         }[ext](file)
     dispatcher.connect(exporter.export_item, signal=signals.item_passed)
     exporter.start_exporting()
     SPIDER.domain_name = args[0]
     if opts.alias == None:
         SPIDER.aliases = []
     else:
         SPIDER.aliases = opts.alias
     if opts.start_url == None:
         SPIDER.start_urls = ['http://%s'%SPIDER.domain_name]
     else:
         SPIDER.start_urls = opts.start_url
     scrapymanager.runonce(SPIDER)
     exporter.finish_exporting()
     
Exemplo n.º 2
0
    def run(self):
        if not self.portno:
            self.port = start_test_site()
            self.portno = self.port.getHost().port
        else:
            self.port = start_test_site(self.portno)

        self.spider.start_urls = [self.geturl("/")]
        scrapymanager.configure()
        scrapymanager.runonce(self.spider)
Exemplo n.º 3
0
 def run(self, args, opts):
     if len(args) != 1:
         return False
     if opts.output:
         file = open(opts.output, 'w+b')
         exporter = XmlItemExporter(file)
         dispatcher.connect(exporter.export_item, signal=signals.item_passed)
         exporter.start_exporting()
     module = _import_file(args[0])
     scrapymanager.runonce(module.SPIDER)
     if opts.output:
         exporter.finish_exporting()
Exemplo n.º 4
0
def fetch(urls):
    """Fetch a list of urls and return a list of the downloaded Scrapy
    Responses.

    This is a blocking function not suitable for calling from spiders. Instead,
    it is indended to be called from outside the framework such as Scrapy
    commands or standalone scripts.
    """
    responses = []
    requests = [Request(url, callback=responses.append, dont_filter=True) \
        for url in urls]
    scrapymanager.runonce(*requests)
    return responses
Exemplo n.º 5
0
    def run(self, args, opts):

        # if not domain argument was given, then exit
        if len(args) == 0:
            return False
        domain = args[0]

        # create directory to store the report output
        path = [opts.output, domain, opts.name]
        parent = ''
        for d in path:
            parent = os.path.join(parent, d)
            if not os.path.exists(parent):
                os.mkdir(parent)
                log.msg('Created directory: %s'%parent, level=log.INFO)

        output = os.path.join(*path)
        reports = ['errors', 'dirtyurls', 'offsite', 'clean']
        
        # create exporter for each report type
        exporters = {}
        for report in reports:
            file = open(os.path.join(output, '%s.%s'%(report, opts.format)), 'w+b')
            exporter =  {
                'json': JsonLinesItemExporter,
                'xml': XmlItemExporter,
                'csv': CsvItemExporter,
                'pickle': PickleItemExporter
                }[opts.format](file)
            exporters[report] = exporter
        junction = ReportJunction(exporters)
        dispatcher.connect(junction, signal=signals.item_passed)
        junction.start()
        SPIDER.domain_name = domain
        if opts.alias == None:
            SPIDER.aliases = []
        else:
            SPIDER.aliases = opts.alias
        if opts.start_url == None:
            SPIDER.start_urls = ['http://%s'%SPIDER.domain_name]
        else:
            SPIDER.start_urls = opts.start_url
        scrapymanager.runonce(SPIDER)
        junction.finish()
        
Exemplo n.º 6
0
    def run(self):
        self.port = start_test_site()
        self.portno = self.port.getHost().port

        self.spider = TestSpider()
        if self.spider:
            self.spider.start_urls = [
                self.geturl("/"),
                self.geturl("/redirect"),
                ]

            dispatcher.connect(self.record_signal, signals.engine_started)
            dispatcher.connect(self.record_signal, signals.engine_stopped)
            dispatcher.connect(self.record_signal, signals.spider_opened)
            dispatcher.connect(self.record_signal, signals.spider_idle)
            dispatcher.connect(self.record_signal, signals.spider_closed)
            dispatcher.connect(self.item_scraped, signals.item_scraped)
            dispatcher.connect(self.request_received, signals.request_received)
            dispatcher.connect(self.response_downloaded, signals.response_downloaded)

            scrapymanager.configure()
            scrapymanager.runonce(self.spider)
            self.port.stopListening()
            self.wasrun = True
Exemplo n.º 7
0
 def run(self, args, opts):
     scrapymanager.runonce(*args)
Exemplo n.º 8
0
 def crawl(self, *args):
     scrapymanager.runonce(*args)