Пример #1
0
def main():
    """Setups item signal and run the spider"""
    from twisted.internet import reactor
    from scrapy import signals
    from scrapy.settings import Settings
    from scrapy.crawler import Crawler

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    settings = Settings()

    # set up crawler
    crawler = Crawler(settings)
    # shut off log
    crawler.settings.set('LOG_ENABLED', False, priority='cmdline')
    # set up signal to catch items scraped
    crawler.signals.connect(catch_item,   signal=signals.item_passed)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

    crawler.install()
    crawler.configure()

    # schedule spider
    spider = MySpider()
    crawler.crawl(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    reactor.run()
    print "ENGINE STOPPED"
Пример #2
0
def run_spider(spider):
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import log
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_exception(sender, failure, response, spider):
        print "Response: %s [%s]" % (response.body, response.meta)
        sys.stdout.flush()

    dispatcher.connect(catch_exception, signal=signals.spider_error)

    def catch_resp_dld(sender, response, request, spider):
        print "Downloaded (%s) Response %s" % (response.status, response.url)
        sys.stdout.flush()

    dispatcher.connect(catch_resp_dld, signal=signals.response_downloaded)

    # settings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        from scrapy.conf import settings as default_settings

    default_settings.overrides.update({
        'LOG_ENABLED': False,
        'LOG_LEVEL': 'CRITICAL',
        'BOT_NAME': 'project',
    })
    # Update general settings with spider-specific ones
    for k,v in spider.settings.iteritems():
        if isinstance(v, dict) and k in default_settings.overrides:
            default_settings.overrides[k].update(v)
        else:
            default_settings.overrides[k] = v

    # set up crawler
    from twisted.internet import reactor
    from scrapy.crawler import Crawler

    crawler = Crawler(default_settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)

    log.start_from_crawler(crawler)

    # start engine scrapy/twisted
    crawler.start()

    if not reactor.running:
        reactor.run()

    crawler.uninstall()
Пример #3
0
class Ctrl(object):
    def __init__(self, settings, puller, worker=None):
        self.settings = settings
        self._puller = puller
        self._crawler = Crawler(settings)
        self._worker = worker or Worker(redis_conf)

        self._crawler.install()
        self._crawler.configure()

        #can not using event module of ec2 for defer send, fix it!
        dispatcher.connect(self._on_recv_pull,
                           signal=signals.RECV,
                           sender=self._puller)
        dispatcher.connect(self._on_err,
                           signal=signals.ERROR,
                           sender=self._puller)

        event.connect(self._worker.on_recv,
                      signal=signals.RESPONSE,
                      sender=event.Any)

    def start(self):
        self._puller.start()
        self._crawler.start()

    def stop(self):
        self._puller.stop()
        self._crawler.stop()

    @decorator.safe_method()
    def _on_recv_pull(self, message):
        #log.msg('on_recv:%s'%(message,), log.DEBUG)
        requests = self._make_requests(message)
        if not requests: return
        self._requests_queue().append((Spider(self.settings), requests))

    def _requests_queue(self):
        return self._crawler.queue.spider_requests

    def _on_err(self):
        self.stop()

    def _make_requests(self, message):
        if not message: return
        chnl, message = message

        #logging.info('1.>>> %s'%message )
        kwds = json.loads(message, object_hook=misc.json_decode_dict)
        if not kwds: return

        #logging.info('3.>>> %s'%kwds )
        return (Request(**e) for e in kwds)
Пример #4
0
def test_crawler():
    crawler = Crawler(scrapy_conf)
    crawler.install()
    crawler.configure()

    myspider = Spider(scrapy_conf)
    event.connect(_resp, signal=signals.RESPONSE, sender=event.Any)

    crawler.queue.spider_requests.append((myspider, _requests(10)))

    #crawler.queue.append_spider(myspider)

    crawler.start()
    reactor.run()
Пример #5
0
def call_spider(start_urls):
  dispatcher.connect(stop_reactor, signal=signals.spider_closed)
  spider = DmozSpider3(start_url=start_urls)
  #crawler = Crawler(Settings())
  crawler = Crawler(get_project_settings())
  crawler.install()
  crawler.configure()
  crawler.crawl(spider)
  crawler.start()
  log.start(logfile="debug.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
  log.msg("------------>Running reactor")
  result = reactor.run()
  #print result
  log.msg("------------>Running stoped")
Пример #6
0
class Ctrl(object):

    def __init__(self, settings, puller, worker=None):
        self.settings = settings 
        self._puller = puller
        self._crawler = Crawler(settings)
        self._worker = worker or Worker(redis_conf)

        self._crawler.install()
        self._crawler.configure()    

        #can not using event module of ec2 for defer send, fix it!
        dispatcher.connect( self._on_recv_pull,  signal=signals.RECV,  sender=self._puller)
        dispatcher.connect( self._on_err,        signal=signals.ERROR, sender=self._puller)
        
        event.connect( self._worker.on_recv,signal=signals.RESPONSE, sender=event.Any)
        

    def start(self):
        self._puller.start()
        self._crawler.start()

    def stop(self):
        self._puller.stop()
        self._crawler.stop()


    @decorator.safe_method()    
    def _on_recv_pull(self, message):
        #log.msg('on_recv:%s'%(message,), log.DEBUG)
        requests = self._make_requests(message)
        if not requests: return
        self._requests_queue().append( (Spider(self.settings),requests) )
        
    def _requests_queue(self):   
        return self._crawler.queue.spider_requests

    def _on_err(self):
        self.stop()

    def _make_requests(self,message):
        if not message: return
        chnl,message = message
        
        #logging.info('1.>>> %s'%message )
        kwds = json.loads( message,object_hook=misc.json_decode_dict )
        if not kwds:    return

        #logging.info('3.>>> %s'%kwds )
        return ( Request(**e) for e in kwds )
class UrlCrawlerScript(Process):
    def __init__(self, spider):
        Process.__init__(self)
        settings = get_project_settings()
        self.crawler = Crawler(settings)

        if not hasattr(self, 'crawler'):
            self.crawler.install()
            self.crawler.configure()
        self.spider = spider

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
Пример #8
0
class UrlCrawlerScript(Process):
    def __init__(self, spider):
        Process.__init__(self)
        settings = get_project_settings()
        self.crawler = Crawler(settings)

        if not hasattr(self, 'crawler'):
            self.crawler.install()
            self.crawler.configure()
        self.spider = spider

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
Пример #9
0
def test_crawler():
    crawler = Crawler(scrapy_conf)
    crawler.install()
    crawler.configure()

    myspider = Spider(scrapy_conf)
    event.connect( _resp, signal=signals.RESPONSE, sender=event.Any)
    
    crawler.queue.spider_requests.append( (myspider, _requests(10)) )
    
    
    #crawler.queue.append_spider(myspider)
    
    crawler.start()
    reactor.run()
Пример #10
0
class UrlCrawlerScript(Process):
        def __init__(self, spider):
            Process.__init__(self)
            settings = get_project_settings()
            self.crawler = Crawler(settings)

            if not hasattr(project, 'crawler'):
                self.crawler.install()
                self.crawler.configure()
                self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
            self.spider = spider

        def run(self):
            self.crawler.crawl(self.spider)
            self.crawler.start()
            reactor.run()
Пример #11
0
class UrlCrawlerScript(Process):
    def __init__(self, spider):
        Process.__init__(self)
        settings = get_project_settings()
        self.crawler = Crawler(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
            self.crawler.configure()

        self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        self.spider = spider

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        reactor.run()
Пример #12
0
class CrawlerWorker(multiprocessing.Process):
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = Crawler(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
Пример #13
0
class CrawlerWorker(multiprocessing.Process):

    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = Crawler(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)
 
    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
Пример #14
0
                    education=education,
                    skills=skills,
                    work_experience=all_work_experience
                )
                base.save()

    def clear(self, text):
        result = list(filter(lambda x: bool(x), list(map(lambda x: x.strip(), text))))
        total_result = list(map(lambda x: x.replace(u'\u2022\t', ''), result))
        return total_result

if __name__ == '__main__':
    options = {
        'CONCURRENT_ITEMS': 300,
        'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 20,
        'DOWNLOAD_DELAY': 0.5
    }

    spider = LinkedIn()
    settings = get_project_settings()
    settings.overrides.update(options)
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
Пример #15
0
from project.spiders.log_test import TestSpider as EstiloMASpider

from scrapy.xlib.pydispatch import dispatcher
from scrapy.crawler import Crawler
from twisted.internet import reactor
from scrapy.utils.project import get_project_settings
from scrapy import log, signals

def stop_reactor():
    reactor.stop()  # Stops reactor to prevent script from hanging

if __name__ == '__main__':
    dispatcher.connect(stop_reactor, signal=signals.engine_stopped)
    spider = EstiloMASpider()
    crawler = Crawler(get_project_settings())
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()

# log_test.py:

from scrapy import log
from scrapy.spider import BaseSpider

class TestSpider(BaseSpider):
    name = "logtest"
    start_urls = ["http://example.com/"]
Пример #16
0
#!/usr/bin/env python
from scrapy.crawler import Crawler
from scrapy.settings import settings
import tutorial.spiders.myspider

runner = Crawler(settings)
runner.crawl(tutorial.spiders.myspider.Myspider())
runner.install()
#runner.start_crawling()
runner.start()