def setup_crawler(spider, stop=False): ''' Takes a spider class object ''' # Deferred means other functions can wait on this finishing # Wait until the callback is triggered by spider close # See twisted docs d = defer.Deferred() def foo(*a, **kw): # The result to be passed to any callbacks to deferred # (we don't use it, so True could've been False, None w/e) d.callback(True) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # Ref to foo otherwise it gets GC'd (garbage collected) crawler._tempref = foo # foo is the handler for the closed signal from this spider # N.B. dispatch returns spider and reason (e.g. 'finished') to foo. crawler.signals.connect(foo, signal=signals.spider_closed) crawler.crawl(spider) # N.B log is scrapy log. log2 is python color logger # The crawler arg is necessary for log_count/{ERROR, DEBUG, INFO..} stats # which you will want for stats mailer extension. # Starting this each time will cause the big torrade of ESMTP Error # log.start(crawler=crawler) crawler.start() return d