def __init__(self, crawler): self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD')) self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120 crawler.signals.connect(self.spider_opened, signals.spider_opened) crawler.signals.connect(self.spider_closed, signals.spider_closed)
def __init__(self, crawler): """Access the settings of the crawler to connect to Scrapoxy. """ self._http_status_codes = crawler.settings.get( 'BLACKLIST_HTTP_STATUS_CODES', [503]) self._sleep_min = crawler.settings.get('SCRAPOXY_SLEEP_MIN', 60) self._sleep_max = crawler.settings.get('SCRAPOXY_SLEEP_MAX', 180) self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD'))
class BlacklistDownloaderMiddleware(object): def __init__(self, crawler): """Access the settings of the crawler to connect to Scrapoxy. """ self._http_status_codes = crawler.settings.get( 'BLACKLIST_HTTP_STATUS_CODES', [503]) self._sleep_min = crawler.settings.get('SCRAPOXY_SLEEP_MIN', 60) self._sleep_max = crawler.settings.get('SCRAPOXY_SLEEP_MAX', 180) self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD')) @classmethod def from_crawler(cls, crawler): """Call constructor with crawler parameters """ return cls(crawler) def process_response(self, request, response, spider): """Detect blacklisted response and stop the instance if necessary. """ try: if response.status in self._http_status_codes: raise BlacklistError(response, 'HTTP status {}'.format(response.status)) return response except BlacklistError as ex: spider.log('Ignoring Blacklisted response {0}: {1}'.format( response.url, ex.message), level=logging.DEBUG) name = response.headers['x-cache-proxyname'].decode('utf-8') self._stop_and_sleep(spider, name) raise IgnoreRequest() def _stop_and_sleep(self, spider, name): if name: alive = self._commander.stop_instance(name) if alive < 0: spider.log('Remove: cannot find instance {}'.format(name), level=logging.ERROR) elif alive == 0: spider.log('Remove: instance removed (no instance remaining)', level=logging.WARNING) else: spider.log( 'Remove: instance removed ({} instances remaining)'.format( alive), level=logging.DEBUG) else: spider.log('Cannot find instance name in headers', level=logging.ERROR) delay = random.randrange(self._sleep_min, self._sleep_max) spider.log('Sleeping {} seconds'.format(delay), level=logging.INFO) time.sleep(delay)
def __init__(self, crawler): self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD') ) self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120 crawler.signals.connect(self.spider_opened, signals.spider_opened) crawler.signals.connect(self.spider_closed, signals.spider_closed)
class BlacklistDownloaderMiddleware(object): def __init__(self, crawler): """Access the settings of the crawler to connect to Scrapoxy. """ self._http_status_codes = crawler.settings.get('BLACKLIST_HTTP_STATUS_CODES', [503]) self._sleep_min = crawler.settings.get('SCRAPOXY_SLEEP_MIN', 60) self._sleep_max = crawler.settings.get('SCRAPOXY_SLEEP_MAX', 180) self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD') ) @classmethod def from_crawler(cls, crawler): """Call constructor with crawler parameters """ return cls(crawler) def process_response(self, request, response, spider): """Detect blacklisted response and stop the instance if necessary. """ try: if response.status in self._http_status_codes: raise BlacklistError(response, u'HTTP status '.format(response.status)) return response except BlacklistError as ex: spider.log(u'Ignoring Blacklisted response {0}: {1}'.format(response.url, ex.message), level=logging.DEBUG) name = response.headers.get(u'x-cache-proxyname') self._stop_and_sleep(spider, name) raise IgnoreRequest() def _stop_and_sleep(self, spider, name): if name: alive = self._commander.stop_instance(name) if alive < 0: spider.log(u'Remove: cannot find instance {}'.format(name), level=logging.ERROR) elif alive == 0: spider.log(u'Remove: instance removed (no instance remaining)', level=logging.WARNING) else: spider.log(u'Remove: instance removed ({} instances remaining)'.format(alive), level=logging.DEBUG) else: spider.log(u'Cannot find instance name in headers', level=logging.ERROR) delay = random.randrange(self._sleep_min, self._sleep_max) spider.log(u'Sleeping {} seconds'.format(delay), level=logging.INFO) time.sleep(delay)
def __init__(self, crawler): """Access the settings of the crawler to connect to Scrapoxy. """ self._http_status_codes = crawler.settings.get('BLACKLIST_HTTP_STATUS_CODES', [503]) self._sleep_min = crawler.settings.get('SCRAPOXY_SLEEP_MIN', 60) self._sleep_max = crawler.settings.get('SCRAPOXY_SLEEP_MAX', 180) self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD') )
class ScaleMiddleware(object): def __init__(self, crawler): self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD') ) self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120 crawler.signals.connect(self.spider_opened, signals.spider_opened) crawler.signals.connect(self.spider_closed, signals.spider_closed) @classmethod def from_crawler(cls, crawler): return cls(crawler) def spider_opened(self, spider): spider.logger.debug(u'[ScaleMiddleware] Upscale Scrapoxy') min_sc, required_sc, max_sc = self._commander.get_scaling() required_sc = max_sc self._commander.update_scaling(min_sc, required_sc, max_sc) spider.log(u'[ScaleMiddleware] Sleeping {0} seconds to finish upscale'.format(self._WAIT_FOR_SCALE), level=logging.WARNING) time.sleep(self._WAIT_FOR_SCALE) def spider_closed(self, spider): spider.logger.debug(u'[ScaleMiddleware] Downscale Scrapoxy') min_sc, required_sc, max_sc = self._commander.get_scaling() required_sc = min_sc self._commander.update_scaling(min_sc, required_sc, max_sc)
class ScaleMiddleware(object): def __init__(self, crawler): self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD')) self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120 crawler.signals.connect(self.spider_opened, signals.spider_opened) crawler.signals.connect(self.spider_closed, signals.spider_closed) @classmethod def from_crawler(cls, crawler): return cls(crawler) def spider_opened(self, spider): spider.logger.debug('[ScaleMiddleware] Upscale Scrapoxy') min_sc, required_sc, max_sc = self._commander.get_scaling() required_sc = max_sc self._commander.update_scaling(min_sc, required_sc, max_sc) spider.log( '[ScaleMiddleware] Sleeping {0} seconds to finish upscale'.format( self._WAIT_FOR_SCALE), level=logging.WARNING) time.sleep(self._WAIT_FOR_SCALE) def spider_closed(self, spider): spider.logger.debug('[ScaleMiddleware] Downscale Scrapoxy') min_sc, required_sc, max_sc = self._commander.get_scaling() required_sc = min_sc self._commander.update_scaling(min_sc, required_sc, max_sc)
def __init__(self): self.settings = get_project_settings() self.commander = Commander(self.settings.get('API_SCRAPOXY'), self.settings.get('API_SCRAPOXY_PASSWORD')) configure_logging(settings=None, install_root_handler=False) logging.config.dictConfig(self.settings['LOGGING_SETTINGS'])
class App(object): def __init__(self): self.settings = get_project_settings() self.commander = Commander(self.settings.get('API_SCRAPOXY'), self.settings.get('API_SCRAPOXY_PASSWORD')) configure_logging(settings=None, install_root_handler=False) logging.config.dictConfig(self.settings['LOGGING_SETTINGS']) def prepare_instances(self): if len(self.settings.get('DOWNLOADER_MIDDLEWARES', {})) <= 1: logger.info("Do not run crawler over proxy") return min_sc, required_sc, max_sc = self.commander.get_scaling() required_sc = max_sc self.commander.update_scaling(min_sc, required_sc, max_sc) wait_for_scale = self.settings.get('WAIT_FOR_SCALE') time.sleep(wait_for_scale) def runCrawlers(self): process = CrawlerProcess(self.settings) crawl_thread = Crawlers(process=process, spiders=[Homegate, Newhome, Immoscout24]) crawl_thread.start() rounds = 0 while crawl_thread.is_alive(): if rounds == (4320): # 4320*10(sleep) = 12h logger.info("Run into time out") break rounds += 1 time.sleep(10) logger.debug("Stopping all crawlers..") process.stop() while crawl_thread.is_alive(): logger.debug("Wait for crawlers to clean up...") time.sleep(100) def shutdown_instances(self): if len(self.settings.get('DOWNLOADER_MIDDLEWARES', {})) <= 1: logger.info("Nothing to stop, because no instances were started") return min_sc, required_sc, max_sc = self.commander.get_scaling() self.commander.update_scaling(min_sc, 0, max_sc) def getCrawledData(self): engine = create_engine(self.settings.get('DATABASE_URL')) Session = sessionmaker(bind=engine, expire_on_commit=True) session = Session() from_time = datetime.datetime.now() - datetime.timedelta(days=1) ads = session.query(Advertisement).filter( Advertisement.last_seen >= from_time).all() with open("crawled_ads.csv", "w", newline="") as csvfile: csvwriter = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow( [column.key for column in Advertisement.__table__.columns]) for ad in ads: csvwriter.writerow(list(ad)) print(len(ads))