def check_job_has_status(c, spider, crawl, status): server = None if crawl['worker_server_id']: c.execute("select * from worker_server where id = %s" % crawl['worker_server_id']) server = c.fetchone() if not server: return False return check_job_has_status_on_remote(c, spider, crawl, status)
def check_todays_crawls_error(): c.execute("select count(*) as number from crawl where crawl_date='%s';" % str(date.today())) res = c.fetchone() res = res['number'] if res < minimum_number_of_crawls: return True else: return False
def check_failed_spiders_with_status(status): spider_stats = {} fname = os.path.join(here, 'spider_stats_%s_new.p' % status) if os.path.exists(fname) and not os.stat(fname).st_size == 0: try: spider_stats = pickle.load(open(fname)) except (pickle.UnpicklingError, AttributeError, TypeError, ValueError, KeyError): return True, ['Unpickling error'] c.execute( "select s.* from spider s join account a on a.id=s.account_id where a.enabled and s.enabled;" ) spiders = {x['id']: x for x in c} spider_ids = spiders.keys() c.execute("select * from crawl where spider_id in %s and status='%s'" % (tuple(spider_ids), status)) errors = False errors_spiders = [] spiders_with_status = [] crawls = c.fetchall() for crawl in crawls: if 'jobid' not in crawl: continue spider = spiders[crawl['spider_id']] s_stats = spider_stats.get(spider['name']) if not check_job_has_status(c, spider, crawl, status): c.execute('select * from crawl where id = %s' % crawl['id']) crawl = c.fetchone() if crawl['status'] == status: spiders_with_status.append(spider['name']) if s_stats and s_stats + timedelta( minutes=error_timeout) < datetime.now(): errors = True errors_spiders.append(spider['name']) elif not s_stats: spider_stats[spider['name']] = datetime.now() elif s_stats: del spider_stats[spider['name']] for k in spider_stats.keys(): if k not in spiders_with_status: del spider_stats[k] with open(fname, 'w') as f: pickle.dump(spider_stats, f) return errors, errors_spiders
def get_jobs_list_url(c, spider, crawl): server = None if crawl['worker_server_id']: c.execute("select * from worker_server where id = %s" % crawl['worker_server_id']) server = c.fetchone() if not server: if spider['enable_multicrawling']: scrapy_url = 'http://localhost:6801/' else: scrapy_url = 'http://localhost:6800/' else: scrapy_url = server['scrapy_url'] jobs_url = scrapy_url + 'listjobs.json?project=default' return jobs_url
def get_log_url(c, spider, crawl): server = None if crawl['worker_server_id']: c.execute("select * from worker_server where id = %s" % crawl['worker_server_id']) server = c.fetchone() if not server: if spider['enable_multicrawling']: scrapy_url = 'http://localhost:6801/' else: scrapy_url = 'http://localhost:6800/' else: scrapy_url = server['scrapy_url'] log_url = scrapy_url + 'logs/default/%s/%s.log' % (spider['name'], crawl['jobid']) return log_url
def check_failed_scheduled_spiders(): spider_stats = {} fname = os.path.join(here, 'spider_stats.p') if os.path.exists(fname) and not os.stat(fname).st_size == 0: try: spider_stats = pickle.load(open(fname)) except (pickle.UnpicklingError, AttributeError, TypeError, ValueError, KeyError): return True, ['Unpickling error'] c.execute( "SELECT s.* FROM spider s JOIN account a ON a.id=s.account_id WHERE a.enabled AND s.enabled;" ) spiders = {x['id']: x for x in c} spider_ids = spiders.keys() c.execute( "select * from crawl where spider_id in %s and status in ('scheduled_on_worker', 'scheduled')" % (tuple(spider_ids), )) errors = False errors_spiders = [] # all spiders with "scheduled" or "scheduled_on_worker" status scheduled = [] crawls = c.fetchall() for crawl in crawls: spider = spiders[crawl['spider_id']] jobid = crawl['jobid'] log_url = get_log_url(c, spider, crawl) s_stats = spider_stats.get(spider['name']) # check if log exists # if status is 'scheduled' or 'scheduled_on_worker' and there is log file then it's an error log_exists = check_log_exists(log_url) # if status is 'scheduled_on_worker' and jobid is not set then it's an error jobid_not_set = (crawl['status'] == 'scheduled_on_worker' and not bool(jobid)) if log_exists or jobid_not_set: c.execute('select * from crawl where id = %s' % crawl['id']) crawl = c.fetchone() if crawl['status'] == 'scheduled_on_worker' or crawl[ 'status'] == 'scheduled': scheduled.append(spider['name']) if s_stats and s_stats + timedelta( minutes=error_timeout) < datetime.now(): errors = True errors_spiders.append(spider['name']) elif not s_stats: spider_stats[spider['name']] = datetime.now() elif s_stats: del spider_stats[spider['name']] for k in spider_stats.keys(): if k not in scheduled: del spider_stats[k] with open(fname, 'w') as f: pickle.dump(spider_stats, f) return errors, errors_spiders