def delete_q_backups(): """ Delete the queue backups generated by `fix_queue`. """ for domain in redis_shell.smembers('user-regions') | redis_shell.smembers('cloudmasters'): qname = domain + ":srvq.bak" if redis_shell.exists(qname): redis_shell.delete(qname)
def delete_q_backups(): """ Delete the queue backups generated by `fix_queue`. """ for domain in redis_shell.smembers('user-regions') | redis_shell.smembers( 'cloudmasters'): qname = domain + ":srvq.bak" if redis_shell.exists(qname): redis_shell.delete(qname)
def fallbacks_and_honeypots_in_srv_table(region, cfgbysrv): ret = [] for srv in redis_shell.smembers(region + ':fallbacks'): if srv not in cfgbysrv: ret.append("Fallback server %s for region %s is not in srv->cfg" % (srv, region)) for srv in redis_shell.smembers(region + ':honeypots'): if srv not in cfgbysrv: ret.append("Honeypot server %s for region %s is not in srv->cfg" % (srv, region)) return ret
def fallbacks_and_honeypots_in_srv_table(region, srv2cfg): ret = [] for srv in redis_shell.smembers(region + ':fallbacks'): if srv not in srv2cfg: ret.append("Fallback server %s for region %s is not in srv->cfg" % (srv, region)) for srv in redis_shell.smembers(region + ':honeypots'): if srv not in srv2cfg: ret.append("Honeypot server %s for region %s is not in srv->cfg" % (srv, region)) return ret
def fix_queues(): regions = redis_shell.smembers('user-regions') cloudmasters = redis_shell.smembers('cloudmasters') for domain in regions | cloudmasters: qname = domain + ':srvq' if redis_shell.exists(qname): print "fixing queue for %s..." % domain fix_queue(qname) else: print "no queue for %s." % domain
def fix_all_configs(fix_fn): """ Generic utility to fix broken configurations. fix_fn should take an unparsed config string and return a (needs_fixing, good_config) tuple. """ for domain in redis_shell.smembers('user-regions') | redis_shell.smembers('cloudmasters'): qname = domain + ':srvq' if redis_shell.exists(qname): fix_queue(qname, fix_fn) else: print "Skipping nonexistant queue %s..." % qname fix_live_servers(fix_fn)
def fix_all_configs(fix_fn): """ Generic utility to fix broken configurations. fix_fn should take an unparsed config string and return a (needs_fixing, good_config) tuple. """ for domain in redis_shell.smembers('user-regions') | redis_shell.smembers( 'cloudmasters'): qname = domain + ':srvq' if redis_shell.exists(qname): fix_queue(qname, fix_fn) else: print "Skipping nonexistant queue %s..." % qname fix_live_servers(fix_fn)
def run_all_checks(): cfgbysrv = redis_shell.hgetall('srv->cfg') errors = configs_start_with_newline(cfgbysrv) regions = redis_shell.smembers('user-regions') for region in regions: errors.extend(srvs_in_cfgbysrv(region, cfgbysrv)) for region in regions: errors.extend(check_srvq_size(region)) report(errors)
def run_all_checks(): cfgbysrv = redis_shell.hgetall("srv->cfg") errors = configs_start_with_newline(cfgbysrv) regions = redis_shell.smembers("user-regions") for region in regions: errors.extend(srvs_in_cfgbysrv(region, cfgbysrv)) for region in regions: errors.extend(check_srvq_size(region)) report(errors)
def run_all_checks(): print "Fetching config data..." srv2cfg = redis_shell.hgetall('srv->cfg') print "Performing checks..." cache = model.make_cache() # This is new code, so let's test it in a cushion to start with. try: print "Checking that srv->cfg table is consistent with the VPS listing..." errors = srv2cfg_consistent_with_vps_list(srv2cfg, cache) except: alert_exception("trying to check consistency between srv->cfg and all_vpss") errors = [] try: print "Check that we don't have duplicate names" errors.extend(no_duplicate_names(cache)) except: alert_exception("trying to check for duplicate VPS names") print "Checking that configs start with a newline..." errors.extend(configs_start_with_newline(srv2cfg)) regions = redis_shell.smembers('user-regions') print "Checking that slice server entries are in srv->cfg..." for region in regions: print " (region %s)..." % region errors.extend(slice_srvs_in_srv2cfg(region, srv2cfg)) print "Checking server queue size..." for region in regions: print " (region %s)..." % region errors.extend(srvq_size(region)) print "Checking server queue integrity..." for region in regions: print " (region %s)..." % region try: errors.extend(srvq_integrity(region, cache=cache)) except: alert_exception("trying to check server queue integrity") print "Check that regional fallbacks and honeypots are in srv->cfg..." for region in regions: print " (region %s)..." % region errors.extend(fallbacks_and_honeypots_in_srv_table(region, srv2cfg)) report(errors)
def run(): qname = QPREFIX + ":srvreqq" print "Serving queue", qname, ", MAXPROCS:", repr(MAXPROCS) quarantine = CM + ":quarantined_vpss" reqq = redisq.Queue(qname, redis_shell, LAUNCH_TIMEOUT) procq = multiprocessing.Queue() pending = {} def kill_task(reqid): print "Killing timed out process and vps..." task = pending.pop(reqid) task['proc'].terminate() proc = multiprocessing.Process(target=vps_shell.destroy_vps, args=(task['name'],)) proc.daemon = True proc.start() while True: # If the request queue is totally empty (no tasks enqueued or even in # progress), flush the quarantine queue into the destroy queue. if redis_shell.llen(qname) == 1: # 1 for the redisq sentinel entry names = redis_shell.smembers(quarantine) if names: print "Flushing %s VPSs from quarantine." % len(names) p = redis_shell.pipeline() p.srem(quarantine, *names) p.lpush(CM + ":destroyq", *names) p.execute() while not procq.empty(): try: result = procq.get(False) print "Got result:", result task = pending.get(result['reqid']) if task and task['name'] == result['name']: p = redis_shell.pipeline() if result['blocked']: print "Quarantining %(name)s (%(ip)s)." % result p.sadd(quarantine, result['name']) p.incr(CM + ":blocked_vps_count") # stats # We'll remove the original request anyway because we # don't want it to stay around until timeout. Insert a # new one to replace it instead. reqid = redis_shell.incr('srvcount') p.lpush(qname, reqid) else: p.incr(CM + ":unblocked_vps_count") # stats del pending[result['reqid']] vps_util.enqueue_cfg(result['name'], result['access_data'], result['srvq']) register_vps(task['name']) task['remove_req'](p) p.execute() except Empty: print "Wat?" break if len(pending) < MAXPROCS: req_string, remover = reqq.next_job() if req_string: print "Got request", req_string req = json.loads(req_string) if isinstance(req, int): # Transition: support the old format while we are updating # the config server etc. req = {'id': req, 'srvq': QPREFIX + ':srvq'} req_string = json.dumps(req) reqid = req['id'] if reqid in pending: print "Killing task %s because of queue timeout" % reqid kill_task(reqid) name = new_proxy_name(req) proc = multiprocessing.Process(target=launch_one_server, args=(procq, reqid, name, req_string)) proc.daemon = True pending[reqid] = { 'name': name, 'proc': proc, 'starttime': time.time(), 'remove_req': remover} print "Starting process to launch", name proc.start() else: # Since we're not checking the queue when we've maxed out our # processes, we need to manually check for expired tasks. for reqid, d in pending.items(): if time.time() - d['starttime'] > LAUNCH_TIMEOUT: print "Killing task %s because of local timeout" % reqid kill_task(reqid) time.sleep(10)
def regions(): return redis_shell.smembers('user-regions')
import json import subprocess import yaml from alert import alert from redis_util import redis_shell prefix = 'fallbacks-to-check' try: local_version = file(prefix + '-version').read() except IOError: local_version = None remote_version = redis_shell.get('srvcount') if local_version != remote_version: suppress = redis_shell.smembers('checkfallbacks-suppress') json.dump([yaml.load(cfg).values()[0] for srv, cfg in redis_shell.hgetall('srv->cfg').iteritems() if srv not in suppress], file(prefix + '.json', 'w')) file(prefix + '-version', 'w').write(remote_version) cmd = subprocess.Popen("checkfallbacks -fallbacks %s.json -connections 20 | grep '\[failed fallback check\]'" % prefix, shell=True, stdout=subprocess.PIPE) errors = list(cmd.stdout) if errors: for error in errors: print error alert(type='checkfallbacks-failures', details={'errors': errors},