def add_instances(args): cluster = Cluster(args.name, args.ec2_region) instances = cluster.list_instances() if len(instances.manager) > 1: print "There is more than one Manager instance. Can't add workers.", \ "Managers:" for m in instances.manager: print_instance(m) return 1 elif len(instances.manager) == 0: print "No manager instance is running. Can't add workers." return 1 if args.num_instances < 1: print "--num-instances must be greater or equal to 1." return 1 status_printer = StatusPrinter() cluster.add_workers(instances.manager[0], args.num_instances, args.spot_price, status_printer.on_event) status_printer.done()
def stop_instances(args): cluster = Cluster(args.name, args.ec2_region) instances = cluster.list_instances() count = len(instances.manager) + len(instances.workers) if count == 0: print "No running instances to stop" else: verb = 'Terminating' if args.terminate else 'Stopping' print '%s %d instances:' % (verb, count) for i in itertools.chain(instances.workers, instances.manager): print_instance(i) if instances.unfulfilled: print "Cancelling %d unfulfilled spot instance requests:" % len(instances.unfulfilled) for r in instances.unfulfilled: print_spot_request(r) cluster.stop(instances, args.terminate)
def worker_load(args): cmd_to_run = 'tail -f /mnt/ufora/logs/ufora-worker.log' if args.logs else \ 'sudo apt-get install htop\\; htop' cluster = Cluster(args.name, args.ec2_region) instances = cluster.list_instances() instances = instances.manager + instances.workers identity_file = get_identity_file(args.identity_file) session = os.getenv("USER") def sh(cmd, **kwargs): try: print "CMD =", cmd.format(SESSION=session, **kwargs) subprocess.check_output(cmd.format(SESSION=session, **kwargs), shell=True) except subprocess.CalledProcessError: import traceback traceback.print_exc() sh("tmux -2 kill-session -t {SESSION}") sh("tmux -2 new-session -d -s {SESSION}") # Setup a window for tailing log files sh("tmux new-window -t {SESSION}:1 -n 'pyfora_htop'") for ix in xrange((len(instances)-1)/2): sh("tmux split-window -v -t 0 -l 20") for ix in xrange(len(instances)/2): sh("tmux split-window -h -t {ix}", ix=ix) # for ix in xrange(len(instances)-1,0,-1): # sh('tmux resize-pane -t {ix} -y 20', ix=ix) for ix in xrange(len(instances)): sh('tmux send-keys -t {ix} "ssh ubuntu@%s -t -i %s %s" C-m' % (instances[ix].ip_address, identity_file, cmd_to_run), ix=ix) # Attach to session sh('tmux -2 attach-session -t {SESSION}')
def worker_logs(args): cluster = Cluster(args.name, args.ec2_region) instances = cluster.list_instances() instances = instances.manager + instances.workers identity_file = get_identity_file(args.identity_file) def grep(instance): #note that we have to swap "A" and "B" because tac has reversed the order of the lines. command = ('"source ufora_setup.sh; tac \\$LOG_DIR/logs/ufora-worker.log ' '| grep -m %s -B %s -A %s -e %s" | tac') % (args.N, args.A, args.B, args.expression) return (pad(instance.ip_address + "> ", 25), ssh_output(identity_file, instance.ip_address, command)) for ip, res in parallel_for(instances, grep): for line in res.split("\n"): print ip, line
def list_instances(args): cluster = Cluster(args.name, get_region(args.ec2_region)) instances = cluster.list_instances() count = len(instances.workers) if instances.manager: count += len(instances.manager) if len(instances.manager) > 1: print "Something is wrong! This cluster has more than one manager!" print "%d instance%s%s" % (count, 's' if count != 1 else '', ':' if count > 0 else '') for manager in instances.manager: print_instance(manager) for i in instances.workers: print_instance(i) if instances.unfulfilled: print "" count = len(instances.unfulfilled) print "%d unfulfilled spot instance request%s:" % (count, 's' if count != 1 else '') for r in instances.unfulfilled: print_spot_request(r)
def deploy_package(args): cluster = Cluster(args.name, args.ec2_region) instances = cluster.list_instances() instances = instances.manager + instances.workers if len(instances) == 0: print "No running instances" return print "Running instances:" for i in instances: print_instance(i) print '' def is_failure(result): return isinstance(result, basestring) def any_failures(results): return any(is_failure(x) for x in results) def print_failures(results): for ix in xrange(len(results)): if is_failure(results[ix]): print instances[ix].id, "|", instances[ix].ip_address, ':', results[ix] print "Uploading package..." results = upload_package(args.package, instances, get_identity_file(args.identity_file)) if any_failures(results): print "Failed to upload package:" print_failures(results) return print "Package uploaded successfully" print '' print "Updating service..." results = update_ufora_service(instances, get_identity_file(args.identity_file)) if any_failures(results): print "Failed to update service:" print_failures(results) return print "Service updated successfully"
def restart_instances(args): cluster = Cluster(args.name, args.ec2_region) instances = cluster.list_instances() instances = instances.manager + instances.workers identity_file = get_identity_file(args.identity_file) def restart_instance(instance): is_manager = 'manager' in instance.tags.get('Name', '') if is_manager: command = ('"source ufora_setup.sh; \\$DOCKER stop ufora_manager; ' 'sudo rm -rf \\$LOG_DIR/*; \\$DOCKER start ufora_manager"') else: command = ('"source ufora_setup.sh; \\$DOCKER stop ufora_worker; ' 'sudo rm -rf \\$LOG_DIR/*; \\$DOCKER start ufora_worker"') return (pad(instance.ip_address + "> ", 25), ssh_output(identity_file, instance.ip_address, command)) for ip, res in parallel_for(instances, restart_instance): for line in res.split("\n"): print ip, line