def run(): context = zmq.Context(1) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) # waits until the kubecfg file gets copied into the pod -- this might be # brittle if we try to move to a non-Ubuntu setting, but I'm not worried # about that for now while not os.path.isfile('/root/.kube/config'): pass client = util.init_k8s() while True: socks = dict(poller.poll()) if churn_pull_socket in socks and socks[churn_pull_socket] == \ zmq.POLLIN: msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': num = int(args[1]) ntype = args[2] logging.info('Adding %d new %s node(s)...' % (num, ntype)) mon_ips = util.get_pod_ips(client, 'role=monitoring') route_ips = util.get_pod_ips(client, 'role=routing') add_nodes(client, [ntype], [num], mon_ips, route_ips) logging.info('Successfully added %d %s node(s).' % (num, ntype)) elif args[0] == 'remove': ip = args[1] ntype = args[2] remove_node(ip, ntype) logging.info('Successfully removed node %s.' % (ip)) if restart_pull_socket in socks and socks[restart_pull_socket] == \ zmq.POLLIN: msg = restart_pull_socket.recv_string() args = msg.split(':') ip = args[1] pod = util.get_pod_from_ip(client, ip) count = str(pod.status.container_statuses[0].restart_count) logging.info('Returning restart count ' + count + ' for IP ' + ip + '.') restart_pull_socket.send_string(count)
def remove_node(ip, ntype): client, _ = util.init_k8s() pod = util.get_pod_from_ip(client, ip) hostname = 'ip-%s.ec2.internal' % (ip.replace('.', '-')) podname = pod.metadata.name client.delete_namespaced_pod(name=podname, namespace=util.NAMESPACE, body=k8s.client.V1DeleteOptions()) client.delete_node(name=hostname, body=k8s.client.V1DeleteOptions()) prev_count = util.get_previous_count(client, ntype) util.run_process(['./modify_ig.sh', ntype, str(prev_count - 1)])
def run(): context = zmq.Context(1) client = util.init_k8s() node_add_socket = context.socket(zmq.PULL) node_add_socket.bind('ipc:///tmp/node_add') node_remove_socket = context.socket(zmq.PULL) node_remove_socket.bind('ipc:///tmp/node_remove') poller = zmq.Poller() poller.register(node_add_socket, zmq.POLLIN) poller.register(node_remove_socket, zmq.POLLIN) cfile = '/fluent/conf/kvs-base.yml' while True: socks = dict(poller.poll(timeout=1000)) if node_add_socket in socks and socks[node_add_socket] == zmq.POLLIN: msg = node_add_socket.recv_string() args = msg.split(':') ntype = args[0] num = int(args[1]) logging.info('Adding %d new %s node(s)...' % (num, ntype)) mon_ips = util.get_pod_ips(client, 'role=monitoring') route_ips = util.get_pod_ips(client, 'role=routing') scheduler_ips = util.get_pod_ips(client, 'role=scheduler') route_addr = util.get_service_address(client, 'routing-service') add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips=route_ips, route_addr=route_addr, scheduler_ips=scheduler_ips) logging.info('Successfully added %d %s node(s).' % (num, ntype)) if node_remove_socket in socks and socks[node_remove_socket] == \ zmq.POLLIN: msg = node_remove_socket.recv_string() args = msg.split(':') ntype = args[0] ip = args[1] remove_node(ip, ntype) logging.info('Successfully removed node %s.' % (ip))
def main(): client, apps_client = util.init_k8s() context = zmq.Context() # Sockets for hash ring membership changes rtr_join_sock = context.socket(zmq.PULL) rtr_join_sock.bind('tcp://*:%s' % (str(ru.MNG_JOIN_PORT))) rtr_depart_sock = context.socket(zmq.PULL) rtr_depart_sock.bind('tcp://*:%s' % (str(ru.MNG_DEPART_PORT))) poller = zmq.Poller() poller.register(rtr_join_sock, zmq.POLLIN) while True: socks = dict(poller.poll()) if rtr_join_sock in socks and socks[rtr_join_sock] == zmq.POLLIN: logging.info('Received join') msg = rtr_join_sock.recv() router_broadcast(client, 'join', msg) if rtr_depart_sock in socks and socks[rtr_depart_sock] == zmq.POLLIN: logging.info('Received depart') msg = rtr_depart_sock.recv() router_broadcast(client, 'depart', msg)
def create_cluster(mem_count, ebs_count, func_count, sched_count, route_count, bench_count, cfile, ssh_key, cluster_name, kops_bucket, aws_key_id, aws_key): # create the cluster object with kops util.run_process( ['./create_cluster_object.sh', cluster_name, kops_bucket, ssh_key]) client, apps_client = util.init_k8s() # create the kops pod print('Creating management pods...') kops_spec = util.load_yaml('yaml/pods/kops-pod.yml') env = kops_spec['spec']['containers'][0]['env'] util.replace_yaml_val(env, 'AWS_ACCESS_KEY_ID', aws_key_id) util.replace_yaml_val(env, 'AWS_SECRET_ACCESS_KEY', aws_key) util.replace_yaml_val(env, 'KOPS_STATE_STORE', kops_bucket) util.replace_yaml_val(env, 'FLUENT_CLUSTER_NAME', cluster_name) client.create_namespaced_pod(namespace=util.NAMESPACE, body=kops_spec) # wait for the kops pod to start kops_ip = util.get_pod_ips(client, 'role=kops', is_running=True)[0] # copy kube config file to kops pod, so it can execute kubectl commands kops_podname = kops_spec['metadata']['name'] kcname = kops_spec['spec']['containers'][0]['name'] os.system('cp %s kvs-config.yml' % cfile) util.copy_file_to_pod(client, '/home/ubuntu/.kube/config', kops_podname, '/root/.kube/', kcname) util.copy_file_to_pod(client, ssh_key, kops_podname, '/root/.ssh/', kcname) util.copy_file_to_pod(client, ssh_key + '.pub', kops_podname, '/root/.ssh/', kcname) util.copy_file_to_pod(client, 'kvs-config.yml', kops_podname, '/fluent/conf/', kcname) # start the monitoring pod mon_spec = util.load_yaml('yaml/pods/monitoring-pod.yml') util.replace_yaml_val(mon_spec['spec']['containers'][0]['env'], 'MGMT_IP', kops_ip) client.create_namespaced_pod(namespace=util.NAMESPACE, body=mon_spec) util.get_pod_ips(client, 'role=monitoring') # copy config file into monitoring pod -- wait till we create routing pods, # so we're sure that the monitoring nodes are up and running util.copy_file_to_pod(client, 'kvs-config.yml', mon_spec['metadata']['name'], '/fluent/conf/', mon_spec['spec']['containers'][0]['name']) os.system('rm kvs-config.yml') print('Creating %d routing nodes...' % (route_count)) add_nodes(client, apps_client, cfile, ['routing'], [route_count], True) util.get_pod_ips(client, 'role=routing') print('Creating %d memory, %d ebs node(s)...' % (mem_count, ebs_count)) add_nodes(client, apps_client, cfile, ['memory', 'ebs'], [mem_count, ebs_count], True) print('Creating routing service...') service_spec = util.load_yaml('yaml/services/routing.yml') client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) print('Adding %d scheduler nodes...' % (sched_count)) add_nodes(client, apps_client, cfile, ['scheduler'], [sched_count], True) util.get_pod_ips(client, 'role=scheduler') print('Adding %d function serving nodes...' % (func_count)) add_nodes(client, apps_client, cfile, ['function'], [func_count], True) print('Creating function service...') service_spec = util.load_yaml('yaml/services/function.yml') client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) print('Adding %d benchmark nodes...' % (bench_count)) add_nodes(client, apps_client, cfile, ['benchmark'], [bench_count], True) print('Finished creating all pods...') os.system('touch setup_complete') util.copy_file_to_pod(client, 'setup_complete', kops_podname, '/fluent', kcname) os.system('rm setup_complete') sg_name = 'nodes.' + cluster_name sg = ec2_client.describe_security_groups(Filters=[{ 'Name': 'group-name', 'Values': [sg_name] }])['SecurityGroups'][0] print('Authorizing ports for routing service...') permission = [{ 'FromPort': 6200, 'IpProtocol': 'tcp', 'ToPort': 6203, 'IpRanges': [{ 'CidrIp': '0.0.0.0/0' }] }] ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'], IpPermissions=permission) routing_svc_addr = util.get_service_address(client, 'routing-service') function_svc_addr = util.get_service_address(client, 'function-service') print('The routing service can be accessed here: \n\t%s' % (routing_svc_addr)) print('The function service can be accessed here: \n\t%s' % (function_svc_addr))
def run(): context = zmq.Context(1) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') list_executors_socket = context.socket(zmq.REP) list_executors_socket.bind('tcp://*:7002') function_status_socket = context.socket(zmq.PULL) function_status_socket.bind('tcp://*:7003') list_schedulers_socket = context.socket(zmq.REP) list_schedulers_socket.bind('tcp://*:7004') executor_depart_socket = context.socket(zmq.PULL) executor_depart_socket.bind('tcp://*:7005') executor_statistics_socket = context.socket(zmq.PULL) executor_statistics_socket.bind('tcp://*:7006') poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) poller.register(function_status_socket, zmq.POLLIN) poller.register(list_executors_socket, zmq.POLLIN) poller.register(list_schedulers_socket, zmq.POLLIN) poller.register(executor_depart_socket, zmq.POLLIN) poller.register(executor_statistics_socket, zmq.POLLIN) add_push_socket = context.socket(zmq.PUSH) add_push_socket.connect('ipc:///tmp/node_add') remove_push_socket = context.socket(zmq.PUSH) remove_push_socket.connect('ipc:///tmp/node_remove') # waits until the kubecfg file gets copied into the pod -- this might be # brittle if we try to move to a non-Ubuntu setting, but I'm not worried # about that for now while not os.path.isfile('/root/.kube/config'): pass client = util.init_k8s() # track the self-reported status of each function execution thread executor_statuses = {} departing_executors = {} function_frequencies = {} function_runtimes = {} latency_history = {} start = time.time() while True: socks = dict(poller.poll(timeout=1000)) if churn_pull_socket in socks and socks[churn_pull_socket] == \ zmq.POLLIN: msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': msg = args[2] + args[1] add_push_socket.send_string(msg) elif args[0] == 'remove': msg = args[2] = args[1] remove_push_socket.send_string(msg) if restart_pull_socket in socks and socks[restart_pull_socket] == \ zmq.POLLIN: msg = restart_pull_socket.recv_string() args = msg.split(':') ip = args[1] pod = util.get_pod_from_ip(client, ip) count = str(pod.status.container_statuses[0].restart_count) logging.info('Returning restart count %s for IP %s.' % (count, ip)) restart_pull_socket.send_string(count) if list_executors_socket in socks and socks[list_executors_socket] == \ zmq.POLLIN: # it doesn't matter what is in this message msg = list_executors_socket.recv() ks = KeySet() for ip in util.get_pod_ips(client, 'role=function'): ks.keys.append(ip) list_executors_socket.send(ks.SerializeToString()) if function_status_socket in socks and \ socks[function_status_socket] == zmq.POLLIN: status = ThreadStatus() status.ParseFromString(function_status_socket.recv()) key = (status.ip, status.tid) # if this executor is one of the ones that's currently departing, # we can just ignore its status updates since we don't want # utilization to be skewed downwards if key in departing_executors: continue executor_statuses[key] = status logging.info(('Received thread status update from %s:%d: %.4f ' + 'occupancy, %d functions pinned') % (status.ip, status.tid, status.utilization, len(status.functions))) if list_schedulers_socket in socks and socks[list_schedulers_socket] == \ zmq.POLLIN: # It doesn't matter what is in this message msg = list_schedulers_socket.recv_string() ks = KeySet() for ip in util.get_pod_ips(client, 'role=scheduler'): ks.keys.append(ip) list_schedulers_socket.send(ks.SerializeToString()) if executor_depart_socket in socks and \ socks[executor_depart_socket] == zmq.POLLIN: ip = executor_depart_socket.recv_string() departing_executors[ip] -= 1 # wait until all the executors on this IP have cleared their queues # and left; then we remove the node if departing_executors[ip] == 0: msg = 'function:' + ip remove_push_socket.send_string(msg) del departing_executors[ip] if executor_statistics_socket in socks and \ socks[executor_statistics_socket] == zmq.POLLIN: stats = ExecutorStatistics() stats.ParseFromString(executor_statistics_socket.recv()) for fstats in stats.statistics: fname = fstats.fname if fname not in function_frequencies: function_frequencies[fname] = 0 function_runtimes[fname] = 0.0 function_frequencies[fname] += fstats.call_count function_runtimes[fname] += fstats.runtime end = time.time() if end - start > REPORT_PERIOD: logging.info('Checking hash ring...') check_hash_ring(client, context) logging.info('Checking for extra nodes...') check_unused_nodes(client, add_push_socket) check_executor_utilization(client, context, executor_statuses, departing_executors, add_push_socket) check_function_load(context, function_frequencies, function_runtimes, executor_statuses, latency_history) start = time.time()
import util from routing_util import register, deregister import subprocess import time # AWS Info aws_key_id = util.check_or_get_env_arg('AWS_ACCESS_KEY_ID') aws_key = util.check_or_get_env_arg('AWS_SECRET_ACCESS_KEY') # Config File Info BASE_CONFIG_FILE = '../config/tasc-base.yml' CONFIG_FILE = './tasc-config.yml' POD_CONFIG_DIR = '/go/src/github.com/saurav-c/tasc/config' NODE_TYPES = ['tasc', 'keynode', 'routing', 'lb', 'worker', 'benchmark'] client, apps_client = util.init_k8s() def main(): args = sys.argv[1:] cmd = args[0] if cmd == 'send-conf': ip = args[1] conf = args[2] if len(args) > 2 else None sendConfig(ip, conf) elif cmd == 'add': ntype = args[1] count = int(args[2]) if ntype not in NODE_TYPES: print('Unknown node type: ' + ntype)
def create_cluster(replica_count, gc_count, lb_count, bench_count, cfile, ssh_key, cluster_name, kops_bucket, aws_key_id, aws_key): prefix = './' util.run_process(['./create_cluster_object.sh', kops_bucket, ssh_key]) client, apps_client = util.init_k8s() print('Creating management pod') # management_spec = util.load_yaml('yaml/pods/management-pod.yml') # env = management_spec['spec']['containers'][0]['env'] # util.replace_yaml_val(env, 'AWS_ACCESS_KEY_ID', aws_key_id) # util.replace_yaml_val(env, 'AWS_SECRET_ACCESS_KEY', aws_key) # # client.create_namespaced_pod(namespace=util.NAMESPACE, # body=management_spec) # management_ip = util.get_pod_ips(client, 'role=management', # is_running=True)[0] management_ip = "" print('Creating standby replicas...') util.run_process(['./modify_ig.sh', 'standby', '1']) util.run_process(['./validate_cluster.sh']) print('Creating %d load balancer, %d GC replicas...' % (lb_count, gc_count)) add_nodes(client, apps_client, cfile, ['lb', 'gc'], [lb_count, gc_count], management_ip, aws_key_id, aws_key, True, prefix) lb_pods = client.list_namespaced_pod(namespace=util.NAMESPACE, label_selector="role=lb").items kubecfg = os.path.join(os.environ['HOME'], '.kube/config') for pod in lb_pods: util.copy_file_to_pod(client, kubecfg, pod.metadata.name, '/root/.kube', 'lb-container') replica_ips = util.get_node_ips(client, 'role=gc', 'ExternalIP') with open('gcs.txt', 'w') as f: for ip in replica_ips: f.write(ip + '\n') # Wait until the monitoring pod is finished creating to get its IP address # and then copy KVS config into the monitoring pod. print('Creating %d Aft replicas...' % (replica_count)) add_nodes(client, apps_client, cfile, ['aft'], [replica_count], management_ip, aws_key_id, aws_key, True, prefix) util.get_pod_ips(client, 'role=aft') replica_ips = util.get_node_ips(client, 'role=aft', 'ExternalIP') with open('replicas.txt', 'w') as f: for ip in replica_ips: f.write(ip + '\n') os.system('cp %s aft-config.yml' % cfile) management_pname = management_spec['metadata']['name'] management_cname = management_spec['spec']['containers'][0]['name'] util.copy_file_to_pod(client, 'aft-config.yml', management_pname, '/go/src/github.com/tajshaik24/aft/config', management_cname) util.copy_file_to_pod(client, 'replicas.txt', management_pname, '/go/src/github.com/tajshaik24/aft', management_cname) util.copy_file_to_pod(client, 'gcs.txt', management_pname, '/go/src/github.com/tajshaik24/aft', management_cname) util.copy_file_to_pod(client, kubecfg, management_pname, '/root/.kube/', management_cname) os.system('rm aft-config.yml') os.system('rm gcs.txt') # Copy replicas.txt to all Aft pods. aft_pod_list = client.list_namespaced_pod(namespace=util.NAMESPACE, label_selector="role=aft").items aft_pod_list = list(map(lambda pod: pod.metadata.name, aft_pod_list)) for pname in aft_pod_list: util.copy_file_to_pod(client, 'replicas.txt', pname, '/go/src/github.com/tajshaik24/aft', 'aft-container') gc_pod_list = client.list_namespaced_pod(namespace=util.NAMESPACE, label_selector="role=gc").items gc_pod_list = list(map(lambda pod: pod.metadata.name, gc_pod_list)) for pname in gc_pod_list: util.copy_file_to_pod(client, 'replicas.txt', pname, '/go/src/github.com/tajshaik24/aft', 'gc-container') os.system('rm replicas.txt') print('Adding %d benchmark nodes...' % (bench_count)) add_nodes(client, apps_client, cfile, ['benchmark'], [bench_count], management_ip, aws_key_id, aws_key, True, prefix) print('Finished creating all pods...') print('Creating Aft service...') service_spec = util.load_yaml('yaml/services/aft.yml', prefix) client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) sg_name = 'nodes.' + cluster_name sg = ec2_client.describe_security_groups(Filters=[{ 'Name': 'group-name', 'Values': [sg_name] }])['SecurityGroups'][0] print('Authorizing ports for Aft replicas...') permission = [{ 'FromPort': 7654, 'IpProtocol': 'tcp', 'ToPort': 7656, 'IpRanges': [{ 'CidrIp': '0.0.0.0/0' }] }, { 'FromPort': 7777, 'IpProtocol': 'tcp', 'ToPort': 7782, 'IpRanges': [{ 'CidrIp': '0.0.0.0/0' }] }, { 'FromPort': 8000, 'IpProtocol': 'tcp', 'ToPort': 8003, 'IpRanges': [{ 'CidrIp': '0.0.0.0/0' }] }] ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'], IpPermissions=permission) print('Finished!')
def run(): context = zmq.Context(1) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') extant_caches_socket = context.socket(zmq.REP) extant_caches_socket.bind('tcp://*:7002') func_pull_socket = context.socket(zmq.PULL) func_pull_socket.bind('tcp://*:7003') poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) poller.register(func_pull_socket, zmq.POLLIN) poller.register(extant_caches_socket, zmq.POLLIN) cfile = '/fluent/conf/kvs-base.yml' # waits until the kubecfg file gets copied into the pod -- this might be # brittle if we try to move to a non-Ubuntu setting, but I'm not worried # about that for now while not os.path.isfile('/root/.kube/config'): pass client = util.init_k8s() func_occ_map = {} start = time.time() while True: socks = dict(poller.poll(timeout=1000)) if churn_pull_socket in socks and socks[churn_pull_socket] == \ zmq.POLLIN: msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': num = int(args[1]) ntype = args[2] logging.info('Adding %d new %s node(s)...' % (num, ntype)) if len(args) > 3: num_threads = args[3] else: num_threads = 3 mon_ips = util.get_pod_ips(client, 'role=monitoring') route_ips = util.get_pod_ips(client, 'role=routing') os.system('sed -i "s|%s: [0-9][0-9]*|%s: %d|g" %s' % (ntype, ntype, num_threads, cfile)) os.system('sed -i "s|%s-cap: [0-9][0-9]*|%s: %d|g" %s' % (ntype, ntype, num_threads * 15, cfile)) add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips) logging.info('Successfully added %d %s node(s).' % (num, ntype)) elif args[0] == 'remove': ip = args[1] ntype = args[2] remove_node(ip, ntype) logging.info('Successfully removed node %s.' % (ip)) if restart_pull_socket in socks and socks[restart_pull_socket] == \ zmq.POLLIN: msg = restart_pull_socket.recv_string() args = msg.split(':') ip = args[1] pod = util.get_pod_from_ip(client, ip) count = str(pod.status.container_statuses[0].restart_count) logging.info('Returning restart count ' + count + ' for IP ' + ip + '.') restart_pull_socket.send_string(count) if extant_caches_socket in socks and socks[extant_caches_socket] == \ zmq.POLLIN: # It doesn't matter what is in this message msg = extant_caches_socket.recv_string() ks = KeySet() for ip in util.get_pod_ips(clinet, 'role=function'): ks.add_keys(ip) extant_caches_socket.send_string(ks.SerializeToString()) if func_pull_socket in socks and socks[func_pull_socket] == zmq.POLLIN: msg = func_pull_socket.recv_string() args = msg.split('|') ip, mutil = args[0], float(args[1]) logging.info('Received node occupancy of %.2f%% from IP %s.' % (mutil * 100, ip)) func_occ_map[ip] = mutil end = time.time() if end - start > THRESHOLD: logging.info('Checking hash ring...') check_hash_ring(client, context) logging.info('Checking for extra nodes...') check_unused_nodes(client) if func_occ_map.values(): avg_focc = reduce(lambda a, b: a + b, func_occ_map.values(), \ 0) / len(func_occ_map) else: avg_focc = 0 logging.info('Average node occupancy is %f%%...' % (avg_focc * 100)) if avg_focc > FOCC_THRESHOLD: mon_ips = util.get_pod_ips(client, 'role=monitoring') route_addr = get_service_address(client, 'routing-service') add_nodes(client, ['function'], [1], mon_ips, route_addr=route_addr) start = time.time()
def create_cluster(txn_count, keynode_count, rtr_count, worker_count, lb_count, benchmark_count, config_file, branch_name, ssh_key, cluster_name, kops_bucket, aws_key_id, aws_key, anna_config_file): prefix = './' util.run_process(['./create_cluster_object.sh', kops_bucket, ssh_key], 'kops') client, apps_client = util.init_k8s() print('Creating Monitor Node...') add_nodes(client, apps_client, config_file, "monitor", 1, aws_key_id, aws_key, True, prefix, branch_name) print('Creating %d Anna Routing Nodes...' % (rtr_count)) add_nodes(client, apps_client, anna_config_file, "routing", rtr_count, aws_key_id, aws_key, True, prefix, branch_name) print('Creating routing service...') service_spec = util.load_yaml('yaml/services/routing.yml', prefix) client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) util.get_service_address(client, 'routing-service') print('Creating %d Key Nodes...' % (keynode_count)) add_nodes(client, apps_client, config_file, "keynode", keynode_count, aws_key_id, aws_key, True, prefix, branch_name) print('Creating %d Worker Nodes...' % (worker_count)) add_nodes(client, apps_client, config_file, "worker", worker_count, aws_key_id, aws_key, True, prefix, branch_name) print('Creating Worker Service...') service_spec = util.load_yaml('yaml/services/worker.yml', prefix) client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) util.get_service_address(client, 'worker-service') print('Creating %d TASC nodes...' % (txn_count)) add_nodes(client, apps_client, config_file, 'tasc', txn_count, aws_key_id, aws_key, True, prefix, branch_name) print('Creating %d Load Balancers...' % (lb_count)) add_nodes(client, apps_client, config_file, 'lb', lb_count, aws_key_id, aws_key, True, prefix, branch_name) print('Creating TASC Load Balancing service...') service_spec = util.load_yaml('yaml/services/tasc.yml', prefix) client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) print('Creating %d Benchmark nodes...' % (benchmark_count)) add_nodes(client, apps_client, config_file, 'benchmark', benchmark_count, aws_key_id, aws_key, True, prefix, branch_name) benchmark_ips = util.get_node_ips(client, 'role=benchmark', 'ExternalIP') with open('../cmd/benchmark/benchmarks.txt', 'w+') as f: for ip in benchmark_ips: f.write(ip + '\n') print('Finished creating all pods...') sg_name = 'nodes.' + cluster_name sg = ec2_client.describe_security_groups(Filters=[{ 'Name': 'group-name', 'Values': [sg_name] }])['SecurityGroups'][0] print("Authorizing Ports for TASC...") permission = [{ 'FromPort': 0, 'IpProtocol': 'tcp', 'ToPort': 65535, 'IpRanges': [{ 'CidrIp': '0.0.0.0/0' }] }] ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'], IpPermissions=permission) print('Registering Key Nodes...') keynode_pod_ips = util.get_pod_ips(client, 'role=keynode', is_running=True) register(client, keynode_pod_ips) print("\nThe TASC ELB Endpoint: " + util.get_service_address(client, "tasc-service") + "\n") print('Finished!')
def run(): context = zmq.Context(1) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) cfile = '/fluent/conf/kvs-base.yml' # waits until the kubecfg file gets copied into the pod -- this might be # brittle if we try to move to a non-Ubuntu setting, but I'm not worried # about that for now while not os.path.isfile('/root/.kube/config'): pass client = util.init_k8s() start = time.time() while True: socks = dict(poller.poll(timeout=1000)) if churn_pull_socket in socks and socks[churn_pull_socket] == \ zmq.POLLIN: msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': num = int(args[1]) ntype = args[2] logging.info('Adding %d new %s node(s)...' % (num, ntype)) if len(args) > 3: num_threads = args[3] else: num_threads = 3 mon_ips = util.get_pod_ips(client, 'role=monitoring') route_ips = util.get_pod_ips(client, 'role=routing') os.system('sed -i "s|%s: [0-9][0-9]*|%s: %d|g" %s' % (ntype, ntype, num_threads, cfile)) os.system('sed -i "s|%s-cap: [0-9][0-9]*|%s: %d|g" %s' % (ntype, ntype, num_threads * 15, cfile)) add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips) logging.info('Successfully added %d %s node(s).' % (num, ntype)) elif args[0] == 'remove': ip = args[1] ntype = args[2] remove_node(ip, ntype) logging.info('Successfully removed node %s.' % (ip)) if restart_pull_socket in socks and socks[restart_pull_socket] == \ zmq.POLLIN: msg = restart_pull_socket.recv_string() args = msg.split(':') ip = args[1] pod = util.get_pod_from_ip(client, ip) count = str(pod.status.container_statuses[0].restart_count) logging.info('Returning restart count ' + count + ' for IP ' + ip + '.') restart_pull_socket.send_string(count) end = time.time() if end - start > THRESHOLD: logging.info('Checking hash ring...') check_hash_ring(client, context) logging.info('Checking for extra nodes...') check_unused_nodes(client) start = time.time()