def recover_containers_from_kv_store(): services_to_be_recovered = _get_crashed_services() for service in services_to_be_recovered: kv.update_container_status('recovering', key=service) recovery_retry_count = 0 while services_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT: get_logger().info("Recovering containers: %s", json.dumps(services_to_be_recovered)) services_not_recovered = [] for service in services_to_be_recovered: service_parameters = kv.kv_get(service)['params'] if not _recover_container(service_parameters): services_not_recovered.append(service) else: kv.kv_remove(service) sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS) services_to_be_recovered = services_not_recovered recovery_retry_count += 1 for service in services_to_be_recovered: kv.update_container_status('not-recovered', key=service) return services_to_be_recovered
def register_service_in_consul(microservice_data): if exists_service(microservice_data['microservice_id']): return consul_service_data = { 'ID': microservice_data['microservice_id'], 'Name': microservice_data['microservice_name'], 'Port': microservice_data['microservice_port'], 'Check': { 'TTL': '15s', } } microservice_tags = microservice_data.get('microservice_tags') if microservice_tags: consul_service_data['Tags'] = microservice_tags response = consul_post('agent/service/register', consul_service_data) response.raise_for_status() container_id = microservice_data['microservice_id'].split(':')[0] kv_set('start_timestamp/{}'.format(container_id), str(microservice_data['container_created_timestamp'])) key = 'single_active_instance/{}'.format( microservice_data['microservice_id']) if microservice_data.get('single_active_instance'): kv_set(key, True) else: kv_remove(key)
def _stop_service(self, container_id): ship = get_ship_name() service_dict = None service_list = kv_list('ships/{}/service/'.format(ship)) if service_list: key = fnmatch.filter(service_list, '*/{}'.format(container_id)) service_dict = kv_get(key[0]) if key else None if service_dict and service_dict['Status'] in ['crashed', 'not-recovered']: kv_remove(key[0]) else: run_command_in_container('supervisorctl stop armada_agent', container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container('supervisorctl stop register_in_service_discovery', container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except: traceback.print_exc() for i in range(3): try: docker_api.stop(container_id) kv_remove(key[0]) except Exception as e: last_exception = e traceback.print_exc() if not is_container_running(container_id): break if is_container_running(container_id): get_logger().error('Could not stop container: {}'.format(container_id)) raise last_exception
def register_service_in_consul(microservice_data): if _exists_service(microservice_data['microservice_id']): return consul_service_data = { 'ID': microservice_data['microservice_id'], 'Name': microservice_data['microservice_name'], 'Port': microservice_data['microservice_port'], 'Check': { 'TTL': '15s', } } if microservice_data['microservice_tags']: consul_service_data['Tags'] = microservice_data['microservice_tags'] response = consul_post('agent/service/register', consul_service_data) response.raise_for_status() container_id = microservice_data['microservice_id'].split(':')[0] kv_set('start_timestamp/{}'.format(container_id), str(microservice_data['container_created_timestamp'])) key = 'single_active_instance/{}'.format(microservice_data['microservice_id']) if microservice_data.get('single_active_instance'): kv_set(key, True) else: kv_remove(key)
def _clean_up_kv_store(): global next_kv_clean_up_timestamp if time.time() < next_kv_clean_up_timestamp: return get_logger().info('Cleaning up kv-store:') next_kv_clean_up_timestamp = get_next_kv_clean_up_timestamp() services = armada_api.get_json('list') valid_container_ids = set( service.get('container_id') for service in services) start_timestamp_keys = kv.kv_list('start_timestamp/') or [] for key in start_timestamp_keys: container_id = key.split('/')[-1] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) single_active_instance_keys = kv.kv_list('single_active_instance/') or [] for key in single_active_instance_keys: container_id = key.split('/')[-1].split(':')[0] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) get_logger().info('Finished cleaning up kv-store.')
def deregister_services(container_id): services_dict = consul_query('agent/services') for service_id, service_dict in services_dict.items(): if service_id.startswith(container_id): consul_get('agent/service/deregister/{service_id}'.format(**locals())) try: kv.kv_remove("start_timestamp/" + container_id) except Exception as e: traceback.print_exc()
def deregister_services(container_id): services_dict = consul_query('agent/services') for service_id, service_dict in services_dict.items(): if service_id.startswith(container_id): consul_get('agent/service/deregister/{service_id}'.format(**locals())) try: kv.kv_remove("start_timestamp/" + container_id) except Exception as e: get_logger().exception(e)
def command_stop(args): microservice_handle = args.microservice_handle or os.environ[ 'MICROSERVICE_NAME'] if not microservice_handle: raise ValueError('No microservice name or container id supplied.') instances = armada_utils.get_matched_containers(microservice_handle) instances_count = len(instances) if instances_count > 1: if not args.all: raise armada_utils.ArmadaCommandException( 'There are too many ({instances_count}) matching containers. ' 'Provide more specific container_id or microservice name or use -a/--all flag.' .format(**locals())) print('Stopping {instances_count} services {microservice_handle}...'. format(**locals())) else: microservice_name = instances[0]['ServiceName'] container_id = instances[0]["ServiceID"].split(':')[0] print( 'Stopping service {microservice_name} ({container_id})...'.format( **locals())) were_errors = False for i, instance in enumerate(instances): try: if instances_count > 1: print('[{0}/{1}]'.format(i + 1, instances_count)) if 'kv_index' in instance: kv.kv_remove('service/{}/{}'.format(instance['ServiceName'], instance['kv_index'])) print('Service {} has been removed.'.format( instance['ServiceName'])) else: container_id = instance['ServiceID'].split(':')[0] payload = {'container_id': container_id} ship_name = instance['Address'] result = armada_api.post('stop', payload, ship_name=ship_name) if result['status'] == 'ok': print('Service {container_id} has been stopped.'.format( **locals())) if instances_count > 1: print() else: raise ArmadaCommandException('Stopping error: {0}'.format( result['error'])) except: traceback.print_exc() were_errors = True if were_errors: sys.exit(1)
def deregister_services(container_id): services_dict = consul_query('agent/services') for service_id, service_dict in services_dict.items(): if service_id.startswith(container_id): consul_get('agent/service/deregister/{service_id}'.format(**locals())) try: kv.kv_remove("start_timestamp/" + container_id) except Exception as e: get_logger().exception(e) try: kv.kv_remove("single_active_instance/" + service_id) except Exception as e: get_logger().exception(e)
def command_stop(args): microservice_handle = args.microservice_handle or os.environ['MICROSERVICE_NAME'] if not microservice_handle: raise ValueError('No microservice name or container id supplied.') instances = armada_utils.get_matched_containers(microservice_handle) instances_count = len(instances) if instances_count > 1: if not args.all: raise armada_utils.ArmadaCommandException( 'There are too many ({instances_count}) matching containers. ' 'Provide more specific container_id or microservice name or use -a/--all flag.'.format(**locals())) print('Stopping {instances_count} services {microservice_handle}...'.format(**locals())) else: microservice_name = instances[0]['ServiceName'] container_id = instances[0]["ServiceID"].split(':')[0] print('Stopping service {microservice_name} ({container_id})...'.format(**locals())) were_errors = False for i, instance in enumerate(instances): try: if instances_count > 1: print('[{0}/{1}]'.format(i + 1, instances_count)) if 'kv_index' in instance: kv.kv_remove('service/{}/{}'.format(instance['ServiceName'], instance['kv_index'])) print('Service {} has been removed.'.format(instance['ServiceName'])) else: container_id = instance['ServiceID'].split(':')[0] payload = {'container_id': container_id} ship_name = instance['Address'] result = armada_api.post('stop', payload, ship_name=ship_name) if result['status'] == 'ok': print('Service {container_id} has been stopped.'.format(**locals())) if instances_count > 1: print() else: raise ArmadaCommandException('Stopping error: {0}'.format(result['error'])) except: traceback.print_exc() were_errors = True if were_errors: sys.exit(1)
def set_ship_name(new_name): ship_ip = get_ship_ip() old_name = get_ship_name(ship_ip) saved_containers = kv.kv_list('ships/{}/service/'.format(old_name)) if saved_containers: for container in saved_containers: new_key = 'ships/{}/service/{}/{}'.format(new_name, container.split('/')[-2], container.split('/')[-1]) container_dict = kv.kv_get(container) kv.kv_set(new_key, container_dict) kv.kv_remove(container) kv.kv_set('ships/{}/name'.format(ship_ip), new_name) kv.kv_set('ships/{}/ip'.format(new_name), ship_ip) os.system('sed -i \'s|ships/{}/|ships/{}/|\' /etc/consul.config'.format(old_name, new_name)) try: os.system('/usr/local/bin/consul reload') except Exception as e: get_logger().exception(e) kv.kv_remove('containers_parameters_list/{}'.format(old_name))
def set_ship_name(new_name): ship_ip = get_ship_ip() old_name = get_ship_name(ship_ip) saved_containers = kv.kv_list('ships/{}/service/'.format(old_name)) if saved_containers: for container in saved_containers: new_key = 'ships/{}/service/{}/{}'.format(new_name, container.split('/')[-2], container.split('/')[-1]) container_dict = kv.kv_get(container) kv.kv_set(new_key, container_dict) kv.kv_remove(container) kv.kv_set('ships/{}/name'.format(ship_ip), new_name) kv.kv_set('ships/{}/ip'.format(new_name), ship_ip) os.system('sed -i \'s|ships/{}/|ships/{}/|\' /etc/consul.config'.format(old_name, new_name)) try: os.system('/usr/local/bin/consul reload') except Exception as e: traceback.print_exc() kv.kv_remove('containers_parameters_list/{}'.format(old_name))
def _stop_service(self, container_id): ship = get_ship_name() service_list = kv_list('ships/{}/service/'.format(ship)) try: key = fnmatch.filter(service_list, '*/{}'.format(container_id))[0] except (IndexError, TypeError): key = None if not is_container_running(container_id): if key: kv_remove(key) try: deregister_services(container_id) except Exception as e: get_logger().exception(e) else: run_command_in_container('supervisorctl stop armada_agent', container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container( 'supervisorctl stop register_in_service_discovery', container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except Exception as e: get_logger().exception(e) for i in range(3): try: docker_api.stop(container_id) except Exception as e: get_logger().debug(e, exc_info=True) last_exception = e if not is_container_running(container_id): if key: kv_remove(key) break if is_container_running(container_id): get_logger().error('Could not stop container: %s', container_id) raise last_exception
def _stop_service(self, container_id, force=False): if force: service_list = get_services_by_ship() else: service_list = get_local_services_from_kv_store() try: keys = fnmatch.filter(service_list, '*/{}'.format(container_id)) except (IndexError, TypeError) as e: get_logger().exception(e) keys = [] if not is_container_running(container_id): for key in keys: kv_remove(key) try: deregister_services(container_id) except Exception as e: get_logger().exception(e) else: run_command_in_container('supervisorctl stop armada_agent', container_id) trigger_hook('pre-stop', container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except Exception as e: get_logger().exception(e) for i in range(3): try: docker_api.stop(container_id) except Exception as e: get_logger().debug(e, exc_info=True) last_exception = e if not is_container_running(container_id): for key in keys: kv_remove(key) break if is_container_running(container_id): get_logger().error('Could not stop container: %s', container_id) raise last_exception
def recover_saved_containers(saved_containers): wait_for_consul_ready() running_containers = _get_local_running_containers() containers_to_be_recovered = _multiset_difference(saved_containers, running_containers) recovery_retry_count = 0 while containers_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT: get_logger().info("Recovering containers: {}".format( json.dumps(containers_to_be_recovered))) containers_not_recovered = [] counter_to_be_recovered = Counter( json.dumps(x, sort_keys=True) for x in containers_to_be_recovered) to_be_recovered = [] for container_parameters in counter_to_be_recovered.elements(): try: if to_be_recovered[-1][0] == container_parameters: index = to_be_recovered[-1][1] + 1 else: index = 0 except IndexError: index = 0 to_be_recovered.append((container_parameters, index)) name = json.loads(container_parameters)['microservice_name'] kv.save_service(name, index, 'recovering', json.loads(container_parameters)) for container_parameters, index in to_be_recovered: container_parameters = json.loads(container_parameters) name = container_parameters['microservice_name'] if not _recover_container(container_parameters): containers_not_recovered.append(container_parameters) if recovery_retry_count == (RECOVERY_RETRY_LIMIT - 1): kv.save_service(name, index, 'not-recovered', json.loads(container_parameters)) else: kv.kv_remove('service/{}/{}'.format(name, index)) sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS) running_containers = _get_local_running_containers() containers_to_be_recovered = _multiset_difference( containers_not_recovered, running_containers) recovery_retry_count += 1 return containers_to_be_recovered
def _stop_service(self, container_id): ship = get_ship_name() service_list = kv_list("ships/{}/service/".format(ship)) try: key = fnmatch.filter(service_list, "*/{}".format(container_id))[0] except (IndexError, TypeError): key = None if not is_container_running(container_id): if key: kv_remove(key) try: deregister_services(container_id) except Exception as e: get_logger().exception(e) else: run_command_in_container("supervisorctl stop armada_agent", container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container("supervisorctl stop register_in_service_discovery", container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except Exception as e: get_logger().exception(e) for i in range(3): try: docker_api.stop(container_id) except Exception as e: get_logger().debug(e, exc_info=True) last_exception = e if not is_container_running(container_id): if key: kv_remove(key) break if is_container_running(container_id): get_logger().error("Could not stop container: %s", container_id) raise last_exception
def set_ship_name(new_name): from armada_backend.models.services import get_services_by_ship, create_consul_services_key ship_ip = get_ship_ip() old_name = get_ship_name(ship_ip) saved_containers = get_services_by_ship(old_name) if saved_containers: for container in saved_containers: new_key = create_consul_services_key( ship=new_name, service_name=container.split('/')[-2], container_id=container.split('/')[-1]) container_dict = kv.kv_get(container) kv.kv_set(new_key, container_dict) kv.kv_remove(container) kv.kv_set('ships/{}/name'.format(ship_ip), new_name) kv.kv_set('ships/{}/ip'.format(new_name), ship_ip) os.system('sed -i \'s|ships/{}/|ships/{}/|\' /etc/consul.config'.format( old_name, new_name)) try: os.system('/usr/local/bin/consul reload') except Exception as e: get_logger().exception(e)
def _stop_service(self, container_id): ship = get_ship_name() service_dict = None service_list = kv_list('ships/{}/service/'.format(ship)) if service_list: key = fnmatch.filter(service_list, '*/{}'.format(container_id)) service_dict = kv_get(key[0]) if key else None if service_dict and service_dict['Status'] in [ 'crashed', 'not-recovered' ]: kv_remove(key[0]) else: run_command_in_container('supervisorctl stop armada_agent', container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container( 'supervisorctl stop register_in_service_discovery', container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except: traceback.print_exc() for i in range(3): try: docker_api.stop(container_id) kv_remove(key[0]) except Exception as e: last_exception = e traceback.print_exc() if not is_container_running(container_id): break if is_container_running(container_id): get_logger().error( 'Could not stop container: {}'.format(container_id)) raise last_exception
def recover_saved_containers(saved_containers): wait_for_consul_ready() running_containers = _get_local_running_containers() containers_to_be_recovered = _multiset_difference(saved_containers, running_containers) recovery_retry_count = 0 while containers_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT: get_logger().info("Recovering containers: {}".format(json.dumps(containers_to_be_recovered))) containers_not_recovered = [] counter_to_be_recovered = Counter(json.dumps(x, sort_keys=True) for x in containers_to_be_recovered) to_be_recovered = [] for container_parameters in counter_to_be_recovered.elements(): try: if to_be_recovered[-1][0] == container_parameters: index = to_be_recovered[-1][1] + 1 else: index = 0 except IndexError: index = 0 to_be_recovered.append((container_parameters, index)) name = json.loads(container_parameters)['microservice_name'] kv.save_service(name, index, 'recovering', json.loads(container_parameters)) for container_parameters, index in to_be_recovered: container_parameters = json.loads(container_parameters) name = container_parameters['microservice_name'] if not _recover_container(container_parameters): containers_not_recovered.append(container_parameters) if recovery_retry_count == (RECOVERY_RETRY_LIMIT - 1): kv.save_service(name, index, 'not-recovered', json.loads(container_parameters)) else: kv.kv_remove('service/{}/{}'.format(name, index)) sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS) running_containers = _get_local_running_containers() containers_to_be_recovered = _multiset_difference(containers_not_recovered, running_containers) recovery_retry_count += 1 return containers_to_be_recovered
def _clean_up_kv_store(): global next_kv_clean_up_timestamp if time.time() < next_kv_clean_up_timestamp: return get_logger().info('Cleaning up kv-store:') next_kv_clean_up_timestamp = get_next_kv_clean_up_timestamp() services = armada_api.get_json('list') valid_container_ids = set(service.get('container_id') for service in services) start_timestamp_keys = kv.kv_list('start_timestamp/') or [] for key in start_timestamp_keys: container_id = key.split('/')[-1] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) single_active_instance_keys = kv.kv_list('single_active_instance/') or [] for key in single_active_instance_keys: container_id = key.split('/')[-1].split(':')[0] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) get_logger().info('Finished cleaning up kv-store.')
def remove_alias(name): key = 'dockyard/aliases/{name}'.format(**locals()) kv.kv_remove(key) if get_default() == name: remove_default()
def remove_default(): kv.kv_remove('dockyard/default')
def command_restart(args): microservice_handle = args.microservice_handle or os.environ['MICROSERVICE_NAME'] if not microservice_handle: raise ValueError('No microservice name or container id supplied.') instances = armada_utils.get_matched_containers(microservice_handle) instances_count = len(instances) if instances_count > 1: if not args.all: raise armada_utils.ArmadaCommandException( 'There are too many ({instances_count}) matching containers. ' 'Provide more specific container_id or microservice name or use -a/--all flag.'.format(**locals())) print('Restarting {instances_count} services {microservice_handle}...'.format(**locals())) else: microservice_name = instances[0]['ServiceName'] container_id = instances[0]["ServiceID"].split(':')[0] print('Restarting service {microservice_name} ({container_id})...'.format(**locals())) were_errors = False for i, instance in enumerate(instances): try: if instances_count > 1: print('[{0}/{1}]'.format(i + 1, instances_count)) if 'kv_index' in instance: if not instance['params']: raise armada_utils.ArmadaCommandException( 'There is no run command available for service {}.'.format(instance['ServiceName'])) run_command = instance['params']['run_command'] with suppress_version_check(): assert armada_utils.execute_local_command(run_command, stream_output=True, retries=5)[0] == 0 kv.kv_remove('service/{}/{}'.format(instance['ServiceName'], instance['kv_index'])) if instances_count > 1: print() continue container_id = instance['ServiceID'].split(':')[0] is_run_locally = armada_utils.is_local_container(container_id) and not args.ship if is_run_locally: result = json.loads(armada_api.get('env/{container_id}/ARMADA_RUN_COMMAND'.format(**locals()))) if result['status'] == 'ok': stop_command = 'armada stop {container_id}'.format(**locals()) run_command = base64.b64decode(result['value']) with suppress_version_check(): assert armada_utils.execute_local_command(stop_command, stream_output=True, retries=3)[0] == 0 assert armada_utils.execute_local_command(run_command, stream_output=True, retries=5)[0] == 0 if instances_count > 1: print() else: raise armada_utils.ArmadaCommandException(result['error']) else: payload = {'container_id': container_id} if args.ship: payload['target_ship'] = args.ship payload['force'] = args.force print('Checking if there is new image version. May take few minutes if download is needed...') ship_name = instance['Address'] result = armada_api.post('restart', payload, ship_name=ship_name) if result['status'] == 'ok': new_container_id = result['container_id'] print('Service has been restarted and is running in container {new_container_id} ' 'available at addresses:'.format(**locals())) for service_address, docker_port in result['endpoints'].iteritems(): print(' {0} ({1})'.format(service_address, docker_port)) if instances_count > 1: print() else: raise armada_utils.ArmadaCommandException(result['error']) except armada_utils.ArmadaCommandException as e: print("ArmadaCommandException: {0}".format(str(e))) were_errors = True except: traceback.print_exc() were_errors = True if were_errors: sys.exit(1)