def new_asg(cluster, ami_id): """ Create a new ASG in the given asgard cluster using the given AMI. Ensures that the new ASG has a min or desired instance count greater than 0. Arguments: cluster(str): Name of the cluster. ami_id(str): AWS AMI ID Returns: str: The name of the new ASG. Raises: TimeoutException: When the task to bring up the new ASG times out. BackendError: When the task to bring up the new ASG fails. ASGCountZeroException: When the new ASG brought online has 0 for it's min and desired counts RateLimitedException: When we are being rate limited by AWS. """ payload = { "name": cluster, "imageId": ami_id, } response = requests.post(NEW_ASG_URL, data=payload, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT) LOG.debug("Sent request to create new ASG in Cluster({}).".format(cluster)) if response.status_code == 404: msg = "Can't create more ASGs for cluster {}. Please either wait " \ "until older ASGs have been removed automatically or remove " \ "old ASGs manually via Asgard." raise BackendError(msg.format(cluster)) if response.status_code != 200: # The requests library follows redirects. The 200 comes from the job status page msg = "Error occured attempting to create new ASG for cluster {}.\nResponse: {}" raise BackendError(msg.format(cluster, response.text)) response = wait_for_task_completion(response.url, ASGARD_NEW_ASG_CREATION_TIMEOUT) if response['status'] == 'failed': msg = "Failure during new ASG creation. Task Log: \n{}".format( response['log']) raise BackendError(msg) # Potential Race condition if multiple people are making ASGs for the same cluster # Return the name of the newest asg newest_asg = asgs_for_cluster(cluster)[-1] LOG.debug("New ASG({}) created in cluster({}).".format( newest_asg['autoScalingGroupName'], cluster)) if newest_asg['desiredCapacity'] <= 0 or newest_asg['minSize'] <= 0: raise ASGCountZeroException( "New ASG {asg_name} created with 0 instances, aborting. Please check Asgard for more information" .format(asg_name=newest_asg['autoScalingGroupName'])) return newest_asg['autoScalingGroupName']
def _get_asgard_resource_info(url): """ A generic function for querying Asgard for inforamtion about a specific resource, such as an Autoscaling Group, A cluster. Raises: TimeoutException: When the task to bring up the new ASG times out. BackendError: When the task to bring up the new ASG fails. ASGCountZeroException: When the new ASG brought online has 0 for it's min and desired counts RateLimitedException: When we are being rate limited by AWS. """ LOG.debug("URL: {}".format(url)) response = requests.get(url, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT) if response.status_code == 404: raise ResourceDoesNotExistException( 'Resource for url {} does not exist'.format(url)) if response.status_code >= 500: raise BackendError('Asgard experienced an error: {}'.format( response.text)) if response.status_code != 200: raise BackendError( 'Call to asgard failed with status code: {0}: {1}'.format( response.status_code, response.text)) LOG.debug("ASG info: {}".format(response.text)) resource_info_json = _parse_asgard_json_response(url, response) return resource_info_json
def check_state(task_id, username, password): """ Checks the state of the response to verify it is "done" Args: task_id (int): The task id to check the state of. username (str): The Acquia username necessary to run the command. password (str): The Acquia password necessary to run the command. Returns: True if state of the response is "done" Raises: BackendError: Raised so the method will retry since immediately after receiving the response, the state will still be "waiting". Can"t rely on parse_response since the response should return a 200, just not the state wanted. """ api_client = get_api_client(username, password) response = api_client.get(CHECK_TASKS_URL.format(id=task_id)) response_json = parse_response(response, "Failed to check state of response.") if response_json["state"] == "done": return True raise BackendError( "Check state failed. The state of the response was {state}, not done as expected.\n" "JSON Data: {response}".format(state=response_json["state"], response=response_json))
def clear_varnish_cache(env, username, password): """ Clears the Varnish cache from all domains in a Drupal environment. Args: env (str): The environment to clear varnish caches in (e.g. test or prod) username (str): The Acquia username necessary to run the command. password (str): The Acquia password necessary to run the command. Returns: True if all of the Varnish caches are successfully cleared. Raises: KeyError: Raised if env value is invalid. BackendError: Raised if the varnish cache fails to clear in any of the domains. """ api_client = get_api_client(username, password) domains = VALID_ENVIRONMENTS[env] failure = "" for domain in domains: response = api_client.delete( CLEAR_CACHE_URL.format(env=env, domain=domain)) error_message = "Failed to clear cache in {domain}.".format( domain=domain) try: response_json = parse_response(response, error_message) except BackendError: failure = failure + error_message + "\n" continue check_state(response_json["id"], username, password) if failure: raise BackendError(failure) return True
def enable_asg(asg): """ Enable an ASG in asgard. This means it will have ELBs routing to it if any are associated and autoscaling will be turned on. Arguments: asg(str): The name of the asg to enable. Returns: None: When the asg has been enabled. Raises: BackendError: If the task to enable the ASG fails. TimeoutException: If the request to enable the ASG times out RateLimitedException: When we are being rate limited by AWS. """ payload = {"name": asg} response = requests.post(ASG_ACTIVATE_URL, data=payload, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT) task_url = response.url task_status = wait_for_task_completion(task_url, 301) if task_status['status'] == 'failed': msg = "Failure while enabling ASG. Task Log: \n{}".format( task_status['log']) raise BackendError(msg)
def check_state(notification_url, token): """ Checks the status of the response to verify it is "done" Args: notification_url (str): The notification url to use to check the state of. token (str): token to authenticate client Returns: True if status of the response is "completed" Raises: BackendError: Raised so the method will retry since immediately after receiving the response, the status will still be "in-progress". Can"t rely on parse_response since the response should return a 200, just not the status wanted. """ response = get_acquia_v2(notification_url, token) response_json = parse_response(response, "Failed to check state of response.") if response_json["status"] == "completed": return True raise BackendError( "Check status failed. The status of the response was {status}, not done as expected.\n" "JSON Data: {response}".format(status=response_json["status"], response=response_json))
def delete_asg(asg, fail_if_active=True, fail_if_last=True, wait_for_deletion=True): """ Delete an ASG using asgard. curl -d "name=helloworld-example-v004" http://asgardprod/us-east-1/cluster/delete Arguments: asg(str): The name of the asg to delete. Returns: None: When the asg has been deleted. Raises: TimeoutException: If the task to delete the ASG fails... BackendError: If asgard was unable to delete the ASG ASGDoesNotExistException: When an ASG does not exist RateLimitedException: When we are being rate limited by AWS. """ if is_asg_pending_delete(asg): LOG.info( "Not deleting ASG {} due to its already pending deletion.".format( asg)) return if fail_if_active and is_asg_enabled(asg): msg = "Not deleting ASG {} as it is currently active.".format(asg) LOG.warning(msg) try: ec2.remove_asg_deletion_tag(asg) except EC2ResponseError as tagging_error: LOG.warning( "Failed to remove deletion tag from asg {}. Ignoring: {}". format(asg, tagging_error)) raise CannotDeleteActiveASG(msg) if fail_if_last and is_last_asg(asg): msg = "Not deleting ASG {} since it is the last ASG in this cluster." LOG.warning(msg) try: ec2.remove_asg_deletion_tag(asg) except EC2ResponseError as tagging_error: LOG.warning( "Failed to remove deletion tag from asg {}. Ignoring: {}". format(asg, tagging_error)) raise CannotDeleteLastASG(msg) payload = {"name": asg} response = requests.post(ASG_DELETE_URL, data=payload, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT) task_url = response.url if wait_for_deletion: task_status = wait_for_task_completion(task_url, 300) if task_status['status'] == 'failed': msg = "Failure while deleting ASG. Task Log: \n{}".format( task_status['log']) raise BackendError(msg)
def _poll_giveup(data): u""" Raise an error when the polling tries are exceeded.""" orig_args = data.get(u'args') # The Build object was the only parameter to the original method call, # and so it's the first and only item in the args. build = orig_args[0] msg = u'Timed out waiting for build {} to finish.'.format(build.name) raise BackendError(msg)
def _poll_giveup(results): """ Raise an error when the polling tries are exceeded. """ orig_args = results['args'] msg = 'Timed out after {tries} attempts to send email with subject "{subject}".'.format( tries=results['tries'], subject=orig_args[3]) raise BackendError(msg)
def _parse_json(url, response): """ Protect against non-JSON responses that are sometimes returned from Asgard. """ try: response_json = response.json() except ValueError: msg = "Expected json response from url: '{}' - but got the following:\n{}" raise BackendError(msg.format(url, response.text)) return response_json
def _get_asgard_resource_info(url): """ A generic function for querying Asgard for inforamtion about a specific resource, such as an Autoscaling Group, A cluster. """ LOG.debug("URL: {}".format(url)) response = requests.get(url, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT) if response.status_code == 404: raise ResourceDoesNotExistException('Resource for url {} does not exist'.format(url)) elif response.status_code >= 500: raise BackendError('Asgard experienced an error: {}'.format(response.text)) elif response.status_code != 200: raise BackendError('Call to asgard failed with status code: {0}: {1}' .format(response.status_code, response.text)) LOG.debug("ASG info: {}".format(response.text)) return _parse_json(url, response)
def deploy(ami_id): """ Deploys an AMI as an auto-scaling group (ASG) to AWS. Arguments: ami_id(str): AWS AMI ID Returns: dict(str, str, dict): Returns a dictionary with the keys: 'ami_id' - AMI id used to deploy the AMI 'current_asgs' - Lists of current active ASGs, keyed by cluster. 'disabled_asgs' - Lists of current inactive ASGs, keyed by cluster. Raises: TimeoutException: When the task to bring up the new instance times out. BackendError: When the task to bring up the new instance fails. ASGDoesNotExistException: If the ASG being queried does not exist. """ LOG.info("Processing request to deploy {}.".format(ami_id)) # Pull the EDP from the AMI ID edp = ec2.edp_for_ami(ami_id) # These are all autoscaling groups that match the tags we care about. existing_edp_asgs = ec2.asgs_for_edp(edp, filter_asgs_pending_delete=False) # Find the clusters for all the existing ASGs. existing_clustered_asgs = clusters_for_asgs(existing_edp_asgs) LOG.info("Deploying to cluster(s) {}".format(existing_clustered_asgs.keys())) # Create a new ASG in each cluster. new_clustered_asgs = defaultdict(list) for cluster in existing_clustered_asgs: try: newest_asg = new_asg(cluster, ami_id) new_clustered_asgs[cluster].append(newest_asg) except: msg = "ASG creation failed for cluster '{}' but succeeded for cluster(s) {}." msg = msg.format(cluster, new_clustered_asgs.keys()) LOG.exception(msg) raise new_asgs = [asgs[0] for asgs in new_clustered_asgs.values()] LOG.info("New ASGs created: {}".format(new_asgs)) ec2.wait_for_in_service(new_asgs, 300) LOG.info("New ASGs healthy: {}".format(new_asgs)) LOG.info("Enabling traffic to new ASGs for the {} cluster(s).".format(existing_clustered_asgs.keys())) success, enabled_asgs, disabled_asgs = _red_black_deploy(dict(new_clustered_asgs), existing_clustered_asgs) if not success: raise BackendError("Error performing red/black deploy - deploy was unsuccessful. " "enabled_asgs: {} - disabled_asgs: {}".format(enabled_asgs, disabled_asgs)) LOG.info("Woot! Deploy Done!") return {'ami_id': ami_id, 'current_asgs': enabled_asgs, 'disabled_asgs': disabled_asgs}
def disable_asg(asg): """ Disable an ASG using asgard. curl -d "name=helloworld-example-v004" http://asgardprod/us-east-1/cluster/deactivate Arguments: asg(str): The name of the asg to disable. Returns: None: When the asg has been disabled. Raises: TimeoutException: If the task to enable the ASG times out BackendError: If asgard was unable to disable the ASG ASGDoesNotExistException: If the ASG does not exist CannotDisableActiveASG: if the current ASG is active RateLimitedException: When we are being rate limited by AWS. """ try: if is_asg_pending_delete(asg): LOG.info( "Not disabling old ASG {} due to its pending deletion.".format( asg)) return except ASGDoesNotExistException: LOG.info("Not disabling ASG {}, it no longer exists.".format(asg)) return if is_last_asg(asg): msg = "Not disabling ASG {}, it is the last ASG in this cluster." raise CannotDisableActiveASG(msg) payload = {"name": asg} response = requests.post(ASG_DEACTIVATE_URL, data=payload, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT) task_url = response.url task_status = wait_for_task_completion(task_url, 300) if task_status['status'] == 'failed': msg = "Failure while disabling ASG. Task Log: \n{}".format( task_status['log']) raise BackendError(msg)
def parse_response(response, error_message): """ Parses the response. Args: response (requests.Response): error_message (str): Returns: The JSON representation of the response if no errors. Raises: BackendError: Raised if the response's status code is not 200. """ if response.status_code != 200: msg = "{specific}\nStatus Code: {status}\nBody: {body}".format(specific=error_message, status=response.status_code, body=response.text) LOG.error(msg) raise BackendError(msg) return response.json()
def clear_varnish_cache(app_id, env, client_id, secret): """ Clears the Varnish cache from all domains in a Drupal environment. Args: app_id (str): Application id assigned to Drupal instance. env (str): The environment to clear varnish caches in (e.g. test or prod) client_id (str): The Acquia api client id necessary to run the command. secret (str): The Acquia api secret key to run the command. Returns: True if all of the Varnish caches are successfully cleared. Raises: KeyError: Raised if env value is invalid. BackendError: Raised if the varnish cache fails to clear in any of the domains. """ domains = VALID_ENVIRONMENTS[env] failure = "" token = get_api_token(client_id, secret) environmentId = fetch_environment_uid(app_id, env, token) if environmentId: for domain in domains: response = post_acquia_v2( CLEAR_CACHE_URL.format(environmentId=environmentId, domain=domain), token) error_message = "Failed to clear cache in {domain}.".format( domain=domain) try: response_json = parse_response(response, error_message) except BackendError: failure = failure + error_message + "\n" continue check_state(response_json['_links']['notification']['href'], token) if failure: raise BackendError(failure) return True
def trigger_build(base_url, user_name, user_token, job_name, job_token, job_cause=None, job_params=None, timeout=60 * 30): u""" Trigger a jenkins job/project (note that jenkins uses these terms interchangeably) Args: base_url (str): The base URL for the jenkins server, e.g. https://test-jenkins.testeng.edx.org user_name (str): The jenkins username user_token (str): API token for the user. Available at {base_url}/user/{user_name)/configure job_name (str): The Jenkins job name, e.g. test-project job_token (str): Jobs must be configured with the option "Trigger builds remotely" selected. Under this option, you must provide an authorization token (configured in the job) in the form of a string so that only those who know it would be able to remotely trigger this project's builds. job_cause (str): Text that will be included in the recorded build cause job_params (set of tuples): Parameter names and their values to pass to the job timeout (int): The maximum number of seconds to wait for the jenkins build to complete (measured from when the job is triggered.) Returns: A the status of the build that was triggered Raises: BackendError: if the Jenkins job could not be triggered successfully """ @backoff.on_predicate( backoff.constant, interval=60, max_tries=timeout / 60 + 1, on_giveup=_poll_giveup, # We aren't worried about concurrent access, so turn off jitter jitter=None, ) def poll_build_for_result(build): u""" Poll for the build running, with exponential backoff, capped to ``timeout`` seconds. The on_predicate decorator is used to retry when the return value of the target function is True. """ return not build.is_running() # Create a dict with key/value pairs from the job_params # that were passed in like this: --param FOO bar --param BAZ biz # These will get passed to the job as string parameters like this: # {u'FOO': u'bar', u'BAX': u'biz'} request_params = {} for param in job_params: request_params[param[0]] = param[1] # Contact jenkins, log in, and get the base data on the system. try: crumb_requester = CrumbRequester(baseurl=base_url, username=user_name, password=user_token, ssl_verify=True) jenkins = Jenkins(base_url, username=user_name, password=user_token, requester=crumb_requester) except (JenkinsAPIException, HTTPError) as err: raise BackendError(str(err)) if not jenkins.has_job(job_name): msg = u'Job not found: {}.'.format(job_name) msg += u' Verify that you have permissions for the job and double check the spelling of its name.' raise BackendError(msg) # This will start the job and will return a QueueItem object which can be used to get build results job = jenkins[job_name] queue_item = job.invoke(securitytoken=job_token, build_params=request_params, cause=job_cause) LOG.info(u'Added item to jenkins. Server: {} Job: {} '.format( jenkins.base_server_url(), queue_item)) # Block this script until we are through the queue and the job has begun to build. queue_item.block_until_building() build = queue_item.get_build() LOG.info(u'Created build {}'.format(build)) LOG.info(u'See {}'.format(build.baseurl)) # Now block until you get a result back from the build. poll_build_for_result(build) # Update the build's internal state, so that the final status is available build.poll() status = build.get_status() LOG.info(u'Build status: {status}'.format(status=status)) return status