def main(): """ Automation script for running scaling tests for Toil Recompute """ parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--config', required=True, help='Configuration file for run. Must be in shared_dir') parser.add_argument('-c', '--cluster_size', required=True, help='Number of workers desired in the cluster.') parser.add_argument('-s', '--sample_size', required=True, type=float, help='Size of the sample deisred in TB.') parser.add_argument('-t', '--instance_type', default='c3.8xlarge', help='e.g. m4.large or c3.8xlarge.') parser.add_argument('-n', '--cluster_name', required=True, help='Name of cluster.') parser.add_argument('--namespace', default='jtvivian', help='CGCloud NameSpace') parser.add_argument('--spot_price', default=0.60, help='Change spot price of instances') parser.add_argument('-b', '--bucket', default='tcga-data-cgl-recompute', help='Bucket where data is.') parser.add_argument('-d', '--shared_dir', required=True, help='Full path to directory with: pipeline script, launch script, config, and master key.') params = parser.parse_args() # Run sequence start = time.time() # Get number of samples from config with open(params.config, 'r') as f: num_samples = len(f.readlines()) # Launch cluster and pipeline uuid = fix_launch(params) launch_cluster(params) ids = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-worker') launch_pipeline(params) # Blocks until all workers are idle stop = time.time() # Collect metrics from cluster collect_metrics(ids, list_of_metrics, start, stop, uuid=uuid) # Apply "Insta-kill" alarm to every worker map(apply_alarm_to_instance, ids) # Kill leader logging.info('Killing Leader') leader_id = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-leader')[0] apply_alarm_to_instance(leader_id, threshold=5) # Generate Run Report avail_zone = get_avail_zone(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-worker')[0] total_cost, avg_hourly_cost = calculate_cost(params.instance_type, ids[0], avail_zone) # Report values output = ['UUID: {}'.format(uuid), 'Number of Samples: {}'.format(num_samples), 'Number of Nodes: {}'.format(params.cluster_size), 'Cluster Name: {}'.format(params.cluster_name), 'Source Bucket: {}'.format(params.bucket), 'Average Hourly Cost: ${}'.format(avg_hourly_cost), 'Cost per Instance: ${}'.format(total_cost), 'Availability Zone: {}'.format(avail_zone), 'Start Time: {}'.format(datetime.isoformat(datetime.utcfromtimestamp(start))), 'Stop Time: {}'.format(datetime.isoformat(datetime.utcfromtimestamp(stop))), 'Total Cost of Cluster: ${}'.format(float(total_cost) * int(params.cluster_size)), 'Cost Per Sample: ${}'.format((float(total_cost) * int(params.cluster_size) / int(num_samples)))] with open(os.path.join(str(uuid) + '_{}'.format(str(datetime.utcnow()).split()[0]), 'run_report.txt'), 'w') as f: f.write('\n'.join(output)) # You're done! logging.info('\n\nScaling Test Complete.')
def collect_metrics(params, start, uuid=str(uuid4())): """ Collect metrics from AWS instances. AWS limits data collection to 1,440 points or 5 days if collected in intervals of 5 minutes. This metric collection will "page" the results in intervals of 4 days (to be safe) in order to collect all the desired metrics. instance_ids: list List of instance IDs list_of_metrics: list List of metric names start: float time.time() of start point stop: float time.time() of stop point region: str AWS region metrics are being collected from uuid: str UUID of metric collection """ list_of_metrics = ['AWS/EC2/CPUUtilization', 'CGCloud/MemUsage', 'CGCloud/DiskUsage_mnt_ephemeral', 'CGCloud/DiskUsage_root', 'AWS/EC2/NetworkIn', 'AWS/EC2/NetworkOut', 'AWS/EC2/DiskWriteOps', 'AWS/EC2/DiskReadOps'] ids = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-worker') while ids: # metrics = {metric: [] for metric in list_of_metrics} for instance_id in ids: for metric in list_of_metrics: averages = [] try: s = start while s < stop: e = s + (4 * 24 * 3600) aws_start = datetime.utcfromtimestamp(s) aws_stop = datetime.utcfromtimestamp(e) met_object = get_metric(metric, instance_id, aws_start, aws_stop) averages.extend([x['Average'] for x in get_datapoints(met_object)]) s = e if averages: metrics[metric].append(averages) logging.info('# of Datapoints for metric {} is {}'.format(metric, len(metrics[metric][0]))) except RuntimeError: if instance_id in instance_ids: instance_ids.remove(instance_id) # Remove metrics if no datapoints were collected metrics = dict((k, v) for k, v in metrics.iteritems() if v) # Save CSV of data mkdir_p('{}_{}'.format(uuid, str(datetime.utcnow()).split()[0])) for metric in metrics: with open('{}_{}/{}.csv'.format(uuid, str(datetime.utcnow()).split()[0], metric.rsplit('/', 1)[1]), 'wb') as f: writer = csv.writer(f) writer.writerows(metrics[metric])
def main(): """ Script to collect aggregate metrics from a collection of instances. """ # parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # parser.add_argument('-c', '--cluster_name', default=None, help='Name of cluster to filter by.') # parser.add_argument('-n', '--instance_name', default=None, help='Name of instnace to filter by.') # params = parser.parse_args() # # ids = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.instance_name) ids = get_instance_ids(filter_cluster='scaling-gtex-400', filter_name='jtvivian_toil-worker') logging.info("IDs being collected: {}".format(ids)) list_of_metrics = ['AWS/EC2/CPUUtilization', 'CGCloud/MemUsage', 'CGCloud/DiskUsage_mnt_ephemeral', 'CGCloud/DiskUsage_root', 'AWS/EC2/NetworkIn', 'AWS/EC2/NetworkOut', 'AWS/EC2/DiskWriteOps', 'AWS/EC2/DiskReadOps'] collect_metrics(ids, list_of_metrics, start=1454355507.550286, stop=1454405909.397642)
def collect_realtime_metrics(params, threshold=0.5): """ Collect metrics from AWS instances in 1 hour intervals. Instances that have gone idle (below threshold CPU value) are terminated. :type params: argparse.Namespace """ list_of_metrics = ['AWS/EC2/CPUUtilization', 'CGCloud/MemUsage', 'CGCloud/DiskUsage_mnt_ephemeral', 'CGCloud/DiskUsage_root', 'AWS/EC2/NetworkIn', 'AWS/EC2/NetworkOut', 'AWS/EC2/DiskWriteOps', 'AWS/EC2/DiskReadOps'] # Create output directory uuid = str(uuid4()) date = str(datetime.utcnow().date()) dir_path = '{}_{}_{}'.format(params.cluster_name, uuid, date) mkdir_p(dir_path) start = time.time() - metric_start_time_margin # Create connections to ec2 and cloudwatch region = region_of_zone(params.zone) conn = boto.ec2.connect_to_region(region) cw = boto.ec2.cloudwatch.connect_to_region(region) sdbconn = boto.sdb.connect_to_region(region) domain = sdbconn.get_domain('{0}--files'.format(params.jobstore)) # Create initial variables start = datetime.utcfromtimestamp(start) DataPoint = namedtuple('datapoint', ['instance_id', 'value', 'timestamp']) timestamps = {} # Begin loop log.info('Metric collection has started. ' 'Waiting {} seconds before initial collection.'.format(metric_initial_wait_period_in_seconds)) time.sleep(metric_initial_wait_period_in_seconds) while True: # FIXME: why doesn't filter_cluster=params.cluster_name work? ids = get_instance_ids(filter_name=params.namespace.strip('/').rstrip('/') + '_toil-worker') if not ids: break metric_collection_time = time.time() try: for instance_id in tqdm(ids): idle = False for metric in list_of_metrics: datapoints = [] aws_start = timestamps.get(instance_id, start) aws_stop = datetime.utcnow() + metric_endtime_margin metric_object = get_metric(cw, metric, instance_id, aws_start, aws_stop) for datum in metric_object: d = DataPoint(instance_id=instance_id, value=datum['Average'], timestamp=datum['Timestamp']) datapoints.append(d) # Save data in local directory if datapoints: datapoints = sorted(datapoints, key=lambda x: x.timestamp) with open(os.path.join(dir_path, '{}.csv'.format(os.path.basename(metric))), 'a') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(datapoints) # Check if instance's CPU has been idle the last 30 minutes. if metric == 'AWS/EC2/CPUUtilization': averages = [x.value for x in sorted(datapoints, key=lambda x: x.timestamp)][-6:] # If there is at least 30 minutes of data points and max is below threshold, flag to be killed. if len(averages) == 6: if max(averages) < threshold: idle = True log.info('Flagging {} to be killed. ' 'Max CPU {} for last 30 minutes.'.format(instance_id, max(averages))) else: log.info('Max CPU for {} was {} for last 30 minutes.'.format(instance_id, max(averages))) # Kill instance if idle and cluster is too large if idle: try: with cluster_size_lock: cluster_size = get_cluster_size(params.cluster_name) desired_cluster_size = get_desired_cluster_size(domain) if cluster_size > desired_cluster_size: log.info('Cluster size (%d) is larger than requested (%d).' 'Terminating idle instance %s.', cluster_size, desired_cluster_size, instance_id) cmd = ['cgcloud', 'terminate', '--instance-id', instance_id, '--cluster-name', params.cluster_name, 'toil'] try: check_call(cmd) log.info("Successfully terminated instance via %s.", " ".join(cmd)) except: log.error("Terminating instance with %s failed.", " ".join(cmd)) raise update_cluster_size(domain, cluster_size - 1) except (EC2ResponseError, BotoServerError) as e: log.info('Error terminating instance: {}\n{}'.format(instance_id, e)) # Set start point to be last collected timestamp timestamps[instance_id] = max(x.timestamp for x in datapoints) if datapoints else start except BotoServerError: log.error('Giving up trying to fetch metric for this interval') # Sleep collection_time = time.time() - metric_collection_time log.info('Metric collection took: {} seconds. Waiting one hour.'.format(collection_time)) wait_time = metric_collection_interval_in_seconds - collection_time if wait_time < 0: log.warning('Collection time exceeded metric collection interval by: %i', -wait_time) else: time.sleep(wait_time) log.info('Metric collection has finished.')
def collect_realtime_metrics(params, threshold=0.5, region='us-west-2'): """ Collect metrics from AWS instances in 1 hour intervals. Instances that have gone idle (below threshold CPU value) are terminated. params: argparse.Namespace Input arguments region: str AWS region metrics are being collected from uuid: str UUID of metric collection """ list_of_metrics = ['AWS/EC2/CPUUtilization', 'CGCloud/MemUsage', 'CGCloud/DiskUsage_mnt_ephemeral', 'CGCloud/DiskUsage_root', 'AWS/EC2/NetworkIn', 'AWS/EC2/NetworkOut', 'AWS/EC2/DiskWriteOps', 'AWS/EC2/DiskReadOps'] # Create output directory uuid = str(uuid4()) date = str(datetime.utcnow().date()) dir_path = '{}_{}_{}'.format(params.cluster_name, uuid, date) mkdir_p(dir_path) start = time.time() - metric_start_time_margin # Create connections to ec2 and cloudwatch conn = boto.ec2.connect_to_region(region) cw = boto.ec2.cloudwatch.connect_to_region(region) # Create initial variables start = datetime.utcfromtimestamp(start) DataPoint = namedtuple('datapoint', ['instance_id', 'value', 'timestamp']) timestamps = {} # Begin loop log.info('Metric collection has started. ' 'Waiting {} seconds before initial collection.'.format(metric_initial_wait_period_in_seconds)) time.sleep(metric_initial_wait_period_in_seconds) while True: ids = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-worker') if not ids: break metric_collection_time = time.time() try: for instance_id in tqdm(ids): kill_instance = False for metric in list_of_metrics: datapoints = [] aws_start = timestamps.get(instance_id, start) aws_stop = datetime.utcnow() + metric_endtime_margin metric_object = get_metric(cw, metric, instance_id, aws_start, aws_stop) for datum in metric_object: d = DataPoint(instance_id=instance_id, value=datum['Average'], timestamp=datum['Timestamp']) datapoints.append(d) # Save data in local directory if datapoints: datapoints = sorted(datapoints, key=lambda x: x.timestamp) with open(os.path.join(dir_path, '{}.tsv'.format(os.path.basename(metric))), 'a') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(datapoints) # Check if instance's CPU has been idle the last 20 minutes. if metric == 'AWS/EC2/CPUUtilization': averages = [x.value for x in sorted(datapoints, key=lambda x: x.timestamp)][-4:] # If there is at least 20 minutes of data points and max is below threshold, flag to be killed. if len(averages) == 4: if max(averages) < threshold: kill_instance = False # Don't kill an instance log.info('Flagging {} to be killed. ' 'Max CPU {} for last 30 minutes.'.format(instance_id, max(averages))) # Kill instance if idle if kill_instance: try: log.info('Terminating Instance: {}'.format(instance_id)) conn.terminate_instances(instance_ids=[instance_id]) except (EC2ResponseError, BotoServerError) as e: log.info('Error terminating instance: {}\n{}'.format(instance_id, e)) # Set start point to be last collected timestamp timestamps[instance_id] = max(x.timestamp for x in datapoints) if datapoints else start except BotoServerError: log.error('Giving up trying to fetch metric for this interval') # Sleep collection_time = time.time() - metric_collection_time log.info('Metric collection took: {} seconds. ' 'Waiting {} seconds.'.format(collection_time, metric_collection_interval_in_seconds)) wait_time = metric_collection_interval_in_seconds - collection_time if wait_time < 0: log.warning('Collection time exceeded metric collection interval by: %i', -wait_time) else: time.sleep(wait_time) log.info('Metric collection has finished.')