def init_resource_config(redis_db, default_mr_config, machine_type): print 'Initializing the Resource Configurations in the containers' instance_specs = get_instance_specs(machine_type) for mr in default_mr_config: new_resource_provision = default_mr_config[mr] if check_improve_mr_viability(redis_db, mr, new_resource_provision) is False: print 'Initial Resource provisioning for {} is too much. Exiting...'.format( mr.to_string()) exit() # Enact the change in resource provisioning resource_modifier.set_mr_provision(mr, new_resource_provision) # Reflect the change in Redis resource_datastore.write_mr_alloc(redis_db, mr, new_resource_provision) update_machine_consumption(redis_db, mr, new_resource_provision, 0)
def apply_pipeline_filter(redis_db, mr_working_set, experiment_iteration, system_config, workload_config, filter_config): print '*' * 20 print 'INFO: Applying Filtering Pipeline' print 'Filter config is {}'.format(filter_config) print 'MR working set is {}'.format(mr_working_set) machine_type = system_config['machine_type'] pipeline_partitions = filter_config['pipeline_partitions'] stress_weight = filter_config['stress_amount'] experiment_trials = filter_config['filter_exp_trials'] pipelined_services = filter_config['pipeline_services'] pipeline_groups = [] print 'Pipelined services are {}'.format(pipelined_services) # No specified pipelined services indicates that each pipeline is a service if pipelined_services[0][0] == 'BY_SERVICE': service_names = list(set([mr.service_name for mr in mr_working_set])) for service_name in service_names: mr_list = search_mr_working_set(mr_working_set, services) pipeline_groups.append(mr_list) elif pipelined_services[0][0] == 'RANDOM': pipeline_groups = gen_mr_random_split(mr_working_set, pipeline_partitions) print "The pipeline groups are being printed below: " for pipeline_group in pipeline_groups: pipeline_group = [mr.to_string() for mr in pipeline_group] print 'A pipeline is {}'.format(pipeline_group) tbot_metric = workload_config['tbot_metric'] optimize_for_lowest = workload_config['optimize_for_lowest'] pipeline_index = 0 for pipeline in pipeline_groups: change_mr_schedule = calculate_mr_gradient_schedule(redis_db, pipeline, system_config, stress_weight) # Simultaneously stress the MRs in a pipeline for mr in change_mr_schedule: resource_modifier.set_mr_provision(mr, change_mr_schedule[mr], workload_config) experiment_results = measure_runtime(workload_config, experiment_trials) exp_mean = mean_list(experiment_results[tbot_metric]) repr_str = str(pipeline_index) tbot_datastore.write_filtered_results(redis_db, 'pipeline', experiment_iteration, repr_str, exp_mean) # Revert the stressing change_mr_schedule = revert_mr_gradient_schedule(redis_db, pipeline, system_config, stress_weight) for mr in change_mr_schedule: resource_modifier.set_mr_provision(mr, change_mr_schedule[mr], workload_config) pipeline_index += 1 all_pipeline_score_list = tbot_datastore.get_top_n_filtered_results(redis_db, 'pipeline', experiment_iteration, system_config, optimize_for_lowest=optimize_for_lowest) print 'INFO: The current pipeline score list is here {}'.format(all_pipeline_score_list) # Temporarily just choose the most impacted pipeline selected_pipeline_score_list = [all_pipeline_score_list[0]] # MIP = Most Impacted Pipeline mip = [] for pipeline_score in selected_pipeline_score_list: pipeline_repr,score = pipeline_score mip += pipeline_groups[int(pipeline_repr)] print mip # Log results of the filtering print 'About to log to {}'.format(FILTER_LOGS) with open(FILTER_LOGS, "a") as myfile: # First output the result filter_str = '{},'.format(experiment_iteration) for mr in mip: filter_str += '{},'.format(mr.to_string()) filter_str += '\n\n' myfile.write(filter_str) return mip
def run(sys_config, workload_config, filter_config, default_mr_config, last_completed_iter=0): redis_host = sys_config['redis_host'] baseline_trials = sys_config['baseline_trials'] experiment_trials = sys_config['trials'] stress_weights = sys_config['stress_weights'] stress_policy = sys_config['stress_policy'] resource_to_stress = sys_config['stress_these_resources'] service_to_stress = sys_config['stress_these_services'] vm_to_stress = sys_config['stress_these_machines'] machine_type = sys_config['machine_type'] quilt_overhead = sys_config['quilt_overhead'] gradient_mode = sys_config['gradient_mode'] preferred_performance_metric = workload_config['tbot_metric'] optimize_for_lowest = workload_config['optimize_for_lowest'] redis_db = redis.StrictRedis(host=redis_host, port=6379, db=0) if last_completed_iter == 0: redis_db.flushall() ''' # Prompt the user to make sure they want to flush the db ok_to_flush = raw_input("Are you sure you want to flush the results of your last experiment? Please respond with Y or N: ") if ok_to_flush == 'Y': redis_db.flushall() elif ok_to_flush == 'N': print 'OK you said it boss. Exiting...' exit() else: print 'Only Y and N are acceptable responses. Exiting...' exit() ''' print '\n' * 2 print '*' * 20 print 'INFO: INITIALIZING RESOURCE CONFIG' # Initialize Redis and Cluster based on the default resource configuration init_cluster_capacities_r(redis_db, machine_type, quilt_overhead) init_service_placement_r(redis_db, default_mr_config) init_resource_config(redis_db, default_mr_config, machine_type) print '*' * 20 print 'INFO: INSTALLING DEPENDENCIES' #install_dependencies(workload_config) # Initialize time for data charts time_start = datetime.datetime.now() print '*' * 20 print 'INFO: RUNNING BASELINE' # Get the Current Performance -- not used for any analysis, just to benchmark progress!! current_performance = measure_baseline(workload_config, baseline_trials) current_performance[preferred_performance_metric] = remove_outlier( current_performance[preferred_performance_metric]) current_time_stop = datetime.datetime.now() time_delta = current_time_stop - time_start print 'Current (non-analytic) performance measured: {}'.format( current_performance) if last_completed_iter != 0: tbot_datastore.write_summary_redis( redis_db, 0, MR('initial', 'initial', []), 0, {}, mean_list(current_performance[preferred_performance_metric]), mean_list(current_performance[preferred_performance_metric]), time_delta.seconds, 0) print '============================================' print '\n' * 2 # Initialize the current configurations # Initialize the working set of MRs to all the MRs mr_working_set = resource_datastore.get_all_mrs(redis_db) resource_datastore.write_mr_working_set(redis_db, mr_working_set, 0) cumulative_mr_count = 0 experiment_count = last_completed_iter + 1 while experiment_count < 10: # Calculate the analytic baseline that is used to determine MRs analytic_provisions = prepare_analytic_baseline( redis_db, sys_config, min(stress_weights)) print 'The Analytic provisions are as follows {}'.format( analytic_provisions) for mr in analytic_provisions: resource_modifier.set_mr_provision(mr, analytic_provisions[mr]) analytic_baseline = measure_runtime(workload_config, experiment_trials) analytic_mean = mean_list( analytic_baseline[preferred_performance_metric]) print 'The analytic baseline is {}'.format(analytic_baseline) print 'This current performance is {}'.format(current_performance) analytic_baseline[preferred_performance_metric] = remove_outlier( analytic_baseline[preferred_performance_metric]) # Get a list of MRs to stress in the form of a list of MRs mr_to_consider = apply_filtering_policy(redis_db, mr_working_set, experiment_count, sys_config, workload_config, filter_config) for mr in mr_to_consider: print '\n' * 2 print '*' * 20 print 'Current MR is {}'.format(mr.to_string()) increment_to_performance = {} current_mr_allocation = resource_datastore.read_mr_alloc( redis_db, mr) print 'Current MR allocation is {}'.format(current_mr_allocation) for stress_weight in stress_weights: # Calculate Gradient Schedule and provision resources accordingly mr_gradient_schedule = calculate_mr_gradient_schedule( redis_db, [mr], sys_config, stress_weight) for change_mr in mr_gradient_schedule: resource_modifier.set_mr_provision( change_mr, mr_gradient_schedule[change_mr]) experiment_results = measure_runtime(workload_config, experiment_trials) # Write results of experiment to Redis # preferred_results = remove_outlier(experiment_results[preferred_performance_metric]) preferred_results = experiment_results[ preferred_performance_metric] mean_result = mean_list(preferred_results) tbot_datastore.write_redis_ranking( redis_db, experiment_count, preferred_performance_metric, mean_result, mr, stress_weight) # Revert the Gradient schedule and provision resources accordingly mr_revert_gradient_schedule = revert_mr_gradient_schedule( redis_db, [mr], sys_config, stress_weight) for change_mr in mr_revert_gradient_schedule: resource_modifier.set_mr_provision( change_mr, mr_revert_gradient_schedule[change_mr]) increment_to_performance[stress_weight] = experiment_results # Write the results of the iteration to Redis tbot_datastore.write_redis_results(redis_db, mr, increment_to_performance, experiment_count, preferred_performance_metric) print '*' * 20 print '\n' * 2 # Timing Information for the purpose of experiments current_time_stop = datetime.datetime.now() time_delta = current_time_stop - time_start cumulative_mr_count += len(mr_to_consider) chart_generator.get_summary_mimr_charts( redis_db, workload_config, current_performance, mr_working_set, experiment_count, stress_weights, preferred_performance_metric, time_start) # Move back into the normal operating basis by removing the baseline prep stresses reverted_analytic_provisions = revert_analytic_baseline( redis_db, sys_config) for mr in reverted_analytic_provisions: resource_modifier.set_mr_provision( mr, reverted_analytic_provisions[mr]) # Recover the results of the experiment from Redis max_stress_weight = min(stress_weights) mimr_list = tbot_datastore.get_top_n_mimr( redis_db, experiment_count, preferred_performance_metric, max_stress_weight, gradient_mode, optimize_for_lowest=optimize_for_lowest, num_results_returned=-1) imr_list, nimr_list = seperate_mr( mimr_list, mean_list(analytic_baseline[preferred_performance_metric]), optimize_for_lowest) if len(imr_list) == 0: print 'INFO: IMR list length is 0. Please choose a metric with more signal. Exiting...' break print 'INFO: IMR list is {}'.format( [mr.to_string() for mr in imr_list]) print 'INFO: NIMR list is {}'.format( [mr.to_string() for mr in nimr_list]) # Try all the MIMRs in the list until a viable improvement is determined # Improvement Amount mimr = None action_taken = {} for imr in imr_list: imr_improvement_percent = improve_mr_by(redis_db, imr, max_stress_weight) current_imr_alloc = resource_datastore.read_mr_alloc(redis_db, imr) new_imr_alloc = convert_percent_to_raw(imr, current_imr_alloc, imr_improvement_percent) imr_improvement_proposal = new_imr_alloc - current_imr_alloc # If the the Proposed MR cannot be improved by the proposed amount, there are two options # - Max out the resources to fill up the remaining resources on the machine # - Resource Stealing from NIMRs # Both functions will return VIABLE improvements to the IMR deployment nimr_diff_proposal = {} if check_improve_mr_viability(redis_db, imr, imr_improvement_proposal) is False: print 'INFO: MR {} to increase {} by {} is not viable'.format( imr.to_string(), current_imr_alloc, imr_improvement_proposal) print 'INFO: Attempting to max out the machines resources...' imr_improvement_proposal = fill_out_resource(redis_db, imr) if imr_improvement_proposal <= 0: print 'INFO: No more space to fill out resources. Stealing from NIMRs' # Calculate a plan to reduce the resource provisioning of NIMRs nimr_diff_proposal, imr_improvement_proposal = create_decrease_nimr_schedule( redis_db, imr, nimr_list, max_stress_weight) print 'INFO: Proposed NIMR {}'.format(nimr_diff_proposal) print 'INFO: New IMR improvement {}'.format( imr_improvement_proposal) if len(nimr_diff_proposal ) == 0 or imr_improvement_proposal == 0: action_taken[imr] = 0 continue # Decrease the amount of resources provisioned to the NIMR for nimr in nimr_diff_proposal: action_taken[nimr] = nimr_diff_proposal[nimr] new_nimr_alloc = resource_datastore.read_mr_alloc( redis_db, nimr) + nimr_diff_proposal[nimr] print 'NIMR stealing: imposing a change of {} on {}'.format( action_taken[nimr], nimr.to_string()) finalize_mr_provision(redis_db, nimr, new_nimr_alloc) # Improving the resource should always be viable at this step if check_improve_mr_viability(redis_db, imr, imr_improvement_proposal): new_imr_alloc = imr_improvement_proposal + current_imr_alloc action_taken[imr] = imr_improvement_proposal finalize_mr_provision(redis_db, imr, new_imr_alloc) print 'Improvement Calculated: MR {} increase from {} to {}'.format( mr.to_string(), current_imr_alloc, new_imr_alloc) mimr = imr break else: action_taken[imr] = 0 print 'Improvement Calculated: MR {} failed to improve from {}'.format( mr.to_string(), current_mr_allocation) print 'This IMR cannot be improved. Printing some debugging before exiting...' print 'Current MR allocation is {}'.format(current_imr_alloc) print 'Proposed (failed) allocation is {}, improved by {}'.format( new_imr_alloc, imr_improvement_proposal) for deployment in imr.instances: vm_ip, container = deployment capacity = resource_datastore.read_machine_capacity( redis_db, vm_ip) consumption = resource_datastore.read_machine_consumption( redis_db, vm_ip) print 'Machine {} Capacity is {}, and consumption is currently {}'.format( vm_ip, capacity, consumption) if mimr is None: print 'No viable improvement found' break #Compare against the baseline at the beginning of the program improved_performance = measure_runtime(workload_config, baseline_trials) # improved_performance[preferred_performance_metric] = remove_outlier(improved_performance[preferred_performance_metric]) improved_mean = mean_list( improved_performance[preferred_performance_metric]) previous_mean = mean_list( current_performance[preferred_performance_metric]) performance_improvement = improved_mean - previous_mean # Write a summary of the experiment's iterations to Redis tbot_datastore.write_summary_redis(redis_db, experiment_count, mimr, performance_improvement, action_taken, analytic_mean, improved_mean, time_delta.seconds, cumulative_mr_count) current_performance = improved_performance # Generating overall performance improvement chart_generator.get_summary_performance_charts(redis_db, workload_config, experiment_count, time_start) results = tbot_datastore.read_summary_redis(redis_db, experiment_count) print 'Results from iteration {} are {}'.format( experiment_count, results) # Checkpoint MR configurations and print current_mr_config = resource_datastore.read_all_mr_alloc(redis_db) print_csv_configuration(current_mr_config) experiment_count += 1 print '{} experiments completed'.format(experiment_count) print_all_steps(redis_db, experiment_count) current_mr_config = resource_datastore.read_all_mr_alloc(redis_db) for mr in current_mr_config: print '{} = {}'.format(mr.to_string(), current_mr_config[mr]) print_csv_configuration(current_mr_config)
def finalize_mr_provision(redis_db, mr, new_alloc): resource_modifier.set_mr_provision(mr, new_alloc) old_alloc = resource_datastore.read_mr_alloc(redis_db, mr) resource_datastore.write_mr_alloc(redis_db, mr, new_alloc) update_machine_consumption(redis_db, mr, new_alloc, old_alloc)
def find_colocated_nimrs(redis_db, imr, mr_working_set, baseline_mean, sys_config, workload_config): print 'Finding colocated NIMRs' experiment_trials = sys_config['trials'] stress_weights = sys_config['stress_weights'] stress_weight = min(stress_weights) preferred_performance_metric = workload_config['tbot_metric'] optimize_for_lowest = workload_config['optimize_for_lowest'] vm_to_service = get_vm_to_service(get_actual_vms()) colocated_services = [] # Identify an unique list of relevant MRs colocated with IMR instances for deployment in imr.instances: vm_ip, container = deployment colocated_services = colocated_services + vm_to_service[vm_ip] print 'Colocated services are {}'.format(colocated_services) candidate_mrs = [] for mr in mr_working_set: if mr.service_name in colocated_services and mr.resource == imr.resource: candidate_mrs.append(mr) print 'Candidate MRs are {}'.format( [mr.to_string() for mr in candidate_mrs]) nimr_list = [] for mr in candidate_mrs: print 'MR being considered is {}'.format(mr.to_string()) mr_gradient_schedule = calculate_mr_gradient_schedule( redis_db, [mr], sys_config, stress_weight) for change_mr in mr_gradient_schedule: resource_modifier.set_mr_provision(change_mr, mr_gradient_schedule[change_mr], workload_config) experiment_results = measure_runtime(workload_config, experiment_trials) preferred_results = experiment_results[preferred_performance_metric] mean_result = mean_list(preferred_results) perf_diff = mean_result - baseline_mean if (perf_diff > 0.03 * baseline_mean) and optimize_for_lowest: print 'Do nothing for optimize lowest' elif (perf_diff < -0.03 * baseline_mean) and optimize_for_lowest is False: print 'Do nothing for optimize lowest' else: nimr_list.append(mr) # Revert the Gradient schedule and provision resources accordingly mr_revert_gradient_schedule = revert_mr_gradient_schedule( redis_db, [mr], sys_config, stress_weight) for change_mr in mr_revert_gradient_schedule: resource_modifier.set_mr_provision( change_mr, mr_revert_gradient_schedule[change_mr], workload_config) return nimr_list
def apply_pipeline_filter(redis_db, mr_working_set, experiment_iteration, system_config, workload_config, filter_config): logging.info('*' * 20) logging.info('Applying Filtering Pipeline') logging.info('Filter config is {}'.format(filter_config)) logging.info('MR working set is {}'.format(mr_working_set)) machine_type = system_config['machine_type'] pipeline_partitions = filter_config['pipeline_partitions'] stress_weight = filter_config['stress_amount'] experiment_trials = filter_config['filter_exp_trials'] pipelined_services = filter_config['pipeline_services'] pipeline_groups = [] logging.info('Pipelined services are {}'.format(pipelined_services)) # No specified pipelined services indicates that each pipeline is a service if pipelined_services[0][0] == 'BY_SERVICE': service_names = list(set([mr.service_name for mr in mr_working_set])) for service_name in service_names: mr_list = search_mr_working_set(mr_working_set, services) pipeline_groups.append(mr_list) elif pipelined_services[0][0] == 'RANDOM': pipeline_groups = gen_mr_random_split(mr_working_set, pipeline_partitions) logging.info("The pipeline groups are being printed below: ") for pipeline_group in pipeline_groups: pipeline_group = [mr.to_string() for mr in pipeline_group] logging.info('A pipeline is {}'.format(pipeline_group)) tbot_metric = workload_config['tbot_metric'] optimize_for_lowest = workload_config['optimize_for_lowest'] pipeline_index = 0 for pipeline in pipeline_groups: change_mr_schedule = calculate_mr_gradient_schedule( redis_db, pipeline, system_config, stress_weight) # Simultaneously stress the MRs in a pipeline for mr in change_mr_schedule: resource_modifier.set_mr_provision(mr, change_mr_schedule[mr], workload_config) experiment_results = measure_runtime(workload_config, experiment_trials) exp_mean = mean_list(experiment_results[tbot_metric]) repr_str = str(pipeline_index) tbot_datastore.write_filtered_results(redis_db, 'pipeline', experiment_iteration, repr_str, exp_mean) # Revert the stressing change_mr_schedule = revert_mr_gradient_schedule( redis_db, pipeline, system_config, stress_weight) for mr in change_mr_schedule: resource_modifier.set_mr_provision(mr, change_mr_schedule[mr], workload_config) pipeline_index += 1 all_pipeline_score_list = tbot_datastore.get_top_n_filtered_results( redis_db, 'pipeline', experiment_iteration, system_config, optimize_for_lowest=optimize_for_lowest) return all_pipeline_score_list, pipeline_groups