def execute_query(aws_key_id, aws_key, args, query, branch, is_first_branch): """ Executes the specified query using the branch that is currently checked out (whose name is `branch`). Copies the event log and continuous monitors file to `args.output_dir`. `is_first_branch` should be set to True if this is the first branch to be tested. """ print_heading("Executing {} {} of query {} using branch '{}'".format( args.num_trials, "trial" if (args.num_trials == 1) else "trials", query, branch)) # Restart Spark and the Thrift server in order to make sure that we are using the correct version, # and to start using new log files. stop_thriftserver() stop_spark() start_spark() benchmark_runner_dir = path.join(args.benchmark_dir, "runner") driver_addr = subprocess.check_output( "curl -s http://169.254.169.254/latest/meta-data/public-hostname", shell=True) print "Creating benchmark tables and starting the Thrift server" prepare_benchmark_script = path.join(benchmark_runner_dir, "prepare-benchmark.sh") prepare_benchmark_command = "{} \ --spark \ --aws-key-id={} \ --aws-key={} \ --spark-host={} \ --spark-identity-file={} \ --scale-factor={} \ --file-format={} \ --skip-s3-import".format(prepare_benchmark_script, aws_key_id, aws_key, driver_addr, args.identity_file, args.scale_factor, args.file_format) if args.parquet: prepare_benchmark_command += " --parquet" if not is_first_branch or args.skip_parquet_conversion: prepare_benchmark_command += " --skip-parquet-conversion" execute_shell_command(prepare_benchmark_command) if args.memory: cache_table_for_query(query) print "Executing query" run_query_script = path.join(benchmark_runner_dir, "run-query.sh") run_query_command = "{} \ --spark \ --spark-host={} \ --spark-identity-file={} \ --query-num={} \ --num-trials={} \ --clear-buffer-cache".format(run_query_script, driver_addr, args.identity_file, query, args.num_trials) if args.compress_output: run_query_command += " --compress" if args.memory: run_query_command += " --spark-cache-output-tables" execute_shell_command(run_query_command) # Stop the Thrift server and Spark in order to stop using the Spark log files. stop_thriftserver() stop_spark() print "Retrieving logs" parameters = [query, branch] log_dir = utils.copy_all_logs(parameters, utils.get_workers()) utils.copy_all_traces(log_dir, driver_addr, utils.get_workers()) log_files = path.join(log_dir, "*") # Move the logs into a new directory: output_dir/query/branch/ output_dir = path.join(args.output_dir, query, branch) execute_shell_command("mkdir -pv {}".format(output_dir)) execute_shell_command("mv -v {} {}".format(log_files, output_dir)) execute_shell_command("rm -rf {}".format(log_dir))
def solver(data): # initialise solver solver = pywrapcp.Solver("allocations") tasks = utils.get_tasks(data['scheduledTasks']) workers = utils.get_workers(data['workers']) cost_matrix = data['costMatrix'] solver_option = data['solverOption'] time_limit = data['timeLimit'] extra_constraints = data['constraints'] if 'constraints' in data else {} print('solver_option', solver_option) num_tasks = len(tasks) num_workers = len(workers) # declare decision variables and a reference matrix assignment_costs = [] assignments = [] assignments_ref = [] for worker in workers: worker_assignments = [] worker_assignments_ref = [] worker_assignment_costs = [] for task in tasks: worker_assignments.append( solver.IntVar(0, 1, f'worker: , task: {task.id}')) worker_assignments_ref.append(Worker_task(worker, task)) worker_assignment_costs.append(cost_matrix[str( worker.id)][task.id]) assignments.append(worker_assignments) assignments_ref.append(worker_assignments_ref) assignment_costs.append(worker_assignment_costs) constraints = Constraints( tasks, workers, assignment_costs, assignments, assignments_ref, ) # objective # Only add objective if optimisation requested if solver_option != 'noOptimisation': total_cost = solver.IntVar(0, 3000, "total_cost") solver.Add(total_cost == solver.Sum([ assignment_costs[i][j] * assignments[i][j] for i in range(num_workers) for j in range(num_tasks) ])) objective = solver.Minimize(total_cost, 5) # constraints # each task assigned it's given qty constraints.add_task_qty_constraint(solver) # a worker cannot work on two tasks that are on at the same time constraints.add_same_worker_same_task_time(solver) # a worker can at most be assigned to the same orderTask date once (i.e cannot take up multiple qty) # maybe add any cannot work constraints # maybe add any must work constraints must_map = extra_constraints[ 'mustWork'] if 'mustWork' in extra_constraints else None cannot_map = extra_constraints[ 'cannotWork'] if 'cannotWork' in extra_constraints else None constraints.must_cannot_work(solver, must_map, cannot_map) # add must combined must work if 'combinedMustWork' in extra_constraints: constraints.combined_must_work_all( solver, extra_constraints['combinedMustWork']) # add at least has to work constraint if 'atLeastWork' in extra_constraints: constraints.add_at_least_work_task(solver, extra_constraints['atLeastWork']) # add total time fatigue constraints if 'timeFatigueTotal' in extra_constraints: constraints.add_time_fatigue_total( solver, extra_constraints['timeFatigueTotal']) # add total overall time fatigue constraints if 'overallTimeFatigueTotal' in extra_constraints: constraints.add_overall_total_fatigue_time( solver, extra_constraints['overallTimeFatigueTotal']) # add consecutive fatigue constaints if 'overallTimeFatigueConsecutive' in extra_constraints: constraints.add_overall_consecutive_total_fatigue_time( solver, extra_constraints['overallTimeFatigueConsecutive']) # add unavailable time constraints if 'unavailable' in extra_constraints: constraints.add_unavailability(solver, extra_constraints['unavailable']) # add buddy constraints if 'buddy' in extra_constraints: constraints.add_buddy(solver, extra_constraints['buddy']) # add nemesis constraints if 'nemesis' in extra_constraints: constraints.add_nemesis(solver, extra_constraints['nemesis']) # works must be assigned to at least n tasks (this could change later per worker) # [solver.Add(solver.Sum(assignments[i][j] for j in range(num_tasks)) >= 3) for i in range(num_workers)] # Create the decision builder. # Want to sort the decision variables by least cost to the solution if solver_option != 'noOptimisation': assignment_ref_copy = copy.deepcopy(assignments_ref) assignment_ref_copy_flat = [ assignment_ref_copy[i][j] for i in range(num_workers) for j in range(num_tasks) ] # Sort by least cost assignment_ref_copy_flat.sort(key=lambda wrk_tsk: cost_matrix[str( wrk_tsk.worker.id)][wrk_tsk.task.id]) # map to assignment vars assignments_flat = [ assignments[ref.worker.index][ref.task.index] for ref in assignment_ref_copy_flat ] else: assignments_flat = [ assignments[i][j] for i in range(num_workers) for j in range(num_tasks) ] db = solver.Phase(assignments_flat, solver.CHOOSE_FIRST_UNBOUND, solver.ASSIGN_MAX_VALUE) # Create solution collector depending on solver option requested if (solver_option == 'optimise' and time_limit != None) or solver_option == 'optimal': collector = solver.BestValueSolutionCollector( False) # False finds minimum as best solution else: collector = solver.FirstSolutionCollector() # Add decision vars to collector collector.Add(assignments_flat) monitor = pywrapcp.SearchMonitor(solver) monitor.RestartSearch() # Set time limit if given if solver_option == 'optimise' and time_limit != None: print('time_limit', time_limit) solver_time_limit = solver.TimeLimit(time_limit * 60 * 1000) # Solve appropriately if solver_option == 'optimal': collector.AddObjective(total_cost) status = solver.Solve(db, [objective, collector, monitor]) elif solver_option == 'optimise' and time_limit != None: collector.AddObjective(total_cost) status = solver.Solve( db, [objective, collector, solver_time_limit, monitor]) else: status = solver.Solve(db, [collector]) print("Time:", solver.WallTime(), "ms") print('status', status) # If solution found, collect all assignments if status: solution_by_task = {} solution_by_worker = {} for i in range(num_workers): for j in range(num_tasks): if collector.Value(0, assignments[i][j]) == 1: worker_task = assignments_ref[i][j] # Group solution by worker and task if worker_task.task.id in solution_by_task: solution_by_task[worker_task.task.id] = [ *solution_by_task[worker_task.task.id], worker_task.worker.id ] else: solution_by_task[worker_task.task.id] = [ worker_task.worker.id ] if worker_task.worker.id in solution_by_worker: solution_by_worker[worker_task.worker.id] = [ *solution_by_worker[worker_task.worker.id], worker_task.task.id ] else: solution_by_worker[worker_task.worker.id] = [ worker_task.task.id ] if solver_option == 'optimal' or (solver_option == 'optimise' and time_limit != None): objective_value = collector.ObjectiveValue(0) else: objective_value = get_non_optimised_cost(cost_matrix, solution_by_task) return { "status": status, "solutionByTask": solution_by_task, "solutionByWorker": solution_by_worker, "objectiveValue": objective_value } return { "status": status, "solutionByTask": None, "solutionByWorker": None, "objectiveValue": None }
""" This script runs a matrix workload that solves a least squares problem using a series of matrix multiplications. """ import subprocess import utils CORES_PER_WORKER = 8 # Figure out the public hostname of the machine. get_hostname_command = "curl -s http://169.254.169.254/latest/meta-data/public-hostname" MASTER_HOSTNAME = subprocess.check_output(get_hostname_command, shell=True) print "Running job with master", MASTER_HOSTNAME workers = utils.get_workers() total_cores = len(workers) * CORES_PER_WORKER # Compute the parameters for the experiment. # Increasing the number of rows increases the CPU time. total_rows_values = [1024 * 1024, 2 * 1024 * 1024] # This is basically the number of tasks; increase this for more tasks. num_row_blocks_values = [total_cores, total_cores * 2] # Reducing this will reduce computation by reduction^2 cols_per_block = 4096 # This is the number of times the shuffle stage will happen. # To use less memory, reduce this number, and instead increase # num_repeats (there's one RDD stored in memory for each shuffle # block). num_col_blocks = 5
This script runs jobs that process the same amount of data, but use different numbers of tasks to do so. Each job reads data from in-memory, sorts the data (saving intermediate shuffle data in-memory) and stores the out in-memory. """ import os import subprocess import time import utils MEGABYTES_PER_GIGABYTE = 1024 slaves = utils.get_workers() print "Running experiment assuming slaves %s" % slaves num_machines = len(slaves) values_per_key = 8 num_shuffles = 5 base_num_tasks = num_machines * 8 num_tasks_multipliers = [8, 4] target_total_data_gb = num_machines * 0.5 for num_tasks_multiplier in num_tasks_multipliers: num_tasks = base_num_tasks * num_tasks_multiplier total_num_items = target_total_data_gb / (4.9 + values_per_key * 1.92) * ( 64 * 4000000)