def parse_digests(job_output): """ return a list of digests """ maj = majority(medusa_settings.faults) # list of dicts nset_digests = [] for job in job_output: ndigests = {} for digest in job.digests: ndigests.update(digest) # concat dicts nset_digests.append(ndigests) # append it to a list keys = [] for _d in job_output[0].digests: keys += _d.keys() digests_matrix = [] for k in keys: temp_val = [] for dset in nset_digests: temp_val.append(dset[k]) v, k = Counter(temp_val).most_common(1)[0] if k >= maj: digests_matrix.append((True, v)) else: return False, None return True, digests_matrix
def needs_more_copies(subset, pinput, faults): """ check how many clusters contains all the inputs """ # get the majority value entries = collections.Counter( {my_obj.cluster: len(my_obj.paths) for my_obj in subset}) count = sum(1 for entry in entries.items() if entry[1] >= len(pinput)) return count < majority(faults)
def pick_up_clusters(step, force=False): """ Get a set of running clusters that will be used to run the job :param step(int) step of execution :param force (boolean) force to execute the if clause :return list of clusters """ if step == 0 or force: # if it is the first time that we are getting a set of clusters n = majority(medusa_settings.faults) clusters = simplecache.SimpleCache.get_pick_up_clusters()[0:n] else: clusters = simplecache.SimpleCache.get_pick_up_clusters() return clusters
def vote(digests, aggregation, faults=0): """ from a matrix get a majority of values E.g.str(data).splitlines() {gid:[[list digests 1], [list digests 2]} """ if faults == 0: return None value = Counter(tuple(item) for item in digests).most_common(1) if medusa_settings.digests_fault and not aggregation and medusa_settings.faults_left > 0: decrease_faults() return None, False nr_produced_equal_digests = value[0][1] if nr_produced_equal_digests < majority(faults): return None, False digests_selected = value[0][0] return digests_selected, True
def run_execution_threads(faults, jobs, aggregation, reference_digests): """ Execute jobs in serial :param faults: (int) Number of faults to tolerate :param jobs: (list) list of Job structures :param aggregation: (boolean) is it the aggregation phase or not :param reference_digests: :return: """ group_jobs = [] if not jobs: return group_jobs logging.info(" Running scheduling: %s" % medusa_settings.ranking_scheduler) job_args = [] # Setup a list of processes that we want to run output = mp.Queue() processes = [ Thread(target=_copy_and_aggregate, args=(job, reference_digests, aggregation, output)) for job in jobs ] # Run and exit processes [p.start() for p in processes] [p.join() for p in processes] # Get process results from the output queue list_clusters = [output.get() for _ in processes] for clusters_to_launch_job, job in zip(list_clusters, jobs): logging.debug("Clusters included %s" % clusters_to_launch_job) job_args.append( ExecutionJob(job.id, clusters_to_launch_job, job.command, job.output_path + '/part*', majority(faults))) # if medusa_settings.relaunch_job_other_cluster and not aggregation: # logging.warn("Please shut one cluster down... Execution will resume in 10 secs.") # time.sleep(10) logging.info("Running %s jobs..." % (len(job_args))) seffective_job_runtime = time.time() processes = [] for execution_parameters in job_args: # Each thread executes a job in the respective clusters processes.append( Thread(target=run_job, args=( execution_parameters, output, ))) # Run processes [p.start() for p in processes] [p.join() for p in processes] logging.info("Run_job took %s" % str(seffective_job_runtime - time.time())) job_output_list = [] _output_list = [output.get() for _ in processes] _job_output = [_output for _output in _output_list[0]] for _output in _job_output: job_output_list.append(parse_data(_output)) digests_matrix = [] while True: successful, digests = run_verification(job_output_list, aggregation) if not successful: if medusa_settings.relaunch_job_same_cluster: # relaunch job in the same cloud path_to_remove = os.path.dirname( execution_parameters.output_path) _relaunch_job_same_cluster(execution_parameters, path_to_remove) else: logging.debug("Re-launching job %s" % execution_parameters.command) save_reexecute_another_cloud(True) execution_parameters = _relaunch_job_other_cluster( execution_parameters, jobs, reference_digests, aggregation) _job_output = run_job(execution_parameters) for _output in _job_output: job_output_list.append(parse_data(_output)) else: digests_matrix.append(digests) break # save progress of the job filename = settings.get_temp_dir() + "/job_progress_log.json" step = 2 if not aggregation else 5 update_json_file(filename, step) eeffective_job_runtime = time.time() span = str(eeffective_job_runtime - seffective_job_runtime) """ The total time that it took to execute all jobs """ logging.info("Effective job run-time: %s" % span) return digests_matrix
def run_execution_serial(faults, jobs, aggregation, reference_digests): """ Execute jobs in serial :param faults: (int) Number of faults to tolerate :param jobs: (list) list of Job structures :param aggregation: (boolean) is it the aggregation phase or not :param reference_digests: (RefDigests) digests of reference :return: list with the result of the selected digest. Ex: (True, {u'/aggregate-output/part-r-00000': u'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'}) """ group_jobs = [] if not jobs: return group_jobs logging.info(" Running scheduling: %s" % medusa_settings.ranking_scheduler) job_args = [] for job in jobs: clusters_to_launch_job = _copy_and_aggregate(job, reference_digests, aggregation) logging.debug("Clusters included %s" % clusters_to_launch_job) job_args.append( ExecutionJob(job.id, clusters_to_launch_job, job.command, job.output_path + '/part*', majority(faults))) # if medusa_settings.relaunch_job_other_cluster and not aggregation: # logging.warn("Please shut one cluster down... Execution will resume in 10 secs.") # time.sleep(10) logging.info("Running %s jobs..." % (len(job_args))) seffective_job_runtime = time.time() digests_matrix = [] for execution_parameters in job_args: _job_output_list = [] while True: _job_output = run_job( execution_parameters) # run job in the set of clusters for _output in _job_output: _job_output_list.append(parse_data(_output)) successful, digests = run_verification(_job_output_list, aggregation) if not successful: if medusa_settings.relaunch_job_same_cluster: # relaunch job in the same cloud path_to_remove = os.path.dirname( execution_parameters.output_path) _relaunch_job_same_cluster(execution_parameters, path_to_remove) else: # if len(_failed_exec) > 0: logging.debug("Re-launching job %s" % execution_parameters.command) save_reexecute_another_cloud(True) execution_parameters = _relaunch_job_other_cluster( execution_parameters, jobs, reference_digests, aggregation) else: digests_matrix.append(digests) break # save progress of the job filename = settings.get_temp_dir() + "/job_progress_log.json" step = 2 if not aggregation else 5 update_json_file(filename, step) eeffective_job_runtime = time.time() span = str(eeffective_job_runtime - seffective_job_runtime) """ The total time that it took to execute all jobs """ logging.info("Effective job run-time: %s" % span) return digests_matrix
def run_execution_threads(faults, jobs): """ Execute jobs in serial :param faults: (int) Number of faults to tolerate :param jobs: (list) list of Job structures :param reference_digests: :return: """ group_jobs = [] if not jobs: return group_jobs logging.info(" Running scheduling: %s" % medusa_settings.ranking_scheduler) # Setup a list of processes that we want to run output = mp.Queue() # Get process results from the output queue clusters_to_launch_job = pick_up_clusters(0) job_args = [] for job in jobs: job_args.append( ExecutionJob(job.id, clusters_to_launch_job, job.command, job.output_path + '/part*', majority(faults))) logging.info("Running %s jobs..." % (len(job_args))) seffective_job_runtime = time.time() processes = [] for execution_parameters in job_args: # Each thread executes a job in the respective clusters processes.append(Thread(target=run_job, args=(execution_parameters, output,))) # Run processes [p.start() for p in processes] [p.join() for p in processes] _output_list = output.get() logging.info("Run_job took %s" % str(time.time() - seffective_job_runtime)) spart = time.time() _job_output = [] for _output in _output_list: _job_output += _output job_output_list = [ parse_data(_joutput) for _joutput in _job_output ] logging.info("Parse_data took %s" % str(time.time() - spart)) srverification = time.time() digests_matrix = [] while True: successful, digests = run_verification(job_output_list) if not successful: if medusa_settings.relaunch_job_same_cluster: # relaunch job in the same cloud path_to_remove = os.path.dirname(execution_parameters.output_path) _relaunch_job_same_cluster(execution_parameters, path_to_remove) else: logging.debug("Re-launching job %s" % execution_parameters.command) execution_parameters = _relaunch_job_other_cluster(execution_parameters, jobs) _job_output = run_job(execution_parameters) for _output in _job_output: job_output_list.append(parse_data(_output[0])) else: digests_matrix.append(digests) break logging.info("Run_verification took %s" % str(time.time() - srverification)) # save progress of the job filename = settings.get_temp_dir() + "/job_progress_log.json" step = 2 update_json_file(filename, step) eeffective_job_runtime = time.time() span = str(eeffective_job_runtime - seffective_job_runtime) """ The total time that it took to execute all jobs """ logging.info("Effective job run-time: %s" % span) return digests_matrix
def run_execution(faults, jobs, aggregation): """ faults is the number of faults to tolerate jobs is the """ group_jobs = [] if not jobs: return group_jobs print " Running scheduling: %s" % medusa_settings.ranking_scheduler seffective_job_runtime = 0 pool = Pool(processes=4) history_rank = defaultdict(lambda: 1) args = [] for job in jobs: gid = str(int(time.time())) if not aggregation: args.append((gid, job, faults, history_rank)) else: args += (gid, job, faults) outputs = [] if not aggregation: outputs = pool.map(mergeDirs, args) else: outputs = pool.map(aggregationMergeDirs, args) for output in outputs: new_included, new_command, new_poutput = output print "Clusters included %s" % new_included params = (new_command, new_poutput + '/part*') pparams = (new_included, params, majority(faults)) output = run_job_test(pparams) # job_args.append((new_included, params, majority(faults))) # outputs=pool2.map(run_job, job_args) # seffective_job_runtime = time.time() # print "Running jobs (%s)" %(new_command) # gjobs = run_job(new_included, params, majority(faults)) # group_jobs.append({gid: gjobs}) group_data = [] for waiter in group_jobs: dlist = [] key, value = waiter.items()[0] for v in value: cluster, xmlfile = v.get() # print xmlfile print "Job finished at %s" % cluster data = parseJobOutputMetrics(xmlfile) logline = "%s:%s:%s:%s:%s:%s:%s:%s" % (cluster, data[ 'currentqueuecapacity'], data['hdfsbytesread'], data['hdfsbyteswritten'], data['jobsrunning'], data['maps'], data['reduces'], data['time']) command = writeJobRunning(logline) s1 = executeCommand.apply_async(queue=cluster, args=(command,)) s1.get() dlist.append([data['digests']]) group_data.append({key: dlist}) eeffective_job_runtime = time.time() span = str(eeffective_job_runtime - seffective_job_runtime) """ The total time that it took to execute all jobs """ print "Effective job run-time: %s" % span return group_data