def __init__(self, comm=None, debug=False, loadbalance=False, comms=None): self.comm = MPI.COMM_WORLD if comm is None else comm self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - 1 self.debug = debug self.function = _error_function self.loadbalance = loadbalance self.comms = comms ''' if not self.is_master() and (comms is None or len(self.comms) != 1): raise RuntimeError("Invalid number of group:" + str(len(comms))) ''' self.node_name_to_tag = {} if self.is_master(): for i, s in enumerate(comms.keys()): self.node_name_to_tag[s] = i self.node_name_to_tag = self.comm.bcast(self.node_name_to_tag, root=0) self.tag_to_node_name = {v: k for k, v in self.node_name_to_tag.items()} self.node_name_to_rank = mpi_utility.mpi_gather_hostnames(comm=self.comm, include_root=False) if not self.is_master() and helper_functions.get_hostname() not in self.node_name_to_tag: raise RuntimeError("Node " + helper_functions.get_hostname() + " not in node_name_to_tag") if self.size == 0: raise ValueError("Tried to create an MPI pool, but there " "was only one MPI process available. " "Need at least two.") if self.num_groups == 0: raise ValueError("Tried to create an MPI pool, but there were no groups available")
def mpi_run_main_args(args): #return None args = list(args) if len(mpi_comms) > 1 or parallelize_cv: my_comm = mpi_comms[helper_functions.get_hostname()] args.append(my_comm) main.run_main_args(args)
def mpi_rollcall(): comm = MPI.COMM_WORLD s = comm.Get_size() rank = comm.Get_rank() for i in range(s): if rank == i: hostname = helper_functions.get_hostname() print '(' + hostname + '): ' + str(rank) + ' of ' + str(s) comm.Barrier()
def test_mpi_func(i): local_comm = mpi_comms[helper_functions.get_hostname()] local_comm.Barrier() if local_comm.Get_rank() == 0: mpi_utility.mpi_print('My Group: ' + str(i),local_comm) local_comm.Barrier() #mpi_utility.mpi_print(str(i)) return 'Hello World'
def run_experiments(self): data_file = self.configs.data_file data_and_splits = helper_functions.load_object(data_file) data_and_splits.data.repair_data() assert self.configs.num_splits <= len(data_and_splits.splits) data_and_splits.labels_to_keep = self.configs.labels_to_keep data_and_splits.labels_to_not_sample = self.configs.labels_to_not_sample data_and_splits.target_labels = self.configs.target_labels data_and_splits.data.repair_data() results_file = self.configs.results_file comm = mpi_utility.get_comm() if os.path.isfile(results_file): if mpi_utility.is_group_master(): print results_file + ' already exists - skipping' return if mpi_utility.is_group_master(): hostname = helper_functions.get_hostname() print '(' + hostname + ') Running experiments: ' + results_file learner = self.configs.learner learner.run_pre_experiment_setup(data_and_splits) num_labels = len(self.configs.num_labels) num_splits = self.configs.num_splits #method_results = results.MethodResults(n_exp=num_labels, n_splits=num_splits) method_results = self.configs.method_results_class(n_exp=num_labels, n_splits=num_splits) for i, nl in enumerate(self.configs.num_labels): method_results.results_list[i].num_labels = nl split_idx = self.configs.split_idx if split_idx is not None: num_labels_list = list(itertools.product(range(num_labels), [split_idx])) else: num_labels_list = list(itertools.product(range(num_labels), range(num_splits))) shared_args = (self, results_file, data_and_splits, method_results) args = [shared_args + (i_labels, split) for i_labels,split in num_labels_list] if self.configs.use_pool: pool = multiprocessing_utility.LoggingPool(processes=self.configs.pool_size) all_results = pool.map(_run_experiment, args) else: all_results = [_run_experiment(a) for a in args] for curr_results,s in zip(all_results,num_labels_list): if curr_results is None: continue i_labels, split = s method_results.set(curr_results, i_labels, split) method_results.configs = self.configs if self.configs.should_load_temp_data: helper_functions.save_object(results_file,method_results) for i_labels, split in num_labels_list: num_labels = self.configs.num_labels[i_labels] _delete_temp_split_files(results_file, num_labels, split) _delete_temp_folder(results_file)
def mpi_gather_hostnames(comm=MPI.COMM_WORLD, include_root=False): hostname = helper_functions.get_hostname() all_hostnames = comm.gather(hostname, root=0) all_hostnames = comm.bcast(all_hostnames, root=0) host_to_rank = {} for i, s in enumerate(all_hostnames): if not include_root and i == 0: continue if s not in host_to_rank: host_to_rank[s] = set() host_to_rank[s].add(i) host_to_rank = comm.bcast(host_to_rank, root=0) return host_to_rank
def __init__(self, comm=None, debug=False, loadbalance=False, comms=None): self.comm = MPI.COMM_WORLD if comm is None else comm self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - 1 self.debug = debug self.function = _error_function self.loadbalance = loadbalance self.comms = comms ''' if not self.is_master() and (comms is None or len(self.comms) != 1): raise RuntimeError("Invalid number of group:" + str(len(comms))) ''' self.node_name_to_tag = {} if self.is_master(): for i, s in enumerate(comms.keys()): self.node_name_to_tag[s] = i self.node_name_to_tag = self.comm.bcast(self.node_name_to_tag, root=0) self.tag_to_node_name = { v: k for k, v in self.node_name_to_tag.items() } self.node_name_to_rank = mpi_utility.mpi_gather_hostnames( comm=self.comm, include_root=False) if not self.is_master() and helper_functions.get_hostname( ) not in self.node_name_to_tag: raise RuntimeError("Node " + helper_functions.get_hostname() + " not in node_name_to_tag") if self.size == 0: raise ValueError("Tried to create an MPI pool, but there " "was only one MPI process available. " "Need at least two.") if self.num_groups == 0: raise ValueError( "Tried to create an MPI pool, but there were no groups available" )
def run_experiments(self): data_file = self.configs.data_file data_and_splits = self.load_data_and_splits(data_file) results_file = self.configs.results_file comm = mpi_utility.get_comm() if os.path.isfile(results_file): if mpi_utility.is_group_master(): print results_file + ' already exists - skipping' return if mpi_utility.is_group_master(): hostname = helper_functions.get_hostname() print '(' + hostname + ') Running experiments: ' + results_file learner = self.configs.learner learner.run_pre_experiment_setup(data_and_splits) num_labels = len(self.configs.num_labels) num_splits = self.configs.num_splits #method_results = results.MethodResults(n_exp=num_labels, n_splits=num_splits) method_results = self.configs.method_results_class(n_exp=num_labels, n_splits=num_splits) for i, nl in enumerate(self.configs.num_labels): method_results.results_list[i].num_labels = nl split_idx = self.configs.split_idx if split_idx is not None: num_labels_list = list(itertools.product(range(num_labels), [split_idx])) else: num_labels_list = list(itertools.product(range(num_labels), range(num_splits))) shared_args = (self, results_file, data_and_splits, method_results) args = [shared_args + (i_labels, split) for i_labels,split in num_labels_list] if self.configs.use_pool: pool = multiprocessing_utility.LoggingPool(processes=self.configs.pool_size) all_results = pool.map(_run_experiment, args) else: all_results = [_run_experiment(a) for a in args] for curr_results,s in zip(all_results,num_labels_list): if curr_results is None: continue i_labels, split = s method_results.set(curr_results, i_labels, split) method_results.configs = self.configs if self.configs.should_load_temp_data: helper_functions.save_object(results_file,method_results) for i_labels, split in num_labels_list: num_labels = self.configs.num_labels[i_labels] _delete_temp_split_files(results_file, num_labels, split) _delete_temp_folder(results_file)
def get_tag(self): if self.is_master(): raise RuntimeError("Master node doesn't have a tag.") return self.node_name_to_tag[helper_functions.get_hostname()]
def is_group_root(self): group_members = self.node_name_to_rank[helper_functions.get_hostname()] if 0 in group_members: group_members.remove(0) return self.rank == min(group_members)