class RayMapReduce(object): def __init__(self, map_func, reduce_func, num_workers=None, host_address=None): """ map_func Map Function. reduce_func Reducer function. num_workers The number of workers to create in the pool. If None, then defaults to the number of CPUs available on the current host. host_address The IP address of master node. If None, then defaults to localhost. """ from ray.util.multiprocessing.pool import Pool # import within __init__() self.pool = Pool() self.map_func = map_func self.reduce_func = reduce_func def partition(self, mapped_values): """ Organize the mapped values by their key. Returns an unsorted sequence of tuples with a key and a sequence of values. """ partitioned_data = collections.defaultdict(list) for key, value in mapped_values: partitioned_data[key].append(value) return partitioned_data.items() def __call__(self, inputs, chunksize=1): """ Process the inputs through the map and reduce functions given. inputs An iterable containing the input data to be processed. chunksize=1 The portion of the input data to hand to each worker. This can be used to tune performance during the mapping phase. """ map_responses = self.pool.map(self.map_func, inputs, chunksize=chunksize) partitioned_data = self.partition(itertools.chain(*map_responses)) reduced_values = self.pool.map(self.reduce_func, partitioned_data) return reduced_values
def __parallel_run_ray(self, run_async=False): """ Initializes a ray pool. Asynchronous pools are still not implemented. """ from ray.util.multiprocessing.pool import Pool def set_niceness(niceness): # pool initializer os.nice(niceness) def worker_wrapper(x): os.nice(self.parameters.get('niceness', 20)) for k, v in zip(self.parameters['parallel'], x): self.parameters[k] = v out = self.process() return out iterable_vars = list( zip(*[self.parameters[k] for k in self.parameters['parallel']])) n_cores = self.parameters.get('n_cores', 4) pool = Pool(processes=n_cores, initializer=set_niceness, initargs=(self.parameters.get('niceness', 20), ), ray_address='auto') #(Run in same host it was called) outs = pool.map(worker_wrapper, iterable_vars) return self.__process_outputs(outs)
def approximate_pi_distributed(num_samples): from ray.util.multiprocessing.pool import Pool # NOTE: Only the import statement is changed. pool = Pool() start = time.time() num_inside = 0 sample_batch_size = 100000 for result in pool.map(sample, [sample_batch_size for _ in range(num_samples//sample_batch_size)]): num_inside += result print("pi ~= {}".format((4*num_inside)/num_samples)) print("Finished in: {:.2f}s".format(time.time()-start))
for i in book_tets: yes = 'b' + str(func_get_movie(i)) tet_dict[yes] = i list_movies = hope.index.tolist() "##################################################" "################## COMPUTE SIMILARITY BETWEEN MOVIE TETS ####################" num_movies = len(tet_dict) print(num_movies) pool = Pool(mp.cpu_count() - 2) # pool = Pool(mp.cpu_count()-2) results = (pool.map(partial(f, tet_dict=tet_dict, list_movies=list_movies, spec=spec_book), list_movies, chunksize=2000)) data = [x[1] for x in results] movies = [] for x in results: movies.append(x[0]) df = pd.DataFrame(data=data, index=movies, columns=movies).fillna(0) cols = df.columns.values.tolist() rows = list(df.index) X = df.to_numpy() X = X + X.T - np.diag(np.diag(X)) #