def test_connect_to_ray(monkeypatch, ray_start_cluster): def getpid(args): return os.getpid() def check_pool_size(pool, size): args = [tuple() for _ in range(size)] assert len(set(pool.map(getpid, args))) == size address = ray_start_cluster.address # Use different numbers of CPUs to distinguish between starting a local # ray cluster and connecting to an existing one. start_cpus = 1 # Set in fixture. init_cpus = 2 # Set up the cluster id so that gcs is talking with a different # storage prefix monkeypatch.setenv("RAY_external_storage_namespace", "new_cluster") ray._raylet.Config.initialize("") # Check that starting a pool still starts ray if RAY_ADDRESS not set. pool = Pool(processes=init_cpus) assert ray.is_initialized() assert int(ray.cluster_resources()["CPU"]) == init_cpus check_pool_size(pool, init_cpus) pool.terminate() pool.join() ray.shutdown() # Check that starting a pool connects to a running ray cluster if # ray_address is passed in. pool = Pool(ray_address=address) assert ray.is_initialized() assert int(ray.cluster_resources()["CPU"]) == start_cpus check_pool_size(pool, start_cpus) pool.terminate() pool.join() ray.shutdown() monkeypatch.setenv("RAY_external_storage_namespace", "new_cluster2") ray._raylet.Config.initialize("") # Set RAY_ADDRESS, so pools should connect to the running ray cluster. os.environ["RAY_ADDRESS"] = address # Check that starting a pool connects to a running ray cluster if # RAY_ADDRESS is set. pool = Pool() assert ray.is_initialized() assert int(ray.cluster_resources()["CPU"]) == start_cpus check_pool_size(pool, start_cpus) pool.terminate() pool.join() ray.shutdown() # Check that trying to start a pool on an existing ray cluster throws an # error if there aren't enough CPUs for the number of processes. with pytest.raises(Exception): Pool(processes=start_cpus + 1) assert int(ray.cluster_resources()["CPU"]) == start_cpus ray.shutdown()
def test_ray_init(shutdown_only): def getpid(args): return os.getpid() def check_pool_size(pool, size): args = [tuple() for _ in range(size)] assert len(set(pool.map(getpid, args))) == size # Check that starting a pool starts ray if not initialized. pool = Pool(processes=2) assert ray.is_initialized() assert int(ray.state.cluster_resources()["CPU"]) == 2 check_pool_size(pool, 2) ray.shutdown() # Check that starting a pool doesn't affect ray if there is a local # ray cluster running. ray.init(num_cpus=3) assert ray.is_initialized() pool = Pool(processes=2) assert int(ray.state.cluster_resources()["CPU"]) == 3 check_pool_size(pool, 2) ray.shutdown() # Check that trying to start a pool on an existing ray cluster throws an # error if there aren't enough CPUs for the number of processes. ray.init(num_cpus=1) assert ray.is_initialized() with pytest.raises(ValueError): Pool(processes=2) assert int(ray.state.cluster_resources()["CPU"]) == 1 ray.shutdown()
def splitDataset(dataset: str, outputPath: str, time: int, inputPath: str = 'D://Datasets//Tsinghua'): if dataset == 'benchmark': assert 1500 % (time * 250) == 0 num_blocks = 6 n_segments = int(1500 / (time * 250)) else: assert 750 % (time * 250) == 0 num_blocks = 4 n_segments = int(750 / (time * 250)) datasetPath = Path(inputPath + '//{}//separated'.format(dataset)) outputPath = Path(outputPath + '//{}//{}'.format(dataset, str(time * 250))) outputPath.mkdir(parents=True, exist_ok=True) sublists = [ list(range(x, y)) for x, y in [(1, 11), (11, 21), (21, 31), (31, 41)] ] f = partial(_splitDataset, inputPath=datasetPath, outputPath=outputPath, n_segments=n_segments) with Pool(processes=4) as pool: pool.map(f, sublists)
def extractSeparatedFiles(dataset: str, raw_data_directory: str = 'D:\\Datasets\\Tsinghua'): ''' Extract data from each trial (40, one for each class) and blocks (one for each trial) and save it separately, divided by class/trial and identified by individual and block: S{individual}_{block}.csv ''' raw_data_directory = Path(raw_data_directory + '\\' + dataset) files = [file for file in os.listdir(raw_data_directory) if "mat" in file] files = segmentList(files, 4) for trial in range(40): (raw_data_directory / 'separated' / str(trial + 1)).mkdir( parents=True, exist_ok=True) if dataset == 'benchmark': B = 6 else: B = 4 f = partial(_extractSeparatedFiles, dataset=dataset, raw_data_directory=raw_data_directory, num_blocks=B) with Pool(processes=4) as pool: pool.map(f, files)
def dist_mat(objs: t.Union[t.Sequence[nx.Graph], t.Sequence[t.Collection[SeqGene]]], n_jobs: int = 10) -> np.ndarray: """Compute distance matrix using `Dist(obj1, obj2)` <- sum of the number of different types at each position. """ if isinstance(objs[0], nx.Graph): dist = graph_dist else: dist = seq_dist size = len(objs) base = np.zeros(shape=(size, size)) staged_data = [] for i in range(size): for j in range(size): if i <= j: staged_data.append((i, objs[i], j, objs[j])) staged_data = tqdm( staged_data, desc='Distance matrix') with Pool(n_jobs) as workers: distances = workers.starmap(dist, staged_data) for i, j, d in distances: base[i][j] = d base[j][i] = d return base
def run(self) -> Any: """ Runs the pipeline with the aforementioned parallelization strategy if parallel is set to True. Otherwise, the pipeline is executed sequentially. :return: """ assert not self.done if self.par_logger: pass if self.parallel: if self.par_logger: self.par_logger.logger.info(f'{self.__class__.__name__}: Initializing mappers') if self.backend == 'mp': with multiprocessing.Pool(initializer=self._initialize_mappers, initargs=(self.mappers_factory,)) \ as pool: res = pool.map(self._map_f, self.streams) else: work_dir = os.getcwd() ray.init(address='auto', redis_password='******') with Pool(initializer=self._initialize_mappers, initargs=(self.mappers_factory, work_dir)) as pool: res = pool.map(self._map_f, self.streams) else: self._initialize_mappers(self.mappers_factory) res = [] for e in self.streams: res.append(self._map_f(e)) if self.par_logger: self.par_logger.logger.info(f'{self.__class__.__name__}: Mapping pipeline executed') self.done = True return res
def run(seed, checkpoint_path, samples, workers, generated_path, termination_mode, frame_take_prob=0.2, disable_adv_comm=False, t_fac=1.5): results = [] with Pool(workers) as p: for res in p.starmap(generate, [(seed+i, checkpoint_path, int(samples/workers), termination_mode, frame_take_prob, disable_adv_comm, t_fac) for i in range(workers)]): results += res print("DONE", len(results)) pickle.dump(results, open(generated_path, "wb"))
def test_maxtasksperchild(shutdown_only): def f(args): return os.getpid() pool = Pool(5, maxtasksperchild=1) assert len(set(pool.map(f, range(20)))) == 20 pool.terminate() pool.join()
def launch_long_running_tasks(num_pool=5): # doing the work, collecting data, updating the database # create an Actor pool of num_pool workers nodes pool = Pool(num_pool) results = [] for result in pool.map(task, range(1, 500, 10)): results.append(result) pool.terminate() return results
def test_ray_init(monkeypatch, shutdown_only): def getpid(args): return os.getpid() def check_pool_size(pool, size): args = [tuple() for _ in range(size)] assert len(set(pool.map(getpid, args))) == size # Check that starting a pool starts ray if not initialized. pool = Pool(processes=2) assert ray.is_initialized() assert int(ray.cluster_resources()["CPU"]) == 2 check_pool_size(pool, 2) pool.terminate() pool.join() ray.shutdown() # Set up the cluster id so that gcs is talking with a different # storage prefix monkeypatch.setenv("RAY_external_storage_namespace", "new_cluster") ray._raylet.Config.initialize("") # Check that starting a pool doesn't affect ray if there is a local # ray cluster running. ray.init(num_cpus=3) assert ray.is_initialized() pool = Pool(processes=2) assert int(ray.cluster_resources()["CPU"]) == 3 check_pool_size(pool, 2) pool.terminate() pool.join() ray.shutdown() # Check that trying to start a pool on an existing ray cluster throws an # error if there aren't enough CPUs for the number of processes. ray.init(num_cpus=1) assert ray.is_initialized() with pytest.raises(ValueError): Pool(processes=2) assert int(ray.cluster_resources()["CPU"]) == 1 ray.shutdown()
def test_initializer(shutdown_only): def init(dirname): with open(os.path.join(dirname, str(os.getpid())), "w") as f: print("hello", file=f) with tempfile.TemporaryDirectory() as dirname: num_processes = 4 pool = Pool(processes=num_processes, initializer=init, initargs=(dirname,)) assert len(os.listdir(dirname)) == 4 pool.terminate() pool.join()
def fitness_function_3(x, np_ext, np_int, m_l, m_i, m_ext, m_int): m_ext_tp = [i[0] for i in m_ext] #asignar valor de coordenada de malla externa m_ext_a_sh = [i[1] for i in m_ext] p = Pool() args = [[i, m_int, m_i, m_ext_tp, m_ext_a_sh] for i in x] total = p.map(iterateArrays, args) p.close() p.join() return np.array(total)
import time from ray.util.multiprocessing import Pool def f(index): time.sleep(5) return index if __name__ == "__main__": pool = Pool(5) for result in pool.map(f, range(10)): print(result)
def pool_4_processes(): pool = Pool(processes=4) yield pool pool.terminate() pool.join() ray.shutdown()
# Let's try multiprocess for each core # Since this is CPU I/O bound task, we should get better performance # the serial and threading # start = time.time() mp_pool = mp.Pool(get_cpu_count()) with mp_pool as p: prime_numbers = p.map(is_prime, list(range(num))) end = time.time() mp_pool.terminate() print( f"Multi Process access: Time elapsed: {end - start:4.2f} sec to compute all primes in {num} are {sum(list(prime_numbers))}" ) # Let's try that with Ray multiprocessing pool ray.init() ray_pool = Pool(get_cpu_count()) lst = list(range(num)) results = [] start = time.time() for result in ray_pool.map(is_prime, lst): results.append(result) end = time.time() ray_pool.terminate() print( f"Ray Distributed Multi Process access: Time elapsed: {end - start:4.2f} sec to compute all primes in {num} are {sum(results)}" ) ray.shutdown()
def pool(): pool = Pool(processes=1) yield pool pool.terminate() ray.shutdown()
def poolit_b(): with Pool(ray_address="auto") as pool: return list(pool.map(poolit_a, range(2, 4, 1)))
def poolit_a(_): with Pool(ray_address="auto") as pool: return list(pool.map(math.sqrt, range(0, 2, 1)))
def _parallel_lambda_ray(function, inputs, jobs): from ray.util.multiprocessing import Pool with Pool(jobs, lambda args: pt.init(**args), pt.init_args) as pool: return pool.map(function, inputs)
def _transform_ray(self, splits): from ray.util.multiprocessing import Pool with Pool(self.n_jobs, _pt_init, pt.init_args) as pool: results = pool.map(lambda topics: self.parent(topics), splits) return pd.concat(results)
page += processes return curr_buf if __name__ == '__main__': start = time.time() ray.init() processes = os.cpu_count() csv_data = [['標題', '評分', '價格', '上市日期', '標籤', '評論']] result_ids = [] reviews_buffer = [] for i in range(1, processes + 1): result_ids.append(crawler.remote(i, processes)) results = ray.get(result_ids) for res in results: reviews_buffer += res pool = Pool(processes=processes) data = pool.map(process_review, [[reviews_buf[1], reviews_buf[0]] for reviews_buf in reviews_buffer]) pool.close() for d in data: csv_data += d with open('output.csv', 'w', newline='', encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) writer.writerows(csv_data) end = time.time() print(f'執行時間 {end - start} 秒')