def test_no_failure_with_torch_mp(out_dir): shutil.rmtree(out_dir, ignore_errors=True) path = build_json(out_dir, save_all=True, save_interval="1") path = str(path) os.environ["SMDEBUG_CONFIG_FILE_PATH"] = path device = "cpu" dataloader_kwargs = {} cpu_count = 2 if mp.cpu_count() > 2 else mp.cpu_count() torch.manual_seed(1) model = Net().to(device) model.share_memory( ) # gradients are allocated lazily, so they are not shared here processes = [] for rank in range(cpu_count): p = mp.Process(target=train, args=(rank, model, device, dataloader_kwargs)) # We first train the model across `num_processes` processes p.start() processes.append(p) for p in processes: p.join() trial = create_trial(out_dir) assert trial.num_workers == 1 # Ensure only one worker saved data assert len(trial.tensor_names()) > 20 # Ensure that data was saved assert trial.steps() == [0, 1, 2, 3] # Ensure that steps were saved shutil.rmtree(out_dir, ignore_errors=True) shutil.rmtree(data_dir, ignore_errors=True)
def __init__(self): mp.set_start_method('spawn') self._top_N = cf().path["inference"]["top_N"] self._using_gpu = cf().path["inference"]["using_gpu"] self._device = torch.device(cf().path["system"]["device"]) self._client_len = 0 self._sku = 0 self._user_data = OrderedDict() self._product_data = OrderedDict() self.user_col_name = OrderedDict() self.whole_user_col_name = OrderedDict() self.product_col_name = OrderedDict() self.whole_product_col_name = OrderedDict() # set process count self._num_processes = max(1, int(mp.cpu_count() * 0.6)) self._num_sampler_processes = max(1, int(mp.cpu_count() * 0.2)) self._using_gpu = False self._sampler_flag = mp.Manager().list() self.load_user_raw_data() self.load_product_raw_data()
def main(args): if args.load is False and os.path.isfile('./model/walker.pt'): while True: load = input('Are you sure you want to erase the previous training? (y/n) ') if load.lower() in ('y', 'yes', '1'): break elif load.lower() in ('n', 'no', '0'): import sys sys.exit() # create shared variables between all the processes manager = mp.Manager() # used to send the results of the net common_dict = manager.dict() # a queue of batches to be fed to the training net mem_queue = manager.Queue(1500 * mp.cpu_count()) # a queue of operations pending process_queue = manager.Queue(mp.cpu_count()-1) workers = mp.cpu_count() if args.train else 2 with mp.Pool(workers) as pool: try: print(f"Running pool with {workers} workers") pool.apply_async(gpu_thread, (args.load, mem_queue, process_queue, common_dict, 0)) if args.render: pool.apply_async(cpu_thread, (2 if not args.train else 1, mem_queue, process_queue, common_dict, 1)) for i in range(1+args.render, workers): pool.apply_async(cpu_thread, (0, mem_queue, process_queue, common_dict, i)) # Wait for children to finish pool.close() pool.join() except KeyboardInterrupt: pool.join()
def collect_policy_losses(self): policy_losses = [] if self.multi_process: game_queue = mp.Queue() done_queue = mp.Queue() # submit games for n in range(self.num_games): game_queue.put(self.setup_games(n)) # start processes for i in range(mp.cpu_count()): mp.Process(target=self_play_multi, args=(game_queue, done_queue)).start() for _ in range(self.num_games): color, reward, policy_loss = done_queue.get() self.self_play_log(color, reward, policy_loss) policy_losses.append(policy_loss) # stop the processes for _ in range(mp.cpu_count()): game_queue.put('STOP') return policy_losses else: for n in range(self.num_games): args = self.setup_games(n) color, reward, policy_loss = self_play(*args) self.self_play_log(color, reward, policy_loss) policy_losses.append(policy_loss) return policy_losses
def calc_chunksize(num_dicts, min_chunksize=4, max_chunksize=2000, max_processes=128): if mp.cpu_count() > 3: num_cpus = min(mp.cpu_count() - 1 or 1, max_processes) # -1 to keep a CPU core free for xxx else: num_cpus = min( mp.cpu_count(), max_processes) # when there are few cores, we use all of them dicts_per_cpu = np.ceil(num_dicts / num_cpus) # automatic adjustment of multiprocessing chunksize # for small files (containing few dicts) we want small chunksize to ulitize all available cores but never less # than 2, because we need it to sample another random sentence in LM finetuning # for large files we want to minimize processor spawning without giving too much data to one process, so we # clip it at 5k multiprocessing_chunk_size = int( np.clip((np.ceil(dicts_per_cpu / 5)), a_min=min_chunksize, a_max=max_chunksize)) # This lets us avoid cases in lm_finetuning where a chunk only has a single doc and hence cannot pick # a valid next sentence substitute from another document if num_dicts != 1: while num_dicts % multiprocessing_chunk_size == 1: multiprocessing_chunk_size -= -1 dict_batches_to_process = int(num_dicts / multiprocessing_chunk_size) num_processes = min(num_cpus, dict_batches_to_process) or 1 return multiprocessing_chunk_size, num_processes
def run_MCTS(args, start_idx=0, iteration=0): net_to_play="%s_iter%d.pth.tar" % (args.neural_net_name, iteration) net = ConnectNet() cuda = torch.cuda.is_available() if cuda: net.cuda() if args.MCTS_num_processes > 1: logger.info("Preparing model for multi-process MCTS...") mp.set_start_method("spawn",force=True) net.share_memory() net.eval() current_net_filename = os.path.join("./model_data/",\ net_to_play) if os.path.isfile(current_net_filename): checkpoint = torch.load(current_net_filename) net.load_state_dict(checkpoint['state_dict']) logger.info("Loaded %s model." % current_net_filename) else: torch.save({'state_dict': net.state_dict()}, os.path.join("./model_data/",\ net_to_play)) logger.info("Initialized model.") processes = [] if args.MCTS_num_processes > mp.cpu_count(): num_processes = mp.cpu_count() logger.info("Required number of processes exceed number of CPUs! Setting MCTS_num_processes to %d" % num_processes) else: num_processes = args.MCTS_num_processes logger.info("Spawning %d processes..." % num_processes) with torch.no_grad(): for i in range(num_processes): p = mp.Process(target=MCTS_self_play, args=(net, args.num_games_per_MCTS_process, start_idx, i, args, iteration)) p.start() processes.append(p) for p in processes: p.join() logger.info("Finished multi-process MCTS!") elif args.MCTS_num_processes == 1: logger.info("Preparing model for MCTS...") net.eval() current_net_filename = os.path.join("./model_data/",\ net_to_play) if os.path.isfile(current_net_filename): checkpoint = torch.load(current_net_filename) net.load_state_dict(checkpoint['state_dict']) logger.info("Loaded %s model." % current_net_filename) else: torch.save({'state_dict': net.state_dict()}, os.path.join("./model_data/",\ net_to_play)) logger.info("Initialized model.") with torch.no_grad(): MCTS_self_play(net, args.num_games_per_MCTS_process, start_idx, 0, args, iteration) logger.info("Finished MCTS!")
def calc_chunksize(num_dicts): MIN_CHUNKSIZE = 4 MAX_CHUNKSIZE = 2000 num_cpus = mp.cpu_count() or 1 dicts_per_cpu = np.ceil(num_dicts / num_cpus) # automatic adjustment of multiprocessing chunksize # for small files (containing few dicts) we want small chunksize to ulitize all available cores but never less # than 2, because we need it to sample another random sentence in LM finetuning # for large files we want to minimize processor spawning without giving too much data to one process, so we # clip it at 5k multiprocessing_chunk_size = int(np.clip((np.ceil(dicts_per_cpu / 5)), a_min=MIN_CHUNKSIZE, a_max=MAX_CHUNKSIZE)) dict_batches_to_process = int(num_dicts / multiprocessing_chunk_size) num_cpus_used = min(mp.cpu_count(), dict_batches_to_process) or 1 return multiprocessing_chunk_size,num_cpus_used
def calculate_best_energy(self): if self.n_spins <= 10: # Generally, for small systems the time taken to start multiple processes is not worth it. res = self.calculate_best_brute() else: # Start up processing pool n_cpu = int(mp.cpu_count()) / 2 pool = mp.Pool(mp.cpu_count()) # Split up state trials across the number of cpus iMax = 2**(self.n_spins) args = np.round( np.linspace(0, np.ceil(iMax / n_cpu) * n_cpu, n_cpu + 1)) arg_pairs = [list(args) for args in zip(args, args[1:])] # Try all the states. # res = pool.starmap(self._calc_over_range, arg_pairs) try: res = pool.starmap(self._calc_over_range, arg_pairs) # Return the best solution, idx_best = np.argmin([e for e, s in res]) res = res[idx_best] except Exception as e: # Falling back to single-thread implementation. # res = self.calculate_best_brute() res = self._calc_over_range(0, 2**(self.n_spins)) finally: # No matter what happens, make sure we tidy up after outselves. pool.close() if self.spin_basis == SpinBasis.BINARY: # convert {1,-1} --> {0,1} best_score, best_spins = res best_spins = (1 - best_spins) / 2 res = best_score, best_spins if self.optimisation_target == OptimisationTarget.CUT: best_energy, best_spins = res best_cut = self.calculate_cut(best_spins) res = best_cut, best_spins elif self.optimisation_target == OptimisationTarget.ENERGY: pass else: raise NotImplementedError() return res
def transform_batch_model(model, data, batchsize, logj=None, start=0, end=None, param=None, pool=None, nocuda=False): if logj is None: logj = torch.zeros(len(data), device=data.device) if pool is None: _transform_batch_model(model, data, logj, 0, batchsize, start=start, end=end, param=param, nocuda=nocuda) else: if torch.cuda.is_available() and not nocuda: nprocess = torch.cuda.device_count() else: nprocess = mp.cpu_count() param0 = [(model, data, logj, i, batchsize, len(data) * i // nprocess, len(data) * (i + 1) // nprocess, start, end, param, nocuda) for i in range(nprocess)] pool.starmap(_transform_batch_model, param0) return data, logj
def transform_batch_layer(layer, data, batchsize, logj=None, direction='forward', param=None, pool=None, nocuda=False): assert direction in ['forward', 'inverse'] if logj is None: logj = torch.zeros(len(data), device=data.device) if pool is None: _transform_batch_layer(layer, data, logj, 0, batchsize, direction=direction, param=param, nocuda=nocuda) else: if torch.cuda.is_available() and not nocuda: nprocess = torch.cuda.device_count() else: nprocess = mp.cpu_count() param0 = [(layer, data, logj, i, batchsize, len(data) * i // nprocess, len(data) * (i + 1) // nprocess, direction, param, nocuda) for i in range(nprocess)] pool.starmap(_transform_batch_layer, param0) return data, logj
def calc_write_learning_curve(exp: Experiment, max_num_workers=40): num_workers = min(min(max_num_workers, multiprocessing.cpu_count() - 1), exp.num_folds) name = exp.name print("got %d evaluations to calculate" % len(exp.jobs)) results_path = results_folder + "/" + name os.makedirs(results_path, exist_ok=True) start = time() scores = calc_scores(exp.score_task, [split for train_size, split in exp.jobs], n_jobs=num_workers) duration = time() - start meta_data = { "duration": duration, "num-workers": num_workers, "experiment": str(exp), } data_io.write_json(results_path + "/meta_datas.json", meta_data) print("calculating learning-curve for %s took %0.2f seconds" % (name, duration)) pprint(scores) results = groupandsort_by_first( zip([train_size for train_size, _ in exp.jobs], scores)) data_io.write_json(results_path + "/learning_curve.json", results) trainsize_to_mean_std_scores = { train_size: tuple_2_dict(calc_mean_and_std(m)) for train_size, m in results.items() } data_io.write_json( results_path + "/learning_curve_meanstd.json", trainsize_to_mean_std_scores, )
def recognize_smoke_worker(rank, world_size, learner, transform, rgb_p, file_name_list, ct_sub_list, parallel, smoke_thr, activation_thr, queue): # Set the dataloader num_workers = max(mp.cpu_count() - 2, 0) dataloader = set_dataloader(rank, world_size, file_name_list, ct_sub_list, rgb_p, transform, num_workers, parallel) # Set model p_model = "../data/saved_i3d/paper_result/full-augm-rgb/55563e4-i3d-rgb-s3/model/573.pt" model = learner.set_model(rank, world_size, learner.mode, p_model, parallel, phase="test") model.train(False) # set model to evaluate mode (IMPORTANT) grad_cam = GradCam(model, use_cuda=learner.use_cuda, normalize=False) # Iterate over batch data smoke_pb_list = [] activation_ratio_list = [] epochtime_list = [] for d in tqdm.tqdm(dataloader): epochtime_list.append(int(d["epochtime"][0])) # Compute probability of having smoke v = d["frames"][0] if learner.use_cuda and torch.cuda.is_available: v = v.cuda() pred, pred_upsample = learner.make_pred(model, v, upsample=None) pred = F.softmax(pred.squeeze().transpose(0, 1)).cpu().detach().numpy()[:, 1] pred_upsample = F.softmax(pred_upsample.squeeze().transpose( 0, 1)).cpu().detach().numpy()[:, 1] smoke_pb = np.median(pred) # use the median as the probability smoke_pb_list.append(round(float(smoke_pb), 3)) # GradCAM (class activation mapping) # Compute the ratio of the activated region that will affect the probability # This can potentially be used to estimate the number of smoke pixels # Need to check more papers about weakly supervised learning C = grad_cam.generate_cam( v, 1) # 1 is the target class, which means having smoke emissions C = C.reshape((C.shape[0], -1)) #print(pd.DataFrame(data={"GradCAM": C.flatten()}).describe().applymap(lambda x: "%.3f" % x)) if smoke_pb > smoke_thr: # only compute the activation ratio when smoke is predicted C = np.multiply(C > activation_thr, 1) # make the binary mask activation_ratio = np.sum( C, axis=1, dtype=np.uint32) / (learner.image_size**2) activation_ratio[pred_upsample < smoke_thr] = 0 activation_ratio = np.mean( activation_ratio) # use the mean as the activation ratio activation_ratio_list.append(round(float(activation_ratio), 3)) else: activation_ratio_list.append(0.0) if queue is None: return (smoke_pb_list, activation_ratio_list, epochtime_list) else: queue.put((smoke_pb_list, activation_ratio_list, epochtime_list))
def run_multiple_times(args, run_fct): cpu_count = mp.cpu_count() gpu_count = torch.cuda.device_count() # Clone arguments into list & Distribute workload across GPUs args_across_workers = [copy.deepcopy(args) for r in range(args.RUN_TIMES)] if gpu_count > 0: gpu_counter = 0 for r in range(args.RUN_TIMES): args_across_workers[r].seed = r args_across_workers[r].device_id = gpu_counter gpu_counter += 1 if gpu_counter > gpu_count-1: gpu_counter = 0 # Execute different runs/random seeds in parallel pool = mp.Pool(cpu_count-1) df_across_runs = pool.map(run_fct, args_across_workers) pool.close() # Post process results df_concat = pd.concat(df_across_runs) by_row_index = df_concat.groupby(df_concat.index) df_means, df_stds = by_row_index.mean(), by_row_index.std() if args.ENV_ID == "dense-v0": sfname = "results/GRIDWORLD/" + str(args.RUN_TIMES) + "_RUNS_" + str(args.AGENT) + "_" + args.SAVE_FNAME else: sfname = "results/ATARI/" + str(args.RUN_TIMES) + "_RUNS_" + str(args.AGENT) + "_" + args.SAVE_FNAME print("Saved agents to {}".format(sfname)) df_means.to_csv(sfname) return df_means, df_stds
def __init__(self, video_vis, n_workers=None): """ Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py n_workers (Optional[int]): number of CPUs for running video visualizer. If not given, use all CPUs. """ num_workers = mp.cpu_count() if n_workers is None else n_workers self.task_queue = mp.Queue() self.result_queue = mp.Queue() self.get_indices_ls = [] self.procs = [] self.result_data = {} self.put_id = -1 for _ in range(max(num_workers, 1)): self.procs.append( AsyncVis._VisWorker(video_vis, self.task_queue, self.result_queue)) for p in self.procs: p.start() atexit.register(self.shutdown)
def get_pred_large(pan_2ch_all, vid_num, nframes_per_video=6): vid_num = len(pan_2ch_all) // nframes_per_video # 10 cpu_num = multiprocessing.cpu_count() // 2 # 32 --> 16 nprocs = min(vid_num, cpu_num) # 10 max_nframes = cpu_num * nframes_per_video nsplits = (len(pan_2ch_all) - 1) // max_nframes + 1 annotations, pan_all = [], [] for i in range(0, len(pan_2ch_all), max_nframes): print('==> Read and convert VPS output - split %d/%d' % ((i // max_nframes) + 1, nsplits)) pan_2ch_part = pan_2ch_all[i:min(i + max_nframes, len(pan_2ch_all ))] pan_2ch_split = np.array_split(pan_2ch_part, nprocs) workers = multiprocessing.Pool(processes=nprocs) processes = [] for proc_id, pan_2ch_set in enumerate(pan_2ch_split): p = workers.apply_async( self.converter_2ch_track_core, (proc_id, pan_2ch_set, color_generator)) processes.append(p) workers.close() workers.join() for p in processes: p = p.get() annotations.extend(p[0]) pan_all.extend(p[1]) pan_json = {'annotations': annotations} return pan_all, pan_json
def begin_background(self): self.queue = mp.Queue() def t(queue): while True: if queue.empty(): continue img, name = queue.get() if name: try: basename, ext = os.path.splitext(name) if ext != '.png': name = '{}.png'.format(basename) imageio.imwrite(name, img) except Exception as e: print(e) else: return worker = lambda: mp.Process( target=t, args=(self.queue, ), daemon=False) cpu_count = min(8, mp.cpu_count() - 1) self.process = [worker() for _ in range(cpu_count)] for p in self.process: p.start()
def whole_images(training_set, validation_set): training_batch_size = 16 validation_batch_size = 2 * training_batch_size training_loader = torch.utils.data.DataLoader( training_set, batch_size=training_batch_size, shuffle=True, num_workers=multiprocessing.cpu_count(), ) validation_loader = torch.utils.data.DataLoader( validation_set, batch_size=validation_batch_size, num_workers=multiprocessing.cpu_count(), )
def update_holdout_chromosomes(self, holdout_set): images = [] time_zero = time.time() block_size = HDF_MULTI_BLOCK_SIZE loc_chunks = list(range(0, self.__len__(), block_size)) num_processes = multiprocessing.cpu_count() print('Process with %d processors' % num_processes) with multiprocessing.Pool(num_processes) as pool: f = functools.partial(process_location, block_size=block_size, path=self.h5_path, holdout_set=holdout_set) for map_image in tqdm.tqdm(pool.imap_unordered(f, loc_chunks), total=len(loc_chunks)): images.extend(map_image) print('Saving num output locations:') print(len(images)) # Not necessary -- just for debug consistency -- remove to save time if big images.sort() print(images[:10]) print('Took %.2fs to process %d loc with %d processes' % (time.time() - time_zero, self.__len__(), num_processes)) for idx in tqdm.tqdm(images, total=len(images)): self.chromosome_holdout[idx] = True
def add_batched_coordinates(self, coords, lr=1, avg=1): start_time = time() num_procs = cpu_count() self.share_memory() processes = [] # sort and bin into layers params_coords = {} sorted_coords = sorted(coords, key=lambda x: x[0][0]) for coord_val_pair in sorted_coords: layer = coord_val_pair[0][0] if layer in params_coords: params_coords[layer].append(coord_val_pair) else: params_coords[layer] = [coord_val_pair] # update parameters in parallel for layer_index in params_coords.keys(): p = Process(target=self.add_coordinates, args=( layer_index, params_coords[layer_index], lr, avg, )) p.start() processes.append(p) for p in processes: p.join() self.log.info('time: {} s'.format(time() - start_time))
def __init__(self, params, model_path): self.params = params self.model_path = model_path self.num_of_processes = mp.cpu_count() self.global_model = ActorCritic(self.params.stack_size, get_action_space()) self.global_model.share_memory()
def evaluate(self, batch_size=100, cpu=-1, filtering=True) -> Dict[str, float]: """Evaluates a model by retrieving scores from the (implemented) score_batch function. :param batch_size: Size of a test batch :param cpu: Number of processors to use, -1 means all processors are used. :return: Dictionary containing the evaluation results (keys: 'hits@1', 'hits@3', 'hits@10', 'mrr') """ self.filtering = filtering start = time.time() n_batches, batches = self.dl.get_test_batches(batch_size) if cpu == 1 or cpu == 0: result = [] for batch in tqdm(batches, total=n_batches): result.append(self.evaluate_batch(batch)) elif cpu == -1: pool = mp.Pool(mp.cpu_count()) result = pool.map(self.evaluate_batch, batches) else: pool = mp.Pool(cpu) result = pool.map(self.evaluate_batch, batches) print('Evaluation took {:.3f} seconds'.format(time.time() - start)) return self.get_result(result)
def run_multiple_times(args, run_fct): cpu_count = mp.cpu_count() gpu_count = torch.cuda.device_count() # Clone arguments into list & Distribute workload across GPUs args_across_workers = [copy.deepcopy(args) for r in range(args.RUN_TIMES)] if gpu_count > 0: gpu_counter = 0 for r in range(args.RUN_TIMES): args_across_workers[r].device_id = gpu_counter gpu_counter += 1 if gpu_counter > gpu_count - 1: gpu_counter = 0 # Execute different runs/random seeds in parallel pool = mp.Pool(cpu_count - 1) df_across_runs = pool.map(run_fct, args_across_workers) pool.close() # Post process results df_concat = pd.concat(df_across_runs) by_row_index = df_concat.groupby(df_concat.index) df_means, df_stds = by_row_index.mean(), by_row_index.std() if args.SAVE: df_means.to_csv("logs/" + args.SAVE_FNAME + ".csv") return df_means, df_stds
def get_gt(pan_gt_json_file=None, pan_gt_folder=None): if pan_gt_json_file is None: pan_gt_json_file = self.panoptic_json_file if pan_gt_folder is None: pan_gt_folder = self.panoptic_gt_folder with open(pan_gt_json_file, 'r') as f: pan_gt_json = json.load(f) files = [item['file_name'] for item in pan_gt_json['images']] if 'viper' in pan_gt_folder: files = [ _.split('/')[-1].replace('.jpg', '.png') for _ in files ] cpu_num = multiprocessing.cpu_count() files_split = np.array_split(files, cpu_num) workers = multiprocessing.Pool(processes=cpu_num) processes = [] for proc_id, files_set in enumerate(files_split): p = workers.apply_async(BaseDataset._load_image_single_core, (proc_id, files_set, pan_gt_folder)) processes.append(p) workers.close() workers.join() pan_gt_all = [] for p in processes: pan_gt_all.extend(p.get()) categories = pan_gt_json['categories'] categories = {el['id']: el for el in categories} color_gererator = IdGenerator(categories) return pan_gt_all, pan_gt_json, categories, color_gererator
def _get_dataset(self, filename): dicts = self.processor._file_to_dicts(filename) #shuffle list of dicts here if we later want to have a random dev set splitted from train set if filename == self.processor.train_filename: if not self.processor.dev_filename: if self.processor.dev_split > 0.0: dicts = random.shuffle(dicts) dict_batches_to_process = int( len(dicts) / self.multiprocessing_chunk_size) num_cpus = min(mp.cpu_count(), self.max_processes, dict_batches_to_process) or 1 with ExitStack() as stack: p = stack.enter_context(mp.Pool(processes=num_cpus)) logger.info( f"Got ya {num_cpus} parallel workers to convert dict chunks to datasets (chunksize = {self.multiprocessing_chunk_size})..." ) log_ascii_workers(num_cpus, logger) results = p.imap( partial(self._multiproc, processor=self.processor), grouper(dicts, self.multiprocessing_chunk_size), chunksize=1, ) datasets = [] for dataset, tensor_names in tqdm(results, total=len(dicts) / self.multiprocessing_chunk_size): datasets.append(dataset) concat_datasets = ConcatDataset(datasets) return concat_datasets, tensor_names
def __init__(self, loader): self.loader = loader self.data_source = loader.data_source self.batch_size = loader.batch_size self.token_field = loader.token_field self.keyphrases_field = loader.keyphrases_field self.lazy_loading = loader.lazy_loading self.num_workers = multiprocessing.cpu_count() // 2 or 1 if self.loader.mode == TRAIN_MODE: self.chunk_size = self.batch_size * 5 else: self.chunk_size = self.batch_size self._data = self.load_data(self.chunk_size) self._batch_count_in_output_queue = 0 self._redundant_batch = [] self.workers = [] self.worker_shutdown = False if self.loader.mode in {TRAIN_MODE, EVAL_MODE}: self.input_queue = multiprocessing.Queue(-1) self.output_queue = multiprocessing.Queue(-1) self.__prefetch() for _ in range(self.num_workers): worker = multiprocessing.Process(target=self._data_worker_loop) self.workers.append(worker) for worker in self.workers: worker.daemon = True worker.start()
def test(n=10000): n_cpus = mp.cpu_count() print("num cpus: ", n_cpus) p = Pool(n_cpus) import time # s1 = time.time() data = [] for i in range(n): inp_val = np.random.random(size=10) vec_val = np.random.random(size=10) data.append((inp_val, vec_val)) # # res = p.map(compute_hvp, data) # e1 = time.time() # print ("Time 1: ", (e1-s1)) s2 = time.time() for i in range(n): inp_val, vec_val = data[i] inp = Variable(torch.FloatTensor([inp_val]), requires_grad=True) v = Variable(torch.FloatTensor([vec_val]), requires_grad=False) z = three_sin(inp) l = F.mse_loss(z, torch.zeros_like(z)) # hvp_rop_lop = Hvp_RopLop(f, inp, v) # print ("hvp: ", hvp_rop_lop.data) # hvp_dbl_bp = Hvp_dbl_bp(l, inp, v) # print ("hvp: ", hvp_dbl_bp.data) # print ("hvp: ", hvp_rop_lop.data, hvp_dbl_bp.data) gnvp_roplop = GNvp_RopLop(l, z, inp, v) e2 = time.time() print("Time 2: ", (e2 - s2))
def train(rank, args, model, barrier, rankstart, rankstop): if args.tp: os.system("taskset -apc %d %d" % (rank % mp.cpu_count(), os.getpid())) torch.manual_seed(args.seed + rank) train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, num_workers=1) optimizer = BATCH_PARTITIONED_SGD(model.parameters(), lr=args.lr, momentum=args.momentum) gamma = 0.9 + torch.rand(1).item() / 10 scheduler = MyLR(optimizer, gamma) #lrs.ReduceLROnPlateau(optimizer, 'min', gamma) # for epoch in range(1, args.epochs + 1): # scheduler.step() print("Training: Epoch = " + str(epoch)) loss = train_epoch(epoch, args, model, train_loader, optimizer, rankstart, rankstop) barrier[rank] += 1 print("TrainError = " + str('%.6f' % loss.item()) + "\n")
def train(env_params, model_path, episodes=200, episode_length=50): print('Actor-Critic training') # Global network env = PoolEnv(**env_params) gnet = Net(env.state_space.n, env.action_space.n, HIDDEN_DIM, action_ranges=env.action_space.ranges) gnet.share_memory() # share the global parameters in multiprocessing opt = SharedAdam(gnet.parameters(), lr=LR) # global optimizer global_ep, global_ep_r = mp.Value('i', 0), mp.Value('d', 0.) # 'i': int, 'd': double # Parallel training workers = [ Worker(gnet, opt, global_ep, global_ep_r, i, env_params, HIDDEN_DIM, episodes, episode_length, model_path) for i in range(mp.cpu_count() // 2) ] for w in workers: w.start() for w in workers: w.join() save_model(model_path, gnet)
def init_worker_pool(args): """ Creates the worker pool for drmsd batch computation. Does nothing if sequential. """ torch.multiprocessing.set_start_method("spawn") return torch.multiprocessing.Pool( mp.cpu_count()) if not args.sequential_drmsd_loss else None
def test_dataloader(self): batch_size = self.hparams.batch_size_test num_workers = cpu_count() return DataLoader(self.test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)