def cast_data(header, tablename, data): typedict = get_typedict(tablename) type_casters = [] for i in range(len(header)): sql_type = typedict[header[i]] if sql_type == text_type: type_casters.append(lambda str: str.encode('UTF-8')) #type_casters.append(lambda passer: passer) elif sql_type == int_type: type_casters.append(int) elif sql_type == date_type: type_casters.append(timestamp_parser.parse) log('casting data for ' + str(len(data)) + " rows") def cast_line(dataln): cast_line = [] for col_id in range(len(dataln)): cast_line.append(type_casters[col_id](dataln[col_id])) return cast_line tpool = Pool(processes=6) ret = tpool.map(cast_line, data) tpool.close() return ret
def __init__(self, observables: List[AgentHandler], actionables: List[AgentHandler], mission_handlers: List[AgentHandler], nsteps, gamma, data_directory, num_workers, worker_batch_size, min_size_to_dequeue): """ Sets up a tensorflow dataset to load videos from a given data directory. :param data_directory: the directory of the data to be loaded, eg: 'minerl.herobraine_parse/output/rendered/' """ self.data_dir = data_directory self.observables = observables self.actionables = actionables self.mission_handlers = mission_handlers # self.vectorizer = vectorizer self.number_of_workers = num_workers self.worker_batch_size = worker_batch_size self.size_to_dequeue = min_size_to_dequeue self.nsteps = nsteps self.gamma = gamma self.processing_pool = Pool(self.number_of_workers) self.m = multiprocessing.Manager() self.data_queue = self.m.Queue(maxsize=self.size_to_dequeue // self.worker_batch_size * 4) pool_size = self.size_to_dequeue * 4 self.random_queue = PriorityQueue(maxsize=pool_size)
def process_experiment(_experiment, _overwrite=False): _arguments = [(_experiment, int(_series.split('_')[1]), _overwrite) for _series in paths.image_files(paths.serieses(_experiment)) ] _p = Pool(CPUS_TO_USE) _p.starmap(process_series, _arguments) _p.close()
def compute_simulations_fiber_densities(_simulations): _arguments = [] for _simulation in _simulations: for _direction in ['left', 'right', 'up', 'down']: _arguments.append({ 'simulation': _simulation, 'length_x': config.QUANTIFICATION_WINDOW_HEIGHT_IN_CELL_DIAMETER if _direction in ['up', 'down'] else config.QUANTIFICATION_WINDOW_WIDTH_IN_CELL_DIAMETER, 'length_y': config.QUANTIFICATION_WINDOW_WIDTH_IN_CELL_DIAMETER if _direction in ['up', 'down'] else config.QUANTIFICATION_WINDOW_HEIGHT_IN_CELL_DIAMETER, 'offset_x': OFFSET_Y if _direction in ['up', 'down'] else OFFSET_X, 'offset_y': OFFSET_X if _direction in ['up', 'down'] else OFFSET_Y, 'cell_id': 'cell', 'direction': _direction, 'time_points': TIME_POINTS }) _fiber_densities = {} with Pool(CPUS_TO_USE) as _p: for _keys, _value in tqdm( _p.imap_unordered(compute.window_fiber_density_by_time, _arguments), total=len(_arguments), desc='Computing windows & fiber densities'): _fiber_densities[(_keys['simulation'], _keys['direction'])] = _value _p.close() _p.join() return _fiber_densities
def synthesize_spectrograms(self, texts: List[str], embeddings: Union[np.ndarray, List[np.ndarray]], return_alignments=False): """ Synthesizes mel spectrograms from texts and speaker embeddings. :param texts: a list of N text prompts to be synthesized :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) :param return_alignments: if True, a matrix representing the alignments between the characters and each decoder output step will be returned for each spectrogram :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the sequence length of spectrogram i, and possibly the alignments. """ if not self._low_mem: # Usual inference mode: load the model on the first request and keep it loaded. if not self.is_loaded(): self.load() specs, alignments = self._model.my_synthesize(embeddings, texts) else: # Low memory inference mode: load the model upon every request. The model has to be # loaded in a separate process to be able to release GPU memory (a simple workaround # to tensorflow's intricacies) specs, alignments = Pool(1).starmap( Synthesizer._one_shot_synthesize_spectrograms, [(self.checkpoint_fpath, embeddings, texts)])[0] return (specs, alignments) if return_alignments else specs
def main(): global _logger parser = ArgumentParser(usage='prog [options]') parser.add_argument('-c', '--config', dest='config_file', required=True, metavar='CONFIG_FILE', help='The full path to the config file to open') parser.add_argument('-l', '--log-config', default='logger.conf', dest='log_config_file', metavar='LOG_CONFIG_FILE', help='The full path to the log config file to open') parser.add_argument('--start-frame-time', default=-1, type=int, dest='start_frame_pos', help='Start frame time in seconds') parser.add_argument('--end-frame-time', default=-1, type=int, dest='end_frame_pos', help='End frame time in seconds') parser.add_argument('--smooth-filter-size', default=3, type=int, dest='gaussian_filter_size', help='Gaussian filter kernel size') parser.add_argument('--smooth-filter-sigma', default=0, type=int, dest='gaussian_filter_sigma', help='Gaussian filter sigma') parser.add_argument('--nthreads', default=1, type=int, dest='nthreads') parser.add_argument('--nprocesses', default=1, type=int, dest='nprocesses', help='Number of processes to run in parallel') args = parser.parse_args() # setup logger logging.config.fileConfig(args.log_config_file) _logger = logging.getLogger('tracker') if args.config_file is None: _logger.warning('Missing config file') parser.exit(1, 'Missing config file\n') # load config file config, errors = load_config(args.config_file) errors |= set(config.validate()) if len(errors) == 0: if args.nprocesses > 1: source = MovieFile(config.get_source(), start_msecs=args.start_frame_pos * 1000, end_msecs=args.end_frame_pos * 1000, resolution=config.get_image_size()) if not source.is_opened(): _logger.error('Error opening %s' % config.get_source()) return start_frame_pos = int(source.get_start_time_in_seconds()) end_frame_pos = int(source.get_end_time_in_seconds()) frame_interval = int((end_frame_pos - start_frame_pos) / args.nprocesses) tracker_args = [(config, s * 1000, (s + frame_interval) * 1000, args.gaussian_filter_size, args.gaussian_filter_sigma, args.nthreads, _get_run_interval(s, s + frame_interval)[1]) for s in range(start_frame_pos, end_frame_pos, frame_interval) ] with Pool(args.nprocesses) as p: p.starmap(_run_tracker, tracker_args) else: _run_tracker(config, args.start_frame_pos * 1000, args.end_frame_pos * 1000, args.gaussian_filter_size, args.gaussian_filter_sigma, args.nthreads) else: _logger.error('Config load error: %r' % errors)
def process_experiment(_experiment, _overwrite=False): _arguments = [] for _tuple in load.experiment_groups_as_tuples(_experiment): _experiment, _series_id, _group = _tuple _arguments.append((_experiment, _series_id, _group, _overwrite)) _p = Pool(CPUS_TO_USE) _p.starmap(process_group, _arguments) _p.close()
def runMultiProcessTrajectories(self, repeat): pool = Pool(processes=len(self.posIni)) result = pool.map(partial(self.nTraj, repeat=repeat), [(x, y) for x, y in self.posIni]) pool.close() pool.join() meanCost, meanTraj = 0, 0 for CostCMAES, traj in result: meanCost += CostCMAES meanTraj += traj size = len(result) return meanCost / size, meanTraj / size
def run(self): nproc = PipeEnum.PARALLEL_N_PROCS.value nchunks = PipeEnum.PARALLEL_N_CHUNKS.value if nproc in self.kwargs: n_processes = self.kwargs[nproc] chunks = self.kwargs.get(nchunks, 1) pool = Pool(n_processes) self.output = [ i for i in pool.map( self.map_function, self.reader.data, chunksize=chunks) ] else: self.output = self.map_function(self.reader.data)
def __init__(self, chromosomes: Iterable[Any], eval_function: Callable, checkpoint_target: Optional[str] = None, concurrent_workers: Optional[int] = 1, maximize: bool = True, generation: int = 0, intended_size: Optional[int] = None, serializer=None): self.concurrent_workers = concurrent_workers self.documented_best = None self.eval_function = eval_function self.generation = generation self.id = str(uuid4())[:6] self.individuals = [Individual(chromosome=chromosome) for chromosome in chromosomes] self.intended_size = intended_size or len(self.individuals) self.maximize = maximize self.serializer = serializer or SimpleSerializer(target=checkpoint_target) self.pool = None if concurrent_workers == 1 else Pool(concurrent_workers)
def _phase_coherence( signal_pair: Tuple[TimeSeries, TimeSeries], params: PCParams ) -> Tuple[Tuple[TimeSeries, TimeSeries], ndarray, ndarray, ndarray, ndarray]: """ Function which uses `wpc` to calculate phase coherence for a single pair of signals. The signals must have their wavelet transforms attached in their `output_data` member variable. :param signal_pair: tuple containing 2 signals :param params: the params object with parameters for the function :return: [tuple] the pair of signals; [2D array] the time-localised phase coherence; [1D array] phase coherence; [1D array] phase difference; [1D array] time-localised phase coherence of surrogates """ s1, s2 = signal_pair wt1 = s1.output_data.values wt2 = s2.output_data.values freq = s1.output_data.freq fs = s1.frequency # Calculate surrogates. surr_count = params.surr_count surr_method = params.surr_method surr_preproc = params.surr_preproc surrogates, _ = surrogate_calc(s1, surr_count, surr_method, surr_preproc, fs) # Calculate surrogates. pool = Pool() args = [(wt1, surrogates[i], params) for i in range(surr_count)] tpc_surr = pool.starmap(_wt_surrogate_calc, args) if len(tpc_surr) > 0: tpc_surr = np.mean(tpc_surr, axis=0) # Calculate phase coherence. tpc, pc, pdiff = wpc(wt1, wt2, freq, fs) return signal_pair, tpc, pc, pdiff, tpc_surr
def __init__(self, observables: List[AgentHandler], actionables: List[AgentHandler], mission_handlers: List[AgentHandler], data_directory, num_workers, worker_batch_size, min_size_to_dequeue): """ Sets up a tensorflow dataset to load videos from a given data directory. :param data_directory: the directory of the data to be loaded, eg: 'minerl.herobraine_parse/output/rendered/' """ self.data_dir = data_directory self.observables = observables self.actionables = actionables self.mission_handlers = mission_handlers # self.vectorizer = vectorizer self.number_of_workers = num_workers self.worker_batch_size = worker_batch_size self.size_to_dequeue = min_size_to_dequeue self.processing_pool = Pool(self.number_of_workers)
def clean_data(): rows_per_loop = 100000 log("") log("starting") dirty_db_path = ROOTDIR + dir_sep + "stage_2_clean.db" clean_db_path = ROOTDIR + dir_sep + "stage_3_cleaner.db" dirty_db_cursor = create_connection(dirty_db_path).cursor() clean_db = create_connection(clean_db_path) clean_db_cursor = clean_db.cursor() clean_db_cursor.execute("DELETE FROM bodies") clean_db_cursor.execute("delete from sqlite_sequence where name='bodies'") dirty_db_cursor.execute("select bodies from bodies") data = dirty_db_cursor.fetchmany(rows_per_loop) tpool = Pool(processes=4) locp_n = 1 log("detected " + str(cpus) + " as cpu count") inserted = 0 more_data = True while more_data: log("cleaning data") data = tpool.map(clean_line, data) data = list(map(lambda line: (line, ), data)) log("inserting 100k rows") query = "insert into bodies (bodies) values (?)" clean_db_cursor.executemany(query, data) clean_db.commit() log("done loop, getting more data.") inserted += len(data) data = dirty_db_cursor.fetchmany(rows_per_loop) #more_data = False if len(data) < 1: more_data = False log("end of data") log("done " + str(locp_n) + " loops") locp_n += 1 log("done") log("inserted " + str(inserted) + " rows")
def test_concurrent_download_and_prepare(self): with tempfile.TemporaryDirectory() as tmp_dir: processes = 2 with Pool(processes=processes) as pool: jobs = [ pool.apply_async(_run_concurrent_download_and_prepare, kwds={"tmp_dir": tmp_dir}) for _ in range(processes) ] dummy_builders = [job.get() for job in jobs] for dummy_builder in dummy_builders: self.assertTrue( os.path.exists( os.path.join(tmp_dir, "dummy_builder", "dummy", "0.0.0", "dummy_builder-train.arrow") ) ) self.assertDictEqual(dummy_builder.info.features, Features({"text": Value("string")})) self.assertEqual(dummy_builder.info.splits["train"].num_examples, 100) self.assertTrue( os.path.exists(os.path.join(tmp_dir, "dummy_builder", "dummy", "0.0.0", "dataset_info.json")) )
def filter(dirty_data): log("starting filter") tpool = Pool(processes=cpus) ret = [] log("filtering deleted and not english") for line in tpool.map(Filter.__is_not_deleted_or_not_non_english, dirty_data): if line[1]: ret.append(line[0]) def clean_links_and_punctuation(comment): words = comment.split(" ") words = list(map(Filter.__filter_links, words)) comment = reduce(lambda x, y: x + " " + y, words) return comment log("filtering links and punctuation") ret = tpool.map(clean_links_and_punctuation, ret) tpool.close() log("filter done") return ret
def classify_multiproc(model, stack_data, result, array_outfile=None, mask=None): d = Classifier() d.get_stack(stack_data, outfile=array_outfile, mask_path=mask) stack_data = d.masked_data_stack cores = cpu_count() a = ArrayDisAssembly(stack_data) arrays = a.disassemble(n_sections=cores) classifiers = [Classifier(idx=i, arr=a, model=model) for i, a in enumerate(arrays)] time = datetime.now() pool = Pool(processes=cores) with pool as p: pool_results = [p.apply_async(get_classifier, (c, a)) for a, c in zip(arrays, classifiers)] classified_arrays = [res.get() for res in pool_results] a.assemble(classified_arrays) final = a.assembled.reshape(d.final_shape) td = (datetime.now() - time) d.write_raster(out_file=result, new_array=final) return None
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, skip_existing, logger): print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) # 预处理一个说话人的所有语音 def preprocess_speaker(speaker_dir: Path): speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) speaker_out_dir = out_dir.joinpath(speaker_name) speaker_out_dir.mkdir(exist_ok=True) sources_fpath = speaker_out_dir.joinpath("_sources.txt") sources_file = sources_fpath.open("a" if skip_existing else "w") for in_fpath in speaker_dir.glob("**/*.%s" % extension): out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) out_fname = out_fname.replace(".%s" % extension, ".npy") # 预处理单条语音 wav = audio.preprocess_wav(in_fpath) # 丢弃过短语音 frames = audio.wav_to_mel_spectrogram(wav) if len(frames) < partials_n_frames: continue out_fpath = speaker_out_dir.joinpath(out_fname) np.save(out_fpath, frames) logger.add_sample(duration=len(wav) / sampling_rate) sources_file.write("%s,%s\n" % (out_fname, in_fpath)) sources_file.close() # 处理每个说话人的所有语音 with Pool(32) as pool: list( tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), unit="speakers")) logger.finalize() print("Done preprocessing %s.\n" % dataset_name)
def connect_structures_find_saddles(sequence, structure_list): pairs = {} #fc = RNA.fold_compound(sequence) fp_pool = Pool(Max_Threads) res_list=[] for i, se_1 in enumerate(structure_list): for j in range(i+1, len(structure_list)): se_2 = structure_list[j] a = fp_pool.apply_async(find_saddle, args=(se_1.Structure, se_2.Structure, i, j)) res_list.append(a) #saddle_energy_dcal = fc.path_findpath_saddle(se_1.Structure, se_2.Structure) #saddle_energy_kcal = saddle_energy_dcal / 100.0 #pairs[(i,j)] = saddle_energy_kcal #pairs[(j,i)] = saddle_energy_kcal fp_pool.close() for a in res_list: i,j, saddle_energy_kcal = a.get() pairs[(i,j)] = saddle_energy_kcal pairs[(j,i)] = saddle_energy_kcal # get lowest saddle for each structure that ends in a structure with lower energy than the first structure. minimal_saddle_list = [] for i in range(0, len(structure_list)): se_1 = structure_list[i] min_saddle_energy = sys.maxsize tree_neighbor = None for j in range(0, len(structure_list)): if i == j: continue se_2 = structure_list[j] saddle_energy = pairs[(i,j)] if saddle_energy <= min_saddle_energy and se_2.Energy < se_1.Energy: min_saddle_energy = saddle_energy tree_neighbor = j if tree_neighbor == None: # it could be the root. tree_neighbor = -1 minimal_saddle_list.append((i, tree_neighbor, min_saddle_energy)) return minimal_saddle_list
def save_word_error_parallel(results_file, output_file, X_test, y_test): file = open(results_file, 'r') lines = file.readlines() file.close() pool = Pool(4) # uses up 4 cores per call (for 8 core machine), so change this to match half your cpu count f_vals = [] # packs a tuple - line, index, X_test, y_test, verbose (should print or not) data = [(l, i, X_test, y_test, True) for i, l in enumerate(lines)] # process it in correct order accuracies = pool.imap(process_line, data) # maintains order when processing in parallel # wait while processing the data pool.close() for acc in accuracies: f_vals.append(str(acc) + "\n") file = open(output_file, 'w') file.writelines(f_vals) file.close()
def main(): parser = argparse.ArgumentParser( description='Space Time Action Unit R-CNN training example:') parser.add_argument('--pid', '-pp', default='/tmp/SpaceTime_AU_R_CNN/') parser.add_argument('--gpu', '-g', nargs='+', type=int, help='GPU ID, multiple GPU split by space') parser.add_argument('--lr', '-l', type=float, default=0.001) parser.add_argument('--out', '-o', default='end_to_end_result', help='Output directory') parser.add_argument('--trainval', default='train', help='train/test') parser.add_argument('--database', default='BP4D', help='Output directory: BP4D/DISFA/BP4D_DISFA') parser.add_argument('--iteration', '-i', type=int, default=70000) parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--batch_size', '-bs', type=int, default=1) parser.add_argument('--snapshot', '-snap', type=int, default=1000) parser.add_argument('--need_validate', action='store_true', help='do or not validate during training') parser.add_argument('--mean', default=config.ROOT_PATH + "BP4D/idx/mean_no_enhance.npy", help='image mean .npy file') parser.add_argument('--backbone', default="mobilenet_v1", help="vgg/resnet101/mobilenet_v1 for train") parser.add_argument('--optimizer', default='RMSprop', help='optimizer: RMSprop/AdaGrad/Adam/SGD/AdaDelta') parser.add_argument('--pretrained_model', default='mobilenet_v1', help='imagenet/mobilenet_v1/resnet101/*.npz') parser.add_argument('--pretrained_model_args', nargs='+', type=float, help='you can pass in "1.0 224" or "0.75 224"') parser.add_argument('--spatial_edge_mode', type=SpatialEdgeMode, choices=list(SpatialEdgeMode), help='1:all_edge, 2:configure_edge, 3:no_edge') parser.add_argument( '--temporal_edge_mode', type=TemporalEdgeMode, choices=list(TemporalEdgeMode), help='1:rnn, 2:attention_block, 3.point-wise feed forward(no temporal)' ) parser.add_argument("--bi_lstm", action="store_true", help="whether to use bi-lstm as Edge/Node RNN") parser.add_argument( '--use_memcached', action='store_true', help='whether use memcached to boost speed of fetch crop&mask') # parser.add_argument('--memcached_host', default='127.0.0.1') parser.add_argument("--fold", '-fd', type=int, default=3) parser.add_argument("--layers", type=int, default=1) parser.add_argument("--split_idx", '-sp', type=int, default=1) parser.add_argument("--use_paper_num_label", action="store_true", help="only to use paper reported number of labels" " to train") parser.add_argument("--previous_frame", type=int, default=50) parser.add_argument("--sample_frame", '-sample', type=int, default=25) parser.add_argument( "--snap_individual", action="store_true", help="whether to snapshot each individual epoch/iteration") parser.add_argument("--proc_num", "-proc", type=int, default=1) parser.add_argument('--eval_mode', action='store_true', help='Use test datasets for evaluation metric') args = parser.parse_args() os.makedirs(args.pid, exist_ok=True) os.makedirs(args.out, exist_ok=True) pid = str(os.getpid()) pid_file_path = args.pid + os.sep + "{0}_{1}_fold_{2}.pid".format( args.database, args.fold, args.split_idx) with open(pid_file_path, "w") as file_obj: file_obj.write(pid) file_obj.flush() print('GPU: {}'.format(",".join(list(map(str, args.gpu))))) adaptive_AU_database(args.database) mc_manager = None if args.use_memcached: from collections_toolkit.memcached_manager import PyLibmcManager mc_manager = PyLibmcManager(args.memcached_host) if mc_manager is None: raise IOError("no memcached found listen in {}".format( args.memcached_host)) train_data = AUDataset( database=args.database, fold=args.fold, split_name=args.trainval, split_index=args.split_idx, mc_manager=mc_manager, train_all_data=False, ) result_data = [ img_path for img_path, AU_set, current_database_name in train_data.result_data if args.database + "|" + img_path not in mc_manager ] sub_list = split_list(result_data, len(result_data) // 100) for img_path_lst in sub_list: with Pool(processes=50) as pool: input_list = [(img_path, None, None) for img_path in img_path_lst] result =\ pool.starmap(parallel_landmark_and_conn_component, input_list) pool.close() pool.join() for img_path, AU_box_dict, landmark_dict, box_is_whole_image in result: key_prefix = args.database + "|" key = key_prefix + img_path orig_img = cv2.imread(img_path, cv2.IMREAD_COLOR) new_face, rect = FaceMaskCropper.dlib_face_crop( orig_img, landmark_dict) print("write {}".format(key)) if mc_manager is not None and key not in mc_manager: save_dict = { "landmark_dict": landmark_dict, "AU_box_dict": AU_box_dict, "crop_rect": rect } mc_manager.set(key, save_dict)
def Pool(processes=None, initializer=None, initargs=()): ''' Returns a process pool object ''' from multiprocess.pool import Pool return Pool(processes, initializer, initargs)
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, skip_existing, logger): print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) # Function to preprocess utterances for one speaker def preprocess_speaker(speaker_dir: Path): # Give a name to the speaker that includes its dataset speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) # Create an output directory with that name, as well as a txt file containing a # reference to each source file. speaker_out_dir = out_dir.joinpath(speaker_name) speaker_out_dir.mkdir(exist_ok=True) sources_fpath = speaker_out_dir.joinpath("_sources.txt") # There's a possibility that the preprocessing was interrupted earlier, check if # there already is a sources file. if sources_fpath.exists(): try: with sources_fpath.open("r") as sources_file: existing_fnames = { line.split(",")[0] for line in sources_file } except Exception: existing_fnames = {} else: existing_fnames = {} # Gather all audio files for that speaker recursively sources_file = sources_fpath.open("a" if skip_existing else "w") for in_fpath in speaker_dir.glob("**/*.%s" % extension): # Check if the target output file already exists out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) out_fname = out_fname.replace(".%s" % extension, ".npy") if skip_existing and out_fname in existing_fnames: continue # Load and preprocess the waveform wav = audio.preprocess_wav(in_fpath) if len(wav) == 0: continue # Create the mel spectrogram, discard those that are too short frames = audio.wav_to_mel_spectrogram(wav) if len(frames) < partials_n_frames: continue out_fpath = speaker_out_dir.joinpath(out_fname) np.save(out_fpath, frames) logger.add_sample(duration=len(wav) / sampling_rate) sources_file.write("%s,%s\n" % (out_fname, in_fpath)) sources_file.close() # Process the utterances for each speaker # for speaker_dir in speaker_dirs: # DEBUG # preprocess_speaker(speaker_dir) with Pool(56) as pool: list( tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), unit="speakers")) logger.finalize() print("Done preprocessing %s.\n" % dataset_name)
def save( target: Optional[str] = typer.Argument(default=None, help="sketch directory or file"), name: Optional[str] = typer.Option(None, "-n", "--name", help="output name (without extension)"), config: Optional[str] = typer.Option( None, "-c", "--config", help= ("path to the config file to use (may be a path to JSON file or the name of the " "configuration)"), ), seed: Optional[str] = typer.Option(None, "-s", "--seed", help="seed or seed range to use"), destination: Optional[str] = typer.Option(None, "-d", "--destination", help="destination path"), multiprocessing: bool = typer.Option(True, envvar="VSK_MULTIPROCESSING", help="enable multiprocessing"), ) -> None: """Save the sketch to a SVG file. By default, the output is named after the sketch and the provided options. An alternative name my be provided with the --name option. If the sketch as parameters, their default values are used. Alternatively, a pre-existing configuration can be used instead with the --config option. By default, a random seed is used for vsketch's random number generator. If --config is used, the seed saved in the configuration is used instead. A seed may also be provided with the --seed option, in which case it will override the configuration's seed. The --seed option also accepts seed range in the form of FIRST..LAST, e.g. 0..100. In this case, one output file per seed is generated. If the number of files to generate is greater than 4, all available cores are used for the process. This behaviour can be disabled with --no-multiprocessing or the VSK_MULTIPROCESSING variable. By default, all SVG are saved in the sketch's "output" sub-directory. This can be overridden using the --destination option. """ try: path = _find_sketch_script(target) except ValueError as err: print_error("Sketch could not be found: ", str(err)) raise typer.Exit(code=1) # load configuration param_set: Dict[str, vsketch.ParamType] = {} config_postfix = "" if config is not None: config_path = pathlib.Path(config) if not config_path.exists(): config_path = get_config_path(path) / (config + ".json") if config_path.exists(): param_set = load_config(config_path) config_postfix = "_" + config_path.stem else: print_error("Config file not found: ", str(config_path)) # compute name if name is None: name = canonical_name(path) + config_postfix seed_in_name = seed is not None if seed is None: if param_set is not None and "__seed__" in param_set: seed_start = seed_end = int(param_set["__seed__"]) else: seed_start = seed_end = random.randint(0, 2**31 - 1) else: try: seed_start, seed_end = _parse_seed(seed) except ValueError as err: print_error(f"Could not parse seed {seed}: ", str(err)) raise typer.Exit(code=1) # prepare output path if destination is not None: output_path = pathlib.Path(destination) if not output_path.exists(): print_error("Provided output path does not exist: ", str(output_path.absolute())) raise typer.Exit(code=1) if not output_path.is_dir(): print_error("Provided output path is not a directory: ", str(output_path.absolute())) raise typer.Exit(code=1) else: output_path = path.parent / "output" if not output_path.exists(): output_path.mkdir() elif not output_path.is_dir(): print_error("Could not create output directory: ", str(output_path)) raise typer.Exit(code=1) # noinspection PyShadowingNames def _write_output(seed: int) -> None: # this needs to be there because the sketch class cannot be pickled apparently sketch_class = load_sketch_class(path) if sketch_class is None: print_error("Could not load script: ", str(path)) raise typer.Exit(code=1) sketch_class.set_param_set(param_set) output_name = name if seed_in_name: output_name += "_s" + str(seed) # type: ignore output_name += ".svg" # type: ignore output_file = output_path / output_name sketch = sketch_class.execute(finalize=True, seed=seed) if sketch is None: print_error("Could not execute script: ", str(path)) raise typer.Exit(code=1) doc = sketch.vsk.document with open(output_file, "w") as fp: print_info("Exporting SVG: ", str(output_file)) vp.write_svg(fp, doc, source_string=f"vsketch save -s {seed} {path}", color_mode="layer") seed_range = range(seed_start, seed_end + 1) if len(seed_range) < 4 or not multiprocessing: for s in seed_range: _write_output(s) else: with Pool() as p: list(p.imap(_write_output, seed_range))
def Pool(processes=None, initializer=None, initargs=(), maxtasksperchild=None): ''' Returns a process pool object ''' from multiprocess.pool import Pool return Pool(processes, initializer, initargs, maxtasksperchild)
c4 = Dataset.load_from_disk("/home/ahemf/processed/c4_extended") cpu_count = os.cpu_count() overall_counts = Counter() # dsets_tokenized = dset.map(lambda x: dict(tokens=tokenizer.batch_encode_plus(x["text"], add_special_tokens=False, max_length=4096, padding=False)["input_ids"]), batched=True, batch_size=1024) def term_frequency_builder(tokens): raw_counts = Counter(tokens) mc = raw_counts.most_common()[0] most_common_count = mc[1] tf = [[str(k), str(0.5 + 0.5 * (v / most_common_count))] for k, v in raw_counts.items()] return tf with Pool(cpu_count) as p: def mapping(x): tokens = [[ re.sub(r'[^\s0-9a-zA-Z]', ' ', w).strip() for w in t.split() ] for t in x["text"]] tokens = [[w for w in t if len(w) >= 2] for t in tokens] tf = [ term_frequency_builder(tk) if len(tk) > 0 else [["", str(1.0)]] for tk in tokens ] return tf def batch_term_frequency_builder(x): texts = x["text"] csz = int(np.ceil(len(texts) / cpu_count))
# Load the wav from disk if needed if isinstance(fpath_or_wav, str): wav = librosa.core.load(fpath_or_wav, sr=sampling_rate)[0] else: wav = fpath_or_wav return wav def local_mel2samp(filepath): print("start", filepath) filepath = filepath.split("|")[0] audio = preprocess_wav(filepath, sampling_rate=args.sampling_rate) filename = os.path.basename(filepath) new_filepath = args.output_dir + '/' + filename + '.npy' print("finish", new_filepath) np.save(new_filepath, audio) filepaths = files_to_list(args.filelist_path) with Pool(args.num_processes) as pool: # ThreadPool(8) as pool: # list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), list(pool.map(local_mel2samp, filepaths)) # for filepath in filepaths: # filepath = filepath.split("|")[0] # audio = preprocess_wav(filepath, sampling_rate=args.sampling_rate) # filename = os.path.basename(filepath) # new_filepath = args.output_dir + '/' + filename + '.npy' # print(new_filepath) # np.save(new_filepath, audio)
def main(): _simulations = load.structured() _simulations = filtering.by_categories( _simulations, _is_single_cell=False, _is_heterogeneity=None, _is_low_connectivity=False, _is_causality=CAUSALITY, _is_dominant_passive=DOMINANT_PASSIVE, _is_fibrin=False) _simulations = filtering.by_pair_distances(_simulations, _distances=PAIR_DISTANCE) print('Total simulations:', len(_simulations)) _arguments = [] for _simulation in _simulations: for _cell_id in ['left_cell', 'right_cell']: _arguments.append({ 'simulation': _simulation, 'length_x': config.QUANTIFICATION_WINDOW_WIDTH_IN_CELL_DIAMETER, 'length_y': config.QUANTIFICATION_WINDOW_HEIGHT_IN_CELL_DIAMETER, 'offset_x': 0, 'offset_y': 0, 'cell_id': _cell_id, 'direction': 'inside', 'time_points': TIME_POINTS }) _fiber_densities = {} with Pool(CPUS_TO_USE) as _p: for _keys, _value in tqdm(_p.imap_unordered( compute.window_fiber_density_by_time, _arguments), total=len(_arguments), desc='Computing windows & fiber densities'): _fiber_densities[(_keys['simulation'], _keys['cell_id'])] = _value _p.close() _p.join() _simulations_by_heterogeneity = organize.by_heterogeneity(_simulations) _headers = [ 'time_point', 'simulation', 'pair_distance_in_cell_diameter', 'heterogeneity_std', 'left_cell_fiber_density', 'left_cell_fiber_density_z_score', 'right_cell_fiber_density', 'right_cell_fiber_density_z_score', ] _csv_path = os.path.join(paths.OUTPUTS, 'simulations_density_cell_pairs.csv') with open(_csv_path, 'w', newline='') as _csv_file: _csv_writer = csv.writer(_csv_file) _csv_writer.writerow(_headers) for _simulation_std in _simulations_by_heterogeneity: print('STD:', _simulation_std) _std_simulations = _simulations_by_heterogeneity[_simulation_std] for _simulation in tqdm(_std_simulations, desc='Simulations loop'): _properties = load.properties(_simulation) _pair_distance = compute.pair_distance(_properties) _average, _std = load.normalization(_simulation).values() _left_cell_fiber_densities = _fiber_densities[(_simulation, 'left_cell')] _right_cell_fiber_densities = _fiber_densities[(_simulation, 'right_cell')] for _time_point, (_left_cell_fiber_density, _right_cell_fiber_density) in \ enumerate(zip(_left_cell_fiber_densities, _right_cell_fiber_densities)): _left_cell_fiber_density_z_score = compute_lib.z_score( _left_cell_fiber_density, _average, _std) _right_cell_fiber_density_z_score = compute_lib.z_score( _right_cell_fiber_density, _average, _std) _csv_writer.writerow([ _time_point, _simulation, _pair_distance, _simulation_std, _left_cell_fiber_density, _left_cell_fiber_density_z_score, _right_cell_fiber_density, _right_cell_fiber_density_z_score ])
def __call__(self, inet, return_last_net=False): # local vars for efficiency args = self.args nettype = args.nettype first_inf_nodes = self.first_inf_nodes no_exposed = self.no_exposed is_covid = self.is_covid tr_rate = self.tr_rate trans_true_items = self.trans_true_items trans_know_items = self.trans_know_items # will hold all network events net_events = defaultdict() # whether return of netstate is needed from this step (only in the case of predefined networks) netstate_return = False # initialize the true network seed either randomly, or based on what has been supplied already + net index net_seed = random.randint( 0, 2e9) if args.netseed is None else args.netseed + inet args_dict = vars(args) # the infection net is either predefined in the first element of args.nettype or created at random based on the model name specified by the same param # if the net is predefined, we also seed here the first_inf_nodes (this is because we did not have access to the list of nodes prior to this point) try: true_net = network.get_from_predef(nettype['0'][0], inet=inet, W_factor=nettype.get('Wi', 0), **args_dict) # if we didn't set the netsize, it means that 'nid' was not supplied in the dict # so we assume only the nodes in the edges supplied are nodes in this network if args.netsize == -1: args.netsize = len(true_net.nodes) args.k = true_net.avg_degree() netstate_return = True # TypeError: nettype is a str, KeyError: key '0' is not in the dynamic dict, IndexError: element 0 (inf net) not in list except (TypeError, KeyError, IndexError): # args_dict should also contain the netsize and the average degree true_net = network.get_random(typ=nettype, nseed=net_seed, inet=inet, weighted=args.use_weights, **args_dict) # if NO compute distribution was enabled, we can use args (since it is not deepcopied) to track all average weights if not args.multip: args.k_i = { '0': (true_net.avg_degree(), true_net.avg_degree(use_weights=True)) } # first_inf_nodes could have been calculated by this point if an infseed was supplied, and # we deal with random network OR 'nid' key was supplied in the predefined network of args.nettype if first_inf_nodes is None: # turn first_inf into an absolute number if it's a percentage by this point (happens only if predefined net with no nid) args.first_inf = int( args.first_inf if args.first_inf >= 1 else args.first_inf * args.netsize) # Random first infected across simulations - seed random locally first_inf_nodes = random.Random(net_seed).sample( true_net.nodes, args.first_inf) # Change the state of the first_inf_nodes to 'I' to root the simulation true_net.change_state(first_inf_nodes, state='I', update=True) # Placeholder for the dual network (will be initialized only if args.dual) know_net = None ### Good place to debug whether edge weights are present and how they are normalized # print(true_net.node_counts[list(true_net.neighbors(first_inf_nodes[0]))[0]]) # print(1, true_net.node_counts[1]) # print(2, true_net.node_counts[2]) if args.dual: # the dual network is either predefined in the second element of args.nettype or initialized at random try: know_net = network.get_dual_from_predef( true_net, nettype['0'][1], count_importance=tr_rate, W_factor=nettype.get('Wt', 0), **args_dict) except (TypeError, KeyError, IndexError): # First dual net depends on both overlap and uptake (this is usually the digital contact tracing net) # Note this will also copy over the states, so no need to call change_state know_net = network.get_dual(true_net, args.overlap, args.zadd, args.zrem, args.uptake, args.maintain_overlap, nseed=net_seed + 1, inet=inet, count_importance=tr_rate, **args_dict) # if 2 dual networks selected, create the second network and add both to a ListDelegator if args.dual == 2: try: know_net_two = network.get_dual_from_predef( true_net, nettype['0'][2], count_importance=args.taut_two, W_factor=nettype.get('Wt2', 0), **args_dict) except (TypeError, KeyError, IndexError): # Second tracing net attempt to maintain overlap_two (this is usually the manual tracing net, # uptake may not make sense for manual tracing # Note this will also copy over the states, so no need to call change_state know_net_two = network.get_dual( true_net, args.overlap_two, args.zadd_two, args.zrem_two, args.uptake_two, args.maintain_overlap_two, nseed=net_seed + 2, inet=inet, count_importance=args.taut_two, **args_dict) # know_net becomes a ListDelegator of the 2 networks know_net = ListDelegator(know_net, know_net_two) # Object used during Multiprocessing of Network simulation events engine = EngineDual(args=args, no_exposed=no_exposed, is_covid=is_covid, true_net=true_net, know_net=know_net, trans_true=trans_true_items, trans_know=trans_know_items) else: # Object used during Multiprocessing of Network simulation events engine = EngineOne( args=args, no_exposed=no_exposed, is_covid=is_covid, true_net=true_net, trans_true=trans_true_items, ) niters = args.niters iters_range = range(niters) if args.multip == 2 or args.multip == 3: # allocate EITHER half or all cpus to pathos.multiprocess for parallelizing simulations for different iterations of 1 init # multip == 2 parallelize only iterations; multip == 3 parallelize both net and iters jobs = int(cpu_count() / (args.multip - 1)) with Pool(jobs) as pool: for itr, stats_events in enumerate( tqdm_redirect(pool.imap(engine, iters_range), total=niters, desc='Iterations simulation progress')): # Record sim results net_events[itr] = stats_events else: with no_std_context(enabled=args.animate): for itr in tqdm_redirect( iters_range, desc='Iterations simulation progress'): print('Running iteration ' + str(itr) + ':') # Reinitialize network + Random first infected at the beginning of each run BUT the first one # This is needed only in sequential processing since in multiprocessing the nets are deepcopied anyway if itr: engine.reinit_net(first_inf_nodes) # Run simulation stats_events = engine(itr) # Record sim results net_events[itr] = stats_events # A situation in which there is NO event can arise when all first infected nodes are orphans, and rem_orphans=True total_inf = stats_events[-1][ 'totalInfected'] if stats_events else args.first_inf print('---> Result:' + str(total_inf) + ' total infected persons over time.') if return_last_net: return net_events, (true_net, know_net) if netstate_return: # we may not know about netsize and k by this point, and if Nets are distributed, args will be deepcopied # therefore we need to return the inferred netsize and k here from the local args return net_events, (args.netsize, args.k) return net_events, None
def run(self): pool = Pool(40) return dict(pool.imap(self._run, self.runners))
def __call__( self, dp: DataPanel, columns: List[str], num_proc: int = None, *args, **kwargs, ): if not num_proc or num_proc == 1: slices = [] slice_membership = [] # Apply each slicebuilder in sequence for i, slicebuilder in tqdm(enumerate(self.subpopulations)): # Apply the slicebuilder slices_i, slice_membership_i = slicebuilder( dp=dp, columns=columns, *args, **kwargs, ) # Add in the slices and slice membership slices.extend(slices_i) slice_membership.append(slice_membership_i) else: # TODO(karan): cleanup, make mp.Pool support simpler across the library with Pool(num_proc) as pool: slices, slice_membership = zip( *pool.map( lambda sb: sb( dp=dp, columns=columns, *args, **kwargs, ), [slicebuilder for slicebuilder in self.subpopulations], ) ) # Combine all the slices slices = list(tz.concat(slices)) def _store_updates(batch, indices): # Each Subpopulation will generate slices for i, subpopulation in enumerate(self.subpopulations): updates = subpopulation.construct_updates( slice_membership=slice_membership[i][indices], columns=columns, ) batch = subpopulation.store( batch=batch, updates=updates, ) return batch if isinstance(dp, DataPanel): dp = dp.map( _store_updates, with_indices=True, batched=True, ) for subpopulation in self.subpopulations: # Update the DataPanel's history dp.update_tape( path=[SLICEBUILDERS, subpopulation.category], identifiers=subpopulation.identifiers, columns=columns, ) # Combine all the slice membership matrices slice_membership = np.concatenate(slice_membership, axis=1) return slices, slice_membership