class MapParallel(PipelineBlock): def __init__(self, function, n_processes=None): self.function = _MapFunctionClosure(function) self.pool = Pool(processes=n_processes) def run(self, input_data): return self.pool.imap(self.function, input_data, chunksize=1)
def _itergroundings(self, simplify=False, unsatfailure=False): global global_bpll_grounding global_bpll_grounding = self if self.multicore: pool = Pool(maxtasksperchild=1) try: for gndresult in pool.imap(with_tracing(create_formula_groundings), self.formulas): for fidx, stat in gndresult: for (varidx, validx, val) in stat: self._varidx2fidx[varidx].add(fidx) self._addstat(fidx, varidx, validx, val) checkmem() yield None except CtrlCException as e: pool.terminate() raise e pool.close() pool.join() else: for gndresult in imap(create_formula_groundings, self.formulas): for fidx, stat in gndresult: for (varidx, validx, val) in stat: self._varidx2fidx[varidx].add(fidx) self._addstat(fidx, varidx, validx, val) yield None
def _itergroundings(self, simplify=True, unsatfailure=True): # generate all groundings if not self.formulas: return global global_fastConjGrounding global_fastConjGrounding = self batches = list(rndbatches(self.formulas, 20)) batchsizes = [len(b) for b in batches] if self.verbose: bar = ProgressBar(width=100, steps=sum(batchsizes), color='green') i = 0 if self.multicore: pool = Pool() try: for gfs in pool.imap(with_tracing(create_formula_groundings), batches): if self.verbose: bar.inc(batchsizes[i]) bar.label(str(cumsum(batchsizes, i + 1))) i += 1 for gf in gfs: yield gf except Exception as e: logger.error('Error in child process. Terminating pool...') pool.close() raise e finally: pool.terminate() pool.join() else: for gfs in imap(create_formula_groundings, batches): if self.verbose: bar.inc(batchsizes[i]) bar.label(str(cumsum(batchsizes, i + 1))) i += 1 for gf in gfs: yield gf
class FilterParallel(PipelineBlock): def __init__(self, function, n_process=None): self.function = self._construct_filter_function(function) self.pool = Pool(processes=n_process) def _construct_filter_function(self, function): return _FilterFunctionClosure(function) def run(self, input_data): return self.pool.imap(self.function, input_data, chunksize=1)
def crawl(): pool = Pool(cpu_count() - 2) image_list, num_images = load_image_list(args.list_file) print 'Loaded {} images'.format(num_images) cleaned_image_list, cleaned_num_images = clean_image_list(image_list) print '{} images to crawl'.format(cleaned_num_images) pbar = get_progress_bar(cleaned_num_images) for i, _ in enumerate(pool.imap(crawl_job, cleaned_image_list), 1): pbar.update(i) pbar.finish() Image.save_image_list(image_list, args.image_cache) Landmark.save_all(args.landmark_cache) logging.info('All done')
def main(): print('Starting.') args = parse_args() pool = Pool() runs = find_runs(args.source_folder, args.target_folder) runs = report_source_versions(runs) samples = read_samples(runs) # noinspection PyTypeChecker results = pool.imap(partial(compare_sample, scenarios_reported=Scenarios.OTHER_CONSENSUS_CHANGED), samples, chunksize=50) scenario_summaries = defaultdict(list) i = None all_consensus_distances = [] report_count = 0 for i, (report, scenarios, consensus_distances) in enumerate(results): if report: report_count += 1 if report_count > 100: break print(report, end='') all_consensus_distances.extend(consensus_distances) for key, messages in scenarios.items(): scenario_summaries[key] += scenarios[key] for key, messages in sorted(scenario_summaries.items()): if messages: sample_names = {message.split()[0] for message in messages} summary = [key, len(messages), 'changes'] body = ''.join(messages).rstrip('.') if body: summary.extend(['in', len(sample_names), 'samples']) print(*summary, end='.\n') print(body, end='') distance_data = pd.DataFrame(all_consensus_distances) non_zero_distances = distance_data[distance_data['distance'] != 0] region_names = sorted(non_zero_distances['region'].unique()) names_iter = iter(region_names) for page_num, region_group in enumerate(zip_longest(names_iter, names_iter, names_iter), 1): group_distances = distance_data[distance_data['region'].isin(region_group)] plot_distances(group_distances, 'consensus_distances_{}.svg'.format(page_num), 'Consensus Distances Between Previous and v' + MICALL_VERSION) plot_distances(group_distances, 'consensus_diffs_{}.svg'.format(page_num), 'Consensus Differences Between Previous and v' + MICALL_VERSION, 'pct_diff') print('Finished {} samples.'.format(i))
class FoldParallel(PipelineBlock): def __init__(self, function, n_process=None): self.function = function self.pool = Pool(processes=n_process) def _construct_fold_function(self, function): return _FoldFunctionClosure(function) def run(self, input_data): batch_function = self._construct_fold_function(self.function) return self._fold_stream(self.pool.imap(batch_function, input_data, chunksize=1)) def _fold_stream(self, input_data): input_iter = iter(input_data) x = next(input_iter) for element in input_iter: x = self.function(x, element) return x
def _train_base(self, compute_vector, entity_word_seqs): pool = Pool() entities = {} vectors = [] def idx_seqs(): for idx, (entity, seq) in enumerate(entity_word_seqs): entities[entity] = idx yield seq for vec in pool.imap(compute_vector, idx_seqs()): vectors.append(vec) if len(vectors) % 1000 == 0: logging.info("Computed %d vectors", len(vectors)) self.entities = entities self.vectors = np.asarray(vectors)
def raw_line_map( filename, line_length, func, start=0, stop=-1, threads=1, pass_teletext=True, pass_rejects=False, show_speed=True ): if show_speed: s = SpeedMonitor() if threads > 0: p = Pool(threads) map_func = lambda x, y: p.imap(x, y, chunksize=1000) else: map_func = itertools.imap for l in map_func(func, raw_line_reader(filename, line_length, start, stop)): if show_speed: s.tally(l.is_teletext) if l.is_teletext: if pass_teletext: yield l else: if pass_rejects: yield l
try: first = int(sys.argv[2], 10) count = int(sys.argv[3], 10) skip = int(sys.argv[4], 10) except: first = 0 count = 10000000 skip = 1 if not os.path.isdir(path+'/t42/'): os.makedirs(path+'/t42/') if 1: p = Pool(multiprocessing.cpu_count()) it = p.imap(process_file, list_files(path+'/vbi/', path+'/t42/', first, count, skip), chunksize=1) for i in it: pass else: # single thread mode for debugging def doit(): map(process_file, list_files(path+'/vbi/', path+'/t42/', first, count, skip)) cProfile.run('doit()', 'myprofile') p = pstats.Stats('myprofile') p.sort_stats('cumulative').print_stats(50)
def featurize_dataset(self, dataset: Dataset): logger.info( f"Loading dataset {dataset.key} and {self.split.key} split") data = dataset.load_x() for required_field in ['product', 'substrates']: if required_field not in data: raise NotImplementedError( f"Need to have field '{required_field} in the dataset") split = self.split.load(dataset.dir) feat_dir = self.dir(dataset.feat_dir) metadata = dataset.load_metadata() reaction_type_given = False if 'reaction_type_id' in metadata: rtypes = metadata['reaction_type_id'].values ntypes = len(np.unique(rtypes)) logger.info(f'Found {ntypes} unique reaction types in the dataset') reaction_type_given = True data['reaction_type'] = rtypes if not os.path.exists(feat_dir): os.makedirs(feat_dir) if 'max_n_nodes' in dataset.meta_info: max_n_nodes = dataset.meta_info['max_n_nodes'] else: max_n_nodes = 1024 logger.info("Max. number of nodes: {}".format(max_n_nodes)) # we do not featurize test set for training all_inds = np.argwhere(split['test'] == 0).flatten() # shuffle indices for featurization in multiple threads np.random.shuffle(all_inds) data_len = len(data) samples_len = data_len * self.max_n_steps chunk_size = int(len(all_inds) / self.n_jobs) chunk_ends = [chunk_size * i for i in range(self.n_jobs + 1)] chunk_ends[-1] = len(all_inds) chunk_inds = [ all_inds[chunk_ends[i]:chunk_ends[i + 1]] for i in range(len(chunk_ends) - 1) ] logger.info(f'Finding all possible values of atom and bond properties ' f'on {len(all_inds)} reactions using {self.n_jobs} chunks') parallel_args = [] for i, ch_inds in enumerate(chunk_inds): new_x = dict((k, x.values[ch_inds]) for k, x in data.items()) parallel_args.append((i, new_x, tqdm)) prop_dict = {'atom': {}, 'bond': {}} if self.n_jobs == 1: chunk_results = [find_properties_parallel(parallel_args[0])] else: pool = Pool(self.n_jobs) chunk_results = pool.imap(find_properties_parallel, parallel_args) for chunk_prop_dict in chunk_results: for type_key in prop_dict.keys(): for key, values in chunk_prop_dict[type_key].items(): if key not in prop_dict[type_key]: prop_dict[type_key][key] = set() prop_dict[type_key][key].update(values) # add some 'special' atom/bond feature values prop_dict['atom']['is_supernode'].update([0, 1]) prop_dict['atom']['is_edited'].update([0, 1]) prop_dict['atom']['is_reactant'].update([0, 1]) prop_dict['bond']['bond_type'].update(['supernode', 'self']) prop_dict['bond']['is_edited'].update([0, 1]) atom_feat_counts = ', '.join([ '{:s}: {:d}'.format(key, len(values)) for key, values in prop_dict['atom'].items() ]) logger.info(f'Found atom features: {atom_feat_counts}') bond_feat_counts = ', '.join([ '{:s}: {:d}'.format(key, len(values)) for key, values in prop_dict['bond'].items() ]) logger.info(f'Found bond features: {bond_feat_counts}') # make a dictionary for conversion of atom/bond features to OH numbers prop2oh = {'atom': {}, 'bond': {}} props = {'atom': {}, 'bond': {}} for type_key, prop_values in prop_dict.items(): for prop_key, values in prop_values.items(): sorted_vals = list( sorted(values, key=lambda x: x if isinstance(x, int) else 0)) props[type_key][prop_key] = sorted_vals oh = dict((k, i + 1) for i, k in enumerate(sorted_vals)) prop2oh[type_key][prop_key] = oh # save 'prop2oh' dictionary with open(get_prop2oh_vocab_path(feat_dir), 'w') as fp: json.dump( { 'atom': props['atom'], 'bond': props['bond'], 'atom_2oh': prop2oh['atom'], 'bond_2oh': prop2oh['bond'] }, fp, indent=2) atom_feature_keys = [ k for k in ORDERED_ATOM_OH_KEYS if k in prop2oh['atom'] ] bond_feature_keys = [ k for k in ORDERED_BOND_OH_KEYS if k in prop2oh['bond'] ] action_vocab = { 'prop2oh': prop2oh, 'atom_feature_keys': atom_feature_keys, 'bond_feature_keys': bond_feature_keys, 'atom_feat_ind': dict( (k, i) for i, k in enumerate(atom_feature_keys)), 'bond_feat_ind': dict( (k, i) for i, k in enumerate(bond_feature_keys)) } parallel_args = [] chunk_save_paths = [] for i, ch_inds in enumerate(chunk_inds): new_x = dict((k, x.values[ch_inds]) for k, x in data.items()) is_train = split['train'][ch_inds].values chunk_save_path = os.path.join(feat_dir, f'chunk_result_{i}') chunk_save_paths.append(chunk_save_path) parallel_args.append( (i, samples_len, ch_inds, new_x, max_n_nodes, tqdm, self.max_n_steps, is_train, reaction_type_given, self.forward, self.action_order, action_vocab, chunk_save_path)) logger.info( f'Featurizing {len(all_inds)} reactions with {self.n_jobs} threads' ) logger.info(f"Number of generated paths (train+valid): {data_len}") logger.info( f"Upper bound for number of generated samples: {samples_len} ({data_len} * {self.max_n_steps})" ) if self.n_jobs == 1: chunk_results = [featurize_parallel(parallel_args[0])] else: # leave one job for merging results pool = Pool(max(self.n_jobs - 1, 1)) chunk_results = pool.imap(featurize_parallel, parallel_args) logger.info(f"Merging featurized data from {self.n_jobs} chunks") nodes_mat = sparse.csr_matrix(([], ([], [])), shape=(samples_len, max_n_nodes)) adj_mat = sparse.csr_matrix(([], ([], [])), shape=(samples_len, max_n_nodes**2)) n_sample_data = 6 if reaction_type_given else 5 sample_data_mat = sparse.csr_matrix(([], ([], [])), shape=(samples_len, n_sample_data)) meta = [] # vocabulary of actions actions_vocab = [] action2ind = {} action_inds = [] action_tuples = [] sample_inds = [] for ch_inds, result_code, chunk_save_path in tqdm( zip(chunk_inds, chunk_results, chunk_save_paths), desc='merging reactions from chunks', total=self.n_jobs): sample_data_path = os.path.join(chunk_save_path, 'sample_data.npz') sample_data_mat += sparse.load_npz(sample_data_path) nodes_mat_path = os.path.join(chunk_save_path, 'nodes_mat.npz') nodes_mat += sparse.load_npz(nodes_mat_path) adj_mat_path = os.path.join(chunk_save_path, 'adj_mat.npz') adj_mat += sparse.load_npz(adj_mat_path) meta_save_path = os.path.join(chunk_save_path, 'metadata.csv') chunk_meta = pd.read_csv(meta_save_path) meta.append(chunk_meta) actions_save_path = os.path.join(chunk_save_path, 'actions.txt') chunk_action_tuples = [] for line in open(actions_save_path, 'r'): action = eval(line.strip()) chunk_action_tuples.append(action) for sample_ind, action in chunk_action_tuples: if action in action2ind: action_inds.append(action2ind[action]) else: action_ind = len(actions_vocab) action2ind[action] = action_ind actions_vocab.append(action) action_tuples.append(action) action_inds.append(action_ind) sample_inds.append(sample_ind) # remove temporary chunk files shutil.rmtree(chunk_save_path) logger.info( f"Merged chunk {len(meta)} (unparsed samples: {result_code}/{len(ch_inds)})" ) logger.info("Concatenating metadata") meta = pd.concat(meta) logger.info("Saving found actions") sample_data_mat[sample_inds, 0] = action_inds with open(get_actions_vocab_path(feat_dir), 'w') as fp: json.dump(action_tuples, fp) logger.info(f"Found {len(action_tuples)} reaction actions") n_samples = meta['n_samples'] logger.info( f"Number of steps: max: {np.max(n_samples)}, avg: {np.mean(n_samples)}" ) logger.info("Saving featurized data") meta.to_csv(get_metadata_path(feat_dir)) sparse.save_npz(get_sample_data_path(feat_dir), sample_data_mat) sparse.save_npz(get_nodes_path(feat_dir), nodes_mat) sparse.save_npz(get_adj_path(feat_dir), adj_mat) n_saved_reacs = len(np.unique(meta['reaction_ind'])) logger.info( f"Saved {n_saved_reacs}/{len(all_inds)} reactions ({n_saved_reacs / len(all_inds) * 100}%)" ) logger.info( f"Saved {len(meta)} paths (avg. {len(meta) / n_saved_reacs} paths per reaction)" ) logger.info("Saving featurization metadata") meta_info = { 'description': 'Graph representation of molecules with discrete node and edge features for MEGAN', 'features': ['atom', 'bond'], 'features_type': ['atom', 'bond'], 'max_n_nodes': max_n_nodes, 'format': 'sparse' } meta_path = self.meta_info_path(dataset.feat_dir) with open(meta_path, 'w') as fp: json.dump(meta_info, fp, indent=2)
def match_plans(self): pool = Pool(multiprocessing.cpu_count() - 1) plans = list(reversed(list(tqdm(pool.imap(match_plan, list(reversed(self.data))), total=len(self.data))))) self.data = [d.set_plan(p) for d, plans in zip(self.data, plans) for p in plans] return self