def run_mcmc(self): complete = self.chains.check_completness() if not complete: for i in range(self.nchains): self.chains.chains.append( Chain(self.chains.chains_filename, self.covfile, nchain=i, nsteps=self.nsteps)) pool = Pool(processes=self.nchains) try: # Without the .get(9999), you can't interrupt this with Ctrl+C. pool.map_async(self.mcmc, self.chains.chains).get(999999) pool.close() pool.join() # to skip lines after the progress bars print '\n' * self.nchains except KeyboardInterrupt: pool.terminate() self.likelihood = self.chains.chains_to_likelihood() self.likelihood.stats(self.covfile) # [self.results[i].append(self.likelihood.pdfs[i].mean) for i in range(self.chains.dim)] # self.p = [self.likelihood.pdfs[i].mean for i in range(self.chains.dim)] self.p = self.chains.best_row_params self.simulation(self.spectrum.lambdas, *self.p) # [self.results_err[i].append([self.likelihood.pdfs[i].error_high,self.likelihood.pdfs[i].error_low]) for i in range(self.chains.dim)] # if(self.plot): self.likelihood.triangle_plots() self.plot_fit() # if convergence_test : self.chains.convergence_tests() return self.likelihood
def multiproc_map(self, func): from pathos.multiprocessing import Pool pool = Pool() result = List(pool.map(func, self)) pool.close() pool.join() return result
def main(args): dens = EMData(args.map) star = parse_star(args.input, keep_index=False) star[["ImageNumber", "ImageName"]] = star['rlnImageName'].str.split("@", expand=True) grouped = star.groupby("ImageName") pool = None if args.nproc > 1: pool = Pool(processes=args.nproc) results = pool.imap(lambda x: project_stack(x, dens, args.dest), (group for name, group in grouped)) else: results = (project_stack(group, dens, args.dest) for name, group in grouped) i = 0 t = 0 for r in results: i += 1 t += r sys.stdout.write("\rProjected %d particles in %d stacks" % (t, i)) sys.stdout.flush() if pool is not None: pool.close() pool.join() sys.stdout.write('\n') sys.stdout.flush() return 0
def vw_train_and_test(self, options_list, data_file_paths): def init_worker(): signal.signal(signal.SIGINT, signal.SIG_IGN) def run_learner(options): train_options = { 'final_regressor': os.path.join(self.work_dir, id_generator()), 'data': data_file_paths['train'], 'cache_file': data_file_paths['train'] + '.cache' } train_options.update(options) test_options = { 'data': data_file_paths['test'], 'predictions': os.path.join(self.work_dir, id_generator()), 'cache_file': data_file_paths['test'] + '.cache' } test_options.update(options) # TO DO: remove below if. if 'kill_cache' in options: del train_options['kill_cache'] del test_options['kill_cache'] vw_wrapper = VW_Wrapper(verbose=False) vw_wrapper.train(train_options) predictions = vw_wrapper.test(test_options) os.remove(train_options['final_regressor']) os.remove(test_options['predictions']) return options, predictions if len(options_list) > 1: try: if not os.path.isfile(data_file_paths['test'] + '.cache') or not os.path.isfile( data_file_paths['train'] + '.cache'): run_learner(options_list[0]) pool = Pool(len(options_list), init_worker) result_list = pool.map_async(run_learner, options_list).get(99999999) pool.close() pool.join() return result_list except KeyboardInterrupt: print ' Keyboard Interrupt, exiting...) ' pool.terminate() pool.join() sys.exit(0) elif len(options_list) == 1: return [run_learner(options_list[0])] else: return []
def create_histograms(self): logger.debug('--->Systematics::create_histograms:') # collect ROOT objects self._root_objects_holder = RootObjects(self._output_file) if self._num_threads == 1: for systematic in self._systematics: logger.debug("---->Create ROOT objects for systematic %s.", systematic.name) if logger.getEffectiveLevel() == 10: print '---->Systematics::create_histograms: systematic', systematic.process, systematic._process.estimation_method._friend_directories systematic.create_root_objects() else: logger.debug("Create ROOT objects for all systematics.") from pathos.multiprocessing import Pool pool = Pool(processes=self._num_threads) systematics_new = pool.map(systematic_create_root_objects, [s for s in self._systematics]) pool.close() pool.join() del pool # Because the new objects have different addresses in memory, # the result objects have to be copied. for i_sys in range(len(systematics_new)): self._systematics[i_sys] = systematics_new[i_sys] logger.debug('-->Create root holders') for systematic in self._systematics: if self._find_unique_objects: self._root_objects_holder.add_unique(systematic.root_objects) else: self._root_objects_holder.add(systematic.root_objects) # self._root_objects_holder.check_duplicates() # TODO: Implement this if needed # produce ROOT objects (in parallel) logger.debug("Produce ROOT objects using the %s backend.", self._backend) logger.debug('-->Produce root with' + self._backend + 'backend') if self._backend == "classic": self._root_objects_holder.produce_classic(self._num_threads) elif self._backend == "tdf": self._root_objects_holder.produce_tdf(self._num_threads) else: logger.fatal("Backend %s is not implemented.", self._backend) raise Exception # set duplicates to the produced ROOT objects logger.debug('--># set duplicates to the produced ROOT objects') if self._find_unique_objects: self._root_objects_holder.set_duplicates()
def calc_distances_between_nodes(self): """ Use the dtw algorithm to calculate the distance between nodes. """ from fastdtw import fastdtw from pathos.multiprocessing import Pool # decide use which algo to use if self.opt1 == True: self.distance_calc_func = self.distance_opt1_func else: self.distance_calc_func = self.distance_func dtws = [] if self.opt2: depth = 0 for node in self.nodes: if node in self.degree_list: if depth in self.degree_list[node]: degree = self.degree_list[node][depth] if args.opt1: degree = degree[0][0] else: degree = degree[0] if degree not in self.degree2nodes: self.degree2nodes[degree] = [] if node not in self.node2degree: self.node2degree[node] = degree self.degree2nodes[degree].append(node) # select the log(n) node to select data degree_keys = self.degree2nodes.keys() degree_keys = np.array(list(degree_keys), dtype='int') self.degrees_sorted = list(np.sort(degree_keys)) selected_nbh_nums = 2 * math.log(self.graph.num_nodes - 1, 2) self.selected_nbh_nums = selected_nbh_nums pool = Pool(10) dtws = pool.map(self.calc_node_with_neighbor_dtw_opt2, self.nodes) pool.close() pool.join() else: src_indices = range(0, self.graph.num_nodes - 2) pool = Pool(10) dtws = pool.map(self.calc_node_with_neighbor_dtw, src_indices) pool.close() pool.join() print('calc the dtw done.') for dtw in dtws: self.distance.update(dtw)
def apply_parallel(data: List[Any], func: Callable) -> List[Any]: """ Apply function to list of elements. Automatically determines the chunk size. """ cpu_cores = cpu_count() try: chunk_size = ceil(len(data) / cpu_cores) pool = Pool(cpu_cores) transformed_data = pool.map(func, chunked(data, chunk_size), chunksize=1) finally: pool.close() pool.join() return transformed_data
def random_walk_structual_sim(self): """ According to struct distance to walk the path """ from pathos.multiprocessing import Pool print('start process struc2vec random walk.') walks_process_ids = [i for i in range(0, self.num_walks)] pool = Pool(10) walks = pool.map(self.executor_random_walk, walks_process_ids) pool.close() pool.join() #save the final walk result file_result = open(args.tag + "_walk_path", "w") for walk in walks: for walk_node in walk: walk_node_str = " ".join([str(node) for node in walk_node]) file_result.write(walk_node_str + "\n") file_result.close() print('process struc2vec random walk done.')
return i, j, phase_hilb_rc pool = Pool(NUM_WORKERS) net.wavelet(1, 'y', pool = pool, cut = 1) # Hilbert on RC SSA fluctuations # args = [ (i, j, net.data[:, i, j]) for i in range(net.lats.shape[0]) for j in range(net.lons.shape[0]) ] # results = pool.map(_hilbert_ssa, args) # for i, j, res in results: # net.phase[:, i, j] = res net.get_continuous_phase(pool = pool) net.get_phase_fluctuations(rewrite = True, pool = pool) pool.close() pool.join() # index_correlations = {} # index_datas = {} # # SURROGATES # for index, ndx_type, start_date, end_year in zip(INDICES, DATE_TYPE, START_DATES, END_YEARS): # load index # print index # if index != 'NINO3.4': index_data = DataField() raw = np.loadtxt("%sNAO.station.monthly.1865-2016.txt" % (path_to_data)) raw = raw[:, 1:] index_data.data = raw.reshape(-1) index_data.create_time_array(date_from = date(1865, 1, 1), sampling = 'm')
def partial_dependence(self, feature_ids, modelinstance, filter_classes=None, grid=None, grid_resolution=30, n_jobs=-1, grid_range=None, sample=True, sampling_strategy='random-choice', n_samples=1000, bin_count=50, samples_per_bin=10, return_metadata=False): """ Approximates the partial dependence of the predict_fn with respect to the variables passed. Parameters: ----------- feature_ids: list the names/ids of the features for which partial dependence is to be computed. Note that the algorithm's complexity scales exponentially with additional features, so generally one should only look at one or two features at a time. These feature ids must be available in the class's associated DataSet. As of now, we only support looking at 1 or 2 features at a time. modelinstance: skater.model.model.Model subtype an estimator function of a fitted model used to derive prediction. Supports classification and regression. Supports classification(binary, multi-class) and regression. predictions = predict_fn(data) Can either by a skater.model.remote.DeployedModel or a skater.model.local.InMemoryModel filter_classes: array type The classes to run partial dependence on. Default None invokes all classes. Only used in classification models. grid: numpy.ndarray 2 dimensional array on which we fix values of features. Note this is determined automatically if not given based on the percentiles of the dataset. grid_resolution: int how many unique values to include in the grid. If the percentile range is 5% to 95%, then that range will be cut into <grid_resolution> equally size bins. Defaults to 30. n_jobs: int The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. Defaults to using all cores(-1). grid_range: tuple the percentile extrama to consider. 2 element tuple, increasing, bounded between 0 and 1. sample: boolean Whether to sample from the original dataset. sampling_strategy: string If sampling, which approach to take. See DataSet.generate_sample for details. n_samples: int The number of samples to use from the original dataset. Note this is only active if sample = True and sampling strategy = 'uniform'. If using 'uniform-over-similarity-ranks', use samples per bin bin_count: int The number of bins to use when using the similarity based sampler. Note this is only active if sample = True and sampling_strategy = 'uniform-over-similarity-ranks'. total samples = bin_count * samples per bin. samples_per_bin: int The number of samples to collect for each bin within the sampler. Note this is only active if sample = True and sampling_strategy = 'uniform-over-similarity-ranks'. If using sampling_strategy = 'uniform', use n_samples. total samples = bin_count * samples per bin. return_metadata: boolean :Example: >>> from skater.model import InMemoryModel >>> from skater.core.explanations import Interpretation >>> from sklearn.ensemble import RandomForestClassier >>> from sklearn.datasets import load_boston >>> boston = load_boston() >>> X = boston.data >>> y = boston.target >>> features = boston.feature_names >>> rf = RandomForestClassier() >>> rf.fit(X,y) >>> model = InMemoryModel(rf, examples = X) >>> interpreter = Interpretation() >>> interpreter.load_data(X) >>> feature_ids = ['ZN','CRIM'] >>> interpreter.partial_dependence.partial_dependence(features,model) """ if self.data_set is None: load_data_not_called_err_msg = "self.interpreter.data_set not found. " \ "Please call Interpretation.load_data " \ "before running this method." raise(exceptions.DataSetNotLoadedError(load_data_not_called_err_msg)) feature_ids = self._check_features(feature_ids) if filter_classes: err_msg = "members of filter classes must be" \ "members of modelinstance.classes." \ "Expected members of: " \ "{0}\n" \ "got: " \ "{1}".format(modelinstance.target_names, filter_classes) assert all([i in modelinstance.target_names for i in filter_classes]), err_msg # TODO: There might be a better place to do this check if not isinstance(modelinstance, ModelType): raise(exceptions.ModelError("Incorrect estimator function used for computing partial dependence, try one " "creating one with skater.model.local.InMemoryModel or" "skater.model.remote.DeployedModel")) if modelinstance.model_type == 'classifier' and modelinstance.probability is False: if modelinstance.unique_values is None: raise(exceptions.ModelError('If using classifier without probability scores, unique_values cannot ' 'be None')) self.interpreter.logger.warn("Classifiers with probability scores can be explained " "more granularly than those without scores. If a prediction method with " "scores is available, use that instead.") # TODO: This we can change easily to functional style missing_feature_ids = [] for feature_id in feature_ids: if feature_id not in self.data_set.feature_ids: missing_feature_ids.append(feature_id) if missing_feature_ids: missing_feature_id_err_msg = "Features {0} not found in " \ "Interpretation.data_set.feature_ids" \ "{1}".format(missing_feature_ids, self.data_set.feature_ids) raise(KeyError(missing_feature_id_err_msg)) if grid_range is None: grid_range = (.05, 0.95) else: if not hasattr(grid_range, "__iter__"): err_msg = "Grid range {} needs to be an iterable".format(grid_range) raise(exceptions.MalformedGridRangeError(err_msg)) self._check_grid_range(grid_range) if not modelinstance.has_metadata: examples = self.data_set.generate_sample(strategy='random-choice', sample=True, n_samples_from_dataset=10) examples = DataManager(examples, feature_names=self.data_set.feature_ids) modelinstance._build_model_metadata(examples) # if you dont pass a grid, build one. grid = np.array(grid) if not grid.any(): # Currently, if a given feature has fewer unique values than the value # of grid resolution, then the grid will be set to those unique values. # Otherwise it will take the percentile # range according with grid_resolution bins. grid = self.data_set.generate_grid(feature_ids, grid_resolution=grid_resolution, grid_range=grid_range) else: # want to ensure all grids have 2 axes if len(grid.shape) == 1 and \ (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])): grid = grid[:, np.newaxis].T grid_resolution = grid.shape[1] self.interpreter.logger.debug("Grid shape used for pdp: {}".format(grid.shape)) self.interpreter.logger.debug("Grid resolution for pdp: {}".format(grid_resolution)) # make sure data_set module is giving us correct data structure self._check_grid(grid, feature_ids) # generate data data_sample = self.data_set.generate_sample(strategy=sampling_strategy, sample=sample, n_samples_from_dataset=n_samples, samples_per_bin=samples_per_bin, bin_count=bin_count) _pdp_metadata = self._build_metadata_dict(modelinstance, feature_ids, self.data_set.feature_ids, filter_classes) self.interpreter.logger.debug("Shape of sampled data: {}".format(data_sample.shape)) self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids)) self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata)) # cartesian product of grid grid_expanded = pd.DataFrame(list(product(*grid))).values if grid_expanded.shape[0] <= 0: empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \ "grid shape: {}".format(grid_expanded.shape) raise(exceptions.MalformedGridError(empty_grid_expanded_err_msg)) predict_fn = modelinstance._get_static_predictor() n_jobs = None if n_jobs < 0 else n_jobs pd_func = functools.partial(_compute_pd, estimator_fn=predict_fn, grid_expanded=grid_expanded, pd_metadata=_pdp_metadata, input_data=data_sample, filter_classes=filter_classes) arg_list = [i for i in range(grid_expanded.shape[0])] executor_instance = Pool(n_jobs) try: pd_list = executor_instance.map(pd_func, arg_list) except: self.interpreter.logger.debug("Multiprocessing failed, going single process") pd_list = map(pd_func, arg_list) finally: executor_instance.close() executor_instance.join() executor_instance.terminate() if return_metadata: return pd.DataFrame(list(pd_list)), _pdp_metadata else: return pd.DataFrame(list(pd_list))
def learnMulti(self, gn, top_n=5): """ Learn using multiprocessing gn: initial grammar top_n: save the best top_n grammars """ # settings max_process = cpu_count( ) # processes to run in parallel; or use cpu_count() pool = Pool(max_process) P_LOCK = Manager().Lock() # mutex lock for printing gidCounter = 0 results = [] gList = [gn] gnBest = [deepcopy(gn)] bestmdl_last = gn.mdl # best MDL on the upper level nodes # uncomment this for debugging #history_pri_lik= set() # history of prior & likelihood values while len(gList) > 0: gn = gList.pop(0) # grammar node to be expanded self.printMsg( 1, '>> gList size: %d, bestMDL: %.3f (#%d)' % (len(gList), gnBest[0].mdl, gnBest[0].gid)) # substitute if possible ntlist = self.getFirstDLThack(gn.dlt) if len(ntlist) > 0: self.printMsg(2, '>> Possible substitutions on #%d:' % (gn.gid), ntlist) while len(ntlist) > 0: gidCounter += 1 argList = [gn, ntlist.pop(), gidCounter, P_LOCK] results.append( pool.apply_async(self.substituteMulti, [argList])) else: self.printMsg( 2, '>> No more SUBSTITUTE possible on #%d\n' % gn.gid) # merge if possible ntlist = self.mergeSet(gn.g) if len(ntlist) > 0: self.printMsg(2, '>> Possible merges on #%d:' % (gn.gid), ntlist) while len(ntlist) > 0: gidCounter += 1 argList = [gn, ntlist.pop(), gidCounter, P_LOCK] results.append(pool.apply_async(self.mergeMulti, [argList])) else: self.printMsg(2, '>> No more MERGE possible on #%d.\n' % gn.gid) del gn delList = [] bestmdl = bestmdl_last # search next level in the search tree while len(results) >= 1: time.sleep(0.001) # avoid wasting resource for r in range(len(results)): if results[r].ready(): delList.append(r) gn_new = results[r].get() # uncomment for debugging #if gn_new.lik < self.max_mdl: # history_pri_lik.add((gn_new.pri,gn_new.lik)) # save the best-N grammars if len(gnBest) < top_n: gnBest.append(gn_new) gnBest = sorted(gnBest, key=lambda gn: gn.mdl) elif gn_new.mdl < gnBest[-1].mdl: del gnBest[-1] gnBest.append(deepcopy(gn_new)) gnBest = sorted(gnBest, key=lambda gn: gn.mdl) # save this level's best mdl if gn_new.mdl < bestmdl: bestmdl = gn_new.mdl # beam search: compare with the best mdl on the upper level if gn_new.mdl >= bestmdl_last: gn_new.worse += 1 if gn_new.worse < BEAMSIZE: gList.append(gn_new) else: del gn_new delList.sort() for d in range(len(delList)): del results[delList.pop()] bestmdl_last = bestmdl pool.close() pool.join() return gnBest
'air', date(1950, 1, 1), date(2016, 1, 1), None, None, None, 'monthly', anom=False) pool = Pool(20) net.wavelet(1, 'y', pool=pool, cut=1) net.get_continuous_phase(pool=pool) print "wavelet done" net.get_phase_fluctuations(rewrite=True, pool=pool) print "fluctuations done" pool.close() pool.join() net.phase_fluctuations -= np.nanmean(net.phase_fluctuations, axis=0) net.get_adjacency_matrix(net.phase_fluctuations, method="L2", pool=None, use_queue=True, num_workers=20) net.save_net('networks/NCEP-SATannual-phase-fluctuations-adjmatL2.bin', only_matrix=True) print "L2 done" ## PHASE FLUCTUATIONS NETWORK correlation print "Computing MI knn..." net = ScaleSpecificNetwork(fname, 'air', date(1950, 1, 1),
def main(options): """ Projection subtraction program entry point. :param options: Command-line arguments parsed by ArgumentParser.parse_args() :return: Exit status """ rchop = lambda x, y: x if not x.endswith(y) or len(y) == 0 else x[:-len(y)] options.output = rchop(options.output, ".star") options.suffix = rchop(options.suffix, ".mrc") options.suffix = rchop(options.suffix, ".mrcs") star = StarFile(options.input) npart = len(star['rlnImageName']) sub_dens = EMData(options.submap) if options.wholemap is not None: dens = EMData(options.wholemap) else: print "Reference map is required." return 1 # Write star header for output.star. top_header = "\ndata_\n\nloop_\n" headings = star.keys() output_star = open("{0}.star".format(options.output), 'w') output_star.write(top_header) for i, heading in enumerate(headings): output_star.write("_{0} #{1}\n".format(heading, i + 1)) if options.recenter: # Compute difference vector between new and old mass centers. if options.wholemap is None: print "Reference map required for recentering." return 1 new_dens = dens - sub_dens # Note the sign of the shift in coordinate frame is opposite the shift in the CoM. recenter = Vec3f(*dens.phase_cog()[:3]) - Vec3f( *new_dens.phase_cog()[:3]) else: recenter = None pool = None if options.nproc > 1: # Compute subtraction in parallel. pool = Pool(processes=options.nproc) results = pool.imap( lambda x: subtract(x, dens, sub_dens, recenter=recenter, no_frc=options.no_frc, low_cutoff=options.low_cutoff, high_cutoff=options.high_cutoff), particles(star), chunksize=min(npart / options.nproc, options.maxchunk)) else: # Use serial generator. results = (subtract(x, dens, sub_dens, recenter=recenter, no_frc=options.no_frc, low_cutoff=options.low_cutoff, high_cutoff=options.high_cutoff) for x in particles(star)) # Write subtraction results to .mrcs and .star files. i = 0 nfile = 1 starpath = None mrcs = None mrcs_orig = None for r in results: if i % options.maxpart == 0: mrcsuffix = options.suffix + "_%d" % nfile nfile += 1 starpath = "{0}.mrcs".format( os.path.sep.join( os.path.relpath(mrcsuffix, options.output).split(os.path.sep)[1:])) mrcs = "{0}.mrcs".format(mrcsuffix) mrcs_orig = "{0}_original.mrcs".format(mrcsuffix) if os.path.exists(mrcs): os.remove(mrcs) if os.path.exists(mrcs_orig): os.remove(mrcs_orig) r.ptcl_norm_sub.append_image(mrcs) if options.original: r.ptcl.append_image(mrcs_orig) if logger.getEffectiveLevel( ) == logging.DEBUG: # Write additional debug output. ptcl_sub_img = r.ptcl.process("math.sub.optimal", { "ref": r.ctfproj, "actual": r.ctfproj_sub, "return_subim": True }) ptcl_lowpass = r.ptcl.process("filter.lowpass.gauss", { "apix": 1.22, "cutoff_freq": 0.05 }) ptcl_sub_lowpass = r.ptcl_norm_sub.process("filter.lowpass.gauss", { "apix": 1.22, "cutoff_freq": 0.05 }) ptcl_sub_img.write_image("poreclass_subimg.mrcs", -1) ptcl_lowpass.write_image("poreclass_lowpass.mrcs", -1) ptcl_sub_lowpass.write_image("poreclass_sublowpass.mrcs", -1) r.ctfproj.write_image("poreclass_ctfproj.mrcs", -1) r.ctfproj_sub.write_image("poreclass_ctfprojsub.mrcs", -1) assert r.meta.i == i # Assert particle order is preserved. star['rlnImageName'][i] = "{0:06d}@{1}".format( i % options.maxpart + 1, starpath) # Set new image name. r.meta.update(star) # Update StarFile with altered fields. line = ' '.join(str(star[key][i]) for key in headings) output_star.write("{0}\n".format(line)) i += 1 output_star.close() if pool is not None: pool.close() pool.join() return 0