def featurize(self, mols, parallel=False, client_kwargs=None, view_flags=None, verbosity=None, log_every_n=1000): """ Calculate features for molecules. Parameters ---------- mols : iterable RDKit Mol objects. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. """ if self.conformers and isinstance(mols, types.GeneratorType): mols = list(mols) assert verbosity in [None, "low", "high"] if parallel: from IPython.parallel import Client if client_kwargs is None: client_kwargs = {} if view_flags is None: view_flags = {} client = Client(**client_kwargs) client.direct_view().use_dill() # use dill view = client.load_balanced_view() view.set_flags(**view_flags) call = view.map(self._featurize, mols, block=False) features = call.get() # get output from engines call.display_outputs() else: features = [] for i, mol in enumerate(mols): if verbosity is not None and i % log_every_n == 0: log("Featurizing %d / %d" % (i, len(mols))) if mol is not None: features.append(self._featurize(mol)) else: features.append(np.array([])) if self.conformers: features = self.conformer_container(mols, features) else: features = np.asarray(features) return features
def featurize(self, mols, parallel=False, client_kwargs=None, view_flags=None): """ Calculate features for molecules. Parameters ---------- mols : iterable RDKit Mol objects. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. """ if self.conformers and isinstance(mols, types.GeneratorType): mols = list(mols) if parallel: from IPython.parallel import Client if client_kwargs is None: client_kwargs = {} if view_flags is None: view_flags = {} client = Client(**client_kwargs) client.direct_view().use_dill() # use dill view = client.load_balanced_view() view.set_flags(**view_flags) call = view.map(self._featurize, mols, block=False) features = call.get() # get output from engines call.display_outputs() else: features = [self._featurize(mol) for mol in mols] if self.conformers: features = self.conformer_container(mols, features) else: features = np.asarray(features) return features
def get_map(cluster_id=None): """ Get the proper mapping function. Parameters ---------- cluster_id : str, optional IPython.parallel cluster ID. """ if cluster_id is not None: client = Client(cluster_id=cluster_id) client.direct_view().use_dill() view = client.load_balanced_view() return view.map_sync else: return map
def __init__(self, config_filename=None, profile=None, seed=None, sshkey=None, packer='json'): """Initialize a IPClusterEngine Do IPython.parallel operations to set up cluster and generate mapper. """ super(IPClusterEngine, self).__init__(seed=seed) rc = Client(config_filename, profile=profile, sshkey=sshkey, packer=packer) # FIXME: add a warning if environment in direct view is not 'empty'? # else, might become dependent on an object created in # environemnt in a prior run dview = rc.direct_view() lview = rc.load_balanced_view() with dview.sync_imports(local=True): import crosscat mapper = lambda f, tuples: self.lview.map(f, *tuples) # if you're trying to debug issues, consider clearning to start fresh # rc.clear(block=True) # self.rc = rc self.dview = dview self.lview = lview self.mapper = mapper self.do_initialize = None self.do_analyze = None return
def remove_duplicates(df): logging.info('Removing duplicates.') image_names = df.image_name.unique() def process_image_name(image_name): data = df[df.image_name == image_name] data = remove_duplicates_from_image_name_data(data) data.to_hdf(get_temp_fname(image_name), 'df') # parallel approach, u need to launch an ipcluster/controller for this work! c = Client() dview = c.direct_view() dview.push({'remove_duplicates_from_image_name_data': remove_duplicates_from_image_name_data, 'data_root': data_root}) lbview = c.load_balanced_view() lbview.map_sync(process_image_name, image_names) df = [] for image_name in image_names: try: df.append(pd.read_hdf(get_temp_fname(image_name), 'df')) except OSError: continue else: os.remove(get_temp_fname(image_name)) df = pd.concat(df, ignore_index=True) logging.info('Duplicates removal complete.') return df
def main(): parser = argparse.ArgumentParser() parser.add_argument('db_fname', help="Provide the filename of the HDF database " "file here.") args = parser.parse_args() image_names = get_image_names_from_db(args.db_fname) logging.info('Found {} image_names'.format(len(image_names))) c = Client() dview = c.direct_view() lbview = c.load_balanced_view() dview.push({'do_clustering': do_clustering, 'dbfile': args.db_fname}) results = lbview.map_async(process_image_name, image_names) import time import sys import os dirname = os.path.join(os.environ['HOME'], 'data/planet4/catalog_2_and_3') while not results.ready(): print("{:.1f} %".format(100 * results.progress / len(image_names))) sys.stdout.flush() time.sleep(10) for res in results.result: print(res) logging.info('Catalog production done. Results in {}.'.format(dirname))
def featurize(self, mols, parallel=False, client_kwargs=None, view_flags=None): """ Calculate features for molecules. Parameters ---------- mols : iterable RDKit Mol objects. parallel : bool, optional (default False) Train subtrainers in parallel using IPython.parallel. client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. """ if parallel: from IPython.parallel import Client if client_kwargs is None: client_kwargs = {} if view_flags is None: view_flags = {} client = Client(**client_kwargs) client.direct_view().use_dill() # use dill view = client.load_balanced_view() view.set_flags(**view_flags) call = view.map(self._featurize, np.array_split(mols, len(client.direct_view())), block=False) features = call.get() features = np.concatenate(features) # get output from engines call.display_outputs() else: features = self._featurize(mols) return np.asarray(features)
def featurize(self, mols, parallel=False, client_kwargs=None, view_flags=None): """ Calculate features for molecules. Parameters ---------- mols : iterable RDKit Mol objects. parallel : bool, optional (default False) Train subtrainers in parallel using IPython.parallel. client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. """ if parallel: from IPython.parallel import Client if client_kwargs is None: client_kwargs = {} if view_flags is None: view_flags = {} client = Client(**client_kwargs) client.direct_view().use_dill() # use dill view = client.load_balanced_view() view.set_flags(**view_flags) call = view.map( self._featurize, np.array_split(mols, len(client.direct_view())), block=False) features = call.get() features = np.concatenate(features) # get output from engines call.display_outputs() else: features = self._featurize(mols) return np.asarray(features)
class IPythonParallelMap(object): """ Class to handle the creation and management of cluster resources, typically on IRP, through IPython's parallel implementation. """ def __init__(self, nodes, irp=True, debug=False): """ if SSH, Open a connection to IRP and start the IPCluster daemon """ self.nodes = nodes self.irp = irp if self.irp: self.child = pexpect.spawn('ssh [email protected]') if debug: self.child.logfile = sys.stdout time.sleep(0.2) self.child.sendline('cd /home/psj/Documents/IPClusterLogs') self.child.sendline('ipcluster start --profile=pbs -n ' + str(nodes) + ' --daemonize') else: self.child = pexpect.spawn('ipcluster start -n ' + str(nodes) + ' --daemonize') def close(self): """ Close the IPCluster, delete jobs, and logout of SSH """ if self.irp: self.child.sendline('ipcluster stop --profile=pbs') else: self.child.sendline('ipcluster stop') time.sleep(0.5) if self.irp: self.child.sendline('qdel all') time.sleep(0.1) self.child.sendline('logout') def __enter__(self): return self def __exit__(self, exc_type, exc_value, exc_traceback): self.close() def connect_client(self): """ Connect the current client to the running engine """ from IPython.parallel import Client if self.irp: self.client = Client(profile='pbs') else: self.client = Client() assert len(self.client.ids) == self.nodes self.lview = self.client.load_balanced_view() self.dview = self.client.direct_view() def __call__(self, *args, **kwargs): """ Map function call to parallel view """ results = self.lview.map(*args, balanced=True, **kwargs) return results.get()
def get_client_and_view(verbose=True): c = None v = None while c is None: try: c = Client(profile_dir=profile_minspan_dir) except: sleep(5) while len(c.ids) == 0: if verbose: print "waiting for connections" sleep(5) while v is None: try: v = c.direct_view() except Exception, e: print e sleep(5)
def remove_duplicates_from_file(dbname, do_odo=False): logging.info('Removing duplicates.') image_names = get_image_names(dbname) def process_image_name(image_name): import pandas as pd data = pd.read_hdf(dbname, 'df', where='image_name==' + image_name) data = remove_duplicates_from_image_name_data(data) data.to_hdf(get_temp_fname(image_name), 'df') # parallel approach, u need to launch an ipcluster/controller for this work! c = Client() dview = c.direct_view() dview.push({'remove_duplicates_from_image_name_data': remove_duplicates_from_image_name_data, 'data_root': data_root, 'get_temp_fname': get_temp_fname}) lbview = c.load_balanced_view() logging.info('Starting parallel processing.') lbview.map_sync(process_image_name, image_names) logging.info('Done clean up. Now concatenating results.') merge_temp_files(dbname, image_names, do_odo)
##################### ### This Analysis ### ##################### hconst = 50. # assume a surface layer of const depth 50 m anal_name = 'hires' ############################## ## Set Up Parallel Engines ### ############################## # give engines time to load time.sleep(20) c = Client() dview = c.direct_view() lview = c.load_balanced_view() with dview.sync_imports(): import numpy from watermasstools import pop_model, transformation ##################################### ## Define Regions for Calculation ### ##################################### # where to find the data ddir = '/glade/scratch/enewsom/HR_ANALYSIS/DATA/HRfiles/' fprefix = 'HRC06.br.pop.h1' fnames = [] for year in xrange(147,168):
from __future__ import division from IPython.parallel import Client from IPython.parallel.util import interactive # TODO may be able to use IPython.parallel.Reference as in # http://minrk.github.com/scipy-tutorial-2011/basic_remote.html # the ipcluster should be set up before this file is imported c = Client() dv = c.direct_view() dv.execute('import pyhsmm') lbv = c.load_balanced_view() # these dicts need to be populated by hand before calling build_states*, # both locally (in this module) and in the ipython top-level module on every # engine # the second dict only needs to be used when calling build_states_changepoints alldata = {} allchangepoints = {} # these functions function are run on the engines, and expects the alldata (and # allchangepoints) global(s) as well as the current model hsmm_subhmms_model to be # present in the ipython global frame @lbv.parallel(block=True) @interactive def build_states(data_id): global global_model global alldata # adding the data to the pushed global model will build a substates object # and resample the states given the parameters in the model
from IPython.parallel import Client from numpy import array, savez, percentile, nan from arch.compat.python import range, lmap # Time in seconds to sleep before checking if ready SLEEP = 10 # Number of repetitions EX_NUM = 500 # Number of simulations per exercise EX_SIZE = 200000 # Approximately controls memory use, in MiB MAX_MEMORY_SIZE = 100 rc = Client() dview = rc.direct_view() with dview.sync_imports(): from numpy import ones, vstack, arange, cumsum, sum, dot, zeros from numpy.random import RandomState, seed, random_integers from numpy.linalg import pinv def clear_cache(client, view): """Cache-clearing function from mailing list""" assert not rc.outstanding, "don't clear history when tasks are outstanding" client.purge_results('all') # clears controller client.results.clear() client.metadata.clear() view.results.clear() client.history = [] view.history = []
class ParallelCache(object): def __init__(self, cluster_profile=None, cachedir=None, purge=False, idle_timeout=None, shutdown=False, retries=3, poll_interval=10, verbose=5, **kwargs): self._purge = purge self._idle_timeout = idle_timeout self._shutdown = shutdown self._retries = retries self._poll_interval = poll_interval self._verbose = verbose self._execution_times = None if cluster_profile is not None: self._ip_client = Client(profile=cluster_profile, **kwargs) else: self._ip_client = None if cachedir is not None: self._memory = Memory(cachedir=cachedir, verbose=verbose) else: self._memory = None def map(self, f, *sequences, **kwargs): # make sure all sequences have the same length n_jobs = None my_seqs = [] for ii, seq in enumerate(sequences): try: this_n_elems = len(seq) if n_jobs is None: n_jobs = this_n_elems if this_n_elems != n_jobs: raise ValueError('All sequences must have the same lenght,' 'sequence at position %d has length %d' % (ii + 1, this_n_elems)) my_seqs.append(seq) except TypeError: # we allow passing ints etc, convert them to a sequence my_seqs.append(repeat(seq)) t_start = time.time() if self._ip_client is None: if self._verbose >= 1: tmp = 'without' if self._memory is None else 'with' print_('Running %d jobs locally %s caching..' % (n_jobs, tmp)) out = list() my_fun = f if self._memory is None else self._memory.cache(f) for this_args in zip(*my_seqs): out.append(my_fun(*this_args, **kwargs)) elif self._ip_client is not None and self._memory is None: if self._verbose >= 1: print('Running %d jobs on cluster without caching..' % n_jobs) out = [None] * n_jobs lbview = self._ip_client.load_balanced_view() tasks = list() for this_args in zip(*my_seqs): tasks.append(lbview.apply(f, *this_args, **kwargs)) # wait for tasks to complete result_retrieved = [False] * len(tasks) execution_times = [None] * len(tasks) retry_no = np.zeros(len(tasks), dtype=np.int) last_print = 0 last_idle_check = time.time() idle_times = {} while True: for ii, task in enumerate(tasks): if not result_retrieved[ii] and task.ready(): if task.successful(): out[ii] = task.get() execution_times[ii] = task.serial_time result_retrieved[ii] = True else: # task failed for some reason, re-run it if retry_no[ii] < self._retries: if self._verbose > 3: print ('\nTask %d failed, re-running (%d / %d)' % (ii, retry_no[ii] + 1, self._retries)) this_args = zip(*my_seqs)[ii] new_task = lbview.apply(f, *this_args, **kwargs) tasks[ii] = new_task retry_no[ii] += 1 else: msg = ('\nTask %d failed %d times. Stopping' % (ii, self._retries + 1)) print msg # this will throw an exception task.get() raise RuntimeError(msg) if self._purge: _purge_results(self._ip_client, task) n_completed = int(np.sum(result_retrieved)) progress = n_completed / float(n_jobs) # print progress in 10% increments this_print = int(np.floor(progress * 10)) if self._verbose >= 1 and this_print != last_print: print_(' %d%%' % (100 * progress), end='') last_print = this_print if n_completed == n_jobs: # we are done! print_('') # newline break if self._idle_timeout is not None and time.time() > last_idle_check + 30: now = time.time() queue = self._ip_client.queue_status() shutdown_eids = [] for eid in self._ip_client.ids: if eid not in queue: continue if queue[eid]['queue'] + queue[eid]['tasks'] == 0: # engine is idle idle_time = idle_times.get(eid, None) if idle_time is None: # mark engine as idle idle_times[eid] = now continue if now - idle_time > self._idle_timeout: # shut down engine shutdown_eids.append(eid) elif eid in idle_times: # engine has started running again del idle_times[eid] if len(shutdown_eids) > 0: if self._verbose > 0: print 'Shuting-down engines: ', shutdown_eids dv = self._ip_client.direct_view(shutdown_eids) dv.shutdown() for eid in shutdown_eids: del idle_times[eid] last_idle_check = now time.sleep(self._poll_interval) self._execution_times = execution_times if self._shutdown: self._shutdown_cluster() elif self._ip_client is not None and self._memory is not None: # now this is the interesting case.. if self._verbose >= 1: print('Running %d jobs on cluster with caching..' % n_jobs) f_cache = self._memory.cache(f) lbview = None out = [None] * n_jobs execution_times = [None] * n_jobs task_info = list() n_cache = 0 for ii, this_args in enumerate(zip(*my_seqs)): # get the cache directory out_dir, _ = f_cache.get_output_dir(*this_args, **kwargs) if op.exists(op.join(out_dir, 'output.pkl')): out[ii] = f_cache.load_output(out_dir) n_cache += 1 continue if lbview is None: lbview = self._ip_client.load_balanced_view() task = lbview.apply(f, *this_args, **kwargs) task_info.append(dict(task=task, idx=ii, args=this_args)) if self._verbose >= 1: print_('Loaded %d results from cache' % n_cache) # wait for tasks to complete last_print = 0 last_idle_check = time.time() idle_times = {} result_retrieved = [False] * len(task_info) retry_no = np.zeros(len(task_info), dtype=np.int) failed_tasks = [] while len(task_info) > 0: for ii, ti in enumerate(task_info): if not result_retrieved[ii] and ti['task'].ready(): task = ti['task'] if task.successful(): this_out = task.get() # cache the input and output out_dir, _ = f_cache.get_output_dir(*ti['args'], **kwargs) f_cache._persist_output(this_out, out_dir) f_cache._persist_input(out_dir, *ti['args'], **kwargs) # insert result into output out[ti['idx']] = this_out execution_times[ti['idx']] = task.serial_time result_retrieved[ii] = True else: if retry_no[ii] < self._retries: if self._verbose > 3: print ('\nTask %d failed, re-running (%d / %d)' % (ii, retry_no[ii] + 1, self._retries)) new_task = lbview.apply(f, *ti['args'], **kwargs) ti['task'] = new_task retry_no[ii] += 1 else: # task failed too many times, mark it as done # but keep running if self._verbose >= 1: print ('\nTask %d failed %d times.' % (ii, self._retries + 1)) failed_tasks.append(task) result_retrieved[ii] = True if self._purge: _purge_results(self._ip_client, task) if self._idle_timeout is not None and time.time() > last_idle_check + 30: now = time.time() queue = self._ip_client.queue_status() shutdown_eids = [] for eid in self._ip_client.ids: if eid not in queue: continue if queue[eid]['queue'] + queue[eid]['tasks'] == 0: # engine is idle idle_time = idle_times.get(eid, None) if idle_time is None: # mark engine as idle idle_times[eid] = now continue if now - idle_time > self._idle_timeout: # shut down engine shutdown_eids.append(eid) elif eid in idle_times: # engine has started running again del idle_times[eid] if len(shutdown_eids) > 0: if self._verbose > 0: print 'Shuting-down engines: ', shutdown_eids dv = self._ip_client.direct_view(shutdown_eids) dv.shutdown() for eid in shutdown_eids: del idle_times[eid] last_idle_check = now n_completed = int(np.sum(result_retrieved)) progress = n_completed / float(n_jobs - n_cache) # print progress in 10% increments this_print = int(np.floor(progress * 10)) if self._verbose >= 1 and this_print != last_print: print_(' %d%% ' % (100 * progress), end='') last_print = this_print if n_completed == n_jobs - n_cache: # we are done! print_('') # newline break time.sleep(self._poll_interval) if self._shutdown: self._shutdown_cluster() if len(failed_tasks) > 0: msg = '' for task in failed_tasks[:5]: try: task.get() except Exception as e: msg += str(e) raise RuntimeError('%d tasks failed:\n %s' % (len(failed_tasks), msg)) self._execution_times = execution_times else: raise RuntimeError('WTF?') if self._verbose >= 1: print_('Done (%0.1f seconds)' % (time.time() - t_start)) return out def get_last_excecution_times(self): return self._execution_times def purge_results(self, f, *sequences, **kwargs): # make sure all sequences have the same length n_jobs = None my_seqs = [] for ii, seq in enumerate(sequences): try: this_n_elems = len(seq) if n_jobs is None: n_jobs = this_n_elems if this_n_elems != n_jobs: raise ValueError('All sequences must have the same lenght,' 'sequence at position %d has length %d' % (ii + 1, this_n_elems)) my_seqs.append(seq) except TypeError: # we allow passing ints etc, convert them to a sequence my_seqs.append(repeat(seq)) f_cache = self._memory.cache(f) n_deleted = 0 for this_args in zip(*my_seqs): out_dir, _ = f_cache.get_output_dir(*this_args, **kwargs) if op.exists(out_dir): shutil.rmtree(out_dir) n_deleted += 1 print 'Purging cache: %d out of %d deleted' % (n_deleted, n_jobs) def _shutdown_cluster(self): # shut down all idle engines queue = self._ip_client.queue_status() shutdown_eids = [] for eid in self._ip_client.ids: if eid not in queue: continue if queue[eid]['queue'] + queue[eid]['tasks'] == 0: shutdown_eids.append(eid) if len(shutdown_eids) > 0: if self._verbose > 0: print 'Shuting-down engines: ', shutdown_eids dv = self._ip_client.direct_view(shutdown_eids) dv.shutdown()
""" Start with `ipcluster` """ #------------------------------------------------------------ # Initialize the connection to the client #------------------------------------------------------------ # Also look at StarCluster to use EC2 instances #------------------------------------------------------------ from IPython.parallel import Client client = Client() queue = client.direct_view() print "available workers: ", len(queue) #------------------------------------------------------------ # Do some work in parallel #------------------------------------------------------------ squared = queue.map_sync(lambda x: x**2, [1,2,3,4]) print squared
def wmt_rho(aname, ddir, fprefix, years, pref=0, hconst=50., task='calc_transformation_rates', monthly_mean=False, fsuffix=''): """Perform water mass analysis on a specific POP model run. aname - (string) the nickname of this specific analysis, used when saving data ddir - the directory where the run lives fprefix - the string that begins the file names (e.g. hybrid_v5_rel04_BC5_ne120_t12_pop62.pop.h.nday1) years - a list of years to analyze fsuffix - a trailing suffix (before .nc) pref - reference pressure for analysis hconst - depth of assumed surface layer monthly_mean - whether to calculate the transformation rate on the monthly mean instead of of the daily snapshots """ ############################## ## Set Up Parallel Engines ### ############################## # give engines time to load time.sleep(20) c = Client() dview = c.direct_view() lview = c.load_balanced_view() with dview.sync_imports(): import numpy from watermasstools import pop_model, transformation ##################################### ## Define Regions for Calculation ### ##################################### fnames = [] for year in years: for month in xrange(1, 13): fname = '%s/%s.%04d-%02d%s.nc' % (ddir, fprefix, year, month, fsuffix) fnames.append(fname) for f in fnames: print f # load a test file p = pop_model.POPFile(fnames[0], pref=pref) # define basins if pref == 0: natl = transformation.WaterMassRegion( basin_names=['Atlantic Ocean', 'GIN Seas', 'Labrador Sea'], latmin=15) natl.initialize_mask(p) natl.calculate_rholevs(rhomin=1022, rhomax=1028.5, nlevs=120, linear=True) npac = transformation.WaterMassRegion(basin_names=['Pacific Ocean'], latmin=15) npac.initialize_mask(p) npac.calculate_rholevs(rhomin=1020, rhomax=1027, nlevs=120, linear=True) so = transformation.WaterMassRegion(basin_names=['Southern Ocean'], latmax=30) so.initialize_mask(p) so.calculate_rholevs(rhomin=1022, rhomax=1030, nlevs=120, linear=True) globe = transformation.WaterMassRegion() globe.initialize_mask(p) globe.calculate_rholevs(rhomin=1018, rhomax=1030, nlevs=120, linear=True) elif pref == 2000: natl = transformation.WaterMassRegion( basin_names=['Atlantic Ocean', 'GIN Seas', 'Labrador Sea'], latmin=15) natl.initialize_mask(p) natl.calculate_rholevs(rhomin=1028, rhomax=1037.8, nlevs=120, linear=True) npac = transformation.WaterMassRegion(basin_names=['Pacific Ocean'], latmin=15) npac.initialize_mask(p) npac.calculate_rholevs(rhomin=1027, rhomax=1036.5, nlevs=120, linear=True) so = transformation.WaterMassRegion(basin_names=['Southern Ocean'], latmax=30) so.initialize_mask(p) so.calculate_rholevs(rhomin=1030, rhomax=1037.8, nlevs=120, linear=True) globe = transformation.WaterMassRegion() globe.initialize_mask(p) globe.calculate_rholevs(rhomin=1026, rhomax=1037.8, nlevs=120, linear=True) else: raise ValueError('Invalid pressure level %g specified' % pref) region_dict = {'natl': natl, 'npac': npac, 'so': so, 'globe': globe} # push to engines dview.push(dict(hconst=hconst, pref=pref, monthly_mean=monthly_mean)) dview.push(region_dict) dview.execute( "region_dict = {'natl': natl, 'npac': npac, 'so': so, 'globe': globe}") # check dview.execute('a = region_dict.keys()[0]') a = dview.gather('a') for r in a.get(): assert r == 'npac' ####################### ## Apply on Engines ### ####################### if task == 'calc_transformation_rates': mapfunc = calc_transformation_rates prefix = 'WMT' elif task == 'calc_Fd': mapfunc = calc_Fd prefix = 'FD' res = lview.map(mapfunc, fnames) while not res.ready(): print 'progress %3.2f%%' % (100 * res.progress / float(len(res))) time.sleep(60) assert res.successful() ################### ## Save Results ### ################### all_res = dict() for k in region_dict: all_res[k] = [] for r in res: for k in all_res: all_res[k].append(r[k]) for k in all_res: all_res[k] = numpy.array(all_res[k]) numpy.savez('../data/%s_%s_sigma%1d_hconst%03d_%s.npz' % (prefix, aname, pref / 1000, hconst, k), A=all_res[k], rholevs=region_dict[k].rholevs)
# (because we do not "install" the llc module) sys.path.append('..') from llc import llc_model base_dir = os.path.join(os.environ['LLC'], 'llc_1080') LLC = llc_model.LLCModel1080( #base_dir = os.path.join(os.environ['LLC'], 'llc_4320') #LLC = llc_worker.LLCModel4320( data_dir = os.path.join(base_dir, 'run_day732_896'), grid_dir = os.path.join(base_dir, 'grid')) # set make this True to use parallel execution if False: # connect to ipcluster server c = Client(profile='default') dview = c.direct_view() lbv = c.load_balanced_view() mapfunc = lbv.map_async else: # just use serial execution mapfunc = map # this is where the work gets done def work_on_tile(tile): # need to reimport the modules # there must be a cleaner way to do this, but I don't know how try: from llc import llc_model except ImportError: sys.path.append('..') from llc import llc_model
from __future__ import division from IPython.parallel import Client from IPython.parallel.util import interactive # NOTE: the ipcluster should be set up before this file is imported c = Client() dv = c.direct_view() dv.execute('import pyhsmm') lbv = c.load_balanced_view() # this dict needs to be populated by hand before calling build_states*, both # locally (in this module) and in the ipython top-level module on every engine # NOTE: the data should probably be arrays with dtype=np.float64 alldata = {} # this function is run on the engines, and expects the alldata global as well as # the current model global_model to be present in the ipython global frame @lbv.parallel(block=True) @interactive def build_states(data_id): global global_model global alldata # adding the data to the pushed global model will build a substates object # and resample the states given the parameters in the model global_model.add_data(alldata[data_id], initialize_from_prior=False) stateseq = global_model.states_list[-1].stateseq global_model.states_list = []
raise ValueError("Cannot gather results from {0}".format(outfile)) results = NumpyCache(outfile) brd = np.broadcast(pointing_indices, ndays, gmags, template_indices) results = np.array([results.get_row(key) for key in brd]) return results.reshape(brd.shape + results.shape[-1:]) if __name__ == '__main__': parallel = True if parallel: # Need some imports on the engine from IPython.parallel import Client client = Client() dview = client.direct_view() with dview.sync_imports(): from gatspy.periodic import (LombScargleMultiband, LombScargleMultibandFast, SuperSmootherMultiband) else: client = None template_indices = np.arange(2 * 23).reshape(2, 23).T pointing_indices = np.arange(1, 24)[:, None] ndays = np.array([90, 180, 365, 2 * 365, 5 * 365])[:, None, None] gmags = np.array([20, 21, 22, 23, 24.5])[:, None, None, None] kwargs = dict(pointing_indices=pointing_indices, ndays=ndays, gmags=gmags,
results = NumpyCache(outfile) brd = np.broadcast(pointing_indices, ndays, gmags, template_indices) results = np.array([results.get_row(key) for key in brd]) return results.reshape(brd.shape + results.shape[-1:]) if __name__ == "__main__": parallel = True if parallel: # Need some imports on the engine from IPython.parallel import Client client = Client() dview = client.direct_view() with dview.sync_imports(): from gatspy.periodic import LombScargleMultiband, LombScargleMultibandFast, SuperSmootherMultiband else: client = None template_indices = np.arange(2 * 23).reshape(2, 23).T pointing_indices = np.arange(1, 24)[:, None] ndays = np.array([90, 180, 365, 2 * 365, 5 * 365])[:, None, None] gmags = np.array([20, 21, 22, 23, 24.5])[:, None, None, None] kwargs = dict( pointing_indices=pointing_indices, ndays=ndays, gmags=gmags, template_indices=template_indices,
DIR = os.getcwd() OUTPUT_DIR = DIR + '/' + 'output/' TMP_DIR = '/da/dmp/cb/clayie1/tmp/seed_graph/' if not os.path.exists(OUTPUT_DIR): print('Making ' + OUTPUT_DIR) os.makedirs(OUTPUT_DIR) print('Output in ' + OUTPUT_DIR) # PROXY = os.environ.get('http_proxy', 'http://eu-chbs-PROXY.eu.novartis.net:2011/') PROXY = os.environ.get('http_proxy', 'http://nibr-proxy.global.nibr.novartis.net:2011/') # <codecell> # parallel support from IPython.parallel import Client, error cluster = Client() # default profile dview = cluster.direct_view() # direct access/control (including push/pull) lbview = cluster.load_balanced_view() # load balanced view for running jobs cluster.ids # nb: MPI clusters must be initiated - see: http://nbviewer.ipython.org/github/ipython/ipython/blob/master/examples/Parallel%20Computing/Using%20MPI%20with%20IPython%20Parallel.ipynb # test dview.execute('import os') %px print("testing... pid: " + str(os.getpid())) # <markdowncell> # ## Collect UTR sequences from Biomart # <markdowncell> # ### Set up query