def random_draw(): '''draws a set of regressors at random Parameters ---------- choices : array-like choices for the number of regressors to go in model K : int total number of regressors to select from Returns ------- draw : list set of regressors ''' model_space = modelcontext() allowed = model_space.k choices = model_space.choices keep = model_space.keep k = np.random.choice(allowed) cols = tuple(np.random.choice(choices, size=k-len(keep)+2, replace=False)) return sorted(keep + tuple(cols))
def pSampleAll(filename, tablename, groupname='', threads=2): '''samples from all models in parallel Parameters ---------- filename : str filepath to save results tablename : str name of table to save groupname : str the group node for result storage threads : int number of processes to spawn Returns ------- hfile : pytables file reference to on-disk storage ''' model_space = modelcontext() allowed = model_space.k choices = model_space.choices keep = model_space.keep maxm = model_space.maxm p = mp.Pool(threads) cols = itertools.chain.from_iterable(iter(itertools.combinations(choices, k-len(keep)+2) \ for k in allowed)) # Pooling Results mapped = iter(p.map(_get_result, cols)) p.close() p.join() # Saving Results num = len(keep) - 1 + len(choices) hfile = _create_table(filename, tablename, groupname, num) resultTable = getattr(hfile.get_node('/{}'.format(groupname)), tablename) for i, (c, fit) in enumerate(mapped): fitcols = sorted(keep + c) _append_result(resultTable, num, fit, fitcols) _process_results(resultTable) hfile.flush() return hfile
def _get_result(cols): '''obtain model results Parameters ---------- cols : list columns in data Returns ------- rslt : tuple column-result pair ''' model_space = modelcontext() keep = model_space.keep return (cols, model_space.fit(sorted(keep + cols)))
def SampleAll(filename, tablename, groupname=''): '''samples from all of the models Parameters ---------- filename : str filepath to save results tablename : str name of table to save groupname : str the group node for result storage Returns ------- hfile : pytables file reference to on-disk storage ''' model_space = modelcontext() allowed = model_space.k choices = model_space.choices keep = model_space.keep maxm = model_space.maxm cols = itertools.chain.from_iterable(iter(itertools.combinations(choices, k-len(keep)+2) \ for k in allowed)) num = len(keep) - 1 + len(choices) hfile = _create_table(filename, tablename, groupname, num) resultTable = getattr(hfile.get_node('/{}'.format(groupname)), tablename) for i,c in enumerate(cols): fitcols = sorted(keep + c) fit = model_space.fit(fitcols) _append_result(resultTable, num, fit, fitcols) _process_results(resultTable) hfile.flush() return hfile
def mcmc_draw(last_draw): '''moves to next model in markov chain sampler for model space Parameters ---------- last_draw : list set of regressors from previous draw cache : dict dictionary to store regression results Returns ------- draw : list set of regressors ''' model_space = modelcontext() allowed = model_space.k choices = model_space.choices keep = model_space.keep maxm = model_space.maxm width = len(keep) + len(choices) prev = np.zeros(width) prev[last_draw] = 1 prev = prev.reshape((-1, 1)) neighbors = abs(np.diag(np.ones(width)) - prev)[:, choices] neighbors = neighbors[:, np.any([neighbors.sum(axis=0) == i+1 for i in allowed], axis=0)] draw = random.choice(xrange(neighbors.shape[1])) proposal = sorted(np.arange(neighbors.shape[0])[neighbors[:, draw] == 1]) return proposal
def pMCMC(visits, filename, tablename, groupname='', threads=2, **mcargs): '''parallel MCMC sampler for model space Parameters ---------- visits : int number of total visits filename : str filepath to save results tablename : str name of table to save groupname : str the group node for result storage burn : int number of total visits to burn thin : int related to fraction of visits kept in chain kick : float minimum value for transition probability seed : int seed for random number threads : int number of threads to spawn for sampling Notes ----- will run a markov chain with `visits` on every thread specified ''' burn = mcargs.get('burn', 0) thin = mcargs.get('thin', 1) kick = mcargs.get('kick', 0.) seed = mcargs.get('seed', 1234) model_space = modelcontext() maxm = model_space.maxm if visits >= maxm: return pSampleAll(filename, tablename, groupname, threads) d_visits = [int(visits/threads)]*threads d_visits = [v+(i<visits%threads) for i,v in enumerate(d_visits)] d_filename = ['t{}_{}'.format(i, filename) for i in xrange(threads)] d_tablename = [tablename]*threads d_groupname = [groupname]*threads d_burn = [int(burn/threads)]*threads d_burn = [b+(i<burn%threads) for i,b in enumerate(d_burn)] d_thin = [thin]*threads d_kick = [kick]*threads d_seed = [seed+i for i in xrange(threads)] argset = zip(d_visits, d_filename, d_tablename, d_groupname, d_burn, d_thin, d_kick, d_seed) p = mp.Pool(threads) jobs = [p.apply_async(MCMC, args) for args in argset] for j in jobs: j.wait() p.close() p.join() # Pooling Results hfile = tables.open_file(d_filename[0], 'a') resultTable = getattr(hfile.get_node('/{}'.format(groupname)), tablename) for name in d_filename[1:]: h = tables.open_file(name, 'a') t = getattr(h.get_node('/{}'.format(groupname)), tablename) resultTable.append(t[:]) h.close() os.remove(name) _process_results(resultTable) hfile.close() os.rename('t0_{}'.format(filename), filename) return tables.open_file(filename, 'a')
def MCMC(visits, filename, tablename, groupname='', burn=0, thin=1, kick=0., seed=1234): '''markov chain monte carlo sampler for model space Parameters ---------- visits : int number of visits in chain filename : str filepath to save results tablename : str name of table to save groupname : str the group node for result storage burn : int number of visits to burn from beginning of chain thin : int related to fraction of visits kept in chain kick : float minimum value for transition probability seed : int seed for random number Returns ------- hfile : pytables file reference to on-disk storage ''' assert (kick <= 1) & (kick >= 0) model_space = modelcontext() allowed = model_space.k choices = model_space.choices keep = model_space.keep maxm = model_space.maxm if visits >= maxm: return SampleAll(filename, tablename, groupname) np.random.seed(seed) if burn >= visits: raise ValueError('burn must be fewer than total visits') if thin < 1: raise ValueError('thin must be an integer 1 or greater') # Saving Results num = len(keep) - 1 + len(choices) hfile = _create_table(filename, tablename, groupname, num) resultTable = getattr(hfile.get_node('/{}'.format(groupname)), tablename) # Obtaining first draw at random last_draw = random_draw() fit = model_space.fit(last_draw) _append_result(resultTable, num, fit, last_draw) last_prob = resultTable.cols.posterior[-1] for i in xrange(1, visits): accepted = False while not accepted: proposal = mcmc_draw(last_draw) fit = model_space.fit(proposal) if last_prob == 0: prob = 1 else: prob = min(1, max(kick, fit[0]/last_prob)) if np.random.choice([True, False], p=[prob, 1 - prob]): last_draw = proposal _append_result(resultTable, num, fit, last_draw) last_prob = resultTable.cols.posterior[-1] accepted = True # Burning and thinning out visits in the chain if (burn > 0) or (thin > 1): resultTable.rename('{}Full'.format(tablename)) selection = resultTable.copy(newname=tablename, start=burn, stop=resultTable.shape[0], step=thin) resultTable.remove() resultTable = selection _process_results(resultTable) hfile.flush() return hfile