def __init__(self, df, params, distributed=True): self.params = params self.name = params.NAME self.query = params.CAT_QUERY self.do_variance_weighted = params.DO_VARIANCE_WEIGHTED self.N_objects_in_this_run = len(df) self.JK_Ngroups = params.JK_NGROUPS self.runJK(df, self.params, distributed) if 'tiled' in self.params.JK_RESAMPLING_METHOD: self.JK_Ngroups = self.kSZ_curveJK_realizations.shape[0] self.cov = JK_tools.getCovMatrix(self.bin_names, self.kSZ_curveJK_realizations, params) self.corr = JK_tools.getCorrMatrix(self.bin_names, self.kSZ_curveJK_realizations)
def runJK(self, df, params, distributed): t1 = time.time() if distributed is True: resampling_method = params.JK_RESAMPLING_METHOD.lower() do_massboosted = resampling_method == 'bs_dt_mass_boosted_est' do_massboosted_debiased = resampling_method == 'bs_dt_mass_boosted_est_debiased' # noqa if do_massboosted or do_massboosted_debiased: res = run_JK_distributed_massboosted(df, params) # noqa else: res = distributed_JK_kSZ.run_JK_distributed(df, params, randomize=True) else: res = singleMachine_JK_kSZ.run_JK_local(df, params, randomize=True) t2 = time.time() fullDataset_results, jk_results = res rsep = fullDataset_results[0] p_uk = fullDataset_results[1] jk_results = [jk_results[j][1] for j in range(len(jk_results))] jk_results = np.array(jk_results) self.rsep = rsep self.bin_edges = params.BIN_EDGES self.bin_names = JK_tools.getBinNamesFromBinEdges(params.BIN_EDGES) self.kSZ_curveFullDataset = p_uk self.kSZ_curveJK_realizations = jk_results self.errorbars = getErrorbars(jk_results, params) self.runtime = t2 - t1
def test_getCorrMatrix(): bin_names = ['0 - 5', '5 - 10', '10 - 15', '15 - 20'] pests = np.random.random(size=[50, 4]) corr = JK_tools.getCorrMatrix(bin_names, pests).values corr_numpy = np.corrcoef(pests.T) chi_sq = ((corr-corr_numpy)**2).flatten().sum() assert chi_sq < 1e-10
def test_indicesToDrop(): N = 20000 a = np.random.random(N) df = pd.DataFrame({'a': a}) groups = JK_tools.indicesToDrop(df, 4) assert len(groups) == 4 assert len(groups[0]) == 5000
def run_JK_local(df, params, randomize=True, multithreading=False): '''Receives the pandas df with objects with temp decrements and the parameter file object. Runs the ksz estimator and runs jackknifes. Everything runs locally, make sure you have requested the resources you are using. df: dataframe object with the variables for the calculation params: param file for this calculation NJK: how many subgroups for the run_JK ''' print("Running a JK run on the local machine, this will take a while.") Ngroups = params.JK_NGROUPS fullDataset_results = pairwiser.get_pairwise_ksz(df, params, multithreading=multithreading) # noqa indices_toDrop = JK_tools.indicesToDrop(df, Ngroups, randomize=randomize) jk_results = [] for j in range(Ngroups): print "%i/%i" % (j, Ngroups) data_JK = df.drop(indices_toDrop[j], inplace=False) jk_results.append(pairwiser.get_pairwise_ksz(data_JK, params, multithreading=multithreading)) # noqa return fullDataset_results, jk_results
def run_resample(self, df1, df2, params): t1 = time.time() res = cross_distributed.run_error_estimation_distributed( df1, df2, params) t2 = time.time() fullDataset_results11 = res['full11'] fullDataset_results12 = res['full12'] fullDataset_results22 = res['full22'] rsep = fullDataset_results11[0] p_uk11 = fullDataset_results11[1] p_uk12 = fullDataset_results12[1] p_uk22 = fullDataset_results22[1] resampled_results11 = [ res['resampled11'][j][1] for j in range(params.JK_NGROUPS) ] resampled_results12 = [ res['resampled12'][j][1] for j in range(params.JK_NGROUPS) ] resampled_results22 = [ res['resampled22'][j][1] for j in range(params.JK_NGROUPS) ] resampled_results11 = np.array(resampled_results11) resampled_results12 = np.array(resampled_results12) resampled_results22 = np.array(resampled_results22) self.rsep = rsep self.bin_edges = params.BIN_EDGES self.bin_names = JK_tools.getBinNamesFromBinEdges(params.BIN_EDGES) self.kSZ_curveFullDataset11 = p_uk11 self.kSZ_curveFullDataset12 = p_uk12 self.kSZ_curveFullDataset22 = p_uk22 self.kSZ_curveJK_realizations11 = resampled_results11 self.kSZ_curveJK_realizations12 = resampled_results12 self.kSZ_curveJK_realizations22 = resampled_results22 self.errorbars11 = JK_tools.getErrorbars(resampled_results11, params) self.errorbars12 = JK_tools.getErrorbars(resampled_results12, params) self.errorbars22 = JK_tools.getErrorbars(resampled_results22, params) self.runtime = t2 - t1
def test_getCovMatrix(): bin_names = ['0 - 5', '5 - 10', '10 - 15', '15 - 20'] pests = np.random.random(size=[50, 4]) N = 50 cov = JK_tools.getCovMatrix(bin_names, pests) cov = cov.values cov_numpy = np.cov(pests.T) * (N-1)/N*(N-1) chi_sq = ((cov_numpy - cov)**2).flatten().sum() assert chi_sq < 1e-10
def __init__(self, ds, params): df1, df2 = ds.df1, ds.df2 self.params = params self.name = params.NAME self.query = params.CAT_QUERY self.do_variance_weighted = params.DO_VARIANCE_WEIGHTED self.N_objects_in_this_run = len(df1) self.JK_Ngroups = params.JK_NGROUPS self.run_resample(df1, df2, self.params) self.cov11 = JK_tools.getCovMatrix(self.bin_names, self.kSZ_curveJK_realizations11, params) self.cov12 = JK_tools.getCovMatrix(self.bin_names, self.kSZ_curveJK_realizations12, params) self.cov22 = JK_tools.getCovMatrix(self.bin_names, self.kSZ_curveJK_realizations22, params) self.corr11 = JK_tools.getCorrMatrix(self.bin_names, self.kSZ_curveJK_realizations11) self.corr12 = JK_tools.getCorrMatrix(self.bin_names, self.kSZ_curveJK_realizations12) self.corr22 = JK_tools.getCorrMatrix(self.bin_names, self.kSZ_curveJK_realizations22)
def test_getErrorbars(): class p: def __init__(self, ngroups): self.JK_NGROUPS = ngroups howManyJKiterations = 50000 fakePars = p(howManyJKiterations) res = np.random.normal(size=[howManyJKiterations, 20]) errorbars = JK_tools.getErrorbars(res, fakePars) std_res = np.std(res, axis=0) * np.sqrt(howManyJKiterations - 1) diff_sq = (std_res - errorbars)**2 assert diff_sq.sum() < 1e-10
def test_getBinNames(): rsep = np.array([5, 10, 15, 20]) names = JK_tools.getBinNames(rsep) assert names == ['0 - 5', '5 - 10', '10 - 15', '15 - 20']
def run_JK_distributed(df, param, randomize=True): '''Receives the pandas dataframe with the objects containing the temperature decrements and the parameter object and run the kSZ statistic and generate Jack Knifes. Everything runs in the cluster, so current terminal does not need to request many cpus. df: dataframe object containing the variables for the calculation params: param file for this calculation NJK: how many subgroups we will make to run the calculation randomize: shuffle data before running the JK''' Ncores = envVars.Ncores NWorkers = envVars.NWorkers Ngroups = param.JK_NGROUPS resampling_method = param.JK_RESAMPLING_METHOD.lower() #setup cluster cluster = SGECluster(walltime='172800', processes=1, cores=1, env_extra=['#$-pe sge_pe %i' % Ncores, '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch', 'export NUMBA_NUM_THREADS=%i' % Ncores, 'export OMP_NUM_THREADS=%i' % Ncores # 'export OMP_NUM_THREADS=1', # noqa ]) cluster.scale(NWorkers) client = Client(cluster) time.sleep(30) #end setting up cluster #send full dataset to the cluster future_fullDataset = client.scatter(df) future_params = client.scatter(param) res_fullDataset = client.submit(pairwiser.get_pairwise_ksz, future_fullDataset, future_params, multithreading=True) #done with the full dataset #iterate over partial dataset for the JK if JK == resampling_method: indices_toDrop = JK_tools.indicesToDrop(df, Ngroups, randomize=randomize) jk_results = [] futureData = [] #data to be sent in jk or bootstrap in galaxy space if (JK == resampling_method) or (BS == resampling_method): for j in range(Ngroups): # submit data to the cluster if JK in resampling_method: # if method jk dataJK = df.drop(indices_toDrop[j], inplace=False) futureData.append(client.scatter(dataJK)) elif BS in resampling_method: dataBS = df.sample(len(df), replace=True) futureData.append(client.scatter(dataBS)) #Now do the JK calculation for j in range(Ngroups): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) if BS_PW == resampling_method: # submit the same dataset futureData = client.scatter(df, broadcast=True) for j in range(Ngroups): jk_results.append(client.submit(bs_pw.get_bootstrap_pairwise, futureData, future_params, multithreading=True, pure=False)) if resampling_method == BS_DT: for j in range(Ngroups): df_bs = df.copy() choose = np.random.choice(len(df), len(df)) df_bs['dT'] = df.dT.values[choose] futureData.append(client.scatter(df_bs)) for j in range(Ngroups): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) if resampling_method == TL_JK: tiled_JK.classify_grid(df) df = tiled_JK.remove_edge_galaxies(df, tol_sigma=1.5) Ntiles = tiled_JK.how_many_tiles(df) for j in range(Ntiles): df_tosubmit = tiled_JK.remove_tile(df, j) futureData.append(client.scatter(df_tosubmit)) for j in range(Ntiles): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) #extract results fullDataset_results = res_fullDataset.result() jk_results = client.gather(jk_results) client.close() # cluster.close() return fullDataset_results, jk_results