def init_cluster(args): env_extra = [ "#$ -e {}".format(args.log_dir or "/dev/null"), "#$ -o {}".format(args.log_dir or "/dev/null"), "#$ -pe serial {}".format(args.ngpus if args.ngpus > 0 else args.ncpus), "export LANG=en_US.UTF-8", "export LC_ALL=en_US.UTF-8", "export MKL_NUM_THREADS=1", "export NUMEXPR_NUM_THREADS=1", "export OMP_NUM_THREADS=1", "export DISABLE_MP_CACHE=1", ] cluster = SGECluster( queue=args.queue, resource_spec="h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req), walltime="720:00:00", name="test_Dask_PytorchDataloader", cores=args.ncpus, memory="{}G".format(args.mem_req), processes=1, interface="ib0", local_directory=".", env_extra=env_extra, spill_dir=".", extra=["--no-nanny"], ) cluster.scale(args.jobs) return cluster
def setup_client_and_cluster(number_processes=1, number_jobs=1, walltime="00:01:00", memory=1): """ Setup Dask client and cluster. Ensure that the number of workers is the right amount for your job and will be fully utilised. """ print("Setting up Dask client and cluster ...") # number of workers used for number of partitions number_workers = number_processes * number_jobs # these are the requirements for a single worker cluster = SGECluster( interface="ib0", walltime=walltime, memory=f"{memory} G", resource_spec=f"h_vmem={memory}G", scheduler_options={"dashboard_address": ":2727"}, job_extra=[ "-V", # export all environment variables f"-pe smp {number_processes}", f"-l disk={memory}G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space"]), ) client = Client(cluster) cluster.scale(jobs=number_jobs) print("The resources of each worker are: ") print(cluster.job_script()) return client, cluster
def main(): # dask cluster and client n_jobs = 20 n_processes = 1 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="02:00:00", memory=f"48 G", resource_spec=f"h_vmem=48G", scheduler_options={ "dashboard_address": ":7777", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=48G", ], local_directory=os.sep.join([os.environ.get("PWD"), "dask-hia-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # dask bag and process simulations = [f'emulator_Base_CLE_2020_{output}'] #simulations = [] #simulations.append(f'wrfchem_Base_CLE_2020_{output}') #simulations.append(f'wrfchem_Base_CLE_2050_{output}') #simulations.append(f'wrfchem_Base_MFR_2050_{output}') #simulations.append(f'wrfchem_SDS_MFR_2050_{output}') #for year in ['2020', '2030', '2040', '2050']: # for scenario in ['Base_CLE', 'Base_MFR', 'SDS_MFR']: # for sim in ['', '_RES', '_IND', '_TRA', '_AGR', '_ENE', '_NO_RES', '_NO_IND', '_NO_TRA', '_NO_AGR', '_NO_ENE']: # simulations.append(f'emulator_{scenario}_{year}{sim}_{output}') print(f"predicting for {len(simulations)} custom outputs ...") bag_simulations = db.from_sequence(simulations, npartitions=n_workers) if output == "PM2_5_DRY": bag_simulations.map(health_impact_assessment_pm25).compute() elif output == "o3_6mDM8h": bag_simulations.map(health_impact_assessment_o3).compute() time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster(interface='ib0', walltime='01:00:00', memory=f'64 G', resource_spec=f'h_vmem=64G', scheduler_options={ 'dashboard_address': ':5757', }, job_extra=['-cwd', '-V', f'-pe smp {n_processes}'], local_directory=os.sep.join( [os.environ.get('PWD'), 'dask-worker-space'])) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # regrid custom outputs to pop grid custom_outputs = glob.glob(path + 'ds*' + output + '.nc') custom_outputs_completed = glob.glob(path + 'ds*' + output + '_popgrid_0.05deg.nc') custom_outputs_completed = [ f'{item[0:-19]}.nc' for item in custom_outputs_completed ] custom_outputs_remaining_set = set(custom_outputs) - set( custom_outputs_completed) custom_outputs_remaining = [item for item in custom_outputs_remaining_set] print( f'custom outputs remaining for {output}: {len(custom_outputs_remaining)}' ) # dask bag and process custom_outputs_remaining = custom_outputs_remaining[ 0: 2500] # run in 2,500 chunks over 30 cores, each chunk taking 5 minutes print(f'predicting for {len(custom_outputs_remaining)} custom outputs ...') bag_custom_outputs = db.from_sequence(custom_outputs_remaining, npartitions=n_workers) bag_custom_outputs.map(regrid_to_pop).compute() time_end = time.time() - time_start print( f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours' ) print( f'average time per custom output is {time_end / len(custom_outputs_remaining):0.2f} seconds' ) client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="48:00:00", memory=f"12 G", resource_spec=f"h_vmem=12G", scheduler_options={ "dashboard_address": ":5757", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=1G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space_popweighted_region"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) # main processing matrix_stacked = np.array( np.meshgrid( np.linspace(0, 1.4, 8), np.linspace(0, 1.4, 8), np.linspace(0, 1.4, 8), np.linspace(0, 1.4, 8), np.linspace(0, 1.4, 8), )).T.reshape(-1, 5) custom_inputs = [np.array(item).reshape(1, -1) for item in matrix_stacked] print(f"processing for {output} over {region} ...") outputs_popweighted = [] bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers) outputs_popweighted = bag_custom_inputs.map( popweight_outputs_for_input).compute() print("saving ...") joblib.dump( outputs_popweighted, f"/nobackup/earlacoa/machinelearning/data_annual/popweighted/popweighted_{region}_{output}_0.25deg_adjusted_scaled.joblib", ) client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster(interface='ib0', walltime='48:00:00', memory=f'12 G', resource_spec=f'h_vmem=12G', scheduler_options={ 'dashboard_address': ':5757', }, job_extra=[ '-cwd', '-V', f'-pe smp {n_processes}', f'-l disk=1G', ], local_directory=os.sep.join( [os.environ.get('PWD'), 'dask-worker-space'])) client = Client(cluster) cluster.scale(jobs=n_jobs) # main processing matrix_stacked = np.array( np.meshgrid(np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16))).T.reshape(-1, 5) custom_inputs = [np.array(item).reshape(1, -1) for item in matrix_stacked] print(f'processing for {output} over {region} ...') outputs_popweighted = [] bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers) outputs_popweighted = bag_custom_inputs.map( popweight_outputs_for_input).compute() print('saving ...') joblib.dump( outputs_popweighted, '/nobackup/earlacoa/machinelearning/data/popweighted/popweighted_' + region + '_' + output + '.joblib') client.close() cluster.close()
def get_cluster(which="ccin2p3", scale=None, set_client=True, **kwargs): """ """ if which == "ccin2p3": from dask_jobqueue import SGECluster prop = dict(name="dask-worker", walltime="06:00:00", memory='8GB', death_timeout=120, project="P_ztf", resource_spec='sps=1', cores=1, processes=1) cluster = SGECluster(**{**prop,**kwargs}) else: raise NotImplementedError(f"only 'ccin2p3' cluster implemented {which} given") if scale is not None: cluster.scale( int(scale) ) return cluster
def process_dask( funcs, jobs=10, cores=3, processes=3, h_vmem=20, m_mem_free=5, h_rt=3000, ): cluster = SGECluster( n_workers=0, job_cls=None, loop=None, security=None, silence_logs='error', name=None, asynchronous=False, interface=None, host=None, protocol='tcp://', dashboard_address=':8787', config_name=None, processes=processes, queue='low.q', project="labxchem", cores=cores, memory="{}GB".format(h_vmem), walltime=h_rt, resource_spec="m_mem_free={}G,h_vmem={}G,h_rt={}".format( m_mem_free, h_vmem, h_rt), ) cluster.scale(jobs=jobs) client = Client(cluster) results_futures = client.map( call, funcs, ) results = client.gather(results_futures) return results
def init_cluster(name, args): resource_spec = "h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req) exclude_nodes = "&".join(["!" + x for x in args.exclude_nodes]) if len(exclude_nodes) > 0: exclude_nodes = "#$ -l h=" + exclude_nodes env_extra = [ "#$ -e {}".format(args.log_dir or "/dev/null"), "#$ -o {}".format(args.log_dir or "/dev/null"), "#$ -pe serial {}".format( args.ngpus if args.ngpus > 0 else args.ncpus), exclude_nodes, "source " + args.to_source if args.to_source is not None else "", "export LANG=en_US.UTF-8", "export LC_ALL=en_US.UTF-8", "export MKL_NUM_THREADS=1", "export NUMEXPR_NUM_THREADS=1", "export OMP_NUM_THREADS=1", "export DISABLE_MP_CACHE=1", "export TORCH_HOME=/sequoia/data1/rriochet/.torch", ] for var in args.export_var: env_extra.append(f'export {var}="{os.environ[var]}"') cluster = SGECluster( queue=args.queue, resource_spec=resource_spec, walltime="720:00:00", name=name, cores=args.ncpus, memory="{}G".format(args.mem_req), processes=1, interface="ib0", local_directory=args.log_dir, env_extra=env_extra, spill_dir=args.spill_dir, extra=["--no-nanny"], ) # cluster.adapt(maximum_jobs=args.jobs) cluster.scale(args.jobs) return cluster
def get_client(): dask.config.set({"distributed.admin.tick.limit": "300s"}) cluster = SGECluster( queue="medium.q", project="labxchem", cores=10, processes=5, memory="64GB", resource_spec="m_mem_free=64G,redhat_release=rhel7", python= "/dls/science/groups/i04-1/conor_dev/ccp4/build/bin/cctbx.python", walltime="03:00:00", ) cluster.scale(60) time.sleep(15) client = Client(cluster) return client
def main(): # dask cluster and client number_processes = 1 number_jobs = 35 number_workers = number_processes * number_jobs cluster = SGECluster( interface="ib0", walltime="04:00:00", memory=f"2 G", resource_spec=f"h_vmem=2G", scheduler_options={ "dashboard_address": ":2727", }, job_extra=[ "-cwd", "-V", f"-pe smp {number_processes}", f"-l disk=1G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space"]), ) client = Client(cluster) cluster.scale(jobs=number_jobs) # main processing print("processing ...") results = [] bag = db.from_sequence(nums, npartitions=number_workers) results = bag.map(weird_function).compute() print("saving ...") joblib.dump(results, f"/nobackup/${USER}/results.joblib") client.close() cluster.close()
def main(): # dask cluster and client number_processes = 1 number_jobs = 35 number_workers = number_processes * number_jobs cluster = SGECluster( interface="ib0", walltime="04:00:00", memory=f"12 G", resource_spec=f"h_vmem=12G", scheduler_options={ "dashboard_address": ":2727", }, job_extra=[ "-cwd", "-V", f"-pe smp {number_processes}", f"-l disk=1G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space"]), ) client = Client(cluster) cluster.scale(jobs=number_jobs) # main processing print("processing ...") results = [] bag = db.from_sequence(sims, npartitions=number_workers) results = bag.map(create_ozone_metric).compute() print("complete") client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster(interface='ib0', walltime='01:00:00', memory=f'2 G', resource_spec=f'h_vmem=2G', scheduler_options={ 'dashboard_address': ':5757', }, project='admiralty', job_extra=[ '-cwd', '-V', f'-pe smp {n_processes}', f'-l disk=1G', ], local_directory=os.sep.join( [os.environ.get('PWD'), 'dask-worker-space'])) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # custom inputs matrix_stacked = np.array( np.meshgrid( np.linspace( 0, 1.5, 16 ), # 1.5 and 16 for 0.1, 1.5 and 6 for 0.3, 1.4 and 8 for 0.2 np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16))).T.reshape(-1, 5) custom_inputs_set = set( tuple(map(float, map("{:.1f}".format, item))) for item in matrix_stacked) custom_inputs_completed_filenames = glob.glob( '/nobackup/earlacoa/machinelearning/data/summary/ds*' + output + '*') custom_inputs_completed_list = [] for custom_inputs_completed_filename in custom_inputs_completed_filenames: custom_inputs_completed_list.append([ float(item) for item in re.findall( r'\d+\.\d+', custom_inputs_completed_filename) ]) custom_inputs_completed_set = set( tuple(item) for item in custom_inputs_completed_list) custom_inputs_remaining_set = custom_inputs_set - custom_inputs_completed_set custom_inputs = [ np.array(item).reshape(1, -1) for item in custom_inputs_remaining_set ] print(f'custom inputs remaining for {output}: {len(custom_inputs)}') # dask bag and process custom_inputs = custom_inputs[ 0:5000] # run in 1,000 chunks over 30 cores, each chunk taking 1 hour print(f'predicting for {len(custom_inputs)} custom inputs ...') bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers) bag_custom_inputs.map(custom_predicts).compute() time_end = time.time() - time_start print( f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours' ) print( f'average time per custom input is {time_end / len(custom_inputs):0.2f} seconds' ) client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="01:00:00", memory=f"64 G", resource_spec=f"h_vmem=64G", scheduler_options={ "dashboard_address": ":5757", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=32G", ], local_directory=os.sep.join([os.environ.get("PWD"), "dask-worker-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # regrid custom outputs to pop grid custom_outputs = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}.nc" ) custom_outputs_completed = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}_popgrid_0.25deg.nc" ) custom_outputs_completed = [ f"{item[0:-19]}.nc" for item in custom_outputs_completed ] custom_outputs_remaining_set = set(custom_outputs) - set(custom_outputs_completed) custom_outputs_remaining = [item for item in custom_outputs_remaining_set] print(f"custom outputs remaining for {output}: {len(custom_outputs_remaining)}") # dask bag and process custom_outputs_remaining = custom_outputs_remaining[ 0:5000 ] # run in 5,000 chunks over 30 cores, each chunk taking 2 minutes print(f"predicting for {len(custom_outputs_remaining)} custom outputs ...") bag_custom_outputs = db.from_sequence( custom_outputs_remaining, npartitions=n_workers ) bag_custom_outputs.map(regrid_to_pop).compute() time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) print( f"average time per custom output is {time_end / len(custom_outputs_remaining):0.2f} seconds" ) client.close() cluster.close()
def run_JK_distributed_massboosted(df, param): '''Receives the pandas dataframe with the objects containing the temperature decrements and the parameter object and run the kSZ statistic and generate Jack Knifes. Everything runs in the cluster, so current terminal does not need to request many cpus. df: dataframe object containing the variables for the calculation params: param file for this calculation NJK: how many subgroups we will make to run the calculation''' Ncores = envVars.Ncores NWorkers = envVars.NWorkers Ngroups = param.JK_NGROUPS #setup cluster cluster = SGECluster( walltime='172800', processes=1, cores=1, env_extra=[ '#$-pe sge_pe %i' % Ncores, '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch', 'export NUMBA_NUM_THREADS=%i' % Ncores, 'export OMP_NUM_THREADS=%i' % Ncores # 'export OMP_NUM_THREADS=1', # noqa ]) cluster.scale(NWorkers) client = Client(cluster) time.sleep(30) #end setting up cluster #send full dataset to the cluster future_fullDataset = client.scatter(df) future_params = client.scatter(param) res_fullDataset = client.submit(get_pairwise_ksz_massboosted, future_fullDataset, future_params, multithreading=True) #done with the full dataset jk_results = [] futureData = [] #data to be sent in jk or bootstrap in galaxy space for j in range(Ngroups): df_bs = df.copy() choose = np.random.choice(len(df), len(df)) df_bs['dT'] = df.dT.values[choose] futureData.append(client.scatter(df_bs)) if param.JK_RESAMPLING_METHOD.lower() == "bs_dt_mass_boosted_est": get_pw_func = get_pairwise_ksz_massboosted elif param.JK_RESAMPLING_METHOD.lower( ) == 'bs_dt_mass_boosted_est_debiased': # noqa get_pw_func = get_pairwise_ksz_massboosted_debiased for j in range(Ngroups): jk_results.append( client.submit(get_pw_func, futureData[j], future_params, multithreading=True)) # extract results fullDataset_results = res_fullDataset.result() jk_results = client.gather(jk_results) client.close() # cluster.close() return fullDataset_results, jk_results
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="01:00:00", memory=f"64 G", resource_spec=f"h_vmem=64G", scheduler_options={ "dashboard_address": ":5757", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=32G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-scale-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # scale custom outputs if normal: emission_configs = np.array( np.meshgrid( np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), )).T.reshape(-1, 5) emission_configs_20percentintervals = [] for emission_config in emission_configs: emission_configs_20percentintervals.append( f'RES{round(emission_config[0], 1)}_IND{round(emission_config[1], 1)}_TRA{round(emission_config[2], 1)}_AGR{round(emission_config[3], 1)}_ENE{round(emission_config[4], 1)}' ) if extra: custom_inputs_main = [ np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]), # bottom-up 2010 np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]), # bottom-up 2011 np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]), # bottom-up 2012 np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]), # bottom-up 2013 np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]), # bottom-up 2014 np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]), # bottom-up 2016 np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]), # bottom-up 2017 np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]), np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]), np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]), np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]), np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]), np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]), np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]), np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]), np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]), np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]), np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]), np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]), np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]), np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]), np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]), np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]), np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]), np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]), np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]), np.array([[0.867, 0.957, 0.677, 0.558, 0.477]]) ] custom_inputs = [] for custom_input in custom_inputs_main: custom_input_res = np.copy(custom_input) custom_input_ind = np.copy(custom_input) custom_input_tra = np.copy(custom_input) custom_input_agr = np.copy(custom_input) custom_input_ene = np.copy(custom_input) custom_input_nores = np.copy(custom_input) custom_input_noind = np.copy(custom_input) custom_input_notra = np.copy(custom_input) custom_input_noagr = np.copy(custom_input) custom_input_noene = np.copy(custom_input) custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_res[0][1:] = 1.0 custom_input_ind[0][0] = 1.0 custom_input_ind[0][2:] = 1.0 custom_input_tra[0][:2] = 1.0 custom_input_tra[0][3:] = 1.0 custom_input_agr[0][:3] = 1.0 custom_input_agr[0][4:] = 1.0 custom_input_ene[0][:4] = 1.0 custom_input_nores[0][0] = 0.0 custom_input_noind[0][1] = 0.0 custom_input_notra[0][2] = 0.0 custom_input_noagr[0][3] = 0.0 custom_input_noene[0][4] = 0.0 custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input) custom_inputs.append(custom_input_res) custom_inputs.append(custom_input_ind) custom_inputs.append(custom_input_tra) custom_inputs.append(custom_input_agr) custom_inputs.append(custom_input_ene) custom_inputs.append(custom_input_nores) custom_inputs.append(custom_input_noind) custom_inputs.append(custom_input_notra) custom_inputs.append(custom_input_noagr) custom_inputs.append(custom_input_noene) custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) emission_configs_20percentintervals = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' emission_configs_20percentintervals.append(emission_config) if climate_cobenefits: custom_inputs_main = [ np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # Base_CLE_2020 np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # Base_MFR_2020 np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # SDS_MFR_2020 np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]), # Base_CLE_2030 np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]), # Base_MFR_2030 np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]), # SDS_MFR_2030 np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]), # Base_CLE_2040 np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]), # Base_MFR_2040 np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]), # SDS_MFR_2040 np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]), # Base_CLE_2050 np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]), # Base_MFR_2050 np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]), # SDS_MFR_2050 ] custom_inputs = [] for custom_input in custom_inputs_main: custom_input_res = np.copy(custom_input) custom_input_ind = np.copy(custom_input) custom_input_tra = np.copy(custom_input) custom_input_agr = np.copy(custom_input) custom_input_ene = np.copy(custom_input) custom_input_nores = np.copy(custom_input) custom_input_noind = np.copy(custom_input) custom_input_notra = np.copy(custom_input) custom_input_noagr = np.copy(custom_input) custom_input_noene = np.copy(custom_input) custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_res[0][1:] = 1.0 custom_input_ind[0][0] = 1.0 custom_input_ind[0][2:] = 1.0 custom_input_tra[0][:2] = 1.0 custom_input_tra[0][3:] = 1.0 custom_input_agr[0][:3] = 1.0 custom_input_agr[0][4:] = 1.0 custom_input_ene[0][:4] = 1.0 custom_input_nores[0][0] = 0.0 custom_input_noind[0][1] = 0.0 custom_input_notra[0][2] = 0.0 custom_input_noagr[0][3] = 0.0 custom_input_noene[0][4] = 0.0 custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input) custom_inputs.append(custom_input_res) custom_inputs.append(custom_input_ind) custom_inputs.append(custom_input_tra) custom_inputs.append(custom_input_agr) custom_inputs.append(custom_input_ene) custom_inputs.append(custom_input_nores) custom_inputs.append(custom_input_noind) custom_inputs.append(custom_input_notra) custom_inputs.append(custom_input_noagr) custom_inputs.append(custom_input_noene) custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) emission_configs_20percentintervals = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' emission_configs_20percentintervals.append(emission_config) if top_down_2020_baseline: emission_config_2020_baseline = np.array( [0.604, 0.399, 0.659, 0.613, 0.724]) # matching to PM2.5 only, top 1,000 emission_configs = np.array( np.meshgrid( np.linspace( emission_config_2020_baseline[0] * 0.50, emission_config_2020_baseline[0], 6 ), # 10% reduction increments from 2020 baseline up to 50% np.linspace(emission_config_2020_baseline[1] * 0.50, emission_config_2020_baseline[1], 6), np.linspace(emission_config_2020_baseline[2] * 0.50, emission_config_2020_baseline[2], 6), np.linspace(emission_config_2020_baseline[3] * 0.50, emission_config_2020_baseline[3], 6), np.linspace(emission_config_2020_baseline[4] * 0.50, emission_config_2020_baseline[4], 6), )).T.reshape(-1, 5) # add a couple more for larger reductions in RES and IND to reach WHO-IT2 emission_configs = list(emission_configs) emission_configs.append(np.array([0.242, 0.160, 0.659, 0.613, 0.724])) emission_configs.append(np.array([0.181, 0.120, 0.659, 0.613, 0.724])) emission_configs.append(np.array([0.121, 0.080, 0.659, 0.613, 0.724])) emission_configs.append(np.array([0.060, 0.040, 0.659, 0.613, 0.724])) emission_configs_20percentintervals = [] for emission_config in emission_configs: emission_configs_20percentintervals.append( f'RES{round(emission_config[0], 3):.3f}_IND{round(emission_config[1], 3):.3f}_TRA{round(emission_config[2], 3):.3f}_AGR{round(emission_config[3], 3):.3f}_ENE{round(emission_config[4], 3):.3f}' ) emission_configs_completed = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}_adjusted_scaled/ds*{output}_popgrid_0.25deg_adjusted_scaled.nc" ) emission_configs_completed = [ f"{item[88:-45]}" for item in emission_configs_completed ] emission_configs_20percentintervals_remaining_set = set( emission_configs_20percentintervals) - set(emission_configs_completed) emission_configs_remaining = [ item for item in emission_configs_20percentintervals_remaining_set ] print( f"custom outputs remaining for {output}: {len(emission_configs_remaining)} - 20% intervals with {int(100 * len(emission_configs_20percentintervals_remaining_set) / len(emission_configs_20percentintervals))}% remaining" ) # dask bag and process emission_configs_remaining = emission_configs_remaining[:35000] print( f"predicting for {len(emission_configs_remaining)} custom outputs ...") bag_emission_configs = db.from_sequence(emission_configs_remaining, npartitions=n_workers) bag_emission_configs.map(scale).compute() time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) print( f"average time per custom output is {time_end / len(emission_configs_remaining):0.2f} seconds" ) client.close() cluster.close()
def run_JK_distributed(df, param, randomize=True): '''Receives the pandas dataframe with the objects containing the temperature decrements and the parameter object and run the kSZ statistic and generate Jack Knifes. Everything runs in the cluster, so current terminal does not need to request many cpus. df: dataframe object containing the variables for the calculation params: param file for this calculation NJK: how many subgroups we will make to run the calculation randomize: shuffle data before running the JK''' Ncores = envVars.Ncores NWorkers = envVars.NWorkers Ngroups = param.JK_NGROUPS resampling_method = param.JK_RESAMPLING_METHOD.lower() #setup cluster cluster = SGECluster(walltime='172800', processes=1, cores=1, env_extra=['#$-pe sge_pe %i' % Ncores, '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch', 'export NUMBA_NUM_THREADS=%i' % Ncores, 'export OMP_NUM_THREADS=%i' % Ncores # 'export OMP_NUM_THREADS=1', # noqa ]) cluster.scale(NWorkers) client = Client(cluster) time.sleep(30) #end setting up cluster #send full dataset to the cluster future_fullDataset = client.scatter(df) future_params = client.scatter(param) res_fullDataset = client.submit(pairwiser.get_pairwise_ksz, future_fullDataset, future_params, multithreading=True) #done with the full dataset #iterate over partial dataset for the JK if JK == resampling_method: indices_toDrop = JK_tools.indicesToDrop(df, Ngroups, randomize=randomize) jk_results = [] futureData = [] #data to be sent in jk or bootstrap in galaxy space if (JK == resampling_method) or (BS == resampling_method): for j in range(Ngroups): # submit data to the cluster if JK in resampling_method: # if method jk dataJK = df.drop(indices_toDrop[j], inplace=False) futureData.append(client.scatter(dataJK)) elif BS in resampling_method: dataBS = df.sample(len(df), replace=True) futureData.append(client.scatter(dataBS)) #Now do the JK calculation for j in range(Ngroups): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) if BS_PW == resampling_method: # submit the same dataset futureData = client.scatter(df, broadcast=True) for j in range(Ngroups): jk_results.append(client.submit(bs_pw.get_bootstrap_pairwise, futureData, future_params, multithreading=True, pure=False)) if resampling_method == BS_DT: for j in range(Ngroups): df_bs = df.copy() choose = np.random.choice(len(df), len(df)) df_bs['dT'] = df.dT.values[choose] futureData.append(client.scatter(df_bs)) for j in range(Ngroups): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) if resampling_method == TL_JK: tiled_JK.classify_grid(df) df = tiled_JK.remove_edge_galaxies(df, tol_sigma=1.5) Ntiles = tiled_JK.how_many_tiles(df) for j in range(Ntiles): df_tosubmit = tiled_JK.remove_tile(df, j) futureData.append(client.scatter(df_tosubmit)) for j in range(Ntiles): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) #extract results fullDataset_results = res_fullDataset.result() jk_results = client.gather(jk_results) client.close() # cluster.close() return fullDataset_results, jk_results
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="01:00:00", memory=f"2 G", resource_spec=f"h_vmem=2G", scheduler_options={ "dashboard_address": ":5757", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=1G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # custom inputs if normal: matrix_stacked = np.array( np.meshgrid( np.linspace( 0, 1.5, 16 ), # 1.5 and 16 for 0.1, 1.5 and 6 for 0.3, 1.4 and 8 for 0.2 np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), )).T.reshape(-1, 5) custom_inputs_set = set( tuple(map(float, map("{:.1f}".format, item))) for item in matrix_stacked) custom_inputs_completed_filenames = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}*" ) custom_inputs_completed_list = [] for custom_inputs_completed_filename in custom_inputs_completed_filenames: custom_inputs_completed_list.append([ float(item) for item in re.findall( r"\d+\.\d+", custom_inputs_completed_filename) ]) custom_inputs_completed_set = set( tuple(item) for item in custom_inputs_completed_list) custom_inputs_remaining_set = custom_inputs_set - custom_inputs_completed_set custom_inputs = [ np.array(item).reshape(1, -1) for item in custom_inputs_remaining_set ] print(f"custom inputs remaining for {output}: {len(custom_inputs)}") if extra: custom_inputs_main = [ np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]), # bottom-up 2010 np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]), # bottom-up 2011 np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]), # bottom-up 2012 np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]), # bottom-up 2013 np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]), # bottom-up 2014 np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]), # bottom-up 2016 np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]), # bottom-up 2017 np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]), np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]), np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]), np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]), np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]), np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]), np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]), np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]), np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]), np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]), np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]), np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]), np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]), np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]), np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]), np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]), np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]), np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]), np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]), np.array([[0.867, 0.957, 0.677, 0.558, 0.477]]) ] custom_inputs = [] for custom_input in custom_inputs_main: custom_input_res = np.copy(custom_input) custom_input_ind = np.copy(custom_input) custom_input_tra = np.copy(custom_input) custom_input_agr = np.copy(custom_input) custom_input_ene = np.copy(custom_input) custom_input_nores = np.copy(custom_input) custom_input_noind = np.copy(custom_input) custom_input_notra = np.copy(custom_input) custom_input_noagr = np.copy(custom_input) custom_input_noene = np.copy(custom_input) custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_res[0][1:] = 1.0 custom_input_ind[0][0] = 1.0 custom_input_ind[0][2:] = 1.0 custom_input_tra[0][:2] = 1.0 custom_input_tra[0][3:] = 1.0 custom_input_agr[0][:3] = 1.0 custom_input_agr[0][4:] = 1.0 custom_input_ene[0][:4] = 1.0 custom_input_nores[0][0] = 0.0 custom_input_noind[0][1] = 0.0 custom_input_notra[0][2] = 0.0 custom_input_noagr[0][3] = 0.0 custom_input_noene[0][4] = 0.0 custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input) custom_inputs.append(custom_input_res) custom_inputs.append(custom_input_ind) custom_inputs.append(custom_input_tra) custom_inputs.append(custom_input_agr) custom_inputs.append(custom_input_ene) custom_inputs.append(custom_input_nores) custom_inputs.append(custom_input_noind) custom_inputs.append(custom_input_notra) custom_inputs.append(custom_input_noagr) custom_inputs.append(custom_input_noene) custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) # just for emulator_predictions.py as this is required in order to adjust for double emissions custom_inputs_temp = custom_inputs.copy() for custom_input in custom_inputs_temp: custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) emission_configs_20percentintervals = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' emission_configs_20percentintervals.append(emission_config) emission_configs_20percentintervals = list( set(emission_configs_20percentintervals)) custom_inputs = [] for emission_config in emission_configs_20percentintervals: custom_input = np.array([ float(num) for num in re.findall(r'\d.\d+', emission_config) ]).reshape(1, -1) custom_inputs.append(custom_input) if climate_cobenefits: custom_inputs_main = [ np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # Base_CLE_2020 np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # Base_MFR_2020 np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # SDS_MFR_2020 np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]), # Base_CLE_2030 np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]), # Base_MFR_2030 np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]), # SDS_MFR_2030 np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]), # Base_CLE_2040 np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]), # Base_MFR_2040 np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]), # SDS_MFR_2040 np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]), # Base_CLE_2050 np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]), # Base_MFR_2050 np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]), # SDS_MFR_2050 ] custom_inputs = [] for custom_input in custom_inputs_main: custom_input_res = np.copy(custom_input) custom_input_ind = np.copy(custom_input) custom_input_tra = np.copy(custom_input) custom_input_agr = np.copy(custom_input) custom_input_ene = np.copy(custom_input) custom_input_nores = np.copy(custom_input) custom_input_noind = np.copy(custom_input) custom_input_notra = np.copy(custom_input) custom_input_noagr = np.copy(custom_input) custom_input_noene = np.copy(custom_input) custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_res[0][1:] = 1.0 custom_input_ind[0][0] = 1.0 custom_input_ind[0][2:] = 1.0 custom_input_tra[0][:2] = 1.0 custom_input_tra[0][3:] = 1.0 custom_input_agr[0][:3] = 1.0 custom_input_agr[0][4:] = 1.0 custom_input_ene[0][:4] = 1.0 custom_input_nores[0][0] = 0.0 custom_input_noind[0][1] = 0.0 custom_input_notra[0][2] = 0.0 custom_input_noagr[0][3] = 0.0 custom_input_noene[0][4] = 0.0 custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input) custom_inputs.append(custom_input_res) custom_inputs.append(custom_input_ind) custom_inputs.append(custom_input_tra) custom_inputs.append(custom_input_agr) custom_inputs.append(custom_input_ene) custom_inputs.append(custom_input_nores) custom_inputs.append(custom_input_noind) custom_inputs.append(custom_input_notra) custom_inputs.append(custom_input_noagr) custom_inputs.append(custom_input_noene) custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) # just for emulator_predictions.py as this is required in order to adjust for double emissions custom_inputs_temp = custom_inputs.copy() for custom_input in custom_inputs_temp: custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) emission_configs_20percentintervals = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' emission_configs_20percentintervals.append(emission_config) emission_configs_20percentintervals = list( set(emission_configs_20percentintervals)) custom_inputs = [] for emission_config in emission_configs_20percentintervals: custom_input = np.array([ float(num) for num in re.findall(r'\d.\d+', emission_config) ]).reshape(1, -1) custom_inputs.append(custom_input) if top_down_2020_baseline: emission_config_2020_baseline = np.array( [0.604, 0.399, 0.659, 0.613, 0.724]) # matching to PM2.5 only, top 1,000 emission_configs = np.array( np.meshgrid( np.linspace( emission_config_2020_baseline[0] * 0.50, emission_config_2020_baseline[0], 6 ), # 10% reduction increments from 2020 baseline up to 50% np.linspace(emission_config_2020_baseline[1] * 0.50, emission_config_2020_baseline[1], 6), np.linspace(emission_config_2020_baseline[2] * 0.50, emission_config_2020_baseline[2], 6), np.linspace(emission_config_2020_baseline[3] * 0.50, emission_config_2020_baseline[3], 6), np.linspace(emission_config_2020_baseline[4] * 0.50, emission_config_2020_baseline[4], 6), )).T.reshape(-1, 5) custom_inputs = [ np.array(item).reshape(1, -1) for item in emission_configs ] # add a couple more for larger reductions in RES and IND to reach WHO-IT2 custom_inputs.append(np.array([[0.242, 0.160, 0.659, 0.613, 0.724]])) custom_inputs.append(np.array([[0.181, 0.120, 0.659, 0.613, 0.724]])) custom_inputs.append(np.array([[0.121, 0.080, 0.659, 0.613, 0.724]])) custom_inputs.append(np.array([[0.060, 0.040, 0.659, 0.613, 0.724]])) # just for emulator_predictions.py as this is required in order to adjust for double emissions custom_inputs_temp = custom_inputs.copy() for custom_input in custom_inputs_temp: custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) emission_configs_20percentintervals = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' emission_configs_20percentintervals.append(emission_config) emission_configs_20percentintervals = set( emission_configs_20percentintervals) custom_inputs_completed_filenames = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}.nc" ) custom_inputs_completed_list = [] for custom_inputs_completed_filename in custom_inputs_completed_filenames: emission_config = re.findall( r"RES\d+\.\d+_IND\d+\.\d+_TRA\d+\.\d+_AGR\d+\.\d+_ENE\d+\.\d+", custom_inputs_completed_filename) if len(emission_config) > 0: custom_inputs_completed_list.append(emission_config) custom_inputs_completed_set = set( item[0] for item in custom_inputs_completed_list) custom_inputs_remaining_set = emission_configs_20percentintervals - custom_inputs_completed_set custom_inputs = [ np.array([float(n) for n in re.findall(r'\d+.\d+', item)]).reshape(1, -1) for item in custom_inputs_remaining_set ] # dask bag and process custom_inputs = custom_inputs[:5000] #custom_inputs = custom_inputs[5000:] print(f"predicting for {len(custom_inputs)} custom inputs ...") bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers) bag_custom_inputs.map(custom_predicts).compute() time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) print( f"average time per custom input is {time_end / len(custom_inputs):0.2f} seconds" ) client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime=walltime, memory=f"32 G", resource_spec=f"h_vmem=32G", scheduler_options={ "dashboard_address": ":5761", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=32G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-find-emis-pm-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # dask bag over emission_configs print( f"predicting over {len(emission_configs)} emission configs for {station_id} ..." ) bag_emission_configs = db.from_sequence(emission_configs, npartitions=n_workers) results = bag_emission_configs.map(filter_emission_configs).compute() station_diffs_abs = [result[0] for result in results] station_diffs_per = [result[1] for result in results] key = [key for key in baselines.keys()][0] station_diffs_abs = [ station_diff_abs for station_diff_abs in station_diffs_abs if len(station_diff_abs[key]) > 0 ] station_diffs_per = [ station_diff_per for station_diff_per in station_diffs_per if len(station_diff_per[key]) > 0 ] merged_per = {} for station_diff_per in station_diffs_per: merged_per = {**merged_per, **station_diff_per[key]} merged_abs = {} for station_diff_abs in station_diffs_abs: merged_abs = {**merged_abs, **station_diff_abs[key]} station_diffs_per = {key: merged_per} station_diffs_abs = {key: merged_abs} joblib.dump( obs_change_abs, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/obs_change_abs_{output}_{station_id}.joblib" ) joblib.dump( obs_change_per, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/obs_change_per_{output}_{station_id}.joblib" ) joblib.dump( baselines, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/baselines_{output}_{station_id}.joblib" ) joblib.dump( targets, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/targets_{output}_{station_id}.joblib" ) joblib.dump( target_diffs, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/target_diffs_{output}_{station_id}.joblib" ) joblib.dump( station_diffs_abs, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/station_diffs_abs_{output}_{station_id}.joblib" ) joblib.dump( station_diffs_per, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/station_diffs_per_{output}_{station_id}.joblib" ) time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) client.close() cluster.close()
def run_error_estimation_distributed(df1, df2, param): Ncores = envVars.Ncores NWorkers = envVars.NWorkers Ngroups = param.JK_NGROUPS #setup cluster cluster = SGECluster( walltime='172800', processes=1, cores=1, env_extra=[ '#$-pe sge_pe %i' % Ncores, '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch', 'export NUMBA_NUM_THREADS=%i' % Ncores, 'export OMP_NUM_THREADS=%i' % Ncores # 'export OMP_NUM_THREADS=1', # noqa ]) cluster.scale(NWorkers) client = Client(cluster) time.sleep(10) #end setting up cluster #send full dataset to the cluster future_df1 = client.scatter(df1) future_df2 = client.scatter(df2) future_params = client.scatter(param) res_fullDataset_11 = client.submit(cpw.get_cross_pairwise_ksz, future_df1, future_df1, future_params) res_fullDataset_12 = client.submit(cpw.get_cross_pairwise_ksz, future_df1, future_df2, future_params) res_fullDataset_22 = client.submit(cpw.get_cross_pairwise_ksz, future_df2, future_df2, future_params) #done with the full dataset #iterate over partial dataset for the JK replicants1 = [] #data to be sent replicants2 = [] if 'jk' in param.JK_RESAMPLING_METHOD.lower(): all_indx = np.arange(len(df1)) np.random.shuffle(all_indx) indx_to_drop = np.array_split(all_indx, param.JK_NGROUPS) for j in range(Ngroups): # submit data to the cluster if 'jk' in param.JK_RESAMPLING_METHOD.lower(): # if method jk todrop = indx_to_drop[j] replicant1 = df1.drop(df1.index[todrop], inplace=False) replicant2 = df2.drop(df2.index[todrop], inplace=False) replicants1.append(client.scatter(replicant1)) replicants2.append(client.scatter(replicant2)) elif 'bootstrap' in param.JK_RESAMPLING_METHOD.lower(): indxs = np.random.randint(low=0, high=len(df1), size=len(df1)) replicant1 = df1.iloc[indxs] replicant2 = df2.iloc[indxs] replicants1.append(client.scatter(replicant1)) replicants2.append(client.scatter(replicant2)) #Now do the JK calculation realizations11 = [] realizations12 = [] realizations22 = [] for j in range(Ngroups): realizations11.append( client.submit(cpw.get_cross_pairwise_ksz, replicants1[j], replicants1[j], future_params)) realizations12.append( client.submit(cpw.get_cross_pairwise_ksz, replicants1[j], replicants2[j], future_params)) realizations22.append( client.submit(cpw.get_cross_pairwise_ksz, replicants2[j], replicants2[j], future_params)) #extract results fullDataset_result11 = res_fullDataset_11.result() fullDataset_result12 = res_fullDataset_12.result() fullDataset_result22 = res_fullDataset_22.result() resampling_result11 = client.gather(realizations11) resampling_result12 = client.gather(realizations12) resampling_result22 = client.gather(realizations22) client.close() # cluster.close() results = { 'full11': fullDataset_result11, 'full12': fullDataset_result12, 'full22': fullDataset_result22, 'resampled11': resampling_result11, 'resampled12': resampling_result12, 'resampled22': resampling_result22 } return results
def main(): # dask cluster and client if output == 'PM2_5_DRY': n_jobs = 20 n_outputs = 1000 elif output == 'o3_6mDM8h': n_jobs = 20 n_outputs = 2000 n_processes = 1 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="02:00:00", memory=f"48 G", resource_spec=f"h_vmem=48G", scheduler_options={ "dashboard_address": ":7777", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=48G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-hia-ozone-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # find remaining inputs if normal: custom_outputs = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}_adjusted_scaled/ds*{output}_popgrid_0.25deg_adjusted_scaled.nc" ) custom_outputs_completed = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/health_impact_assessments/{output}_adjusted_scaled/df_country_hia_*.csv" ) custom_outputs_remaining_set = set([ item.split("/")[-1][3:-1 - len(output) - 19 - 7] for item in custom_outputs ]) - set([ item.split("/")[-1][15 + len(output) + 1:-4 - 7] for item in custom_outputs_completed ]) custom_outputs_remaining = [ item for item in custom_outputs_remaining_set ] print( f"custom outputs remaining for {output}: {len(custom_outputs_remaining)} - 10% intervals with {int(100 * len(custom_outputs_remaining_set) / 16**5)}% remaining" ) reduce_to_20percent_intervals = True if reduce_to_20percent_intervals: emission_configs = np.array( np.meshgrid( np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), )).T.reshape(-1, 5) emission_configs_20percentintervals = [] for emission_config in emission_configs: emission_configs_20percentintervals.append( f'RES{round(emission_config[0], 1)}_IND{round(emission_config[1], 1)}_TRA{round(emission_config[2], 1)}_AGR{round(emission_config[3], 1)}_ENE{round(emission_config[4], 1)}' ) emission_configs_completed = [] for custom_output_completed in custom_outputs_completed: emission_configs_completed.append( re.findall( r'RES\d+.\d+_IND\d+.\d+_TRA\d+.\d+_AGR\d+.\d+_ENE\d+.\d+', custom_output_completed)[0]) emission_configs_20percentintervals_remaining_set = set( emission_configs_20percentintervals) - set( emission_configs_completed) custom_outputs_remaining = [ item for item in emission_configs_20percentintervals_remaining_set ] print( f"custom outputs remaining for {output}: {len(custom_outputs_remaining)} - 20% intervals with {int(100 * len(emission_configs_20percentintervals_remaining_set) / len(emission_configs_20percentintervals))}% remaining" ) if extra: if year == '2010': custom_inputs_main = [ np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]), # bottom-up 2010 ] elif year == '2011': custom_inputs_main = [ np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]), # bottom-up 2011 ] elif year == '2012': custom_inputs_main = [ np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]), # bottom-up 2012 ] elif year == '2013': custom_inputs_main = [ np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]), # bottom-up 2013 ] elif year == '2014': custom_inputs_main = [ np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]), # bottom-up 2014 ] elif year == '2015': custom_inputs_main = [ np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]), # control ] elif year == '2016': custom_inputs_main = [ np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]), # bottom-up 2016 np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]), # top-down 2016 - both np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]), # top-down 2016 - either np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]), # top-down 2016 - pm25 only np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]), # top-down 2016 - o3 only ] elif year == '2017': custom_inputs_main = [ np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]), # bottom-up 2017 np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]), # top-down 2017 - both np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]), # top-down 2017 - either np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]), # top-down 2017 - pm25 only np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]), # top-down 2017 - o3 only ] elif year == '2018': custom_inputs_main = [ np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]), # top-down 2018 - both np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]), # top-down 2018 - either np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]), # top-down 2018 - pm25 only np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]), # top-down 2018 - o3 only ] elif year == '2019': custom_inputs_main = [ np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]), # top-down 2019 - both np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]), # top-down 2019 - either np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]), # top-down 2019 - pm25 only np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]), # top-down 2019 - o3 only ] elif year == '2020': custom_inputs_main = [ np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]), # top-down 2020 - both np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]), # top-down 2020 - either np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]), # top-down 2020 - pm25 only np.array([[0.867, 0.957, 0.677, 0.558, 0.477]]), # top-down 2020 - o3 only ] custom_inputs = [] for custom_input in custom_inputs_main: custom_input_res = np.copy(custom_input) custom_input_ind = np.copy(custom_input) custom_input_tra = np.copy(custom_input) custom_input_agr = np.copy(custom_input) custom_input_ene = np.copy(custom_input) custom_input_nores = np.copy(custom_input) custom_input_noind = np.copy(custom_input) custom_input_notra = np.copy(custom_input) custom_input_noagr = np.copy(custom_input) custom_input_noene = np.copy(custom_input) custom_input_res[0][1:] = 1.0 custom_input_ind[0][0] = 1.0 custom_input_ind[0][2:] = 1.0 custom_input_tra[0][:2] = 1.0 custom_input_tra[0][3:] = 1.0 custom_input_agr[0][:3] = 1.0 custom_input_agr[0][4:] = 1.0 custom_input_ene[0][:4] = 1.0 custom_input_nores[0][0] = 0.0 custom_input_noind[0][1] = 0.0 custom_input_notra[0][2] = 0.0 custom_input_noagr[0][3] = 0.0 custom_input_noene[0][4] = 0.0 custom_inputs.append(custom_input) custom_inputs.append(custom_input_res) custom_inputs.append(custom_input_ind) custom_inputs.append(custom_input_tra) custom_inputs.append(custom_input_agr) custom_inputs.append(custom_input_ene) custom_inputs.append(custom_input_nores) custom_inputs.append(custom_input_noind) custom_inputs.append(custom_input_notra) custom_inputs.append(custom_input_noagr) custom_inputs.append(custom_input_noene) custom_outputs_remaining = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' custom_outputs_remaining.append(emission_config) if climate_cobenefits: custom_inputs_main = [ np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # Base_CLE_2020 np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # Base_MFR_2020 np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # SDS_MFR_2020 np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]), # Base_CLE_2030 np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]), # Base_MFR_2030 np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]), # SDS_MFR_2030 np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]), # Base_CLE_2040 np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]), # Base_MFR_2040 np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]), # SDS_MFR_2040 np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]), # Base_CLE_2050 np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]), # Base_MFR_2050 np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]), # SDS_MFR_2050 ] custom_inputs = [] for custom_input in custom_inputs_main: custom_input_res = np.copy(custom_input) custom_input_ind = np.copy(custom_input) custom_input_tra = np.copy(custom_input) custom_input_agr = np.copy(custom_input) custom_input_ene = np.copy(custom_input) custom_input_nores = np.copy(custom_input) custom_input_noind = np.copy(custom_input) custom_input_notra = np.copy(custom_input) custom_input_noagr = np.copy(custom_input) custom_input_noene = np.copy(custom_input) custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_res[0][1:] = 1.0 custom_input_ind[0][0] = 1.0 custom_input_ind[0][2:] = 1.0 custom_input_tra[0][:2] = 1.0 custom_input_tra[0][3:] = 1.0 custom_input_agr[0][:3] = 1.0 custom_input_agr[0][4:] = 1.0 custom_input_ene[0][:4] = 1.0 custom_input_nores[0][0] = 0.0 custom_input_noind[0][1] = 0.0 custom_input_notra[0][2] = 0.0 custom_input_noagr[0][3] = 0.0 custom_input_noene[0][4] = 0.0 custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input) custom_inputs.append(custom_input_res) custom_inputs.append(custom_input_ind) custom_inputs.append(custom_input_tra) custom_inputs.append(custom_input_agr) custom_inputs.append(custom_input_ene) custom_inputs.append(custom_input_nores) custom_inputs.append(custom_input_noind) custom_inputs.append(custom_input_notra) custom_inputs.append(custom_input_noagr) custom_inputs.append(custom_input_noene) custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) custom_outputs_remaining = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' custom_outputs_remaining.append(emission_config) if top_down_2020_baseline: emission_config_2020_baseline = np.array( [0.604, 0.399, 0.659, 0.613, 0.724]) # matching to PM2.5 only, top 1,000 emission_configs = np.array( np.meshgrid( np.linspace( emission_config_2020_baseline[0] * 0.50, emission_config_2020_baseline[0], 6 ), # 10% reduction increments from 2020 baseline up to 50% np.linspace(emission_config_2020_baseline[1] * 0.50, emission_config_2020_baseline[1], 6), np.linspace(emission_config_2020_baseline[2] * 0.50, emission_config_2020_baseline[2], 6), np.linspace(emission_config_2020_baseline[3] * 0.50, emission_config_2020_baseline[3], 6), np.linspace(emission_config_2020_baseline[4] * 0.50, emission_config_2020_baseline[4], 6), )).T.reshape(-1, 5) # add a couple more for larger reductions in RES and IND to reach WHO-IT2 emission_configs = list(emission_configs) emission_configs.append(np.array([[0.242, 0.160, 0.659, 0.613, 0.724]])) emission_configs.append(np.array([[0.181, 0.120, 0.659, 0.613, 0.724]])) emission_configs.append(np.array([[0.121, 0.080, 0.659, 0.613, 0.724]])) emission_configs.append(np.array([[0.060, 0.040, 0.659, 0.613, 0.724]])) emission_configs_total = [] for emission_config in emission_configs: emission_configs_total.append( f'RES{round(emission_config[0], 3):.3f}_IND{round(emission_config[1], 3):.3f}_TRA{round(emission_config[2], 3):.3f}_AGR{round(emission_config[3], 3):.3f}_ENE{round(emission_config[4], 3):.3f}' ) custom_outputs_completed = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/health_impact_assessments/{output}_adjusted_scaled/df_country_hia_*.csv" ) emission_configs_completed = [] for custom_output_completed in custom_outputs_completed: emission_configs_completed.append( re.findall( r'RES\d+.\d+_IND\d+.\d+_TRA\d+.\d+_AGR\d+.\d+_ENE\d+.\d+', custom_output_completed)[0]) emission_configs_remaining_set = set(emission_configs_total) - set( emission_configs_completed) custom_outputs_remaining = [ item for item in emission_configs_remaining_set ] print( f"custom outputs remaining: {len(custom_outputs_remaining)}, {int(100 * len(emission_configs_remaining_set) / len(emission_configs_total))}%" ) # -------------------------------------------------- # dask bag and process # run in 10 chunks over 10 cores, each chunk taking 2 minutes custom_outputs_remaining = custom_outputs_remaining[0:n_outputs] print(f"predicting for {len(custom_outputs_remaining)} custom outputs ...") bag_custom_outputs = db.from_sequence(custom_outputs_remaining, npartitions=n_workers) if output == "PM2_5_DRY": bag_custom_outputs.map(health_impact_assessment_pm25).compute() elif output == "o3_6mDM8h": bag_custom_outputs.map(health_impact_assessment_o3).compute() time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) client.close() cluster.close()