예제 #1
0
def init_cluster(args):
    env_extra = [
        "#$ -e {}".format(args.log_dir or "/dev/null"),
        "#$ -o {}".format(args.log_dir or "/dev/null"),
        "#$ -pe serial {}".format(args.ngpus if args.ngpus > 0 else args.ncpus),
        "export LANG=en_US.UTF-8",
        "export LC_ALL=en_US.UTF-8",
        "export MKL_NUM_THREADS=1",
        "export NUMEXPR_NUM_THREADS=1",
        "export OMP_NUM_THREADS=1",
        "export DISABLE_MP_CACHE=1",
    ]
    cluster = SGECluster(
        queue=args.queue,
        resource_spec="h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req),
        walltime="720:00:00",
        name="test_Dask_PytorchDataloader",
        cores=args.ncpus,
        memory="{}G".format(args.mem_req),
        processes=1,
        interface="ib0",
        local_directory=".",
        env_extra=env_extra,
        spill_dir=".",
        extra=["--no-nanny"],
    )
    cluster.scale(args.jobs)
    return cluster
예제 #2
0
def setup_client_and_cluster(number_processes=1,
                             number_jobs=1,
                             walltime="00:01:00",
                             memory=1):
    """
    Setup Dask client and cluster.
    Ensure that the number of workers is the right amount
    for your job and will be fully utilised.
    """
    print("Setting up Dask client and cluster ...")
    # number of workers used for number of partitions
    number_workers = number_processes * number_jobs
    # these are the requirements for a single worker
    cluster = SGECluster(
        interface="ib0",
        walltime=walltime,
        memory=f"{memory} G",
        resource_spec=f"h_vmem={memory}G",
        scheduler_options={"dashboard_address": ":2727"},
        job_extra=[
            "-V",  # export all environment variables
            f"-pe smp {number_processes}",
            f"-l disk={memory}G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )
    client = Client(cluster)
    cluster.scale(jobs=number_jobs)
    print("The resources of each worker are: ")
    print(cluster.job_script())
    return client, cluster
def main():
    # dask cluster and client
    n_jobs = 20
    n_processes = 1
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="02:00:00",
        memory=f"48 G",
        resource_spec=f"h_vmem=48G",
        scheduler_options={
            "dashboard_address": ":7777",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=48G",
        ],
        local_directory=os.sep.join([os.environ.get("PWD"), "dask-hia-space"]),
    )

    client = Client(cluster)
    cluster.scale(jobs=n_jobs)
    time_start = time.time()

    # dask bag and process
    simulations = [f'emulator_Base_CLE_2020_{output}']

    #simulations = []
    #simulations.append(f'wrfchem_Base_CLE_2020_{output}')
    #simulations.append(f'wrfchem_Base_CLE_2050_{output}')
    #simulations.append(f'wrfchem_Base_MFR_2050_{output}')
    #simulations.append(f'wrfchem_SDS_MFR_2050_{output}')

    #for year in ['2020', '2030', '2040', '2050']:
    #    for scenario in ['Base_CLE', 'Base_MFR', 'SDS_MFR']:
    #        for sim in ['', '_RES', '_IND', '_TRA', '_AGR', '_ENE', '_NO_RES', '_NO_IND', '_NO_TRA', '_NO_AGR', '_NO_ENE']:
    #            simulations.append(f'emulator_{scenario}_{year}{sim}_{output}')

    print(f"predicting for {len(simulations)} custom outputs ...")
    bag_simulations = db.from_sequence(simulations, npartitions=n_workers)

    if output == "PM2_5_DRY":
        bag_simulations.map(health_impact_assessment_pm25).compute()
    elif output == "o3_6mDM8h":
        bag_simulations.map(health_impact_assessment_o3).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )

    client.close()
    cluster.close()
예제 #4
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(interface='ib0',
                         walltime='01:00:00',
                         memory=f'64 G',
                         resource_spec=f'h_vmem=64G',
                         scheduler_options={
                             'dashboard_address': ':5757',
                         },
                         job_extra=['-cwd', '-V', f'-pe smp {n_processes}'],
                         local_directory=os.sep.join(
                             [os.environ.get('PWD'), 'dask-worker-space']))

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # regrid custom outputs to pop grid
    custom_outputs = glob.glob(path + 'ds*' + output + '.nc')
    custom_outputs_completed = glob.glob(path + 'ds*' + output +
                                         '_popgrid_0.05deg.nc')
    custom_outputs_completed = [
        f'{item[0:-19]}.nc' for item in custom_outputs_completed
    ]
    custom_outputs_remaining_set = set(custom_outputs) - set(
        custom_outputs_completed)
    custom_outputs_remaining = [item for item in custom_outputs_remaining_set]
    print(
        f'custom outputs remaining for {output}: {len(custom_outputs_remaining)}'
    )

    # dask bag and process
    custom_outputs_remaining = custom_outputs_remaining[
        0:
        2500]  # run in 2,500 chunks over 30 cores, each chunk taking 5 minutes
    print(f'predicting for {len(custom_outputs_remaining)} custom outputs ...')
    bag_custom_outputs = db.from_sequence(custom_outputs_remaining,
                                          npartitions=n_workers)
    bag_custom_outputs.map(regrid_to_pop).compute()

    time_end = time.time() - time_start
    print(
        f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours'
    )
    print(
        f'average time per custom output is {time_end / len(custom_outputs_remaining):0.2f} seconds'
    )

    client.close()
    cluster.close()
예제 #5
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="48:00:00",
        memory=f"12 G",
        resource_spec=f"h_vmem=12G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space_popweighted_region"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    # main processing
    matrix_stacked = np.array(
        np.meshgrid(
            np.linspace(0, 1.4, 8),
            np.linspace(0, 1.4, 8),
            np.linspace(0, 1.4, 8),
            np.linspace(0, 1.4, 8),
            np.linspace(0, 1.4, 8),
        )).T.reshape(-1, 5)

    custom_inputs = [np.array(item).reshape(1, -1) for item in matrix_stacked]

    print(f"processing for {output} over {region} ...")
    outputs_popweighted = []
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    outputs_popweighted = bag_custom_inputs.map(
        popweight_outputs_for_input).compute()

    print("saving ...")
    joblib.dump(
        outputs_popweighted,
        f"/nobackup/earlacoa/machinelearning/data_annual/popweighted/popweighted_{region}_{output}_0.25deg_adjusted_scaled.joblib",
    )

    client.close()
    cluster.close()
예제 #6
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(interface='ib0',
                         walltime='48:00:00',
                         memory=f'12 G',
                         resource_spec=f'h_vmem=12G',
                         scheduler_options={
                             'dashboard_address': ':5757',
                         },
                         job_extra=[
                             '-cwd',
                             '-V',
                             f'-pe smp {n_processes}',
                             f'-l disk=1G',
                         ],
                         local_directory=os.sep.join(
                             [os.environ.get('PWD'), 'dask-worker-space']))

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    # main processing
    matrix_stacked = np.array(
        np.meshgrid(np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16),
                    np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16),
                    np.linspace(0, 1.5, 16))).T.reshape(-1, 5)

    custom_inputs = [np.array(item).reshape(1, -1) for item in matrix_stacked]

    print(f'processing for {output} over {region} ...')
    outputs_popweighted = []
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    outputs_popweighted = bag_custom_inputs.map(
        popweight_outputs_for_input).compute()

    print('saving ...')
    joblib.dump(
        outputs_popweighted,
        '/nobackup/earlacoa/machinelearning/data/popweighted/popweighted_' +
        region + '_' + output + '.joblib')

    client.close()
    cluster.close()
예제 #7
0
    def get_cluster(which="ccin2p3", scale=None, set_client=True, **kwargs):
        """ """
        
        if which == "ccin2p3":
            from dask_jobqueue import SGECluster
            prop = dict(name="dask-worker",  walltime="06:00:00",
                        memory='8GB', death_timeout=120, project="P_ztf",
                        resource_spec='sps=1', cores=1, processes=1)
            
            cluster = SGECluster(**{**prop,**kwargs})
        else:
            raise NotImplementedError(f"only 'ccin2p3' cluster implemented {which} given")

        if scale is not None:
            cluster.scale( int(scale) )

        return cluster
예제 #8
0
def process_dask(
    funcs,
    jobs=10,
    cores=3,
    processes=3,
    h_vmem=20,
    m_mem_free=5,
    h_rt=3000,
):
    cluster = SGECluster(
        n_workers=0,
        job_cls=None,
        loop=None,
        security=None,
        silence_logs='error',
        name=None,
        asynchronous=False,
        interface=None,
        host=None,
        protocol='tcp://',
        dashboard_address=':8787',
        config_name=None,
        processes=processes,
        queue='low.q',
        project="labxchem",
        cores=cores,
        memory="{}GB".format(h_vmem),
        walltime=h_rt,
        resource_spec="m_mem_free={}G,h_vmem={}G,h_rt={}".format(
            m_mem_free, h_vmem, h_rt),
    )

    cluster.scale(jobs=jobs)

    client = Client(cluster)

    results_futures = client.map(
        call,
        funcs,
    )

    results = client.gather(results_futures)

    return results
예제 #9
0
파일: dask.py 프로젝트: rronan/pyronan
def init_cluster(name, args):
    resource_spec = "h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req)
    exclude_nodes = "&".join(["!" + x for x in args.exclude_nodes])
    if len(exclude_nodes) > 0:
        exclude_nodes = "#$ -l h=" + exclude_nodes
    env_extra = [
        "#$ -e {}".format(args.log_dir or "/dev/null"),
        "#$ -o {}".format(args.log_dir or "/dev/null"),
        "#$ -pe serial {}".format(
            args.ngpus if args.ngpus > 0 else args.ncpus),
        exclude_nodes,
        "source " + args.to_source if args.to_source is not None else "",
        "export LANG=en_US.UTF-8",
        "export LC_ALL=en_US.UTF-8",
        "export MKL_NUM_THREADS=1",
        "export NUMEXPR_NUM_THREADS=1",
        "export OMP_NUM_THREADS=1",
        "export DISABLE_MP_CACHE=1",
        "export TORCH_HOME=/sequoia/data1/rriochet/.torch",
    ]
    for var in args.export_var:
        env_extra.append(f'export {var}="{os.environ[var]}"')
    cluster = SGECluster(
        queue=args.queue,
        resource_spec=resource_spec,
        walltime="720:00:00",
        name=name,
        cores=args.ncpus,
        memory="{}G".format(args.mem_req),
        processes=1,
        interface="ib0",
        local_directory=args.log_dir,
        env_extra=env_extra,
        spill_dir=args.spill_dir,
        extra=["--no-nanny"],
    )
    # cluster.adapt(maximum_jobs=args.jobs)
    cluster.scale(args.jobs)
    return cluster
예제 #10
0
def get_client():

    dask.config.set({"distributed.admin.tick.limit": "300s"})

    cluster = SGECluster(
        queue="medium.q",
        project="labxchem",
        cores=10,
        processes=5,
        memory="64GB",
        resource_spec="m_mem_free=64G,redhat_release=rhel7",
        python=
        "/dls/science/groups/i04-1/conor_dev/ccp4/build/bin/cctbx.python",
        walltime="03:00:00",
    )
    cluster.scale(60)

    time.sleep(15)

    client = Client(cluster)

    return client
예제 #11
0
def main():
    # dask cluster and client
    number_processes = 1
    number_jobs = 35
    number_workers = number_processes * number_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="04:00:00",
        memory=f"2 G",
        resource_spec=f"h_vmem=2G",
        scheduler_options={
            "dashboard_address": ":2727",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {number_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)
    cluster.scale(jobs=number_jobs)

    # main processing
    print("processing ...")
    results = []
    bag = db.from_sequence(nums, npartitions=number_workers)
    results = bag.map(weird_function).compute()

    print("saving ...")
    joblib.dump(results, f"/nobackup/${USER}/results.joblib")

    client.close()
    cluster.close()
예제 #12
0
def main():
    # dask cluster and client
    number_processes = 1
    number_jobs = 35
    number_workers = number_processes * number_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="04:00:00",
        memory=f"12 G",
        resource_spec=f"h_vmem=12G",
        scheduler_options={
            "dashboard_address": ":2727",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {number_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)
    cluster.scale(jobs=number_jobs)

    # main processing
    print("processing ...")
    results = []
    bag = db.from_sequence(sims, npartitions=number_workers)
    results = bag.map(create_ozone_metric).compute()
    print("complete")

    client.close()
    cluster.close()
예제 #13
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(interface='ib0',
                         walltime='01:00:00',
                         memory=f'2 G',
                         resource_spec=f'h_vmem=2G',
                         scheduler_options={
                             'dashboard_address': ':5757',
                         },
                         project='admiralty',
                         job_extra=[
                             '-cwd',
                             '-V',
                             f'-pe smp {n_processes}',
                             f'-l disk=1G',
                         ],
                         local_directory=os.sep.join(
                             [os.environ.get('PWD'), 'dask-worker-space']))

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # custom inputs
    matrix_stacked = np.array(
        np.meshgrid(
            np.linspace(
                0, 1.5, 16
            ),  # 1.5 and 16 for 0.1, 1.5 and 6 for 0.3, 1.4 and 8 for 0.2
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16))).T.reshape(-1, 5)
    custom_inputs_set = set(
        tuple(map(float, map("{:.1f}".format, item)))
        for item in matrix_stacked)

    custom_inputs_completed_filenames = glob.glob(
        '/nobackup/earlacoa/machinelearning/data/summary/ds*' + output + '*')
    custom_inputs_completed_list = []
    for custom_inputs_completed_filename in custom_inputs_completed_filenames:
        custom_inputs_completed_list.append([
            float(item) for item in re.findall(
                r'\d+\.\d+', custom_inputs_completed_filename)
        ])

    custom_inputs_completed_set = set(
        tuple(item) for item in custom_inputs_completed_list)
    custom_inputs_remaining_set = custom_inputs_set - custom_inputs_completed_set
    custom_inputs = [
        np.array(item).reshape(1, -1) for item in custom_inputs_remaining_set
    ]
    print(f'custom inputs remaining for {output}: {len(custom_inputs)}')

    # dask bag and process
    custom_inputs = custom_inputs[
        0:5000]  # run in 1,000 chunks over 30 cores, each chunk taking 1 hour
    print(f'predicting for {len(custom_inputs)} custom inputs ...')
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    bag_custom_inputs.map(custom_predicts).compute()

    time_end = time.time() - time_start
    print(
        f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours'
    )
    print(
        f'average time per custom input is {time_end / len(custom_inputs):0.2f} seconds'
    )

    client.close()
    cluster.close()
예제 #14
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="01:00:00",
        memory=f"64 G",
        resource_spec=f"h_vmem=64G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=32G",
        ],
        local_directory=os.sep.join([os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # regrid custom outputs to pop grid
    custom_outputs = glob.glob(
        f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}.nc"
    )
    custom_outputs_completed = glob.glob(
        f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}_popgrid_0.25deg.nc"
    )
    custom_outputs_completed = [
        f"{item[0:-19]}.nc" for item in custom_outputs_completed
    ]
    custom_outputs_remaining_set = set(custom_outputs) - set(custom_outputs_completed)
    custom_outputs_remaining = [item for item in custom_outputs_remaining_set]
    print(f"custom outputs remaining for {output}: {len(custom_outputs_remaining)}")

    # dask bag and process
    custom_outputs_remaining = custom_outputs_remaining[
        0:5000
    ]  # run in 5,000 chunks over 30 cores, each chunk taking 2 minutes
    print(f"predicting for {len(custom_outputs_remaining)} custom outputs ...")
    bag_custom_outputs = db.from_sequence(
        custom_outputs_remaining, npartitions=n_workers
    )
    bag_custom_outputs.map(regrid_to_pop).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )
    print(
        f"average time per custom output is {time_end / len(custom_outputs_remaining):0.2f} seconds"
    )

    client.close()
    cluster.close()
예제 #15
0
def run_JK_distributed_massboosted(df, param):
    '''Receives the pandas dataframe with the objects containing the
    temperature decrements and the parameter object and run the kSZ
    statistic and generate Jack Knifes.
    Everything runs in the cluster, so current terminal does not need
    to request many cpus.

    df: dataframe object containing the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups we will make to run the calculation'''

    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS

    #setup cluster
    cluster = SGECluster(
        walltime='172800',
        processes=1,
        cores=1,
        env_extra=[
            '#$-pe sge_pe %i' % Ncores,
            '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch',
            'export NUMBA_NUM_THREADS=%i' % Ncores,
            'export OMP_NUM_THREADS=%i' % Ncores
            #                                    'export OMP_NUM_THREADS=1',  # noqa
        ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(30)
    #end setting up cluster

    #send full dataset to the cluster
    future_fullDataset = client.scatter(df)
    future_params = client.scatter(param)
    res_fullDataset = client.submit(get_pairwise_ksz_massboosted,
                                    future_fullDataset,
                                    future_params,
                                    multithreading=True)
    #done with the full dataset
    jk_results = []
    futureData = []  #data to be sent in jk or bootstrap in galaxy space

    for j in range(Ngroups):
        df_bs = df.copy()
        choose = np.random.choice(len(df), len(df))
        df_bs['dT'] = df.dT.values[choose]
        futureData.append(client.scatter(df_bs))

    if param.JK_RESAMPLING_METHOD.lower() == "bs_dt_mass_boosted_est":
        get_pw_func = get_pairwise_ksz_massboosted
    elif param.JK_RESAMPLING_METHOD.lower(
    ) == 'bs_dt_mass_boosted_est_debiased':  # noqa
        get_pw_func = get_pairwise_ksz_massboosted_debiased

    for j in range(Ngroups):
        jk_results.append(
            client.submit(get_pw_func,
                          futureData[j],
                          future_params,
                          multithreading=True))


# extract results
    fullDataset_results = res_fullDataset.result()
    jk_results = client.gather(jk_results)
    client.close()
    #  cluster.close()

    return fullDataset_results, jk_results
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="01:00:00",
        memory=f"64 G",
        resource_spec=f"h_vmem=64G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=32G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-scale-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # scale custom outputs
    if normal:
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
            )).T.reshape(-1, 5)
        emission_configs_20percentintervals = []
        for emission_config in emission_configs:
            emission_configs_20percentintervals.append(
                f'RES{round(emission_config[0], 1)}_IND{round(emission_config[1], 1)}_TRA{round(emission_config[2], 1)}_AGR{round(emission_config[3], 1)}_ENE{round(emission_config[4], 1)}'
            )

    if extra:
        custom_inputs_main = [
            np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]),  # bottom-up 2010
            np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]),  # bottom-up 2011
            np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]),  # bottom-up 2012
            np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]),  # bottom-up 2013
            np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]),  # bottom-up 2014
            np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]),  # bottom-up 2016
            np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]),  # bottom-up 2017
            np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]),
            np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]),
            np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]),
            np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]),
            np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]),
            np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]),
            np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]),
            np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]),
            np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]),
            np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]),
            np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]),
            np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]),
            np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]),
            np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]),
            np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]),
            np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]),
            np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]),
            np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]),
            np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]),
            np.array([[0.867, 0.957, 0.677, 0.558, 0.477]])
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

    if climate_cobenefits:
        custom_inputs_main = [
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_CLE_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_MFR_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # SDS_MFR_2020
            np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]),  # Base_CLE_2030
            np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]),  # Base_MFR_2030
            np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]),  # SDS_MFR_2030
            np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]),  # Base_CLE_2040
            np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]),  # Base_MFR_2040
            np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]),  # SDS_MFR_2040
            np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]),  # Base_CLE_2050
            np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]),  # Base_MFR_2050
            np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]),  # SDS_MFR_2050
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

    if top_down_2020_baseline:
        emission_config_2020_baseline = np.array(
            [0.604, 0.399, 0.659, 0.613,
             0.724])  # matching to PM2.5 only, top 1,000
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(
                    emission_config_2020_baseline[0] * 0.50,
                    emission_config_2020_baseline[0], 6
                ),  # 10% reduction increments from 2020 baseline up to 50%
                np.linspace(emission_config_2020_baseline[1] * 0.50,
                            emission_config_2020_baseline[1], 6),
                np.linspace(emission_config_2020_baseline[2] * 0.50,
                            emission_config_2020_baseline[2], 6),
                np.linspace(emission_config_2020_baseline[3] * 0.50,
                            emission_config_2020_baseline[3], 6),
                np.linspace(emission_config_2020_baseline[4] * 0.50,
                            emission_config_2020_baseline[4], 6),
            )).T.reshape(-1, 5)
        # add a couple more for larger reductions in RES and IND to reach WHO-IT2
        emission_configs = list(emission_configs)
        emission_configs.append(np.array([0.242, 0.160, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.181, 0.120, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.121, 0.080, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.060, 0.040, 0.659, 0.613, 0.724]))

        emission_configs_20percentintervals = []
        for emission_config in emission_configs:
            emission_configs_20percentintervals.append(
                f'RES{round(emission_config[0], 3):.3f}_IND{round(emission_config[1], 3):.3f}_TRA{round(emission_config[2], 3):.3f}_AGR{round(emission_config[3], 3):.3f}_ENE{round(emission_config[4], 3):.3f}'
            )

    emission_configs_completed = glob.glob(
        f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}_adjusted_scaled/ds*{output}_popgrid_0.25deg_adjusted_scaled.nc"
    )
    emission_configs_completed = [
        f"{item[88:-45]}" for item in emission_configs_completed
    ]

    emission_configs_20percentintervals_remaining_set = set(
        emission_configs_20percentintervals) - set(emission_configs_completed)
    emission_configs_remaining = [
        item for item in emission_configs_20percentintervals_remaining_set
    ]
    print(
        f"custom outputs remaining for {output}: {len(emission_configs_remaining)} - 20% intervals with {int(100 * len(emission_configs_20percentintervals_remaining_set) / len(emission_configs_20percentintervals))}% remaining"
    )

    # dask bag and process
    emission_configs_remaining = emission_configs_remaining[:35000]
    print(
        f"predicting for {len(emission_configs_remaining)} custom outputs ...")
    bag_emission_configs = db.from_sequence(emission_configs_remaining,
                                            npartitions=n_workers)
    bag_emission_configs.map(scale).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )
    print(
        f"average time per custom output is {time_end / len(emission_configs_remaining):0.2f} seconds"
    )

    client.close()
    cluster.close()
예제 #17
0
def run_JK_distributed(df, param, randomize=True):
    '''Receives the pandas dataframe with the objects containing the
    temperature decrements and the parameter object and run the kSZ
    statistic and generate Jack Knifes.
    Everything runs in the cluster, so current terminal does not need
    to request many cpus.

    df: dataframe object containing the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups we will make to run the calculation
    randomize: shuffle data before running the JK'''

    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS
    resampling_method = param.JK_RESAMPLING_METHOD.lower()

    #setup cluster
    cluster = SGECluster(walltime='172800', processes=1, cores=1,
                         env_extra=['#$-pe sge_pe %i' % Ncores,
                                    '-l m_core=%i' % Ncores,
                                    'mkdir -p /tmp/pag227/dask/dask-scratch',
                                    'export NUMBA_NUM_THREADS=%i' % Ncores,
                                    'export OMP_NUM_THREADS=%i' % Ncores
#                                    'export OMP_NUM_THREADS=1',  # noqa
                                    ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(30)
    #end setting up cluster

    #send full dataset to the cluster
    future_fullDataset = client.scatter(df)
    future_params = client.scatter(param)
    res_fullDataset = client.submit(pairwiser.get_pairwise_ksz,
                                    future_fullDataset,
                                    future_params, multithreading=True)
    #done with the full dataset

    #iterate over partial dataset for the JK
    if JK == resampling_method:
        indices_toDrop = JK_tools.indicesToDrop(df, Ngroups,
                                                randomize=randomize)
    jk_results = []
    futureData = []  #data to be sent in jk or bootstrap in galaxy space

    if (JK == resampling_method) or (BS == resampling_method):
        for j in range(Ngroups):  # submit data to the cluster
            if JK in resampling_method:  # if method jk
                dataJK = df.drop(indices_toDrop[j], inplace=False)
                futureData.append(client.scatter(dataJK))
            elif BS in resampling_method:
                dataBS = df.sample(len(df), replace=True)
                futureData.append(client.scatter(dataBS))
        #Now do the JK calculation
        for j in range(Ngroups):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                              futureData[j],
                              future_params, multithreading=True))

    if BS_PW == resampling_method:  # submit the same dataset
        futureData = client.scatter(df, broadcast=True)

        for j in range(Ngroups):
            jk_results.append(client.submit(bs_pw.get_bootstrap_pairwise,
                                            futureData,
                                            future_params,
                                            multithreading=True,
                                            pure=False))
    if resampling_method == BS_DT:
        for j in range(Ngroups):
            df_bs = df.copy()
            choose = np.random.choice(len(df), len(df))
            df_bs['dT'] = df.dT.values[choose]
            futureData.append(client.scatter(df_bs))
        for j in range(Ngroups):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                                            futureData[j],
                                            future_params,
                                            multithreading=True))

    if resampling_method == TL_JK:
        tiled_JK.classify_grid(df)
        df = tiled_JK.remove_edge_galaxies(df, tol_sigma=1.5)
        Ntiles = tiled_JK.how_many_tiles(df)
        for j in range(Ntiles):
            df_tosubmit = tiled_JK.remove_tile(df, j)
            futureData.append(client.scatter(df_tosubmit))
        for j in range(Ntiles):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                                            futureData[j],
                                            future_params,
                                            multithreading=True))
    #extract results
    fullDataset_results = res_fullDataset.result()
    jk_results = client.gather(jk_results)
    client.close()
#    cluster.close()

    return fullDataset_results, jk_results
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="01:00:00",
        memory=f"2 G",
        resource_spec=f"h_vmem=2G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # custom inputs
    if normal:
        matrix_stacked = np.array(
            np.meshgrid(
                np.linspace(
                    0, 1.5, 16
                ),  # 1.5 and 16 for 0.1, 1.5 and 6 for 0.3, 1.4 and 8 for 0.2
                np.linspace(0, 1.5, 16),
                np.linspace(0, 1.5, 16),
                np.linspace(0, 1.5, 16),
                np.linspace(0, 1.5, 16),
            )).T.reshape(-1, 5)
        custom_inputs_set = set(
            tuple(map(float, map("{:.1f}".format, item)))
            for item in matrix_stacked)

        custom_inputs_completed_filenames = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}*"
        )
        custom_inputs_completed_list = []
        for custom_inputs_completed_filename in custom_inputs_completed_filenames:
            custom_inputs_completed_list.append([
                float(item) for item in re.findall(
                    r"\d+\.\d+", custom_inputs_completed_filename)
            ])

        custom_inputs_completed_set = set(
            tuple(item) for item in custom_inputs_completed_list)
        custom_inputs_remaining_set = custom_inputs_set - custom_inputs_completed_set
        custom_inputs = [
            np.array(item).reshape(1, -1)
            for item in custom_inputs_remaining_set
        ]
        print(f"custom inputs remaining for {output}: {len(custom_inputs)}")

    if extra:
        custom_inputs_main = [
            np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]),  # bottom-up 2010
            np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]),  # bottom-up 2011
            np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]),  # bottom-up 2012
            np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]),  # bottom-up 2013
            np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]),  # bottom-up 2014
            np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]),  # bottom-up 2016
            np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]),  # bottom-up 2017
            np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]),
            np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]),
            np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]),
            np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]),
            np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]),
            np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]),
            np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]),
            np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]),
            np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]),
            np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]),
            np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]),
            np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]),
            np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]),
            np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]),
            np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]),
            np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]),
            np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]),
            np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]),
            np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]),
            np.array([[0.867, 0.957, 0.677, 0.558, 0.477]])
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        # just for emulator_predictions.py as this is required in order to adjust for double emissions
        custom_inputs_temp = custom_inputs.copy()
        for custom_input in custom_inputs_temp:
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

        emission_configs_20percentintervals = list(
            set(emission_configs_20percentintervals))

        custom_inputs = []
        for emission_config in emission_configs_20percentintervals:
            custom_input = np.array([
                float(num) for num in re.findall(r'\d.\d+', emission_config)
            ]).reshape(1, -1)
            custom_inputs.append(custom_input)

    if climate_cobenefits:
        custom_inputs_main = [
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_CLE_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_MFR_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # SDS_MFR_2020
            np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]),  # Base_CLE_2030
            np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]),  # Base_MFR_2030
            np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]),  # SDS_MFR_2030
            np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]),  # Base_CLE_2040
            np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]),  # Base_MFR_2040
            np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]),  # SDS_MFR_2040
            np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]),  # Base_CLE_2050
            np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]),  # Base_MFR_2050
            np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]),  # SDS_MFR_2050
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        # just for emulator_predictions.py as this is required in order to adjust for double emissions
        custom_inputs_temp = custom_inputs.copy()
        for custom_input in custom_inputs_temp:
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

        emission_configs_20percentintervals = list(
            set(emission_configs_20percentintervals))

        custom_inputs = []
        for emission_config in emission_configs_20percentintervals:
            custom_input = np.array([
                float(num) for num in re.findall(r'\d.\d+', emission_config)
            ]).reshape(1, -1)
            custom_inputs.append(custom_input)

    if top_down_2020_baseline:
        emission_config_2020_baseline = np.array(
            [0.604, 0.399, 0.659, 0.613,
             0.724])  # matching to PM2.5 only, top 1,000
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(
                    emission_config_2020_baseline[0] * 0.50,
                    emission_config_2020_baseline[0], 6
                ),  # 10% reduction increments from 2020 baseline up to 50%
                np.linspace(emission_config_2020_baseline[1] * 0.50,
                            emission_config_2020_baseline[1], 6),
                np.linspace(emission_config_2020_baseline[2] * 0.50,
                            emission_config_2020_baseline[2], 6),
                np.linspace(emission_config_2020_baseline[3] * 0.50,
                            emission_config_2020_baseline[3], 6),
                np.linspace(emission_config_2020_baseline[4] * 0.50,
                            emission_config_2020_baseline[4], 6),
            )).T.reshape(-1, 5)
        custom_inputs = [
            np.array(item).reshape(1, -1) for item in emission_configs
        ]
        # add a couple more for larger reductions in RES and IND to reach WHO-IT2
        custom_inputs.append(np.array([[0.242, 0.160, 0.659, 0.613, 0.724]]))
        custom_inputs.append(np.array([[0.181, 0.120, 0.659, 0.613, 0.724]]))
        custom_inputs.append(np.array([[0.121, 0.080, 0.659, 0.613, 0.724]]))
        custom_inputs.append(np.array([[0.060, 0.040, 0.659, 0.613, 0.724]]))

        # just for emulator_predictions.py as this is required in order to adjust for double emissions
        custom_inputs_temp = custom_inputs.copy()
        for custom_input in custom_inputs_temp:
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)
            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

        emission_configs_20percentintervals = set(
            emission_configs_20percentintervals)

        custom_inputs_completed_filenames = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}.nc"
        )
        custom_inputs_completed_list = []
        for custom_inputs_completed_filename in custom_inputs_completed_filenames:
            emission_config = re.findall(
                r"RES\d+\.\d+_IND\d+\.\d+_TRA\d+\.\d+_AGR\d+\.\d+_ENE\d+\.\d+",
                custom_inputs_completed_filename)
            if len(emission_config) > 0:
                custom_inputs_completed_list.append(emission_config)

        custom_inputs_completed_set = set(
            item[0] for item in custom_inputs_completed_list)
        custom_inputs_remaining_set = emission_configs_20percentintervals - custom_inputs_completed_set
        custom_inputs = [
            np.array([float(n)
                      for n in re.findall(r'\d+.\d+', item)]).reshape(1, -1)
            for item in custom_inputs_remaining_set
        ]

    # dask bag and process
    custom_inputs = custom_inputs[:5000]
    #custom_inputs = custom_inputs[5000:]

    print(f"predicting for {len(custom_inputs)} custom inputs ...")
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    bag_custom_inputs.map(custom_predicts).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )
    print(
        f"average time per custom input is {time_end / len(custom_inputs):0.2f} seconds"
    )

    client.close()
    cluster.close()
def main():
    # dask cluster and client
    n_processes = 1
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime=walltime,
        memory=f"32 G",
        resource_spec=f"h_vmem=32G",
        scheduler_options={
            "dashboard_address": ":5761",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=32G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-find-emis-pm-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # dask bag over emission_configs
    print(
        f"predicting over {len(emission_configs)} emission configs for {station_id} ..."
    )
    bag_emission_configs = db.from_sequence(emission_configs,
                                            npartitions=n_workers)
    results = bag_emission_configs.map(filter_emission_configs).compute()

    station_diffs_abs = [result[0] for result in results]
    station_diffs_per = [result[1] for result in results]
    key = [key for key in baselines.keys()][0]
    station_diffs_abs = [
        station_diff_abs for station_diff_abs in station_diffs_abs
        if len(station_diff_abs[key]) > 0
    ]
    station_diffs_per = [
        station_diff_per for station_diff_per in station_diffs_per
        if len(station_diff_per[key]) > 0
    ]

    merged_per = {}
    for station_diff_per in station_diffs_per:
        merged_per = {**merged_per, **station_diff_per[key]}

    merged_abs = {}
    for station_diff_abs in station_diffs_abs:
        merged_abs = {**merged_abs, **station_diff_abs[key]}

    station_diffs_per = {key: merged_per}
    station_diffs_abs = {key: merged_abs}

    joblib.dump(
        obs_change_abs,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/obs_change_abs_{output}_{station_id}.joblib"
    )
    joblib.dump(
        obs_change_per,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/obs_change_per_{output}_{station_id}.joblib"
    )
    joblib.dump(
        baselines,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/baselines_{output}_{station_id}.joblib"
    )
    joblib.dump(
        targets,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/targets_{output}_{station_id}.joblib"
    )
    joblib.dump(
        target_diffs,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/target_diffs_{output}_{station_id}.joblib"
    )
    joblib.dump(
        station_diffs_abs,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/station_diffs_abs_{output}_{station_id}.joblib"
    )
    joblib.dump(
        station_diffs_per,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/station_diffs_per_{output}_{station_id}.joblib"
    )

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )

    client.close()
    cluster.close()
예제 #20
0
def run_error_estimation_distributed(df1, df2, param):
    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS

    #setup cluster
    cluster = SGECluster(
        walltime='172800',
        processes=1,
        cores=1,
        env_extra=[
            '#$-pe sge_pe %i' % Ncores,
            '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch',
            'export NUMBA_NUM_THREADS=%i' % Ncores,
            'export OMP_NUM_THREADS=%i' % Ncores
            #                                    'export OMP_NUM_THREADS=1',  # noqa
        ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(10)
    #end setting up cluster

    #send full dataset to the cluster
    future_df1 = client.scatter(df1)
    future_df2 = client.scatter(df2)

    future_params = client.scatter(param)
    res_fullDataset_11 = client.submit(cpw.get_cross_pairwise_ksz, future_df1,
                                       future_df1, future_params)
    res_fullDataset_12 = client.submit(cpw.get_cross_pairwise_ksz, future_df1,
                                       future_df2, future_params)
    res_fullDataset_22 = client.submit(cpw.get_cross_pairwise_ksz, future_df2,
                                       future_df2, future_params)
    #done with the full dataset

    #iterate over partial dataset for the JK
    replicants1 = []  #data to be sent
    replicants2 = []

    if 'jk' in param.JK_RESAMPLING_METHOD.lower():
        all_indx = np.arange(len(df1))
        np.random.shuffle(all_indx)
        indx_to_drop = np.array_split(all_indx, param.JK_NGROUPS)
    for j in range(Ngroups):  # submit data to the cluster
        if 'jk' in param.JK_RESAMPLING_METHOD.lower():  # if method jk
            todrop = indx_to_drop[j]
            replicant1 = df1.drop(df1.index[todrop], inplace=False)
            replicant2 = df2.drop(df2.index[todrop], inplace=False)

            replicants1.append(client.scatter(replicant1))
            replicants2.append(client.scatter(replicant2))
        elif 'bootstrap' in param.JK_RESAMPLING_METHOD.lower():
            indxs = np.random.randint(low=0, high=len(df1), size=len(df1))
            replicant1 = df1.iloc[indxs]
            replicant2 = df2.iloc[indxs]
            replicants1.append(client.scatter(replicant1))
            replicants2.append(client.scatter(replicant2))

    #Now do the JK calculation
    realizations11 = []
    realizations12 = []
    realizations22 = []

    for j in range(Ngroups):
        realizations11.append(
            client.submit(cpw.get_cross_pairwise_ksz, replicants1[j],
                          replicants1[j], future_params))
        realizations12.append(
            client.submit(cpw.get_cross_pairwise_ksz, replicants1[j],
                          replicants2[j], future_params))
        realizations22.append(
            client.submit(cpw.get_cross_pairwise_ksz, replicants2[j],
                          replicants2[j], future_params))
    #extract results
    fullDataset_result11 = res_fullDataset_11.result()
    fullDataset_result12 = res_fullDataset_12.result()
    fullDataset_result22 = res_fullDataset_22.result()

    resampling_result11 = client.gather(realizations11)
    resampling_result12 = client.gather(realizations12)
    resampling_result22 = client.gather(realizations22)
    client.close()
    #    cluster.close()

    results = {
        'full11': fullDataset_result11,
        'full12': fullDataset_result12,
        'full22': fullDataset_result22,
        'resampled11': resampling_result11,
        'resampled12': resampling_result12,
        'resampled22': resampling_result22
    }

    return results
예제 #21
0
def main():
    # dask cluster and client
    if output == 'PM2_5_DRY':
        n_jobs = 20
        n_outputs = 1000
    elif output == 'o3_6mDM8h':
        n_jobs = 20
        n_outputs = 2000

    n_processes = 1
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="02:00:00",
        memory=f"48 G",
        resource_spec=f"h_vmem=48G",
        scheduler_options={
            "dashboard_address": ":7777",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=48G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-hia-ozone-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # find remaining inputs
    if normal:
        custom_outputs = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}_adjusted_scaled/ds*{output}_popgrid_0.25deg_adjusted_scaled.nc"
        )
        custom_outputs_completed = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/health_impact_assessments/{output}_adjusted_scaled/df_country_hia_*.csv"
        )
        custom_outputs_remaining_set = set([
            item.split("/")[-1][3:-1 - len(output) - 19 - 7]
            for item in custom_outputs
        ]) - set([
            item.split("/")[-1][15 + len(output) + 1:-4 - 7]
            for item in custom_outputs_completed
        ])
        custom_outputs_remaining = [
            item for item in custom_outputs_remaining_set
        ]
        print(
            f"custom outputs remaining for {output}: {len(custom_outputs_remaining)} - 10% intervals with {int(100 * len(custom_outputs_remaining_set) / 16**5)}% remaining"
        )

        reduce_to_20percent_intervals = True
        if reduce_to_20percent_intervals:
            emission_configs = np.array(
                np.meshgrid(
                    np.linspace(0.0, 1.4, 8),
                    np.linspace(0.0, 1.4, 8),
                    np.linspace(0.0, 1.4, 8),
                    np.linspace(0.0, 1.4, 8),
                    np.linspace(0.0, 1.4, 8),
                )).T.reshape(-1, 5)
            emission_configs_20percentintervals = []
            for emission_config in emission_configs:
                emission_configs_20percentintervals.append(
                    f'RES{round(emission_config[0], 1)}_IND{round(emission_config[1], 1)}_TRA{round(emission_config[2], 1)}_AGR{round(emission_config[3], 1)}_ENE{round(emission_config[4], 1)}'
                )

            emission_configs_completed = []
            for custom_output_completed in custom_outputs_completed:
                emission_configs_completed.append(
                    re.findall(
                        r'RES\d+.\d+_IND\d+.\d+_TRA\d+.\d+_AGR\d+.\d+_ENE\d+.\d+',
                        custom_output_completed)[0])

            emission_configs_20percentintervals_remaining_set = set(
                emission_configs_20percentintervals) - set(
                    emission_configs_completed)
            custom_outputs_remaining = [
                item
                for item in emission_configs_20percentintervals_remaining_set
            ]
            print(
                f"custom outputs remaining for {output}: {len(custom_outputs_remaining)} - 20% intervals with {int(100 * len(emission_configs_20percentintervals_remaining_set) / len(emission_configs_20percentintervals))}% remaining"
            )

    if extra:
        if year == '2010':
            custom_inputs_main = [
                np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]),  # bottom-up 2010
            ]
        elif year == '2011':
            custom_inputs_main = [
                np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]),  # bottom-up 2011
            ]
        elif year == '2012':
            custom_inputs_main = [
                np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]),  # bottom-up 2012
            ]
        elif year == '2013':
            custom_inputs_main = [
                np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]),  # bottom-up 2013
            ]
        elif year == '2014':
            custom_inputs_main = [
                np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]),  # bottom-up 2014
            ]
        elif year == '2015':
            custom_inputs_main = [
                np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]),  # control
            ]
        elif year == '2016':
            custom_inputs_main = [
                np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]),  # bottom-up 2016
                np.array([[0.76, 0.934, 0.735, 0.683,
                           0.708]]),  # top-down 2016 - both
                np.array([[0.744, 0.904, 0.778, 0.678,
                           0.716]]),  # top-down 2016 - either
                np.array([[0.803, 0.835, 0.742, 0.71,
                           0.717]]),  # top-down 2016 - pm25 only
                np.array([[0.769, 1.009, 0.697, 0.69,
                           0.72]]),  # top-down 2016 - o3 only
            ]
        elif year == '2017':
            custom_inputs_main = [
                np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]),  # bottom-up 2017
                np.array([[0.704, 0.786, 0.73, 0.659,
                           0.6]]),  # top-down 2017 - both
                np.array([[0.771, 0.835, 0.711, 0.685,
                           0.544]]),  # top-down 2017 - either
                np.array([[0.721, 0.863, 0.712, 0.74,
                           0.709]]),  # top-down 2017 - pm25 only
                np.array([[0.824, 0.759, 0.767, 0.641,
                           0.429]]),  # top-down 2017 - o3 only
            ]
        elif year == '2018':
            custom_inputs_main = [
                np.array([[0.712, 0.703, 0.725, 0.676,
                           0.649]]),  # top-down 2018 - both
                np.array([[0.647, 0.945, 0.746, 0.588,
                           0.473]]),  # top-down 2018 - either
                np.array([[0.661, 0.674, 0.694, 0.742,
                           0.715]]),  # top-down 2018 - pm25 only
                np.array([[0.858, 1.092, 0.794, 0.604,
                           0.475]]),  # top-down 2018 - o3 only
            ]
        elif year == '2019':
            custom_inputs_main = [
                np.array([[0.739, 0.668, 0.701, 0.686,
                           0.682]]),  # top-down 2019 - both
                np.array([[0.657, 0.745, 0.714, 0.613,
                           0.591]]),  # top-down 2019 - either
                np.array([[0.701, 0.642, 0.669, 0.681,
                           0.679]]),  # top-down 2019 - pm25 only
                np.array([[0.8, 0.987, 0.648, 0.57,
                           0.493]]),  # top-down 2019 - o3 only
            ]
        elif year == '2020':
            custom_inputs_main = [
                np.array([[0.67, 0.609, 0.709, 0.621,
                           0.661]]),  # top-down 2020 - both
                np.array([[0.582, 0.7, 0.672, 0.5,
                           0.492]]),  # top-down 2020 - either
                np.array([[0.604, 0.399, 0.659, 0.613,
                           0.724]]),  # top-down 2020 - pm25 only
                np.array([[0.867, 0.957, 0.677, 0.558,
                           0.477]]),  # top-down 2020 - o3 only
            ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)

        custom_outputs_remaining = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            custom_outputs_remaining.append(emission_config)

    if climate_cobenefits:
        custom_inputs_main = [
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_CLE_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_MFR_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # SDS_MFR_2020
            np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]),  # Base_CLE_2030
            np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]),  # Base_MFR_2030
            np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]),  # SDS_MFR_2030
            np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]),  # Base_CLE_2040
            np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]),  # Base_MFR_2040
            np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]),  # SDS_MFR_2040
            np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]),  # Base_CLE_2050
            np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]),  # Base_MFR_2050
            np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]),  # SDS_MFR_2050
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        custom_outputs_remaining = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            custom_outputs_remaining.append(emission_config)

    if top_down_2020_baseline:
        emission_config_2020_baseline = np.array(
            [0.604, 0.399, 0.659, 0.613,
             0.724])  # matching to PM2.5 only, top 1,000
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(
                    emission_config_2020_baseline[0] * 0.50,
                    emission_config_2020_baseline[0], 6
                ),  # 10% reduction increments from 2020 baseline up to 50%
                np.linspace(emission_config_2020_baseline[1] * 0.50,
                            emission_config_2020_baseline[1], 6),
                np.linspace(emission_config_2020_baseline[2] * 0.50,
                            emission_config_2020_baseline[2], 6),
                np.linspace(emission_config_2020_baseline[3] * 0.50,
                            emission_config_2020_baseline[3], 6),
                np.linspace(emission_config_2020_baseline[4] * 0.50,
                            emission_config_2020_baseline[4], 6),
            )).T.reshape(-1, 5)
        # add a couple more for larger reductions in RES and IND to reach WHO-IT2
        emission_configs = list(emission_configs)
        emission_configs.append(np.array([[0.242, 0.160, 0.659, 0.613,
                                           0.724]]))
        emission_configs.append(np.array([[0.181, 0.120, 0.659, 0.613,
                                           0.724]]))
        emission_configs.append(np.array([[0.121, 0.080, 0.659, 0.613,
                                           0.724]]))
        emission_configs.append(np.array([[0.060, 0.040, 0.659, 0.613,
                                           0.724]]))

        emission_configs_total = []
        for emission_config in emission_configs:
            emission_configs_total.append(
                f'RES{round(emission_config[0], 3):.3f}_IND{round(emission_config[1], 3):.3f}_TRA{round(emission_config[2], 3):.3f}_AGR{round(emission_config[3], 3):.3f}_ENE{round(emission_config[4], 3):.3f}'
            )

        custom_outputs_completed = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/health_impact_assessments/{output}_adjusted_scaled/df_country_hia_*.csv"
        )
        emission_configs_completed = []
        for custom_output_completed in custom_outputs_completed:
            emission_configs_completed.append(
                re.findall(
                    r'RES\d+.\d+_IND\d+.\d+_TRA\d+.\d+_AGR\d+.\d+_ENE\d+.\d+',
                    custom_output_completed)[0])

        emission_configs_remaining_set = set(emission_configs_total) - set(
            emission_configs_completed)
        custom_outputs_remaining = [
            item for item in emission_configs_remaining_set
        ]
        print(
            f"custom outputs remaining: {len(custom_outputs_remaining)}, {int(100 * len(emission_configs_remaining_set) / len(emission_configs_total))}%"
        )

    # --------------------------------------------------

    # dask bag and process
    # run in 10 chunks over 10 cores, each chunk taking 2 minutes
    custom_outputs_remaining = custom_outputs_remaining[0:n_outputs]
    print(f"predicting for {len(custom_outputs_remaining)} custom outputs ...")
    bag_custom_outputs = db.from_sequence(custom_outputs_remaining,
                                          npartitions=n_workers)
    if output == "PM2_5_DRY":
        bag_custom_outputs.map(health_impact_assessment_pm25).compute()
    elif output == "o3_6mDM8h":
        bag_custom_outputs.map(health_impact_assessment_o3).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )

    client.close()
    cluster.close()