Exemplo n.º 1
0
def parslConfigFromCompute(compute):
    """Given a Compute instance, return a setup parsl configuration"""
    if isinstance(compute, EC2Compute):
        # NOTE: Assumes the paropt is being run on an EC2 instance with access to metadata service
        try:
            public_ip = getAWSPublicIP()

            # get the required environment variables
            required_env_vars = [
                "PAROPT_AWS_REGION", "PAROPT_AWS_KEY_NAME",
                "PAROPT_AWS_STATE_FILE", "PAROPT_AWS_IAM_INSTANCE_PROFILE_ARN"
            ]
            env_vars = {
                varname.replace('PAROPT_AWS_', '').lower(): os.getenv(varname)
                for varname in required_env_vars
            }
            missing_vars = [
                varname for varname, value in env_vars.items() if value == None
            ]
            if missing_vars:
                raise Exception(
                    "Missing required environment variables for running parsl with AWS:\n{}"
                    .format(missing_vars))

            parsl_config = Config(
                executors=[
                    HighThroughputExecutor(
                        label='htex_local',
                        address=public_ip,
                        worker_port_range=(54000, 54050),
                        interchange_port_range=(54051, 54100),
                        cores_per_worker=1,
                        max_workers=1,
                        provider=AWSProvider(
                            image_id=compute.ami,
                            instance_type=compute.instance_model,
                            worker_init=
                            'pip3 install git+https://[email protected]/globus-labs/ParaOpt@Chaofeng_modification',  # git+https://[email protected]/chaofengwu/paropt',#git+https://[email protected]/macintoshpie/paropt',
                            nodes_per_block=1,
                            init_blocks=1,
                            max_blocks=1,
                            min_blocks=0,
                            walltime='24:00:00',
                            spot_max_bid=2.0,
                            **env_vars),
                    )
                ],
                strategy=None,
            )

            return parsl_config
        except KeyError as e:
            logger.error('Failed initializing aws config: {}'.format(e))
            raise e
        except (HTTPError, URLError, OSError) as e:
            logger.error('Request to metadata service failed: {}'.format(e))
            raise e

    elif isinstance(compute, LocalCompute):
        return Config(executors=[
            ThreadPoolExecutor(max_threads=8, label='local_threads')
        ])

    else:
        raise Exception('Unknown Compute type')
        HighThroughputExecutor(
            label='comet_htex',
            worker_debug=True,
            address='js-17-185.jetstream-cloud.org',
            max_workers=1,
            cores_per_worker=24,
            worker_logdir_root='/home/aymen/parsl_scripts',
            #address=address_by_query(),
            interchange_address='comet-ln2.sdsc.edu',
            interchange_port_range=(50100, 50400),
            #client_address = "129.114.17.185",
            worker_port_range=(50500, 51000),
            provider=SlurmProvider(
                'debug',
                channel=SSHChannel(
                    hostname='comet-ln2.sdsc.edu',
                    username=
                    '******',  # Please replace USERNAME with your username
                    password='******',
                    script_dir=
                    '/home/aymen/parsl_scripts',  # Please replace USERNAME with your username
                ),
                # launcher=SrunLauncher(),
                scheduler_options='',  # Input your scheduler_options if needed
                #worker_init='conda activate /oasis/projects/nsf/unc100/aymen/anaconda3/envs/parsl-env',     # Input your worker_init if needed
                worker_init='source /home/aymen/ve/parsl-env/bin/activate',
                walltime="00:10:00",
                init_blocks=1,
                max_blocks=1,
                nodes_per_block=1,
                parallelism=24,
            ),
            working_dir="/home/aymen/parsl_scripts",
            #client_address = "129.114.17.185",
            #worker_port_range=(54000, 55000),

            #interchange_address = "js-17-185.jetstream-cloud.org"
            #storage_access=[GlobusScheme(
            #    endpoint_uuid='de463f97-6d04-11e5-ba46-22000b92c6ec',
            #    endpoint_path='/',
            #    local_path='/')],
        )
Exemplo n.º 3
0
from parsl.providers import CobaltProvider
from parsl.launchers import AprunLauncher
from parsl.executors import HighThroughputExecutor


config = Config(
    executors=[
        HighThroughputExecutor(
            label='theta_local_htex_multinode',
            max_workers=4,
            provider=CobaltProvider(
                queue='YOUR_QUEUE',
                account='YOUR_ACCOUNT',
                launcher=AprunLauncher(overrides="-d 64"),
                walltime='00:30:00',
                nodes_per_block=2,
                init_blocks=1,
                min_blocks=1,
                max_blocks=1,
                # string to prepend to #COBALT blocks in the submit
                # script to the scheduler eg: '#COBALT -t 50'
                scheduler_options='',
                # Command to be run before starting a worker, such as:
                # 'module load Anaconda; source activate parsl_env'.
                worker_init='',
                cmd_timeout=120,
            ),
        )
    ],
)
Exemplo n.º 4
0
from datetime import datetime
import csv

import parsl
from parsl.app.app import python_app, bash_app
from parsl.config import Config
from parsl.providers import GridEngineProvider
from parsl.executors import HighThroughputExecutor
from parsl.executors import IPyParallelExecutor
from parsl.executors import ThreadPoolExecutor
from parsl.addresses import address_by_route, address_by_query, address_by_hostname

config = Config(executors=[
    HighThroughputExecutor(worker_debug=True,
                           address=address_by_route(),
                           provider=GridEngineProvider(walltime='100:00:00',
                                                       init_blocks=1,
                                                       max_blocks=20),
                           label="workers"),
    ThreadPoolExecutor(label="login", max_threads=20)
], )

parsl.set_stream_logger()
parsl.load(config)

from data_generation import generate_data

proteomefile = sys.argv[1]
directory = f'/home/users/ellenrichards/{sys.argv[2]}/'
threshold = 1000

if not os.path.isdir(directory):
Exemplo n.º 5
0
 executors=[
     HighThroughputExecutor(
         label="theta_funcx",
         #worker_debug=True,
         max_workers=tasks_per_node,  # Experimental 
         #suppress_failure=True, # Experimental
         heartbeat_period=60,
         heartbeat_threshold=600,
         cores_per_worker=args.cores_per_worker,
         address=address_by_hostname(),
         #container_image=os.path.expanduser("~/sing-run.simg"),
         container_image='/tmp/sing-run.simg',
         worker_mode="singularity_reuse",
         #worker_mode="no_container",
         provider=CobaltProvider(
             queue=args.queue,
             #account='ExM',
             account='CSC249ADCD01',
             launcher=AprunLauncher(overrides="-d 64"),
             scheduler_options='',
             # worker_init='source ~/move_image.sh\nsource activate funcx-test'.format(os.getenv('PWD')),
             worker_init=
             'source activate funcx-test\naprun -n {} -N 1 /bin/bash ~/move_image.sh'
             .format(nodes_per_block),
             init_blocks=1,
             max_blocks=1,
             min_blocks=1,
             nodes_per_block=nodes_per_block,
             walltime=walltime,
             cmd_timeout=60),
     )
 ],
Exemplo n.º 6
0
docker_image = "opensciencegrid/osgvo-el6"

transfer_output_files = coffea_parsl_condor
''' #% (nproc, ) # twoGB*nproc,

#RequestMemory = %d
#RequestCpus = %d
#RequestDisk = 1048576

xfer_files = ['%s/.local' % (os.environ['HOME'], ), '/tmp/%s' % (x509_proxy, )]

config = Config(
    executors=[
        HighThroughputExecutor(
            label="coffea_parsl_condor",
            address=address_by_hostname(),
            prefetch_capacity=0,
            cores_per_worker=1,
            max_workers=nproc,
            worker_logdir_root='./',
            provider=CondorProvider(init_blocks=8,
                                    max_blocks=200,
                                    nodes_per_block=1,
                                    worker_init=wrk_init,
                                    transfer_input_files=xfer_files,
                                    scheduler_options=condor_cfg),
        )
    ],
    strategy=None,
)
Exemplo n.º 7
0
from parsl.config import Config
from parsl.providers import CondorProvider
from parsl.executors import HighThroughputExecutor
from parsl.addresses import address_by_query

config = Config(executors=[
    HighThroughputExecutor(
        label='OSG_HTEX',
        address=address_by_query(),
        max_workers=1,
        provider=CondorProvider(
            nodes_per_block=1,
            init_blocks=4,
            max_blocks=4,
            # This scheduler option string ensures that the compute nodes provisioned
            # will have modules
            scheduler_options=
            'Requirements = OSGVO_OS_STRING == "RHEL 6" && Arch == "X86_64" &&  HAS_MODULES == True',
            # Command to be run before starting a worker, such as:
            # 'module load Anaconda; source activate parsl_env'.
            worker_init='',
            walltime="00:20:00",
        ),
    )
])
Exemplo n.º 8
0
from typing import Any, Dict

user_opts = {'adhoc':
             {'username': '******',
              'script_dir': 'YOUR_SCRIPT_DIR',
              'remote_hostnames': ['REMOTE_HOST_URL_1', 'REMOTE_HOST_URL_2']
             }
}  # type: Dict[str, Dict[str, Any]]

config = Config(
    executors=[
        HighThroughputExecutor(
            label='remote_htex',
            max_workers=2,
            address=address_by_query(),
            worker_logdir_root=user_opts['adhoc']['script_dir'],
            provider=AdHocProvider(
                # Command to be run before starting a worker, such as:
                # 'module load Anaconda; source activate parsl_env'.
                worker_init='',
                channels=[SSHChannel(hostname=m,
                                     username=user_opts['adhoc']['username'],
                                     script_dir=user_opts['adhoc']['script_dir'],
                ) for m in user_opts['adhoc']['remote_hostnames']]
            )
        )
    ],
    #  AdHoc Clusters should not be setup with scaling strategy.
    strategy=None,
)
Exemplo n.º 9
0
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

n_managers = 2
config = Config(
    executors=[
        HighThroughputExecutor(
            address=address_by_interface('bond0.144'),
            poll_period=1,
            heartbeat_period=1,
            heartbeat_threshold=2,
            label="htex_local",
            worker_mode="no_container",
            worker_ports=(53531, 53532),
            # worker_debug=True,
            cores_per_worker=1,
            max_workers=4,
            provider=LocalProvider(
                channel=LocalChannel(),
                init_blocks=0,
                max_blocks=1,
                # tasks_per_node=1,  # For HighThroughputExecutor, this option should in most cases be 1
                launcher=SingleNodeLauncher(),
            ),
        )
    ],
    retries=2,
    strategy=None,
)

Exemplo n.º 10
0
import getpass

vm_reference = {
    # All fields below are required
    "admin_username": '******',
    "password": '******',
    "vm_size": 'YOUR_VM_SIZE',
    "disk_size_gb": 'YOUR_VM_DISK_SIZE',
    "publisher": 'YOUR_IMAGE_PUBLISHER',
    "offer": 'YOUR_VM_OS_OFFER',
    "sku": 'YOUR_VM_OS_SKU',
    "version": 'YOUR_VM_OS_VERSION',
}

config = Config(executors=[
    HighThroughputExecutor(
        label='azure_single_node',
        address=address_by_query(),
        provider=AzureProvider(
            vm_reference=vm_reference,
            key_file='azure_key_file.json',
        ),
        storage_access=[
            HTTPInTaskStaging(),
            FTPInTaskStaging(),
            RSyncStaging(getpass.getuser() + "@" + address_by_query())
        ],
    )
])
Exemplo n.º 11
0
from parsl.providers import LocalProvider
from parsl.channels import LocalChannel

from parsl.config import Config
from parsl.executors import HighThroughputExecutor

config = Config(executors=[
    HighThroughputExecutor(
        label="htex_local",
        cores_per_worker=1,
        provider=LocalProvider(
            channel=LocalChannel(),
            init_blocks=1,
            max_blocks=1,
        ),
    )
], )
Exemplo n.º 12
0
     channel = SSHChannel(
         hostname=hostnames[step],
         username=args.unix_username,
         gssapi_auth=args.gssapi,
     )
 if args.scheduler_name == 'slurm':
     executors.append(
         HighThroughputExecutor(
             label=step,
             worker_debug=True,
             address=address_by_hostname(),
             cores_per_worker=vCPUs_per_core * int(cores_per_task[step]),
             provider=SlurmProvider(
                 args.scheduler_partition,
                 channel=channel,
                 launcher=SrunLauncher(),
                 nodes_per_block=node_count,
                 worker_init=worker_init,
                 init_blocks=1,
                 max_blocks=1,
                 walltime=walltimes[step],
                 scheduler_options=options,
                 move_files=False,
             ),
         ))
 elif args.scheduler_name == 'grid_engine':
     executors.append(
         HighThroughputExecutor(
             label=step,
             worker_debug=True,
             address=address_by_hostname(),
             provider=GridEngineProvider(
Exemplo n.º 13
0
from parsl.config import Config
from parsl.providers import AWSProvider
from parsl.executors import HighThroughputExecutor
from parsl.addresses import address_by_query

config = Config(executors=[
    HighThroughputExecutor(
        label='ec2_htex_single_node',
        address=address_by_query(),
        provider=AWSProvider(
            image_id='YOUR_AMI_ID',
            region='us-east-1',
            key_name='YOUR_KEY_NAME',
            profile='default',
            state_file='awsproviderstate.json',
            nodes_per_block=1,
            init_blocks=1,
            walltime='01:00:00',
        ),
    )
], )
Exemplo n.º 14
0
def theta_nwchem_config(log_dir: str,
                        nodes_per_nwchem: int = 2,
                        total_nodes: int = int(
                            os.environ.get("COBALT_JOBSIZE", 1)),
                        ml_prefetch: int = 0) -> Config:
    """Theta configuration where QC workers sit on the launch node (to be able to aprun)
    and ML workers are placed on compute nodes

    Args:
        nodes_per_nwchem: Number of nodes per NWChem computation
        log_dir: Path to store monitoring DB and parsl logs
        total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE
        ml_prefetch: Number of tasks for ML workers to prefetch for inference
    Returns:
        (Config) Parsl configuration
    """
    assert total_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task"
    nwc_workers = total_nodes // nodes_per_nwchem

    return Config(
        executors=[
            ThreadPoolExecutor(label='qc', max_threads=nwc_workers),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="ml-inference",
                max_workers=1,
                prefetch_capacity=ml_prefetch,
                provider=LocalProvider(
                    nodes_per_block=nodes_per_nwchem,
                    init_blocks=0,
                    max_blocks=total_nodes //
                    nodes_per_nwchem,  # Limits the number of manager processes,
                    launcher=AprunLauncher(
                        overrides='-d 256 --cc depth -j 4'
                    ),  # Places worker on the compute node
                    worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env
    ''',
                ),
            ),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="ml-train",
                max_workers=1,
                prefetch_capacity=0,
                provider=LocalProvider(
                    nodes_per_block=nodes_per_nwchem,
                    init_blocks=0,
                    max_blocks=
                    nwc_workers,  # Limits the number of manager processes,
                    launcher=AprunLauncher(
                        overrides='-d 256 --cc depth -j 4'
                    ),  # Places worker on the compute node
                    worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env
    ''',
                ),
            )
        ],
        monitoring=MonitoringHub(
            hub_address=address_by_hostname(),
            monitoring_debug=False,
            resource_monitoring_interval=10,
            logdir=log_dir,
            logging_endpoint=
            f'sqlite:///{os.path.join(log_dir, "monitoring.db")}'),
        run_dir=log_dir,
        strategy='simple',
        max_idletime=15.)
Exemplo n.º 15
0
def cli_run():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--redishost",
        default="127.0.0.1",
        help="Address at which the redis server can be reached")
    parser.add_argument("--redisport",
                        default="6379",
                        help="Port on which redis is available")
    parser.add_argument("-d",
                        "--debug",
                        action='store_true',
                        help="Count of apps to launch")
    parser.add_argument("-m",
                        "--mac",
                        action='store_true',
                        help="Configure for Mac")
    args = parser.parse_args()

    if args.debug:
        parsl.set_stream_logger()

    if args.mac:
        config = Config(
            executors=[
                ThreadPoolExecutor(label="htex"),
                ThreadPoolExecutor(label="local_threads")
            ],
            strategy=None,
        )
    else:
        config = Config(
            executors=[
                HighThroughputExecutor(
                    label="htex",
                    # Max workers limits the concurrency exposed via mom node
                    max_workers=2,
                    provider=LocalProvider(
                        init_blocks=1,
                        max_blocks=1,
                    ),
                ),
                ThreadPoolExecutor(label="local_threads")
            ],
            strategy=None,
        )
    parsl.load(config)

    print(
        '''This program creates an "MPI Method Server" that listens on an inputs queue and write on an output queue:

        input_queue --> mpi_method_server --> queues

To send it a request, add an entry to the inputs queue:
     run "pipeline-pump -p N" where N is an integer request
To access a value, remove it from the outout queue:
     run "pipeline-pull" (blocking) or "pipeline-pull -t T" (T an integer) to time out after T seconds
     TODO: Timeout does not work yet!
''')

    # Get the queues for the method server
    method_queues = MethodServerQueues(args.redishost, port=args.redisport)

    # Start the method server
    mms = ParslMethodServer([target_fun],
                            method_queues,
                            default_executors=['htex'])
    mms.run()
Exemplo n.º 16
0
from parsl.config import Config
from parsl.providers import SlurmProvider
from parsl.executors import HighThroughputExecutor
from parsl.addresses import address_by_hostname
from parsl.data_provider.globus import GlobusScheme

config = Config(
    executors=[
        HighThroughputExecutor(
            label='Stampede2_HTEX',
            address=address_by_hostname(),
            provider=SlurmProvider(
                nodes_per_block=2,
                init_blocks=1,
                min_blocks=1,
                partition='YOUR_PARTITION',
                # string to prepend to #SBATCH blocks in the submit
                # script to the scheduler eg: '#SBATCH --constraint=knl,quad,cache'
                scheduler_options='',
                # Command to be run before starting a worker, such as:
                # 'module load Anaconda; source activate parsl_env'.
                worker_init='',
                walltime='00:30:00'),
            storage_access=[
                GlobusScheme(
                    endpoint_uuid='ceea5ca0-89a9-11e7-a97f-22000a92523b',
                    endpoint_path='/',
                    local_path='/')
            ])
    ], )
Exemplo n.º 17
0
def workflow_config(name,
                    nodes,
                    cores_per_node=24,
                    interval=30,
                    monitor=False):
    import parsl
    from parsl.config import Config
    from parsl.channels import LocalChannel
    from parsl.launchers import SrunLauncher
    from parsl.providers import LocalProvider
    from parsl.addresses import address_by_interface
    from parsl.executors import HighThroughputExecutor
    from parsl.monitoring.monitoring import MonitoringHub

    parsl.set_stream_logger()
    parsl.set_file_logger('script.output', level=logging.DEBUG)

    logging.info('Configuring Parsl Workflow Infrastructure')

    #Read where datasets are...
    env_str = str()
    with open('parsl.env', 'r') as reader:
        env_str = reader.read()

    logging.info(f'Task Environment {env_str}')

    mon_hub = MonitoringHub(
        workflow_name=name,
        hub_address=address_by_interface('ib0'),
        hub_port=60001,
        resource_monitoring_enabled=True,
        monitoring_debug=False,
        resource_monitoring_interval=interval,
    ) if monitor else None

    config = Config(
        executors=[
            HighThroughputExecutor(
                label=name,
                # Optional: The network interface on node 0 which compute nodes can communicate with.
                # address=address_by_interface('enp4s0f0' or 'ib0')
                address=address_by_interface('ib0'),
                # one worker per manager / node
                max_workers=cores_per_node,
                provider=LocalProvider(
                    channel=LocalChannel(script_dir='.'),
                    # make sure the nodes_per_block matches the nodes requested in the submit script in the next step
                    nodes_per_block=nodes,
                    # make sure
                    launcher=SrunLauncher(overrides=f'-c {cores_per_node}'),
                    cmd_timeout=120,
                    init_blocks=1,
                    max_blocks=1,
                    worker_init=env_str,
                ),
            )
        ],
        monitoring=mon_hub,
        strategy=None,
    )

    logging.info('Loading Parsl Config')

    parsl.load(config)
    return
Exemplo n.º 18
0
WORKERS_PER_NODE = 64

parsl_config = Config(
    executors=[
        HighThroughputExecutor(
            label='theta-htex',
            max_workers=WORKERS_PER_NODE * MY_COMPUTE_NODES *
            MY_COMPUTE_BLOCKS,
            worker_debug=True,
            address=address_by_hostname(),
            provider=CobaltProvider(
                queue=MY_QUEUE,
                account=MY_ALLOCATION,
                launcher=AprunLauncher(overrides="-d 64"),
                walltime=MY_TIME,
                nodes_per_block=MY_COMPUTE_NODES,
                init_blocks=1,
                min_blocks=1,
                max_blocks=MY_COMPUTE_BLOCKS,
                # string to prepend to #COBALT blocks in the submit
                # script to the scheduler eg: '#COBALT -t 50'
                scheduler_options='',
                # Command to be run before starting a worker, such as:
                worker_init='module load miniconda-3; export PATH=$PATH:{}'.
                format(MY_USER_PATH),
                cmd_timeout=120,
            ),
        ),
        ThreadPoolExecutor(label='login-node', max_threads=8),
    ], )
parsl.load(parsl_config)
Exemplo n.º 19
0
from parsl.config import Config
from parsl.providers import GridEngineProvider
from parsl.executors import HighThroughputExecutor
from parsl.addresses import address_by_query

config = Config(
    executors=[
        HighThroughputExecutor(
            label='CC.IN2P3_HTEX',
            address=address_by_query(),
            provider=GridEngineProvider(
                nodes_per_block=1,
                init_blocks=1,
                max_blocks=1,
                # string to prepend to #SBATCH blocks in the submit
                # script to the scheduler eg: '#$ -M [email protected]
                scheduler_options='',
                # Command to be run before starting a worker, such as:
                # 'module load Anaconda; source activate parsl_env'.
                worker_init='',
                walltime='00:20:00',
            ),
        )
    ], )
Exemplo n.º 20
0
            from parsl.executors.high_throughput.interchange import ManagerLost
            if isinstance(exception, ManagerLost):
                return 0.1
            else:
                return 1

        slurm_htex = Config(
            executors=[
                HighThroughputExecutor(
                    label="jobs",
                    address=address_by_hostname(),
                    prefetch_capacity=0,
                    worker_debug=True,
                    provider=SlurmProvider(
                        channel=LocalChannel(script_dir='logs_parsl'),
                        launcher=SrunLauncher(),
                        max_blocks=args.workers,
                        init_blocks=args.workers,
                        partition='all',
                        # scheduler_options=sched_opts,   # Enter scheduler_options if needed
                        worker_init=wrk_init,
                        walltime='03:00:00'),
                ),
                HighThroughputExecutor(
                    label="merges",
                    address=address_by_hostname(),
                    prefetch_capacity=0,
                    worker_debug=True,
                    provider=SlurmProvider(
                        channel=LocalChannel(script_dir='logs_parsl'),
                        launcher=SrunLauncher(),
Exemplo n.º 21
0
"""Tests related to Parsl workers being able to access their worker ID"""

from parsl.providers import LocalProvider
from parsl.channels import LocalChannel
from parsl.config import Config
from parsl.executors import HighThroughputExecutor
from parsl import python_app
import pytest

local_config = Config(
    executors=[
        HighThroughputExecutor(
            label="htex_Local",
            worker_debug=True,
            max_workers=4,
            provider=LocalProvider(
                channel=LocalChannel(),
                init_blocks=1,
                max_blocks=1,
            ),
        )
    ],
    strategy=None,
)


@python_app
def get_worker_info():
    import os
    rank = int(os.environ['PARSL_WORKER_RANK'])
    size = int(os.environ['PARSL_WORKER_COUNT'])
    pool_id = os.environ['PARSL_WORKER_POOL_ID']
Exemplo n.º 22
0
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

from parsl.launchers import JsrunLauncher
from parsl.providers import LSFProvider

from parsl.addresses import address_by_interface

config = Config(
    executors=[
        HighThroughputExecutor(
            label='Summit_HTEX',
            working_dir='/gpfs/alpine/scratch/yadunan/gen011/',
            address=address_by_interface(
                'ib0'),  # This assumes Parsl is running on login node
            worker_port_range=(50000, 55000),
            provider=LSFProvider(launcher=JsrunLauncher(),
                                 walltime="00:10:00",
                                 nodes_per_block=2,
                                 init_blocks=1,
                                 max_blocks=1,
                                 worker_init="source ~/setup.sh",
                                 project='GEN011WORKFLOW',
                                 cmd_timeout=60),
        )
    ], )
Exemplo n.º 23
0
def main(args=None):

    # Grab CLI args if not present
    if args is None:
        args = parse_args()
    exit_callbacks = []

    try:
        if args["debug"]["schema"]:
            print(ManagerSettings.schema_json(indent=2))
            return  # We're done, exit normally
    except KeyError:
        pass  # Don't worry if schema isn't in the list
    finally:
        debug_args = args.pop("debug", {})  # Ensure the debug key is not present

    # Construct object
    settings = ManagerSettings(**args)

    # Handle Skeleton Generation
    if debug_args.get("skeleton", None):

        class IndentListDumper(yaml.Dumper):
            """
            Internal yaml Dumper to make lists indent in the output YAML

            Buried inside this since its only used in "skeleton," once, and then exits. Does not need to be imported
            anywhere else or accessed somehow

            Based on response:
            https://stackoverflow.com/questions/25108581/python-yaml-dump-bad-indentation/39681672#39681672
            """

            def increase_indent(self, flow=False, indentless=False):
                return super(IndentListDumper, self).increase_indent(flow, False)

        skel_path = os.path.expanduser(debug_args["skeleton"])
        with open(skel_path, "w") as skel:
            # cast to
            data = yaml.dump(json.loads(settings.json()), Dumper=IndentListDumper, default_flow_style=False)
            skel.write(data)
            print(
                f"Skeleton Queue Manager YAML file written to {skel_path}\n"
                f"Run: `qcfractal-manager --config-file={skel_path}` to start a manager with this configuration."
            )
            return

    logger_map = {AdapterEnum.pool: "", AdapterEnum.dask: "dask_jobqueue.core", AdapterEnum.parsl: "parsl"}
    if settings.common.verbose:
        adapter_logger = logging.getLogger(logger_map[settings.common.adapter])
        adapter_logger.setLevel("DEBUG")
        logger.setLevel("DEBUG")

    if settings.manager.log_file_prefix is not None:
        tornado.options.options["log_file_prefix"] = settings.manager.log_file_prefix
        # Clones the log to the output
        tornado.options.options["log_to_stderr"] = True
    tornado.log.enable_pretty_logging()

    if settings.manager.test:
        # Test this manager, no client needed
        client = None
    else:
        # Connect to a specified fractal server
        client = qcfractal.interface.FractalClient(
            address=settings.server.fractal_uri, **settings.server.dict(skip_defaults=True, exclude={"fractal_uri"})
        )

    # Figure out per-task data
    node_parallel_tasks = settings.common.nodes_per_task > 1  # Whether tasks are node-parallel
    if node_parallel_tasks:
        supported_adapters = ["parsl"]
        if settings.common.adapter not in supported_adapters:
            raise ValueError("Node-parallel jobs are only supported with {} adapters".format(supported_adapters))
        # Node-parallel tasks use all cores on a worker
        cores_per_task = settings.common.cores_per_worker
        memory_per_task = settings.common.memory_per_worker
        if settings.common.tasks_per_worker > 1:
            raise ValueError(">1 task per node and >1 node per tasks are mutually-exclusive")
    else:
        cores_per_task = settings.common.cores_per_worker // settings.common.tasks_per_worker
        memory_per_task = settings.common.memory_per_worker / settings.common.tasks_per_worker
    if cores_per_task < 1:
        raise ValueError("Cores per task must be larger than one!")

    if settings.common.adapter == "pool":
        from multiprocessing import Pool, set_start_method

        set_start_method("spawn")

        # Error if the number of nodes per jobs is more than 1
        if settings.common.nodes_per_job > 1:
            raise ValueError("Pool adapters only run on a single local node")
        queue_client = Pool(processes=settings.common.tasks_per_worker, initializer=_initialize_signals_process_pool)

    elif settings.common.adapter == "dask":

        dask_settings = settings.dask.dict(skip_defaults=True)
        # Checks
        if "extra" not in dask_settings:
            dask_settings["extra"] = []
        if QCA_RESOURCE_STRING not in dask_settings["extra"]:
            dask_settings["extra"].append(QCA_RESOURCE_STRING)
        # Scheduler opts
        scheduler_opts = settings.cluster.scheduler_options.copy()

        # Error if the number of nodes per jobs is more than 1
        if settings.common.nodes_per_job > 1:
            raise NotImplementedError("Support for >1 node per job is not yet supported by QCFractal + Dask")
            # TODO (wardlt): Implement multinode jobs in Dask

        _cluster_loaders = {
            "slurm": "SLURMCluster",
            "pbs": "PBSCluster",
            "moab": "MoabCluster",
            "sge": "SGECluster",
            "lsf": "LSFCluster",
        }
        dask_exclusivity_map = {
            "slurm": "--exclusive",
            "pbs": "-n",
            "moab": "-n",  # Less sure about this one
            "sge": "-l exclusive=true",
            "lsf": "-x",
        }
        if settings.cluster.node_exclusivity and dask_exclusivity_map[settings.cluster.scheduler] not in scheduler_opts:
            scheduler_opts.append(dask_exclusivity_map[settings.cluster.scheduler])

        # Create one construct to quickly merge dicts with a final check
        dask_construct = {
            "name": "QCFractal_Dask_Compute_Executor",
            "cores": settings.common.cores_per_worker,
            "memory": str(settings.common.memory_per_worker) + "GB",
            "processes": settings.common.tasks_per_worker,  # Number of workers to generate == tasks in this construct
            "walltime": settings.cluster.walltime,
            "job_extra": scheduler_opts,
            "env_extra": settings.cluster.task_startup_commands,
            **dask_settings,
        }

        try:
            # Import the dask things we need
            import dask_jobqueue
            from dask.distributed import Client

            cluster_module = cli_utils.import_module(
                "dask_jobqueue", package=_cluster_loaders[settings.cluster.scheduler]
            )
            cluster_class = getattr(cluster_module, _cluster_loaders[settings.cluster.scheduler])
            if dask_jobqueue.__version__ < "0.5.0":
                raise ImportError
        except ImportError:
            raise ImportError("You need`dask-jobqueue >= 0.5.0` to use the `dask` adapter")

        cluster = cluster_class(**dask_construct)

        # Setup up adaption
        # Workers are distributed down to the cores through the sub-divided processes
        # Optimization may be needed
        workers = settings.common.tasks_per_worker * settings.common.max_workers
        if settings.cluster.adaptive == AdaptiveCluster.adaptive:
            cluster.adapt(minimum=0, maximum=workers, interval="10s")
        else:
            cluster.scale(workers)

        queue_client = Client(cluster)

    elif settings.common.adapter == "parsl":

        scheduler_opts = settings.cluster.scheduler_options

        if not settings.cluster.node_exclusivity:
            raise ValueError(
                "For now, QCFractal can only be run with Parsl in node exclusivity. This will be relaxed "
                "in a future release of Parsl and QCFractal"
            )

        # Import helpers
        _provider_loaders = {
            "slurm": "SlurmProvider",
            "pbs": "TorqueProvider",
            "moab": "TorqueProvider",
            "sge": "GridEngineProvider",
            "cobalt": "CobaltProvider",
            "lsf": None,
        }

        if _provider_loaders[settings.cluster.scheduler] is None:
            raise ValueError(f"Parsl does not know how to handle cluster of type {settings.cluster.scheduler}.")

        # Headers
        _provider_headers = {
            "slurm": "#SBATCH",
            "pbs": "#PBS",
            "moab": "#PBS",
            "sge": "#$$",
            "lsf": None,
            "cobalt": "#COBALT",
        }

        # Import the parsl things we need
        try:
            import parsl
            from parsl.config import Config
            from parsl.executors import HighThroughputExecutor
            from parsl.addresses import address_by_hostname

            provider_module = cli_utils.import_module(
                "parsl.providers", package=_provider_loaders[settings.cluster.scheduler]
            )
            provider_class = getattr(provider_module, _provider_loaders[settings.cluster.scheduler])
            provider_header = _provider_headers[settings.cluster.scheduler]
            if parsl.__version__ < "0.9.0":
                raise ImportError
        except ImportError:
            raise ImportError("You need `parsl >=0.9.0` to use the `parsl` adapter")

        if _provider_loaders[settings.cluster.scheduler] == "moab":
            logger.warning(
                "Parsl uses its TorqueProvider for Moab clusters due to the scheduler similarities. "
                "However, if you find a bug with it, please report to the Parsl and QCFractal developers so "
                "it can be fixed on each respective end."
            )

        # Setup the providers

        # Determine the maximum number of blocks
        # TODO (wardlt): Math assumes that user does not set aside a compute node for the adapter
        max_nodes = settings.common.max_workers * settings.common.nodes_per_task
        if settings.common.nodes_per_job > max_nodes:
            raise ValueError("Number of nodes per job is more than the maximum number of nodes used by manager")
        if max_nodes % settings.common.nodes_per_job != 0:
            raise ValueError(
                "Maximum number of nodes (maximum number of workers times nodes per task) "
                "needs to be a multiple of the number of nodes per job"
            )
        if settings.common.nodes_per_job % settings.common.nodes_per_task != 0:
            raise ValueError("Number of nodes per job needs to be a multiple of the number of nodes per task")
        max_blocks = max_nodes // settings.common.nodes_per_job

        # Create one construct to quickly merge dicts with a final check
        common_parsl_provider_construct = {
            "init_blocks": 0,  # Update this at a later time of Parsl
            "max_blocks": max_blocks,
            "walltime": settings.cluster.walltime,
            "scheduler_options": f"{provider_header} " + f"\n{provider_header} ".join(scheduler_opts) + "\n",
            "nodes_per_block": settings.common.nodes_per_job,
            "worker_init": "\n".join(settings.cluster.task_startup_commands),
            **settings.parsl.provider.dict(skip_defaults=True, exclude={"partition", "launcher"}),
        }
        if settings.cluster.scheduler.lower() == "slurm" and "cores_per_node" not in common_parsl_provider_construct:
            common_parsl_provider_construct["cores_per_node"] = settings.common.cores_per_worker
        # TODO: uncomment after Parsl#1416 is resolved
        # if settings.cluster.scheduler.lower() == "slurm" and "mem_per_node" not in common_parsl_provider_construct:
        #    common_parsl_provider_construct["mem_per_node"] = settings.common.memory_per_worker

        if settings.parsl.provider.launcher:
            common_parsl_provider_construct["launcher"] = settings.parsl.provider.launcher.build_launcher()
        if settings.cluster.scheduler == "slurm":
            # The Parsl SLURM constructor has a strange set of arguments
            provider = provider_class(
                settings.parsl.provider.partition,
                exclusive=settings.cluster.node_exclusivity,
                **common_parsl_provider_construct,
            )
        else:
            provider = provider_class(**common_parsl_provider_construct)

        # The executor for Parsl is different for node parallel tasks and shared-memory tasks
        if node_parallel_tasks:
            # Tasks are launched from a single worker on the login node
            # TODO (wardlt): Remove assumption that there is only one Parsl worker running all tasks
            tasks_per_job = settings.common.nodes_per_job // settings.common.nodes_per_task
            logger.info(f"Preparing a HTEx to use node-parallel tasks with {tasks_per_job} workers")
            parsl_executor_construct = {
                "label": "QCFractal_Parsl_{}_Executor".format(settings.cluster.scheduler.title()),
                # Parsl will create one worker process per MPI task. Normally, Parsl prevents having
                #  more processes than cores. However, as each worker will spend most of its time
                #  waiting for the MPI task to complete, we can safely oversubscribe (e.g., more worker
                #  processes than cores), which requires setting "cores_per_worker" to <1
                "cores_per_worker": 1e-6,
                "max_workers": tasks_per_job,
                "provider": provider,
                "address": address_by_hostname(),
                **settings.parsl.executor.dict(skip_defaults=True),
            }
        else:

            parsl_executor_construct = {
                "label": "QCFractal_Parsl_{}_Executor".format(settings.cluster.scheduler.title()),
                "cores_per_worker": cores_per_task,
                "max_workers": settings.common.tasks_per_worker,
                "provider": provider,
                "address": address_by_hostname(),
                **settings.parsl.executor.dict(skip_defaults=True),
            }

        queue_client = Config(
            retries=settings.common.retries, executors=[HighThroughputExecutor(**parsl_executor_construct)]
        )

    else:
        raise KeyError(
            "Unknown adapter type '{}', available options: {}.\n"
            "This code should also be unreachable with pydantic Validation, so if "
            "you see this message, please report it to the QCFractal GitHub".format(
                settings.common.adapter, [getattr(AdapterEnum, v).value for v in AdapterEnum]
            )
        )

    # Build out the manager itself
    # Compute max tasks
    max_concurrent_tasks = settings.common.tasks_per_worker * settings.common.max_workers
    if settings.manager.max_queued_tasks is None:
        # Tasks * jobs * buffer + 1
        max_queued_tasks = ceil(max_concurrent_tasks * 2.00) + 1
    else:
        max_queued_tasks = settings.manager.max_queued_tasks

    # The queue manager is configured differently for node-parallel and single-node tasks
    manager = qcfractal.queue.QueueManager(
        client,
        queue_client,
        max_tasks=max_queued_tasks,
        queue_tag=settings.manager.queue_tag,
        manager_name=settings.manager.manager_name,
        update_frequency=settings.manager.update_frequency,
        cores_per_task=cores_per_task,
        memory_per_task=memory_per_task,
        nodes_per_task=settings.common.nodes_per_task,
        scratch_directory=settings.common.scratch_directory,
        retries=settings.common.retries,
        verbose=settings.common.verbose,
        cores_per_rank=settings.common.cores_per_rank,
        configuration=settings,
    )

    # Set stats correctly since we buffer the max tasks a bit
    manager.statistics.max_concurrent_tasks = max_concurrent_tasks

    # Add exit callbacks
    for cb in exit_callbacks:
        manager.add_exit_callback(cb[0], *cb[1], **cb[2])

    # Either startup the manager or run until complete
    if settings.manager.test:
        success = manager.test(settings.manager.ntests)
        if success is False:
            raise ValueError("Testing was not successful, failing.")
    else:

        for signame in {"SIGHUP", "SIGINT", "SIGTERM"}:

            def stop(*args, **kwargs):
                manager.stop(signame)
                raise KeyboardInterrupt()

            signal.signal(getattr(signal, signame), stop)

        # Blocks until signal
        try:
            manager.start()
        except KeyboardInterrupt:
            pass
Exemplo n.º 24
0
#outside running on UI, snippet, partition, address_by_interface
"""  Each job submitted to the scheduler will request 2 nodes for 10 minutes.
"""
config = Config(
    executors=[
        HighThroughputExecutor(
            label="sd_htex",
            address=address_by_interface('ib0'),
            max_workers=1,          # Set number of workers per node
            provider=SlurmProvider(
                cmd_timeout=60,     # Add extra time for slow scheduler responses
                channel=LocalChannel(),
                nodes_per_block=2,
                init_blocks=1,
                min_blocks=1,
                max_blocks=1,
                partition='normal',                                 # Replace with partition name
                scheduler_options='#SBATCH -A <YOUR_ALLOCATION>',   # Enter scheduler_options if needed

                # Command to be run before starting a worker, such as:
                # 'module load Anaconda; source activate parsl_env'.
                worker_init='',

                # Ideally we set the walltime to the longest supported walltime.
                walltime='00:10:00',
                launcher=SrunLauncher(),
            ),
        )
    ],
)
Exemplo n.º 25
0
from parsl.launchers import SrunLauncher
from parsl.executors import HighThroughputExecutor
from parsl.tests.utils import get_rundir

# If you are a developer running tests, make sure to update parsl/tests/configs/user_opts.py
# If you are a user copying-and-pasting this as an example, make sure to either
#       1) create a local `user_opts.py`, or
#       2) delete the user_opts import below and replace all appearances of `user_opts` with the literal value
#          (i.e., user_opts['swan']['username'] -> 'your_username')
from .user_opts import user_opts

config = Config(executors=[
    HighThroughputExecutor(
        label='midway_htex_multinode',
        provider=SlurmProvider(
            'westmere',
            channel=SSHChannel(hostname='swift.rcc.uchicago.edu',
                               username=user_opts['midway']['username'],
                               script_dir=user_opts['midway']['script_dir']),
            launcher=SrunLauncher(),
            scheduler_options=user_opts['midway']['scheduler_options'],
            worker_init=user_opts['midway']['worker_init'],
            walltime="00:05:00",
            init_blocks=1,
            max_blocks=1,
            nodes_per_block=2,
        ),
    )
],
                run_dir=get_rundir())
Exemplo n.º 26
0
from parsl.config import Config
from parsl.executors import HighThroughputExecutor
from parsl.launchers import MpiRunLauncher
from parsl.providers import CobaltProvider

config = Config(
    executors=[
        HighThroughputExecutor(
            label="cooley_htex",
            worker_debug=False,
            cores_per_worker=1,
            provider=CobaltProvider(
                queue='debug',
                account='YOUR_ACCOUNT',  # project name to submit the job
                launcher=MpiRunLauncher(),
                scheduler_options=
                '',  # string to prepend to #COBALT blocks in the submit script to the scheduler
                worker_init=
                '',  # command to run before starting a worker, such as 'source activate env'
                init_blocks=1,
                max_blocks=1,
                min_blocks=1,
                nodes_per_block=4,
                cmd_timeout=60,
                walltime='00:10:00',
            ),
        )
    ], )
 HighThroughputExecutor(
     label='comet_htex',
     worker_debug=True,
     address='js-17-185.jetstream-cloud.org',
     max_workers=24,
     #workers_per_node = 24, # Getting error for unexpexted argument
     cores_per_worker=1,
     worker_logdir_root='/home/aymen/parsl_scripts',
     interchange_address=
     'comet-ln2.sdsc.edu',  #'js-17-185.jetstream-cloud.org','comet-ln2.sdsc.edu',
     interchange_port_range=(50000, 50400),
     worker_port_range=(50500, 51000),
     provider=SlurmProvider(
         'debug',
         channel=SSHChannel(
             hostname='comet-ln2.sdsc.edu',
             username=
             '******',  # Please replace USERNAME with your username
             password='******',
             script_dir=
             '/home/aymen/parsl_scripts',  # Please replace USERNAME with your username
         ),
         #launcher=SrunLauncher(),
         scheduler_options='',  # Input your scheduler_options if needed
         worker_init=
         'source ve/parsl-env/bin/activate',  # Input your worker_init if needed
         #worker_init='source activate /oasis/projects/nsf/unc100/aymen/anaconda3/envs/conda-parsl',
         #partition = "debug",
         walltime="00:10:00",
         init_blocks=1,
         max_blocks=1,
         #tasks_per_node = 24, # Getting error for unexpexted argument
         nodes_per_block=1,
         #cores_per_node=24, # Getting error for unexpexted argument
         parallelism=24,
     ),
     #working_dir="/home/aymen/parsl_scripts",
 )
Exemplo n.º 28
0
        '''
        nproc = 36
        sched_opts = '''
        #SBATCH --cpus-per-task=%d
        ''' % (nproc)

        slurm_htex = Config(
            executors=[
                HighThroughputExecutor(
                    label="coffea_parsl_slurm",
                    address=address_by_hostname(),
                    prefetch_capacity=0,
                    max_workers=nproc,
                    provider=SlurmProvider(
                        channel=LocalChannel(script_dir='parsl_slurm'),
                        launcher=SrunLauncher(),
                        max_blocks=(args.ncpu)+5,
                        init_blocks=args.ncpu, 
                        partition='all',
                        scheduler_options=sched_opts,   # Enter scheduler_options if needed
                        worker_init=wrk_init,         # Enter worker_init if needed
                        walltime='00:120:00'
                    ),
                )
            ],
            retries=20,
        )
        dfk = parsl.load(slurm_htex)
    else:
        config = Config(executors=[ThreadPoolExecutor(max_threads=args.ncpu)])
        parsl.load(config)
Exemplo n.º 29
0
from parsl.config import Config
from parsl.providers import LocalProvider
from parsl.channels import SSHChannel
from parsl.addresses import address_by_hostname
from parsl.executors import HighThroughputExecutor

hostnames = ['host-1', 'host-2']

config = Config(
    executors=[
        HighThroughputExecutor(
            label='htex_{}'.format(h),
            worker_debug=False,
            address=address_by_hostname(),
            provider=LocalProvider(
                # The username on the machines depend on the distribution
                # used, for eg. on Ubuntu, username is 'ubuntu'
                channel=SSHChannel(hostname=h, username='******'),
                move_files=False,  # set to True if there is no shared filesystem
                nodes_per_block=1,
                init_blocks=1,
                min_blocks=1,
                max_blocks=1,
                # Command to be run before starting a worker, such as:
                # 'module load Anaconda; source activate parsl_env'.
                worker_init='',
            ),
        ) for h in hostnames
    ],
    strategy=None)