Exemplo n.º 1
0
def configure_node(cluster_changed, cluster_joined):
    status_set('maintenance', 'Configuring slurm-node')

    controller_data = cluster_changed.active_data
    create_spool_dir(context=controller_data)

    render_munge_key(context=controller_data)
    # If the munge.key has been changed on the controller and munge is
    # running, the service must be restarted to use the new key
    if flags.is_flag_set('endpoint.slurm-cluster.changed.munge_key'
                         ) and service_running(MUNGE_SERVICE):
        log('Restarting munge due to key change on slurm-controller')
        service_restart(MUNGE_SERVICE)

    render_slurm_config(context=controller_data)

    # Make sure munge is running
    if not service_running(MUNGE_SERVICE):
        service_start(MUNGE_SERVICE)
    # Make sure slurmd is running
    if not service_running(SLURMD_SERVICE):
        service_start(SLURMD_SERVICE)

    flags.set_flag('slurm-node.configured')
    log('Set {} flag'.format('slurm-node.configured'))

    flags.clear_flag('endpoint.slurm-cluster.active.changed')
    log('Cleared {} flag'.format('endpoint.slurm-cluster.active.changed'))

    # Clear this flag to be able to signal munge_key changed if it occurs from
    # a controller.
    flags.clear_flag('endpoint.slurm-cluster.changed.munge_key')
    log('Cleared {} flag'.format('endpoint.slurm-cluster.changed.munge_key'))
Exemplo n.º 2
0
def configure_node(cluster_changed, cluster_joined):
    status_set('maintenance', 'Configuring slurm-node')

    controller_data = cluster_changed.active_data
    create_spool_dir(context=controller_data)
    render_munge_key(context=controller_data)
    render_slurm_config(context=controller_data)
    # Make sure slurmd is running
    if not service_running(SLURMD_SERVICE):
        service_start(SLURMD_SERVICE)

    flags.set_flag('slurm-node.configured')
    log('Set {} flag'.format('slurm-node.configured'))

    flags.clear_flag('endpoint.slurm-cluster.active.changed')
    log('Cleared {} flag'.format('endpoint.slurm-cluster.active.changed'))
Exemplo n.º 3
0
def configure_node(cluster_changed, cluster_joined):
    status_set('maintenance', 'Configuring slurm-node')

    controller_data = cluster_changed.active_data

    gres_context = get_inventory()
    gres_context.update({key: controller_data[key] for key in ['slurm_user']})

    create_spool_dir(context=controller_data)
    render_munge_key(context=controller_data)
    render_slurm_config(context=controller_data)
    render_gres_config(context=gres_context)
    # Make sure slurmd is running
    if not service_running(SLURMD_SERVICE):
        service_start(SLURMD_SERVICE)

    flags.set_flag('slurm-node.configured')
    flags.clear_flag('endpoint.slurm-cluster.active.changed')
Exemplo n.º 4
0
def configure_controller(cluster):
    status_set('maintenance', 'Configuring slurm-controller')
    # Get node configs
    nodes = cluster.get_nodes()
    partitions = cluster.get_partitions()
    config().update({
        'nodes': nodes,
        'partitions': partitions,
        'control_machine': gethostname(),
        'control_addr': unit_private_ip(),
    })
    # Setup slurm dirs and config
    create_state_save_location(config=config())
    render_slurm_config(config=config())
    # Make sure slurmctld is running
    if not service_running(SLURMCTLD_SERVICE):
        service_start(SLURMCTLD_SERVICE)
    # Send config to nodes
    cluster.send_controller_config(config=config())
    # Update states
    remove_state('slurm-controller.changed')
    set_state('slurm-controller.configured')
Exemplo n.º 5
0
def configure_controller(*args):
    ''' A controller is only configured after leader election is
    performed. Cluster endpoint must be present for a controller to
    proceed with initial configuration'''
    hookenv.status_set('maintenance', 'Configuring slurm-controller')

    # need to have a role determined here so that a controller context can
    # be uniformly prepared for consumption on the worker side as controller
    # and node layers share a common layer with a slurm.conf template
    # mostly identical on all nodes
    is_active = controller.is_active_controller()

    role = controller.ROLES[is_active]
    peer_role = controller.ROLES[not is_active]

    # the endpoint is present as joined is required for this handler
    cluster_endpoint = relations.endpoint_from_flag(
        'endpoint.slurm-cluster.joined')
    # Get node configs
    nodes = cluster_endpoint.get_node_data()
    partitions = controller.get_partitions(nodes)

    # relation-changed does not necessarily mean that data will be provided
    if not partitions:
        flags.clear_flag('endpoint.slurm-cluster.changed')
        return

    # the whole charm config will be sent to related nodes
    # with some additional options added via dict update
    controller_conf = copy.deepcopy(hookenv.config())
    controller_conf.update({
        'nodes': nodes,
        'partitions': partitions,
        # for worker nodes
        'munge_key': hookenv.leader_get('munge_key'),
    })

    net_details = controller.add_key_prefix(
        cluster_endpoint.network_details(), role)
    # update the config dict used as a context in rendering to have prefixed
    # keys for network details based on a current unit role (active or backup)
    controller_conf.update(net_details)

    ha_endpoint = relations.endpoint_from_flag(
        'endpoint.slurm-controller-ha.joined')
    if ha_endpoint:
        # add prefixed peer data
        peer_data = controller.add_key_prefix(
            ha_endpoint.peer_data, peer_role)
        controller_conf.update(peer_data)
    else:
        peer_data = None

    # a controller service is configurable if it is an active controller
    # or a backup controller that knows about an active controller
    is_configurable = is_active or (not is_active and peer_data)
    if is_configurable:
        hookenv.log('The controller is configurable ({})'.format(role))
        # Setup slurm dirs and config
        helpers.create_state_save_location(context=controller_conf)
        helpers.render_slurm_config(context=controller_conf)
        flags.set_flag('slurm-controller.configured')
        # restart controller process on any changes
        # TODO: this could be optimized via goal-state hook by
        # observing "joining" node units
        host.service_restart(helpers.SLURMCTLD_SERVICE)
    else:
        hookenv.log('The controller is NOT configurable ({})'.format(role))
        if not is_active:
            hookenv.status_set('maintenance',
                               'Backup controller is waiting for peer data')

    # Send config to nodes
    if is_active:
        # TODO: wait until a peer acknowledges that it has cleared
        # its side of a node-facing relation - this needs to be done
        # in case an active controller is changed to a different one
        # to avoid split-brain conditions on node units
        cluster_endpoint.send_controller_config(controller_conf)
    else:
        # otherwise make sure that all keys are cleared
        # this is relevant for a former active controller
        cluster_endpoint.send_controller_config({
            k: None for k in controller_conf.keys()
        })

    # clear the changed flag as it is not cleared automatically
    flags.clear_flag('endpoint.slurm-cluster.changed')
Exemplo n.º 6
0
def configure_controller(*args):
    ''' A controller is only configured after leader election is
    performed. Cluster endpoint must be present for a controller to
    proceed with initial configuration'''
    hookenv.status_set('maintenance', 'Configuring slurm-controller')
    flags.clear_flag('slurm-controller.configured')

    # need to have a role determined here so that a controller context can
    # be uniformly prepared for consumption on the worker side as controller
    # and node layers share a common layer with a slurm.conf template
    # mostly identical on all nodes
    is_active = controller.is_active_controller()

    role = controller.ROLES[is_active]
    peer_role = controller.ROLES[not is_active]

    # the endpoint is present as joined is required for this handler
    cluster_endpoint = relations.endpoint_from_flag(
        'endpoint.slurm-cluster.joined')
    # Get node configs
    nodes = cluster_endpoint.get_node_data()
    partitions = controller.get_partitions(nodes)

    # Implementation of automatic node weights
    node_weight_criteria = hookenv.config().get('node_weight_criteria')
    if node_weight_criteria != 'none':
        weightres = controller.set_node_weight_criteria(node_weight_criteria, nodes)
        # If the weight configuration is incorrect, abort reconfiguration. Status
        # will be set to blocked with an informative message. The controller charm
        # will keep running.
        if not weightres:
            return

    # relation-changed does not necessarily mean that data will be provided
    if not partitions:
        flags.clear_flag('endpoint.slurm-cluster.changed')
        return

    # the whole charm config will be sent to related nodes
    # with some additional options added via dict update
    controller_conf = copy.deepcopy(hookenv.config())
    # if controller cluster config include file exists, add contents to controller_conf dict
    slurmconf_include = '%s/slurm-%s.conf' % (helpers.SLURM_CONFIG_DIR, hookenv.config().get('clustername'))
    if os.path.exists(slurmconf_include):
        f = open(slurmconf_include, "r")
        controller_conf.update({'include': f.read()})
        f.close()
    controller_conf.update({
        'nodes': nodes,
        'partitions': partitions,
        # for worker nodes
        'munge_key': hookenv.leader_get('munge_key'),
    })

    net_details = controller.add_key_prefix(
        cluster_endpoint.network_details(), role)
    # update the config dict used as a context in rendering to have prefixed
    # keys for network details based on a current unit role (active or backup)
    controller_conf.update(net_details)

    ha_endpoint = relations.endpoint_from_flag(
        'endpoint.slurm-controller-ha.joined')
    if ha_endpoint:
        # add prefixed peer data
        peer_data = controller.add_key_prefix(
            ha_endpoint.peer_data, peer_role)
        controller_conf.update(peer_data)
    else:
        peer_data = None

    # If we have a DBD relation, extract endpoint data and configure DBD setup
    # directly, regardless if the clustername gets accepted in the DBD or not
    if flags.is_flag_set('endpoint.slurm-dbd-consumer.joined') and leadership.leader_get('dbd_host'):
        dbd_host = leadership.leader_get('dbd_host')
        controller_conf.update({
            'dbd_host': leadership.leader_get('dbd_host'),
            'dbd_port': leadership.leader_get('dbd_port'),
            'dbd_ipaddr': leadership.leader_get('dbd_ipaddr')
        })


    es_endpoint = relations.endpoint_from_flag(
        'elasticsearch.available')
        
    if es_endpoint:
        for unit in es_endpoint.list_unit_data():
            elastic_host = unit['host']
            elastic_port = unit['port']
        controller_conf.update({
            'elastic_host': elastic_host,
            'elastic_port': elastic_port,
        })
        hookenv.log(("elasticsearch available, using %s:%s from endpoint relation.") % (elastic_host,elastic_port))
    else:
        hookenv.log('No endpoint for elasticsearch available')
        
        
    # In case we are here due to DBD join or charm config change, announce this to the nodes
    # by changing the value of slurm_config_updated
    if flags.is_flag_set('slurm.dbd_host_updated') or flags.is_flag_set('config.changed'):
        ts = time.time()
        hookenv.log('Slurm configuration on controller was updated on %s, annoucing to nodes' % ts)
        controller_conf.update({ 'slurm_config_updated': ts })
        flags.clear_flag('slurm.dbd_host_updated')

    # a controller service is configurable if it is an active controller
    # or a backup controller that knows about an active controller
    is_configurable = is_active or (not is_active and peer_data)
    if is_configurable:
        hookenv.log('The controller is configurable ({})'.format(role))
        # Setup slurm dirs and config
        helpers.create_state_save_location(context=controller_conf)
        helpers.render_slurm_config(context=controller_conf, active_controller=is_active)
        flags.set_flag('slurm-controller.configured')
        flags.clear_flag('slurm-controller.reconfigure')
        flags.clear_flag('slurm-controller.munge_updated')
        # restart controller process on any changes
        # TODO: this could be optimized via goal-state hook by
        # observing "joining" node units
        host.service_restart(helpers.SLURMCTLD_SERVICE)
    else:
        hookenv.log('The controller is NOT configurable ({})'.format(role))
        if not is_active:
            hookenv.status_set('maintenance',
                               'Backup controller is waiting for peer data')

    # Send config to nodes
    if is_active:
        # TODO: wait until a peer acknowledges that it has cleared
        # its side of a node-facing relation - this needs to be done
        # in case an active controller is changed to a different one
        # to avoid split-brain conditions on node units
        cluster_endpoint.send_controller_config(controller_conf)
    else:
        # otherwise make sure that all keys are cleared
        # this is relevant for a former active controller
        cluster_endpoint.send_controller_config({
            k: None for k in controller_conf.keys()
        })

    # clear the changed flag as it is not cleared automatically
    flags.clear_flag('endpoint.slurm-cluster.changed')