Пример #1
0
def reregister_server(url, address):
    """ Update manager url and register agent address with manager """
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    conf.set_server_url(url)
    crypto = Crypto(conf.ENV_PATH)
    agent_client = AgentClient(
        url + "reregister/",
        ActionPluginManager(),
        DevicePluginManager(),
        ServerProperties(),
        crypto,
    )
    data = {"address": address, "fqdn": agent_client._fqdn}

    try:
        result = agent_client.post(data)
    except HttpError:
        console_log.error("Reregistration failed to %s with request %s" %
                          (agent_client.url, data))
        raise

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return result
Пример #2
0
def register_server(url, ca, secret, address=None):
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    crypto = Crypto(config.path)
    # Call delete in case we are over-writing a previous configuration that wasn't removed properly
    crypto.delete()
    crypto.install_authority(ca)

    agent_client = AgentClient(url + "register/%s/" % secret,
                               ActionPluginManager(), DevicePluginManager(),
                               ServerProperties(), crypto)

    registration_result = agent_client.register(address)
    crypto.install_certificate(registration_result['certificate'])

    config.set('settings', 'server', {'url': url})

    console_log.info("Enabling chroma-agent service")
    agent_service.enable()

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return registration_result
Пример #3
0
def stop_target(ha_label):
    '''
    Stop the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        if _resource_exists(_zfs_name(ha_label)):
            # Group disable will disable all members of group regardless of current status
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable',
                 _group_name(ha_label)])
        else:
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable', ha_label])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s", ha_label)
        else:
            return agent_error("Failed to stop target {}".format(ha_label))
Пример #4
0
    def _shutdown():
        console_log.info("Initiating server shutdown per manager request")
        # This will initiate a "nice" shutdown with a wall from root, etc.
        AgentShell.try_run(["shutdown", "-H" if halt else "-h", at_time])

        console_log.info("Terminating")
        os._exit(0)
Пример #5
0
def reregister_server(url, address):
    """ Update manager url and register agent address with manager """
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    config.set('settings', 'server', {'url': url})
    crypto = Crypto(config.path)
    agent_client = AgentClient(url + 'reregister/', ActionPluginManager(),
                               DevicePluginManager(), ServerProperties(),
                               crypto)
    data = {'address': address, 'fqdn': agent_client._fqdn}

    try:
        result = agent_client.post(data)
    except HttpError:
        console_log.error("Reregistration failed to %s with request %s" %
                          (agent_client.url, data))
        raise

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return result
Пример #6
0
def stop_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Stopped'
        ])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s" % ha_label)
        else:
            return agent_error("failed to stop target %s" % ha_label)
Пример #7
0
    def _reboot():
        console_log.info("Initiating server reboot per manager request")
        # reboot(8) just calls shutdown anyhow.
        AgentShell.try_run(["shutdown", "-r", at_time])

        console_log.info("Terminating")
        os._exit(0)
Пример #8
0
    def set_address(self, ipv4_address, prefix):
        ifaddr = "%s/%s" % (ipv4_address, prefix)

        console_log.info("Set %s (%s) up" % (self.name, ifaddr))

        if self.ipv4_address != ipv4_address:
            node_admin.unmanage_network(self.device, self.mac_address)

            AgentShell.try_run(
                ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up'])
            AgentShell.try_run(
                ['/sbin/ip', 'addr', 'add', ifaddr, 'dev', self.name])

            # The link address change is asynchronous, so we need to wait for the
            # address to stick of we have a race condition.
            timeout = 30
            while self.ipv4_address != ipv4_address and timeout != 0:
                self.refresh()
                time.sleep(1)
                timeout -= 1

            if self.ipv4_address != ipv4_address:
                raise RuntimeError(
                    'Unable to set the address %s for interface %s' %
                    (self.ipv4_address, self.name))

            node_admin.write_ifcfg(self.device, self.mac_address,
                                   self.ipv4_address, self.ipv4_netmask)
        else:
            console_log.info("Nothing to do as %s already has address %s" %
                             (self.name, ifaddr))
Пример #9
0
def _remove_module(name, modules):
    try:
        m = modules[name]
    except KeyError:
        # It's not loaded, do nothing.
        return None

    console_log.info("Removing %d dependents of %s : %s" %
                     (len(m.dependents), name, m.dependents))
    while m.dependents:
        error = _remove_module(m.dependents.pop(), modules)

        if error:
            return error

    console_log.info("Removing %s" % name)

    error = AgentShell.run_canned_error_message(['rmmod', name])

    if error:
        return error

    modules.pop(name)
    for m in modules.values():
        if name in m.dependents:
            m.dependents.remove(name)

    return None
Пример #10
0
    def private_key_file(self):
        """Return a path to a PEM file"""
        if not os.path.exists(self.PRIVATE_KEY_FILE):
            console_log.info("Generating private key")
            AgentShell.try_run(['openssl', 'genrsa', '-out', self.PRIVATE_KEY_FILE, '2048', '-sha256'])

        return self.PRIVATE_KEY_FILE
Пример #11
0
    def disable_and_kill():
        console_log.info("Terminating")

        storage_server_target = ServiceControl.create(
            "iml-storage-server.target")
        storage_server_target.disable()
        storage_server_target.stop()
Пример #12
0
def generate_ring1_network(ring0):
    # find a good place for the ring1 network
    subnet = find_subnet(ring0.ipv4_network, ring0.ipv4_prefixlen)
    address = str(IPAddress((int(IPAddress(ring0.ipv4_hostmask)) &
                             int(IPAddress(ring0.ipv4_address))) |
                            int(subnet.ip)))
    console_log.info("Chose %s/%d for ring1 address" % (address, subnet.prefixlen))
    return address, str(subnet.prefixlen)
Пример #13
0
def stonith(node):
    p_cfg = PacemakerConfig()

    # TODO: signal that manager that a STONITH has been done so that it
    #       doesn't treat it as an AWOL
    console_log.info("Rebooting %s per a STONITH request" % node)

    p_cfg.get_node(node).fence_reboot()
Пример #14
0
def start_lnet():
    """
    Place lnet into the 'up' state.
    """
    console_log.info("Starting LNet")

    return AgentShell.run_canned_error_message(
        ["lnetctl", "lnet", "configure", "--all"])
Пример #15
0
def terminate_block_device_drivers():
    console_log.info("Terminating drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.terminate_driver()

        if error:
            return agent_error(error)

    return agent_result_ok
def stop_lnet():
    '''
    Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state
    will be unloaded before lnet is stopped.
    '''

    console_log.info("Stopping LNet")
    return agent_ok_or_error(_rmmod_deps("lnet", excpt=["ksocklnd", "ko2iblnd"]) or
                             AgentShell.run_canned_error_message(["lctl", "net", "down"]))
Пример #17
0
def initialise_block_device_drivers():
    console_log.info("Initialising drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.initialise_driver(config.profile_managed)

        if error:
            return agent_error(error)

    return agent_result_ok
Пример #18
0
    def private_key_file(self):
        """Return a path to a PEM file"""
        if not os.path.exists(self.PRIVATE_KEY_FILE):
            console_log.info("Generating private key")
            AgentShell.try_run([
                "openssl", "genrsa", "-out", self.PRIVATE_KEY_FILE, "2048",
                "-sha256"
            ])

        return self.PRIVATE_KEY_FILE
def start_lnet():
    '''
    Place lnet into the 'up' state.
    '''
    console_log.info("Starting LNet")

    # modprobe lust is a hack for HYD-1263 - Fix or work around LU-1279 - failure trying to mount
    # should be removed when LU-1279 is fixed
    return agent_ok_or_error(AgentShell.run_canned_error_message(["lctl", "net", "up"]) or
                             AgentShell.run_canned_error_message(["modprobe", "lustre"]))
Пример #20
0
def get_cluster_node_name():
    try:
        return AgentShell.try_run(["crm_node", "-n"]).strip()
    except Exception as e:
        console_log.info(
            "Could not get cluster node name {}. Falling back to socket.getfqdn()".format(
                e
            )
        )

        return socket.getfqdn()
Пример #21
0
def stop_lnet():
    """
    Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state
    will be unloaded before lnet is stopped.
    """

    console_log.info("Stopping LNet")

    return agent_ok_or_error(
        AgentShell.run_canned_error_message(["lustre_rmmod", "ptlrpc"])
        or AgentShell.run_canned_error_message(
            ["lnetctl", "lnet", "unconfigure"]))
Пример #22
0
def find_unused_port(ring0, timeout=10, batch_count=10000):
    from random import choice

    dest_addr = ring0.mcastaddr
    port_min = 32767
    port_max = 65535
    ports = range(port_min, port_max, 2)
    portrange_str = "%s-%s" % (port_min, port_max)

    firewall_control.add_rule(
        0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr
    )

    try:
        networking.subscribe_multicast(ring0)
        console_log.info(
            "Sniffing for packets to %s on %s within port range %s"
            % (dest_addr, ring0.name, portrange_str)
        )
        cap = networking.start_cap(
            ring0,
            timeout,
            "host %s and udp and portrange %s" % (dest_addr, portrange_str),
        )

        def recv_packets(header, data):
            tgt_port = networking.get_dport_from_packet(data)

            try:
                ports.remove(tgt_port)
            except ValueError:
                # already removed
                pass

        packet_count = 0
        start = time.time()
        while time.time() - start < timeout:
            try:
                packet_count += cap.dispatch(batch_count, recv_packets)
            except Exception as e:
                raise RuntimeError("Error reading from the network: %s" % str(e))

        console_log.info(
            "Finished after %d seconds, sniffed: %d"
            % (time.time() - start, packet_count)
        )
    finally:
        firewall_control.remove_rule(
            0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr
        )

    return choice(ports)
Пример #23
0
def start_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-1989: brute force, try up to 3 times to start the target
    i = 0
    while True:
        i += 1

        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Started'
        ])

        if error:
            return agent_error(error)

        # now wait for it to start
        _wait_target(ha_label, True)

        # and make sure it didn't start but (the RA) fail(ed)
        rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1'])

        failed = True
        for line in stdout.split("\n"):
            if line.lstrip().startswith(ha_label):
                if line.find("FAILED") < 0:
                    failed = False

        if failed:
            # try to leave things in a sane state for a failed mount
            error = AgentShell.run_canned_error_message([
                'crm_resource', '-r', ha_label, '-p', 'target-role', '-m',
                '-v', 'Stopped'
            ])

            if error:
                return agent_error(error)

            if i < 4:
                console_log.info("failed to start target %s" % ha_label)
            else:
                return agent_error("Failed to start target %s" % ha_label)

        else:
            location = get_resource_location(ha_label)
            if not location:
                return agent_error("Started %s but now can't locate it!" %
                                   ha_label)
            return agent_result(location)
Пример #24
0
def terminate_block_device_drivers():
    """
    When the agent is stopped we want to allow block devices to do any termination that they might need, this function
    may also be called by the manager.
    """
    console_log.info("Terminating drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.terminate_driver()

        if error:
            return agent_error(error)

    return agent_result_ok
Пример #25
0
def initialise_block_device_drivers():
    """
    When the agent is run we want to allow block devices to do any initialization that they might need, this function
    may also be called by the manager.
    """
    console_log.info("Initialising drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.initialise_driver(config.profile_managed)

        if error:
            return agent_error(error)

    return agent_result_ok
Пример #26
0
def start_target(ha_label):
    """
    Start the high availability target

    Return: Value using simple return protocol
    """

    if not _resource_exists(ha_label):
        return agent_error("Target {} does not exist".format(ha_label))

    # if resource already started but not on primary, move it
    location = get_resource_location(ha_label)
    primary = _find_resource_constraint(ha_label, True)
    if location:
        if location != primary:
            console_log.info(
                "Resource %s already started, moving to primary node %s",
                ha_label,
                primary,
            )
            error = _move_target(ha_label, primary)
            if error:
                return agent_error(error)
            location = primary
        return agent_result(location)

    try:
        _res_set_started(ha_label, True)
        if _resource_exists(_zfs_name(ha_label)):
            _res_set_started(_zfs_name(ha_label), True)
            # enable group also, in case group was disabled
            _res_set_started(_group_name(ha_label), True)

        # now wait for it to start
        if not _wait_target(ha_label, True):
            # try to leave things in a sane state for a failed mount
            _res_set_started(ha_label, False)

            return agent_error("Failed to start target {}".format(ha_label))

        location = get_resource_location(ha_label)
        if not location:
            return agent_error(
                "Started {} but now can't locate it!".format(ha_label))

        return agent_result(location)

    except AgentShell.CommandExecutionError as err:
        return agent_error(
            "Error (%s) running '%s': '%s' '%s'" %
            (err.result.rc, err.command, err.result.stdout, err.result.stderr))
Пример #27
0
def find_unused_port(ring0, timeout=10, batch_count=10000):
    from random import choice

    dest_addr = ring0.mcastaddr
    port_min = 32767
    port_max = 65535
    ports = range(port_min, port_max, 2)
    portrange_str = "%s-%s" % (port_min, port_max)

    firewall_control.add_rule(
        0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr
    )

    try:
        console_log.info(
            "Sniffing packets on {}({}) within range: {}".format(
                ring0.name, dest_addr, portrange_str
            )
        )

        dports = sniff(
            iface=ring0.name,
            lfilter=lambda x: x.haslayer(UDP)
            and isinstance(x[UDP].dport, (int, long))
            and x[UDP].dport >= port_min
            and x[UDP].dport <= port_max
            and x[IP].dst == dest_addr,
            timeout=timeout,
        )

        console_log.info(
            "Finished after %d seconds, sniffed: %d" % (timeout, len(dports))
        )

        for dport in dports:
            try:
                ports.remove(dport)
            except ValueError:
                # already removed
                pass

    finally:
        firewall_control.remove_rule(
            0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr
        )

    return choice(ports)
Пример #28
0
def get_resource_locations():
    """Parse `crm_mon -1` to identify where (if anywhere) resources
    (i.e. targets) are running
    returns [ resoure_id: location|None, ... ]
    """
    try:
        result = AgentShell.run(["crm_mon", "-1", "-r", "-X"])
    except OSError as err:
        # ENOENT is fine here.  Pacemaker might not be installed yet.
        if err.errno != errno.ENOENT:
            raise err
        return {}

    if result.rc != 0:
        console_log.info("crm_mon failed (%d): '%s' '%s'", result.rc,
                         result.stdout, result.stderr)
        return {}

    return _get_resource_locations(result.stdout)
Пример #29
0
def get_ring0():
    # ring0 will always be on the interface used for agent->manager comms
    from urlparse import urlparse
    server_url = urljoin(os.environ["IML_MANAGER_URL"], "agent")
    manager_address = socket.gethostbyname(urlparse(server_url).hostname)
    out = AgentShell.try_run(['/sbin/ip', 'route', 'get', manager_address])
    match = re.search(r'dev\s+([^\s]+)', out)
    if match:
        manager_dev = match.groups()[0]
    else:
        raise RuntimeError("Unable to find ring0 dev in %s" % out)

    console_log.info("Chose %s for corosync ring0" % manager_dev)
    ring0 = CorosyncRingInterface(manager_dev)

    if ring0.ipv4_prefixlen < 9:
        raise RuntimeError("%s subnet is too large (/%s)" %
                           (ring0.name, ring0.ipv4_prefixlen))

    return ring0
Пример #30
0
def clear_targets(force=False):
    if not force:
        from os import _exit
        import textwrap
        warning = """
        clear-targets will forcibly unmount and unconfigure all Lustre targets
        on EVERY node in this HA domain.  This is an irreversible and
        potentially very destructive operation.  Data loss may occur.  Please
        do not use it unless you fully understand the consequences!  If you
        are sure that this command does what you intend to do, then you must
        supply the --force flag to avoid seeing this message.
        """
        console_log.warn(textwrap.fill(textwrap.dedent(warning)))
        _exit(1)

    for resource, attrs in _query_ha_targets().items():
        console_log.info("Stopping %s" % resource)
        stop_target(attrs['ha_label'])
        console_log.info("Unconfiguring %s" % resource)
        unconfigure_target_ha(True, attrs['ha_label'], attrs['uuid'])