示例#1
0
文件: agent.py 项目: smanjara/ceph
 def _check_agent(self, host: str) -> bool:
     down = False
     try:
         assert self.mgr.cherrypy_thread
         assert self.mgr.cherrypy_thread.ssl_certs.get_root_cert()
     except Exception:
         self.mgr.log.debug(
             f'Delaying checking agent on {host} until cephadm endpoint finished creating root cert')
         return down
     if self.mgr.agent_helpers._agent_down(host):
         down = True
     try:
         agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0]
         assert agent.daemon_id is not None
         assert agent.hostname is not None
     except Exception as e:
         self.mgr.log.debug(
             f'Could not retrieve agent on host {host} from daemon cache: {e}')
         return down
     try:
         spec = self.mgr.spec_store.active_specs.get('agent', None)
         deps = self.mgr._calc_daemon_deps(spec, 'agent', agent.daemon_id)
         last_deps, last_config = self.mgr.agent_cache.get_agent_last_config_deps(host)
         if not last_config or last_deps != deps:
             # if root cert is the dep that changed, we must use ssh to reconfig
             # so it's necessary to check this one specifically
             root_cert_match = False
             try:
                 root_cert = self.mgr.cherrypy_thread.ssl_certs.get_root_cert()
                 if last_deps and root_cert in last_deps:
                     root_cert_match = True
             except Exception:
                 pass
             daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
             # we need to know the agent port to try to reconfig w/ http
             # otherwise there is no choice but a full ssh reconfig
             if host in self.mgr.agent_cache.agent_ports and root_cert_match and not down:
                 daemon_spec = self.mgr.cephadm_services[daemon_type_to_service(
                     daemon_spec.daemon_type)].prepare_create(daemon_spec)
                 self.mgr.agent_helpers._request_agent_acks(
                     hosts={daemon_spec.host},
                     increment=True,
                     daemon_spec=daemon_spec,
                 )
             else:
                 self.mgr._daemon_action(daemon_spec, action='reconfig')
             return down
     except Exception as e:
         self.mgr.log.debug(
             f'Agent on host {host} not ready to have config and deps checked: {e}')
     action = self.mgr.cache.get_scheduled_daemon_action(agent.hostname, agent.name())
     if action:
         try:
             daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
             self.mgr._daemon_action(daemon_spec, action=action)
             self.mgr.cache.rm_scheduled_daemon_action(agent.hostname, agent.name())
         except Exception as e:
             self.mgr.log.debug(
                 f'Agent on host {host} not ready to {action}: {e}')
     return down
示例#2
0
文件: agent.py 项目: potatogim/ceph
 def _check_agent(self, host: str) -> bool:
     try:
         assert self.mgr.cherrypy_thread
         assert self.mgr.cherrypy_thread.ssl_certs.get_root_cert()
     except Exception:
         self.mgr.log.debug(
             f'Delaying checking agent on {host} until cephadm endpoint finished creating root cert'
         )
         return False
     if self.mgr.agent_helpers._agent_down(host):
         return True
     else:
         try:
             agent = self.mgr.cache.get_daemons_by_type('agent',
                                                        host=host)[0]
             assert agent.daemon_id is not None
             assert agent.hostname is not None
         except Exception as e:
             self.mgr.log.debug(
                 f'Could not retrieve agent on host {host} from daemon cache: {e}'
             )
             return False
         try:
             spec = self.mgr.spec_store.active_specs.get('agent', None)
             deps = self.mgr._calc_daemon_deps(spec, 'agent',
                                               agent.daemon_id)
             last_deps, last_config = self.mgr.cache.get_daemon_last_config_deps(
                 host, agent.name())
             if not last_config or last_deps != deps:
                 daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(
                     agent)
                 self.mgr._daemon_action(daemon_spec, action='reconfig')
                 return False
         except Exception as e:
             self.mgr.log.debug(
                 f'Agent on host {host} not ready to have config and deps checked: {e}'
             )
         action = self.mgr.cache.get_scheduled_daemon_action(
             agent.hostname, agent.name())
         if action:
             try:
                 daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(
                     agent)
                 self.mgr._daemon_action(daemon_spec, action=action)
                 self.mgr.cache.rm_scheduled_daemon_action(
                     agent.hostname, agent.name())
             except Exception as e:
                 self.mgr.log.debug(
                     f'Agent on host {host} not ready to {action}: {e}')
         return False
示例#3
0
文件: osd.py 项目: huashengand/ceph
    def deploy_osd_daemons_for_existing_osds(
            self,
            host: str,
            service_name: str,
            replace_osd_ids: Optional[List[str]] = None) -> str:

        if replace_osd_ids is None:
            replace_osd_ids = OsdIdClaims(self.mgr).filtered_by_host(host)
            assert replace_osd_ids is not None
        # check result
        osds_elems: dict = CephadmServe(self.mgr)._run_cephadm_json(
            host, 'osd', 'ceph-volume', [
                '--',
                'lvm',
                'list',
                '--format',
                'json',
            ])
        before_osd_uuid_map = self.mgr.get_osd_uuid_map(only_up=True)
        fsid = self.mgr._cluster_fsid
        osd_uuid_map = self.mgr.get_osd_uuid_map()
        created = []
        for osd_id, osds in osds_elems.items():
            for osd in osds:
                if osd['type'] == 'db':
                    continue
                if osd['tags']['ceph.cluster_fsid'] != fsid:
                    logger.debug('mismatched fsid, skipping %s' % osd)
                    continue
                if osd_id in before_osd_uuid_map and osd_id not in replace_osd_ids:
                    # if it exists but is part of the replacement operation, don't skip
                    continue
                if osd_id not in osd_uuid_map:
                    logger.debug(
                        'osd id {} does not exist in cluster'.format(osd_id))
                    continue
                if osd_uuid_map.get(osd_id) != osd['tags']['ceph.osd_fsid']:
                    logger.debug('mismatched osd uuid (cluster has %s, osd '
                                 'has %s)' % (osd_uuid_map.get(osd_id),
                                              osd['tags']['ceph.osd_fsid']))
                    continue

                created.append(osd_id)
                daemon_spec: CephadmDaemonDeploySpec = CephadmDaemonDeploySpec(
                    service_name=service_name,
                    daemon_id=osd_id,
                    host=host,
                    daemon_type='osd',
                )
                daemon_spec.final_config, daemon_spec.deps = self.generate_config(
                    daemon_spec)
                CephadmServe(self.mgr)._create_daemon(
                    daemon_spec, osd_uuid_map=osd_uuid_map)

        if created:
            self.mgr.cache.invalidate_host_devices(host)
            self.mgr.cache.invalidate_autotune(host)
            return "Created osd(s) %s on host '%s'" % (','.join(created), host)
        else:
            return "Created no osd(s) on host %s; already created?" % host
示例#4
0
    def test_iscsi_client_caps(self):

        iscsi_daemon_spec = CephadmDaemonDeploySpec(
            host='host',
            daemon_id='a',
            service_name=self.iscsi_spec.service_name())

        self.iscsi_service.prepare_create(iscsi_daemon_spec)

        expected_caps = [
            'mon',
            'profile rbd, allow command "osd blocklist", allow command "config-key get" with "key" prefix "iscsi/"',
            'mgr', 'allow command "service status"', 'osd', 'allow rwx'
        ]

        expected_call = call({
            'prefix': 'auth get-or-create',
            'entity': 'client.iscsi.a',
            'caps': expected_caps
        })
        expected_call2 = call({
            'prefix': 'auth caps',
            'entity': 'client.iscsi.a',
            'caps': expected_caps
        })

        assert expected_call in self.mgr.mon_command.mock_calls
        assert expected_call2 in self.mgr.mon_command.mock_calls
示例#5
0
    def test_iscsi_client_caps(self):
        mgr = FakeMgr()
        iscsi_service = self._get_services(mgr)['iscsi']

        iscsi_spec = IscsiServiceSpec(service_type='iscsi', service_id="a")
        iscsi_spec.daemon_type = "iscsi"
        iscsi_spec.daemon_id = "a"
        iscsi_spec.spec = MagicMock()
        iscsi_spec.spec.daemon_type = "iscsi"
        iscsi_spec.spec.ssl_cert = ''

        mgr.spec_store = MagicMock()
        mgr.spec_store.__getitem__.return_value = iscsi_spec

        iscsi_daemon_spec = CephadmDaemonDeploySpec(
            host='host', daemon_id='a', service_name=iscsi_spec.service_name())

        iscsi_service.prepare_create(iscsi_daemon_spec)

        expected_caps = ['mon',
                         'profile rbd, allow command "osd blocklist", allow command "config-key get" with "key" prefix "iscsi/"',
                         'mgr', 'allow command "service status"',
                         'osd', 'allow rwx']

        expected_call = call({'prefix': 'auth get-or-create',
                              'entity': 'client.iscsi.a',
                              'caps': expected_caps})
        expected_call2 = call({'prefix': 'auth caps',
                               'entity': 'client.iscsi.a',
                               'caps': expected_caps})

        assert expected_call in mgr.mon_command.mock_calls
        assert expected_call2 in mgr.mon_command.mock_calls
示例#6
0
 def prepare_create(
         self,
         daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
     assert self.TYPE == daemon_spec.daemon_type
     daemon_spec.final_config, daemon_spec.deps = self.generate_config(
         daemon_spec)
     return daemon_spec
示例#7
0
文件: jaeger.py 项目: drunkard/ceph
 def prepare_create(
         self,
         daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
     assert self.TYPE == daemon_spec.daemon_type
     elasticsearch_nodes = get_elasticsearch_nodes(self, daemon_spec)
     daemon_spec.final_config = {
         'elasticsearch_nodes': ",".join(elasticsearch_nodes)
     }
     return daemon_spec
示例#8
0
 def agent_config_successfully_delivered(
         self, daemon_spec: CephadmDaemonDeploySpec) -> None:
     # agent successfully received new config. Update config/deps
     assert daemon_spec.service_name == 'agent'
     self.update_daemon_config_deps(daemon_spec.host, daemon_spec.name(),
                                    daemon_spec.deps, datetime_now())
     self.agent_timestamp[daemon_spec.host] = datetime_now()
     self.agent_counter[daemon_spec.host] = 1
     self.save_host(daemon_spec.host)
示例#9
0
 def test_grafana_initial_admin_pw(self,
                                   cephadm_module: CephadmOrchestrator):
     with with_host(cephadm_module, 'test'):
         with with_service(cephadm_module, ServiceSpec('mgr')) as _, \
                 with_service(cephadm_module, GrafanaSpec(initial_admin_password='******')):
             out = cephadm_module.cephadm_services[
                 'grafana'].generate_config(
                     CephadmDaemonDeploySpec('test', 'daemon', 'grafana'))
             assert out == ({
                 'files': {
                     'grafana.ini':
                     '# This file is generated by cephadm.\n'
                     '[users]\n'
                     '  default_theme = light\n'
                     '[auth.anonymous]\n'
                     '  enabled = true\n'
                     "  org_name = 'Main Org.'\n"
                     "  org_role = 'Viewer'\n"
                     '[server]\n'
                     "  domain = 'bootstrap.storage.lab'\n"
                     '  protocol = https\n'
                     '  cert_file = /etc/grafana/certs/cert_file\n'
                     '  cert_key = /etc/grafana/certs/cert_key\n'
                     '  http_port = 3000\n'
                     '  http_addr = \n'
                     '[security]\n'
                     '  admin_user = admin\n'
                     '  admin_password = secure\n'
                     '  cookie_secure = true\n'
                     '  cookie_samesite = none\n'
                     '  allow_embedding = true',
                     'provisioning/datasources/ceph-dashboard.yml':
                     "# This file is generated by cephadm.\n"
                     'deleteDatasources:\n\n'
                     "  - name: 'Loki'\n"
                     '    orgId: 2\n\n'
                     'datasources:\n\n'
                     "  - name: 'Loki'\n"
                     "    type: 'loki'\n"
                     "    access: 'proxy'\n"
                     '    orgId: 2\n'
                     "    url: 'http://[1::4]:3100'\n"
                     '    basicAuth: false\n'
                     '    isDefault: true\n'
                     '    editable: false',
                     'certs/cert_file':
                     ANY,
                     'certs/cert_key':
                     ANY
                 }
             }, [])
示例#10
0
文件: jaeger.py 项目: drunkard/ceph
 def prepare_create(
         self,
         daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
     assert self.TYPE == daemon_spec.daemon_type
     collectors = []
     for dd in self.mgr.cache.get_daemons_by_type(
             JaegerCollectorService.TYPE):
         # scrape jaeger-collector nodes
         assert dd.hostname is not None
         port = dd.ports[
             0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT
         url = build_url(host=dd.hostname, port=port).lstrip('/')
         collectors.append(url)
     daemon_spec.final_config = {'collector_nodes': ",".join(collectors)}
     return daemon_spec
示例#11
0
文件: ingress.py 项目: zhangsw/ceph
    def haproxy_prepare_create(
        self,
        daemon_spec: CephadmDaemonDeploySpec,
    ) -> CephadmDaemonDeploySpec:
        assert daemon_spec.daemon_type == 'haproxy'

        daemon_id = daemon_spec.daemon_id
        host = daemon_spec.host
        spec = cast(IngressSpec,
                    self.mgr.spec_store[daemon_spec.service_name].spec)

        logger.debug('prepare_create haproxy.%s on host %s with spec %s' %
                     (daemon_id, host, spec))

        daemon_spec.final_config, daemon_spec.deps = self.haproxy_generate_config(
            daemon_spec)

        return daemon_spec
示例#12
0
    def test_ingress_config(self, _run_cephadm,
                            cephadm_module: CephadmOrchestrator):
        _run_cephadm.side_effect = async_side_effect(('{}', '', 0))

        with with_host(cephadm_module, 'test'):
            cephadm_module.cache.update_host_networks(
                'test', {'1.2.3.0/24': {
                    'if0': ['1.2.3.4/32']
                }})

            # the ingress backend
            s = RGWSpec(service_id="foo",
                        placement=PlacementSpec(count=1),
                        rgw_frontend_type='beast')

            ispec = IngressSpec(service_type='ingress',
                                service_id='test',
                                backend_service='rgw.foo',
                                frontend_port=8089,
                                monitor_port=8999,
                                monitor_user='******',
                                monitor_password='******',
                                keepalived_password='******',
                                virtual_interface_networks=['1.2.3.0/24'],
                                virtual_ip="1.2.3.4/32")
            with with_service(cephadm_module,
                              s) as _, with_service(cephadm_module,
                                                    ispec) as _:
                # generate the keepalived conf based on the specified spec
                keepalived_generated_conf = cephadm_module.cephadm_services[
                    'ingress'].keepalived_generate_config(
                        CephadmDaemonDeploySpec(
                            host='test',
                            daemon_id='ingress',
                            service_name=ispec.service_name()))

                keepalived_expected_conf = {
                    'files': {
                        'keepalived.conf':
                        '# This file is generated by cephadm.\n'
                        'vrrp_script check_backend {\n    '
                        'script "/usr/bin/curl http://localhost:8999/health"\n    '
                        'weight -20\n    '
                        'interval 2\n    '
                        'rise 2\n    '
                        'fall 2\n}\n\n'
                        'vrrp_instance VI_0 {\n  '
                        'state MASTER\n  '
                        'priority 100\n  '
                        'interface if0\n  '
                        'virtual_router_id 51\n  '
                        'advert_int 1\n  '
                        'authentication {\n      '
                        'auth_type PASS\n      '
                        'auth_pass 12345\n  '
                        '}\n  '
                        'unicast_src_ip 1::4\n  '
                        'unicast_peer {\n  '
                        '}\n  '
                        'virtual_ipaddress {\n    '
                        '1.2.3.4/32 dev if0\n  '
                        '}\n  '
                        'track_script {\n      '
                        'check_backend\n  }\n'
                        '}'
                    }
                }

                # check keepalived config
                assert keepalived_generated_conf[0] == keepalived_expected_conf

                # generate the haproxy conf based on the specified spec
                haproxy_generated_conf = cephadm_module.cephadm_services[
                    'ingress'].haproxy_generate_config(
                        CephadmDaemonDeploySpec(
                            host='test',
                            daemon_id='ingress',
                            service_name=ispec.service_name()))

                haproxy_expected_conf = {
                    'files': {
                        'haproxy.cfg':
                        '# This file is generated by cephadm.'
                        '\nglobal\n    log         '
                        '127.0.0.1 local2\n    '
                        'chroot      /var/lib/haproxy\n    '
                        'pidfile     /var/lib/haproxy/haproxy.pid\n    '
                        'maxconn     8000\n    '
                        'daemon\n    '
                        'stats socket /var/lib/haproxy/stats\n'
                        '\ndefaults\n    '
                        'mode                    http\n    '
                        'log                     global\n    '
                        'option                  httplog\n    '
                        'option                  dontlognull\n    '
                        'option http-server-close\n    '
                        'option forwardfor       except 127.0.0.0/8\n    '
                        'option                  redispatch\n    '
                        'retries                 3\n    '
                        'timeout queue           20s\n    '
                        'timeout connect         5s\n    '
                        'timeout http-request    1s\n    '
                        'timeout http-keep-alive 5s\n    '
                        'timeout client          1s\n    '
                        'timeout server          1s\n    '
                        'timeout check           5s\n    '
                        'maxconn                 8000\n'
                        '\nfrontend stats\n    '
                        'mode http\n    '
                        'bind 1.2.3.4:8999\n    '
                        'bind localhost:8999\n    '
                        'stats enable\n    '
                        'stats uri /stats\n    '
                        'stats refresh 10s\n    '
                        'stats auth admin:12345\n    '
                        'http-request use-service prometheus-exporter if { path /metrics }\n    '
                        'monitor-uri /health\n'
                        '\nfrontend frontend\n    '
                        'bind 1.2.3.4:8089\n    '
                        'default_backend backend\n\n'
                        'backend backend\n    '
                        'option forwardfor\n    '
                        'balance static-rr\n    '
                        'option httpchk HEAD / HTTP/1.0\n    '
                        'server ' + haproxy_generated_conf[1][0] +
                        ' 1::4:80 check weight 100\n'
                    }
                }

                assert haproxy_generated_conf[0] == haproxy_expected_conf
示例#13
0
    def _check_agent(self, host: str) -> bool:
        try:
            assert self.mgr.cherrypy_thread
            assert self.mgr.cherrypy_thread.ssl_certs.get_root_cert()
        except Exception:
            self.mgr.log.debug(
                f'Delaying checking agent on {host} until cephadm endpoint finished creating root cert')
            return False
        if self.mgr.agent_helpers._agent_down(host):
            if host not in self.mgr.offline_hosts:
                self.mgr.cache.metadata_up_to_date[host] = False
                # In case host is actually offline, it's best to reset the connection to avoid
                # a long timeout trying to use an existing connection to an offline host
                self.mgr.ssh._reset_con(host)

                try:
                    # try to schedule redeploy of agent in case it is individually down
                    agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0]
                    with self.mgr.agent_helpers.agent_lock(host):
                        daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
                        self.mgr._daemon_action(daemon_spec, action='redeploy')
                except AgentLockException:
                    self.mgr.log.debug(
                        f'Could not redeploy agent on host {host}. Someone else holds agent\'s lock')
                except Exception as e:
                    self.mgr.log.debug(
                        f'Failed to redeploy agent on host {host}. Agent possibly never deployed: {e}')
            return True
        else:
            try:
                agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0]
                assert agent.daemon_id is not None
                assert agent.hostname is not None
            except Exception as e:
                self.mgr.log.debug(
                    f'Could not retrieve agent on host {host} from daemon cache: {e}')
                return False
            try:
                spec = self.mgr.spec_store.active_specs.get('agent', None)
                deps = self.mgr._calc_daemon_deps(spec, 'agent', agent.daemon_id)
                last_deps, last_config = self.mgr.cache.get_daemon_last_config_deps(
                    host, agent.name())
                if not last_config or last_deps != deps:
                    daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
                    with self.mgr.agent_helpers.agent_lock(host):
                        self.mgr._daemon_action(daemon_spec, action='reconfig')
                    return False
            except AgentLockException:
                self.mgr.log.debug(
                    f'Could not reconfig agent on host {host}. Someone else holds agent\'s lock')
            except Exception as e:
                self.mgr.log.debug(
                    f'Agent on host {host} not ready to have config and deps checked: {e}')
            action = self.mgr.cache.get_scheduled_daemon_action(agent.hostname, agent.name())
            if action:
                try:
                    with self.mgr.agent_helpers.agent_lock(host):
                        daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
                        self.mgr._daemon_action(daemon_spec, action=action)
                        self.mgr.cache.rm_scheduled_daemon_action(agent.hostname, agent.name())
                except AgentLockException:
                    self.mgr.log.debug(
                        f'Could not {action} agent on host {host}. Someone else holds agent\'s lock')
                except Exception as e:
                    self.mgr.log.debug(
                        f'Agent on host {host} not ready to {action}: {e}')
            return False
示例#14
0
    def _create_daemon(
        self,
        daemon_spec: CephadmDaemonDeploySpec,
        reconfig: bool = False,
        osd_uuid_map: Optional[Dict[str, Any]] = None,
    ) -> str:

        with set_exception_subject('service',
                                   orchestrator.DaemonDescription(
                                       daemon_type=daemon_spec.daemon_type,
                                       daemon_id=daemon_spec.daemon_id,
                                       hostname=daemon_spec.host,
                                   ).service_id(),
                                   overwrite=True):

            try:
                image = ''
                start_time = datetime_now()
                ports: List[
                    int] = daemon_spec.ports if daemon_spec.ports else []

                if daemon_spec.daemon_type == 'container':
                    spec = cast(
                        CustomContainerSpec,
                        self.mgr.spec_store[daemon_spec.service_name].spec)
                    image = spec.image
                    if spec.ports:
                        ports.extend(spec.ports)

                if daemon_spec.daemon_type == 'cephadm-exporter':
                    if not reconfig:
                        assert daemon_spec.host
                        deploy_ok = self._deploy_cephadm_binary(
                            daemon_spec.host)
                        if not deploy_ok:
                            msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}"
                            self.log.warning(msg)
                            return msg

                if daemon_spec.daemon_type == 'haproxy':
                    haspec = cast(
                        HA_RGWSpec,
                        self.mgr.spec_store[daemon_spec.service_name].spec)
                    if haspec.haproxy_container_image:
                        image = haspec.haproxy_container_image

                if daemon_spec.daemon_type == 'keepalived':
                    haspec = cast(
                        HA_RGWSpec,
                        self.mgr.spec_store[daemon_spec.service_name].spec)
                    if haspec.keepalived_container_image:
                        image = haspec.keepalived_container_image

                # TCP port to open in the host firewall
                if len(ports) > 0:
                    daemon_spec.extra_args.extend(
                        ['--tcp-ports', ' '.join(map(str, ports))])

                # osd deployments needs an --osd-uuid arg
                if daemon_spec.daemon_type == 'osd':
                    if not osd_uuid_map:
                        osd_uuid_map = self.mgr.get_osd_uuid_map()
                    osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id)
                    if not osd_uuid:
                        raise OrchestratorError('osd.%s not in osdmap' %
                                                daemon_spec.daemon_id)
                    daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid])

                if reconfig:
                    daemon_spec.extra_args.append('--reconfig')
                if self.mgr.allow_ptrace:
                    daemon_spec.extra_args.append('--allow-ptrace')

                if self.mgr.cache.host_needs_registry_login(
                        daemon_spec.host) and self.mgr.registry_url:
                    self._registry_login(daemon_spec.host,
                                         self.mgr.registry_url,
                                         self.mgr.registry_username,
                                         self.mgr.registry_password)

                daemon_spec.extra_args.extend(['--config-json', '-'])

                self.log.info('%s daemon %s on %s' %
                              ('Reconfiguring' if reconfig else 'Deploying',
                               daemon_spec.name(), daemon_spec.host))

                out, err, code = self._run_cephadm(
                    daemon_spec.host,
                    daemon_spec.name(),
                    'deploy', [
                        '--name',
                        daemon_spec.name(),
                    ] + daemon_spec.extra_args,
                    stdin=json.dumps(daemon_spec.final_config),
                    image=image)
                if not code and daemon_spec.host in self.mgr.cache.daemons:
                    # prime cached service state with what we (should have)
                    # just created
                    sd = daemon_spec.to_daemon_description(1, 'starting')
                    self.mgr.cache.add_daemon(daemon_spec.host, sd)
                    if daemon_spec.daemon_type in [
                            'grafana', 'iscsi', 'prometheus', 'alertmanager'
                    ]:
                        self.mgr.requires_post_actions.add(
                            daemon_spec.daemon_type)
                self.mgr.cache.invalidate_host_daemons(daemon_spec.host)
                self.mgr.cache.update_daemon_config_deps(
                    daemon_spec.host, daemon_spec.name(), daemon_spec.deps,
                    start_time)
                self.mgr.cache.save_host(daemon_spec.host)
                msg = "{} {} on host '{}'".format(
                    'Reconfigured' if reconfig else 'Deployed',
                    daemon_spec.name(), daemon_spec.host)
                if not code:
                    self.mgr.events.for_daemon(daemon_spec.name(),
                                               OrchestratorEvent.INFO, msg)
                else:
                    what = 'reconfigure' if reconfig else 'deploy'
                    self.mgr.events.for_daemon(daemon_spec.name(),
                                               OrchestratorEvent.ERROR,
                                               f'Failed to {what}: {err}')
                return msg
            except OrchestratorError:
                if not reconfig:
                    # we have to clean up the daemon. E.g. keyrings.
                    servict_type = daemon_type_to_service(
                        daemon_spec.daemon_type)
                    dd = daemon_spec.to_daemon_description(-1, 'failed')
                    self.mgr.cephadm_services[servict_type].post_remove(dd)
                raise
示例#15
0
    def create_single_host(self,
                           drive_group: DriveGroupSpec,
                           host: str,
                           cmd: str,
                           replace_osd_ids: List[str],
                           env_vars: Optional[List[str]] = None) -> str:
        out, err, code = self._run_ceph_volume_command(host,
                                                       cmd,
                                                       env_vars=env_vars)

        if code == 1 and ', it is already prepared' in '\n'.join(err):
            # HACK: when we create against an existing LV, ceph-volume
            # returns an error and the above message.  To make this
            # command idempotent, tolerate this "error" and continue.
            logger.debug('the device was already prepared; continuing')
            code = 0
        if code:
            raise RuntimeError(
                'cephadm exited with an error code: %d, stderr:%s' %
                (code, '\n'.join(err)))

        # check result
        out, err, code = CephadmServe(self.mgr)._run_cephadm(
            host, 'osd', 'ceph-volume', [
                '--',
                'lvm',
                'list',
                '--format',
                'json',
            ])
        before_osd_uuid_map = self.mgr.get_osd_uuid_map(only_up=True)
        try:
            osds_elems = json.loads('\n'.join(out))
        except ValueError:
            logger.exception('Cannot decode JSON: \'%s\'' % '\n'.join(out))
            osds_elems = {}
        fsid = self.mgr._cluster_fsid
        osd_uuid_map = self.mgr.get_osd_uuid_map()
        created = []
        for osd_id, osds in osds_elems.items():
            for osd in osds:
                if osd['tags']['ceph.cluster_fsid'] != fsid:
                    logger.debug('mismatched fsid, skipping %s' % osd)
                    continue
                if osd_id in before_osd_uuid_map and osd_id not in replace_osd_ids:
                    # if it exists but is part of the replacement operation, don't skip
                    continue
                if osd_id not in osd_uuid_map:
                    logger.debug(
                        'osd id {} does not exist in cluster'.format(osd_id))
                    continue
                if osd_uuid_map.get(osd_id) != osd['tags']['ceph.osd_fsid']:
                    logger.debug('mismatched osd uuid (cluster has %s, osd '
                                 'has %s)' % (osd_uuid_map.get(osd_id),
                                              osd['tags']['ceph.osd_fsid']))
                    continue

                created.append(osd_id)
                daemon_spec: CephadmDaemonDeploySpec = CephadmDaemonDeploySpec(
                    service_name=drive_group.service_name(),
                    daemon_id=osd_id,
                    host=host,
                    daemon_type='osd',
                )
                daemon_spec.final_config, daemon_spec.deps = self.generate_config(
                    daemon_spec)
                CephadmServe(self.mgr)._create_daemon(
                    daemon_spec, osd_uuid_map=osd_uuid_map)

        if created:
            self.mgr.cache.invalidate_host_devices(host)
            return "Created osd(s) %s on host '%s'" % (','.join(created), host)
        else:
            return "Created no osd(s) on host %s; already created?" % host
示例#16
0
    def _do_upgrade(self):
        # type: () -> None
        if not self.upgrade_state:
            logger.debug('_do_upgrade no state, exiting')
            return

        target_image = self.target_image
        target_id = self.upgrade_state.target_id
        target_digests = self.upgrade_state.target_digests
        target_version = self.upgrade_state.target_version

        first = False
        if not target_id or not target_version or not target_digests:
            # need to learn the container hash
            logger.info('Upgrade: First pull of %s' % target_image)
            self.upgrade_info_str = 'Doing first pull of %s image' % (
                target_image)
            try:
                target_id, target_version, target_digests = CephadmServe(
                    self.mgr)._get_container_image_info(target_image)
            except OrchestratorError as e:
                self._fail_upgrade(
                    'UPGRADE_FAILED_PULL', {
                        'severity': 'warning',
                        'summary': 'Upgrade: failed to pull target image',
                        'count': 1,
                        'detail': [str(e)],
                    })
                return
            if not target_version:
                self._fail_upgrade(
                    'UPGRADE_FAILED_PULL', {
                        'severity':
                        'warning',
                        'summary':
                        'Upgrade: failed to pull target image',
                        'count':
                        1,
                        'detail':
                        ['unable to extract ceph version from container'],
                    })
                return
            self.upgrade_state.target_id = target_id
            # extract the version portion of 'ceph version {version} ({sha1})'
            self.upgrade_state.target_version = target_version.split(' ')[2]
            self.upgrade_state.target_digests = target_digests
            self._save_upgrade_state()
            target_image = self.target_image
            first = True

        if target_digests is None:
            target_digests = []
        if target_version.startswith('ceph version '):
            # tolerate/fix upgrade state from older version
            self.upgrade_state.target_version = target_version.split(' ')[2]
            target_version = self.upgrade_state.target_version
        target_major, target_minor, target_patch = target_version.split('.')
        target_major_name = self.mgr.lookup_release_name(int(target_major))

        if first:
            logger.info('Upgrade: Target is version %s (%s)' %
                        (target_version, target_major_name))
            logger.info('Upgrade: Target container is %s, digests %s' %
                        (target_image, target_digests))

        version_error = self._check_target_version(target_version)
        if version_error:
            self._fail_upgrade(
                'UPGRADE_BAD_TARGET_VERSION', {
                    'severity': 'error',
                    'summary':
                    f'Upgrade: cannot upgrade/downgrade to {target_version}',
                    'count': 1,
                    'detail': [version_error],
                })
            return

        image_settings = self.get_distinct_container_image_settings()

        daemons = [
            d for d in self.mgr.cache.get_daemons()
            if d.daemon_type in CEPH_UPGRADE_ORDER
        ]
        done = 0
        for daemon_type in CEPH_UPGRADE_ORDER:
            logger.debug('Upgrade: Checking %s daemons' % daemon_type)

            need_upgrade_self = False
            need_upgrade: List[Tuple[DaemonDescription, bool]] = []
            need_upgrade_deployer: List[Tuple[DaemonDescription, bool]] = []
            for d in daemons:
                if d.daemon_type != daemon_type:
                    continue
                assert d.daemon_type is not None
                assert d.daemon_id is not None
                correct_digest = False
                if (any(d in target_digests
                        for d in (d.container_image_digests or []))
                        or d.daemon_type in MONITORING_STACK_TYPES):
                    logger.debug('daemon %s.%s container digest correct' %
                                 (daemon_type, d.daemon_id))
                    correct_digest = True
                    if any(d in target_digests for d in (d.deployed_by or [])):
                        logger.debug(
                            'daemon %s.%s deployed by correct version' %
                            (d.daemon_type, d.daemon_id))
                        done += 1
                        continue

                if self.mgr.daemon_is_self(d.daemon_type, d.daemon_id):
                    logger.info('Upgrade: Need to upgrade myself (mgr.%s)' %
                                self.mgr.get_mgr_id())
                    need_upgrade_self = True
                    continue

                if correct_digest:
                    logger.debug(
                        'daemon %s.%s not deployed by correct version' %
                        (d.daemon_type, d.daemon_id))
                    need_upgrade_deployer.append((d, True))
                else:
                    logger.debug(
                        'daemon %s.%s not correct (%s, %s, %s)' %
                        (daemon_type, d.daemon_id, d.container_image_name,
                         d.container_image_digests, d.version))
                    need_upgrade.append((d, False))

            if not need_upgrade_self:
                # only after the mgr itself is upgraded can we expect daemons to have
                # deployed_by == target_digests
                need_upgrade += need_upgrade_deployer

            # prepare filesystems for daemon upgrades?
            if (daemon_type == 'mds' and need_upgrade
                    and not self._prepare_for_mds_upgrade(
                        target_major, [d_entry[0]
                                       for d_entry in need_upgrade])):
                return

            if need_upgrade:
                self.upgrade_info_str = 'Currently upgrading %s daemons' % (
                    daemon_type)

            to_upgrade: List[Tuple[DaemonDescription, bool]] = []
            known_ok_to_stop: List[str] = []
            for d_entry in need_upgrade:
                d = d_entry[0]
                assert d.daemon_type is not None
                assert d.daemon_id is not None
                assert d.hostname is not None

                if not d.container_image_id:
                    if d.container_image_name == target_image:
                        logger.debug(
                            'daemon %s has unknown container_image_id but has correct image name'
                            % (d.name()))
                        continue

                if known_ok_to_stop:
                    if d.name() in known_ok_to_stop:
                        logger.info(
                            f'Upgrade: {d.name()} is also safe to restart')
                        to_upgrade.append(d_entry)
                    continue

                if d.daemon_type in ['mon', 'osd', 'mds']:
                    # NOTE: known_ok_to_stop is an output argument for
                    # _wait_for_ok_to_stop
                    if not self._wait_for_ok_to_stop(d, known_ok_to_stop):
                        return

                to_upgrade.append(d_entry)

                # if we don't have a list of others to consider, stop now
                if not known_ok_to_stop:
                    break

            num = 1
            for d_entry in to_upgrade:
                d = d_entry[0]
                assert d.daemon_type is not None
                assert d.daemon_id is not None
                assert d.hostname is not None

                self._update_upgrade_progress(done / len(daemons))

                # make sure host has latest container image
                out, errs, code = CephadmServe(self.mgr)._run_cephadm(
                    d.hostname,
                    '',
                    'inspect-image', [],
                    image=target_image,
                    no_fsid=True,
                    error_ok=True)
                if code or not any(d in target_digests
                                   for d in json.loads(''.join(out)).get(
                                       'repo_digests', [])):
                    logger.info('Upgrade: Pulling %s on %s' %
                                (target_image, d.hostname))
                    self.upgrade_info_str = 'Pulling %s image on host %s' % (
                        target_image, d.hostname)
                    out, errs, code = CephadmServe(self.mgr)._run_cephadm(
                        d.hostname,
                        '',
                        'pull', [],
                        image=target_image,
                        no_fsid=True,
                        error_ok=True)
                    if code:
                        self._fail_upgrade(
                            'UPGRADE_FAILED_PULL', {
                                'severity':
                                'warning',
                                'summary':
                                'Upgrade: failed to pull target image',
                                'count':
                                1,
                                'detail': [
                                    'failed to pull %s on host %s' %
                                    (target_image, d.hostname)
                                ],
                            })
                        return
                    r = json.loads(''.join(out))
                    if not any(d in target_digests
                               for d in r.get('repo_digests', [])):
                        logger.info(
                            'Upgrade: image %s pull on %s got new digests %s (not %s), restarting'
                            % (target_image, d.hostname, r['repo_digests'],
                               target_digests))
                        self.upgrade_info_str = 'Image %s pull on %s got new digests %s (not %s), restarting' % (
                            target_image, d.hostname, r['repo_digests'],
                            target_digests)
                        self.upgrade_state.target_digests = r['repo_digests']
                        self._save_upgrade_state()
                        return

                    self.upgrade_info_str = 'Currently upgrading %s daemons' % (
                        daemon_type)

                if len(to_upgrade) > 1:
                    logger.info(
                        'Upgrade: Updating %s.%s (%d/%d)' %
                        (d.daemon_type, d.daemon_id, num, len(to_upgrade)))
                else:
                    logger.info('Upgrade: Updating %s.%s' %
                                (d.daemon_type, d.daemon_id))
                action = 'Upgrading' if not d_entry[1] else 'Redeploying'
                try:
                    daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(
                        d)
                    self.mgr._daemon_action(
                        daemon_spec,
                        'redeploy',
                        image=target_image if not d_entry[1] else None)
                except Exception as e:
                    self._fail_upgrade(
                        'UPGRADE_REDEPLOY_DAEMON', {
                            'severity': 'warning',
                            'summary':
                            f'{action} daemon {d.name()} on host {d.hostname} failed.',
                            'count': 1,
                            'detail': [f'Upgrade daemon: {d.name()}: {e}'],
                        })
                    return
                num += 1
            if to_upgrade:
                return

            # complete mon upgrade?
            if daemon_type == 'mon':
                if not self.mgr.get("have_local_config_map"):
                    logger.info(
                        'Upgrade: Restarting mgr now that mons are running pacific'
                    )
                    need_upgrade_self = True

            if need_upgrade_self:
                try:
                    self.mgr.mgr_service.fail_over()
                except OrchestratorError as e:
                    self._fail_upgrade(
                        'UPGRADE_NO_STANDBY_MGR', {
                            'severity':
                            'warning',
                            'summary':
                            f'Upgrade: {e}',
                            'count':
                            1,
                            'detail': [
                                'The upgrade process needs to upgrade the mgr, '
                                'but it needs at least one standby to proceed.',
                            ],
                        })
                    return

                return  # unreachable code, as fail_over never returns
            elif daemon_type == 'mgr':
                if 'UPGRADE_NO_STANDBY_MGR' in self.mgr.health_checks:
                    del self.mgr.health_checks['UPGRADE_NO_STANDBY_MGR']
                    self.mgr.set_health_checks(self.mgr.health_checks)

            # make sure 'ceph versions' agrees
            ret, out_ver, err = self.mgr.check_mon_command({
                'prefix':
                'versions',
            })
            j = json.loads(out_ver)
            for version, count in j.get(daemon_type, {}).items():
                short_version = version.split(' ')[2]
                if short_version != target_version:
                    logger.warning(
                        'Upgrade: %d %s daemon(s) are %s != target %s' %
                        (count, daemon_type, short_version, target_version))

            # push down configs
            daemon_type_section = name_to_config_section(daemon_type)
            if image_settings.get(daemon_type_section) != target_image:
                logger.info('Upgrade: Setting container_image for all %s' %
                            daemon_type)
                self.mgr.set_container_image(daemon_type_section, target_image)
            to_clean = []
            for section in image_settings.keys():
                if section.startswith(
                        name_to_config_section(daemon_type) + '.'):
                    to_clean.append(section)
            if to_clean:
                logger.debug('Upgrade: Cleaning up container_image for %s' %
                             to_clean)
                for section in to_clean:
                    ret, image, err = self.mgr.check_mon_command({
                        'prefix':
                        'config rm',
                        'name':
                        'container_image',
                        'who':
                        section,
                    })

            logger.debug('Upgrade: All %s daemons are up to date.' %
                         daemon_type)

            # complete osd upgrade?
            if daemon_type == 'osd':
                osdmap = self.mgr.get("osd_map")
                osd_min_name = osdmap.get("require_osd_release", "argonaut")
                osd_min = ceph_release_to_major(osd_min_name)
                if osd_min < int(target_major):
                    logger.info(
                        f'Upgrade: Setting require_osd_release to {target_major} {target_major_name}'
                    )
                    ret, _, err = self.mgr.check_mon_command({
                        'prefix':
                        'osd require-osd-release',
                        'release':
                        target_major_name,
                    })

            # complete mds upgrade?
            if daemon_type == 'mds' and self.upgrade_state.fs_original_max_mds:
                for i in self.mgr.get("fs_map")['filesystems']:
                    fs_id = i["id"]
                    fs_name = i['mdsmap']['fs_name']
                    new_max = self.upgrade_state.fs_original_max_mds.get(fs_id)
                    if new_max:
                        self.mgr.log.info(
                            'Upgrade: Scaling up filesystem %s max_mds to %d' %
                            (fs_name, new_max))
                        ret, _, err = self.mgr.check_mon_command({
                            'prefix':
                            'fs set',
                            'fs_name':
                            fs_name,
                            'var':
                            'max_mds',
                            'val':
                            str(new_max),
                        })

                self.upgrade_state.fs_original_max_mds = {}
                self._save_upgrade_state()

        # clean up
        logger.info('Upgrade: Finalizing container_image settings')
        self.mgr.set_container_image('global', target_image)

        for daemon_type in CEPH_UPGRADE_ORDER:
            ret, image, err = self.mgr.check_mon_command({
                'prefix':
                'config rm',
                'name':
                'container_image',
                'who':
                name_to_config_section(daemon_type),
            })

        logger.info('Upgrade: Complete!')
        if self.upgrade_state.progress_id:
            self.mgr.remote('progress', 'complete',
                            self.upgrade_state.progress_id)
        self.upgrade_state = None
        self._save_upgrade_state()
        return
示例#17
0
    def _check_daemons(self) -> None:

        daemons = self.mgr.cache.get_daemons()
        daemons_post: Dict[str, List[orchestrator.DaemonDescription]] = defaultdict(list)
        for dd in daemons:
            # orphan?
            spec = self.mgr.spec_store.active_specs.get(dd.service_name(), None)
            assert dd.hostname is not None
            assert dd.daemon_type is not None
            assert dd.daemon_id is not None
            if not spec and dd.daemon_type not in ['mon', 'mgr', 'osd']:
                # (mon and mgr specs should always exist; osds aren't matched
                # to a service spec)
                self.log.info('Removing orphan daemon %s...' % dd.name())
                self._remove_daemon(dd.name(), dd.hostname)

            # ignore unmanaged services
            if spec and spec.unmanaged:
                continue

            # These daemon types require additional configs after creation
            if dd.daemon_type in ['grafana', 'iscsi', 'prometheus', 'alertmanager', 'nfs']:
                daemons_post[dd.daemon_type].append(dd)

            if self.mgr.cephadm_services[daemon_type_to_service(dd.daemon_type)].get_active_daemon(
               self.mgr.cache.get_daemons_by_service(dd.service_name())).daemon_id == dd.daemon_id:
                dd.is_active = True
            else:
                dd.is_active = False

            deps = self.mgr._calc_daemon_deps(dd.daemon_type, dd.daemon_id)
            last_deps, last_config = self.mgr.cache.get_daemon_last_config_deps(
                dd.hostname, dd.name())
            if last_deps is None:
                last_deps = []
            action = self.mgr.cache.get_scheduled_daemon_action(dd.hostname, dd.name())
            if not last_config:
                self.log.info('Reconfiguring %s (unknown last config time)...' % (
                    dd.name()))
                action = 'reconfig'
            elif last_deps != deps:
                self.log.debug('%s deps %s -> %s' % (dd.name(), last_deps,
                                                     deps))
                self.log.info('Reconfiguring %s (dependencies changed)...' % (
                    dd.name()))
                action = 'reconfig'
            elif self.mgr.last_monmap and \
                    self.mgr.last_monmap > last_config and \
                    dd.daemon_type in CEPH_TYPES:
                self.log.info('Reconfiguring %s (monmap changed)...' % dd.name())
                action = 'reconfig'
            elif self.mgr.extra_ceph_conf_is_newer(last_config) and \
                    dd.daemon_type in CEPH_TYPES:
                self.log.info('Reconfiguring %s (extra config changed)...' % dd.name())
                action = 'reconfig'
            if action:
                if self.mgr.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) == 'redeploy' \
                        and action == 'reconfig':
                    action = 'redeploy'
                try:
                    daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(dd)
                    self.mgr._daemon_action(daemon_spec, action=action)
                    self.mgr.cache.rm_scheduled_daemon_action(dd.hostname, dd.name())
                except OrchestratorError as e:
                    self.mgr.events.from_orch_error(e)
                    if dd.daemon_type in daemons_post:
                        del daemons_post[dd.daemon_type]
                    # continue...
                except Exception as e:
                    self.mgr.events.for_daemon_from_exception(dd.name(), e)
                    if dd.daemon_type in daemons_post:
                        del daemons_post[dd.daemon_type]
                    # continue...

        # do daemon post actions
        for daemon_type, daemon_descs in daemons_post.items():
            if daemon_type in self.mgr.requires_post_actions:
                self.mgr.requires_post_actions.remove(daemon_type)
                self.mgr._get_cephadm_service(daemon_type_to_service(
                    daemon_type)).daemon_check_post(daemon_descs)