예제 #1
0
class CommandRunner:
    def __init__(self, hostname, user='******', password=''):
        self.hostname = hostname
        self.user = user
        self.password = password
        self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)})
        self.connection = self._create_connection()

    def __str__(self):
        return '{} [{}@{}]'.format(self.__class__.__name__, self.user, self.hostname)

    def run(self, cmd, timeout=None, ignore_status=False,  # pylint: disable=too-many-arguments
            connect_timeout=300, verbose=True, log_file=None, retry=0):
        raise NotImplementedError("Should be implemented in subclasses")

    def _create_connection(self):
        raise NotImplementedError("_create_connection should be implemented")

    def _print_command_results(self, result, verbose, ignore_status):
        """When verbose=True and ignore_status=True that means nothing will be printed in any case"""
        if verbose and not result.failed:
            if result.stderr:
                self.log.info('STDERR: {}'.format(result.stderr))

            self.log.info('Command "{}" finished with status {}'.format(result.command, result.exited))
            return

        if verbose and result.failed and not ignore_status:
            self.log.error('Error executing command: "{}"; Exit status: {}'.format(result.command, result.exited))
            if result.stdout:
                self.log.debug('STDOUT: {}'.format(result.stdout[-240:]))
            if result.stderr:
                self.log.debug('STDERR: {}'.format(result.stderr))
            return
예제 #2
0
class CommandRunner(metaclass=ABCMeta):
    _params = None

    def __init__(self, hostname='', user='******', password=''):
        self.hostname = hostname
        self.user = user
        self.password = password
        self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)})
        self.connection = self._create_connection()

    @abstractmethod
    def get_init_arguments(self) -> dict:
        """
        Return instance parameters required to rebuild instance
        """

    @abstractmethod
    def is_up(self, timeout=None) -> bool:
        """
        Return instance parameters required to rebuild instance
        """

    def __str__(self):
        return '{} [{}@{}]'.format(self.__class__.__name__, self.user, self.hostname)

    def _setup_watchers(self, verbose, log_file, additional_watchers):
        watchers = additional_watchers if additional_watchers else []
        if verbose:
            watchers.append(OutputWatcher(self.log))
        if log_file:
            watchers.append(LogWriteWatcher(log_file))
        return watchers

    @abstractmethod
    def run(self, cmd, timeout=None, ignore_status=False,  # pylint: disable=too-many-arguments
            verbose=True, new_session=False, log_file=None, retry=0, watchers=None):
        pass

    @abstractmethod
    def _create_connection(self):
        pass

    def _print_command_results(self, result, verbose, ignore_status):
        """When verbose=True and ignore_status=True that means nothing will be printed in any case"""
        if verbose and not result.failed:
            if result.stderr:
                self.log.info('STDERR: {}'.format(result.stderr))

            self.log.info('Command "{}" finished with status {}'.format(result.command, result.exited))
            return

        if verbose and result.failed and not ignore_status:
            self.log.error('Error executing command: "{}"; Exit status: {}'.format(result.command, result.exited))
            if result.stdout:
                self.log.debug('STDOUT: {}'.format(result.stdout[-240:]))
            if result.stderr:
                self.log.debug('STDERR: {}'.format(result.stderr))
            return
예제 #3
0
 def __init__(self, node: 'BaseNode', max_core_upload_limit: int):
     self.node = node
     self.log = SDCMAdapter(node.log, extra={"prefix": self.__class__.__name__})
     self.max_core_upload_limit = max_core_upload_limit
     self.found: List[CoreDumpInfo] = []
     self.in_progress: List[CoreDumpInfo] = []
     self.completed: List[CoreDumpInfo] = []
     self.uploaded: List[CoreDumpInfo] = []
     self.termination_event = Event()
     self.exception = None
     super().__init__(daemon=True)
예제 #4
0
    def __init__(self,
                 name,
                 node_prefix=None,
                 parent_cluster=None,
                 base_logdir=None):  # pylint: disable=too-many-arguments,super-init-not-called
        self.name = name
        self.node_prefix = node_prefix
        self.remoter = LOCALRUNNER
        self.remoter.receive_files = types.MethodType(send_receive_files, self)
        self.remoter.send_files = types.MethodType(send_receive_files, self)
        self.parent_cluster = parent_cluster
        self.is_seed = False
        self._distro = None

        self.logdir = os.path.join(base_logdir, self.name)
        makedirs(self.logdir)
        self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)})
 def __init__(
         self,  # pylint: disable=too-many-arguments
         name,
         parent_cluster,
         base_logdir=None,
         ssh_login_info=None,
         node_prefix=None,
         dc_idx=None):
     super(DockerMonitoringNode,
           self).__init__(name=name,
                          parent_cluster=parent_cluster,
                          base_logdir=base_logdir,
                          ssh_login_info=ssh_login_info,
                          node_prefix=node_prefix,
                          dc_idx=dc_idx)
     self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)})
     self._grafana_address = None
예제 #6
0
class CoredumpThreadBase(Thread):  # pylint: disable=too-many-instance-attributes
    lookup_period = 30
    upload_retry_limit = 3
    max_coredump_thread_exceptions = 10

    def __init__(self, node: 'BaseNode', max_core_upload_limit: int):
        self.node = node
        self.log = SDCMAdapter(node.log,
                               extra={"prefix": self.__class__.__name__})
        self.max_core_upload_limit = max_core_upload_limit
        self.found: List[CoreDumpInfo] = []
        self.in_progress: List[CoreDumpInfo] = []
        self.completed: List[CoreDumpInfo] = []
        self.uploaded: List[CoreDumpInfo] = []
        self.termination_event = Event()
        self.exception = None
        super().__init__(daemon=True)

    def stop(self):
        self.termination_event.set()

    @raise_event_on_failure
    def run(self):
        """
        Keep reporting new coredumps found, every 30 seconds.
        """
        exceptions_count = 0
        while not self.termination_event.wait(
                self.lookup_period) or self.in_progress:
            try:
                self.main_cycle_body()
                exceptions_count = 0
            except Exception as exc:  # pylint: disable=broad-except
                self.log.error("Following error occurred: %s", exc)
                exceptions_count += 1
                if exceptions_count == self.max_coredump_thread_exceptions:
                    self.exception = exc
                    raise

    def main_cycle_body(self):
        if not self.node.remoter.is_up(timeout=60):
            return
        self._process_coredumps(self.in_progress, self.completed,
                                self.uploaded)
        new_cores = self.extract_info_from_core_pids(self.get_list_of_cores(),
                                                     exclude_cores=self.found)
        self.push_new_cores_to_process(new_cores)

    def push_new_cores_to_process(self, new_cores: List[CoreDumpInfo]):
        self.found.extend(new_cores)
        for core_dump in new_cores:
            if 'bash' in core_dump.executable:
                continue
            self.log_coredump(core_dump)
            if not self.is_limit_reached():
                self.in_progress.append(core_dump)

    def is_limit_reached(self):
        return len(self.uploaded) >= self.max_core_upload_limit

    def process_coredumps(self):
        self._process_coredumps(self.in_progress, self.completed,
                                self.uploaded)

    def _process_coredumps(self, in_progress: List[CoreDumpInfo],
                           completed: List[CoreDumpInfo],
                           uploaded: List[CoreDumpInfo]):
        """
        Get core files from node and report them
        """
        if not in_progress:
            return
        for core_info in in_progress.copy():
            if self.is_limit_reached():
                in_progress.remove(core_info)
                continue
            try:
                core_info.process_retry += 1
                if self.upload_retry_limit < core_info.process_retry:
                    self.log.error(
                        f"Maximum retry uploading is reached for core {str(core_info)}"
                    )
                    in_progress.remove(core_info)
                    completed.append(core_info)
                    continue
                self.update_coredump_info_with_more_information(core_info)
                result = self.upload_coredump(core_info)
                completed.append(core_info)
                in_progress.remove(core_info)
                if result:
                    uploaded.append(core_info)
                    self.publish_event(core_info)
            except:  # pylint: disable=bare-except
                pass

    @abstractmethod
    def get_list_of_cores(self) -> Optional[List[CoreDumpInfo]]:
        ...

    def publish_event(self, core_info: CoreDumpInfo):
        try:
            core_info.publish_event()
        except Exception as exc:  # pylint: disable=broad-except
            self.log.error(
                f"Failed to publish coredump event due to the: {str(exc)}")

    def extract_info_from_core_pids(
            self, new_cores: Optional[List[CoreDumpInfo]],
            exclude_cores: List[CoreDumpInfo]) -> List[CoreDumpInfo]:
        output = []
        for new_core_info in new_cores:
            found = False
            for e_core_info in exclude_cores:
                if e_core_info.pid == new_core_info.pid:
                    found = True
                    break
            if found:
                continue
            self.publish_event(new_core_info)
            output.append(new_core_info)
        return output

    # @retrying(n=10, sleep_time=20, allowed_exceptions=NETWORK_EXCEPTIONS, message="Retrying on uploading coredump")
    def _upload_coredump(self, core_info: CoreDumpInfo):
        coredump = core_info.corefile
        coredump = self._pack_coredump(coredump)
        base_upload_url = 'upload.scylladb.com/%s/%s'
        coredump_id = os.path.basename(coredump)[:-3]
        upload_url = base_upload_url % (coredump_id,
                                        os.path.basename(coredump))
        self.log.info('Uploading coredump %s to %s' % (coredump, upload_url))
        self.node.remoter.run("sudo curl --request PUT --upload-file "
                              "'%s' '%s'" % (coredump, upload_url))
        download_url = 'https://storage.cloud.google.com/%s' % upload_url
        self.log.info(
            "You can download it by %s (available for ScyllaDB employee)",
            download_url)
        download_instructions = 'gsutil cp gs://%s .\ngunzip %s' % (upload_url,
                                                                    coredump)
        core_info.download_url, core_info.download_instructions = download_url, download_instructions

    def upload_coredump(self, core_info: CoreDumpInfo):
        if core_info.download_url:
            return False
        if not core_info.corefile:
            self.log.error(
                f"{str(core_info)} has inaccessible corefile, can't upload it")
            return False
        try:
            self.log.debug(f'Start uploading file: {core_info.corefile}')
            core_info.download_instructions = 'Coredump upload in progress'
            self._upload_coredump(core_info)
            return True
        except Exception as exc:  # pylint: disable=broad-except
            core_info.download_instructions = 'failed to upload core'
            self.log.error(
                f"Following error occurred during uploading coredump {core_info.corefile}: {str(exc)}"
            )
            raise

    @cached_property
    def _is_pigz_installed(self):
        if self.node.is_rhel_like():
            return self.node.remoter.run('yum list installed | grep pigz',
                                         ignore_status=True).ok
        if self.node.is_ubuntu() or self.node.is_debian():
            return self.node.remoter.run('apt list --installed | grep pigz',
                                         ignore_status=True).ok
        raise RuntimeError("Distro is not supported")

    def _install_pigz(self):
        if self.node.is_rhel_like():
            self.node.remoter.sudo('yum install -y pigz')
            self.__dict__['is_pigz_installed'] = True
        elif self.node.is_ubuntu() or self.node.is_debian():
            self.node.remoter.sudo('apt install -y pigz')
            self.__dict__['is_pigz_installed'] = True
        else:
            raise RuntimeError("Distro is not supported")

    def _pack_coredump(self, coredump: str) -> str:
        extensions = ['.lz4', '.zip', '.gz', '.gzip']
        for extension in extensions:
            if coredump.endswith(extension):
                return coredump
        if not self._is_pigz_installed:
            self._install_pigz()
        try:  # pylint: disable=unreachable
            if not self.node.remoter.run(f'sudo ls {coredump}.gz',
                                         verbose=False,
                                         ignore_status=True).ok:
                self.node.remoter.run(f'sudo pigz --fast --keep {coredump}')
            coredump += '.gz'
        except NETWORK_EXCEPTIONS:  # pylint: disable=try-except-raise
            raise
        except Exception as ex:  # pylint: disable=broad-except
            self.log.warning("Failed to compress coredump '%s': %s", coredump,
                             ex)
        return coredump

    def log_coredump(self, core_info: CoreDumpInfo):
        if not core_info.coredump_info:
            return
        log_file = os.path.join(self.node.logdir, 'coredump.log')
        with open(log_file, 'a') as log_file_obj:
            log_file_obj.write(core_info.coredump_info)
        for line in core_info.coredump_info.splitlines():
            self.log.error(line)

    @property
    def n_coredumps(self) -> int:
        return len(self.found)

    @abstractmethod
    def update_coredump_info_with_more_information(self,
                                                   core_info: CoreDumpInfo):
        pass
예제 #7
0
 def __init__(self, hostname='', user='******', password=''):
     self.hostname = hostname
     self.user = user
     self.password = password
     self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)})
     self.connection = self._create_connection()