示例#1
0
    def get_all_jobs(self) -> None:
        """incrementally saves all stored jobsessions, even before first execution of sppmon"""

        job_list = MethodUtils.query_something(
            name="job list", source_func=self.__api_queries.get_job_list)

        for job in job_list:
            job_id = job.get("id", None)
            job_name = job.get("name", None)

            # this way to make sure we also catch empty strings
            if (not job_id or not job_name):
                ExceptionUtils.error_message(
                    f"skipping, missing name or id for job {job}")
                continue
            LOGGER.info(
                ">> capturing Job information for Job \"{}\"".format(job_name))

            try:
                self.__job_by_id(job_id=job_id)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    f"error when getting jobs for {job_name}, skipping it")
                continue
示例#2
0
    def login(self) -> None:
        """Logs in into the REST-API. Call this before using any methods.

        Sets up the sessionId and the server URL.

        Raises:
            ValueError: Login was not sucessfull.
        """
        http_auth: HTTPBasicAuth = HTTPBasicAuth(self.__username, self.__password) # type: ignore
        self.__srv_url = "https://{srv_address}:{port}".format(srv_address=self.__srv_address, port=self.__srv_port)
        endpoint = "/api/endeavour/session"

        LOGGER.debug(f"login to SPP REST API server: {self.__srv_url}")
        if(self.__verbose):
            LOGGER.info(f"login to SPP REST API server: {self.__srv_url}")
        try:
            response_json = self.post_data(endpoint=endpoint, auth=http_auth) # type: ignore
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)
            ExceptionUtils.error_message(
                "Please make sure your Hostadress, port, username and password for REST-API (not SSH) login is correct."
                + "\nYou may test this by logging in into the SPP-Website with the used credentials.")
            raise ValueError(f"REST API login request not successfull.")

        self.__sessionid: str = response_json.get("sessionid", "")
        (version, build) = self.get_spp_version_build()

        LOGGER.debug(f"SPP-Version: {version}, build {build}")
        LOGGER.debug(f"REST API Session ID: {self.__sessionid}")
        if(self.__verbose):
            LOGGER.info(f"REST API Session ID: {self.__sessionid}")
            LOGGER.info(f"SPP-Version: {version}, build {build}")

        self.__headers['X-Endeavour-Sessionid'] = self.__sessionid
示例#3
0
    def set_critial_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any critical infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """
        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting")
            self.exit(error_code=ERROR_CODE_START_ERROR)
        try:
            # critical components only
            self.influx_client = InfluxClient(config_file)

            if (not self.ignore_setup):
                # delay the connect into the testing phase
                self.influx_client.connect()

        except ValueError as err:
            ExceptionUtils.exception_info(
                error=err,
                extra_message="error while setting up critical config. Aborting"
            )
            self.influx_client = None  # set none, otherwise the variable is undeclared
            self.exit(error_code=ERROR_CODE)
    def __job_logs_to_stats(self, list_with_logs: List[Dict[str, Any]]) -> None:
        """Parses joblogs into their own statisic table, using declared supported ID's

        To parse more jobLogs define additional entrys in the attribute `supported_ids`.

        Arguments:
            list_with_logs {List[Dict[str, Any]]} -- List with all saved joblogs
        """

        # only continue with joblogs we want to save
        supported_log_iterator = filter(lambda log: log['messageId'] in self.__supported_ids.keys(), list_with_logs)
        sorted_log_iterator = sorted(supported_log_iterator, key=lambda entry: entry['logTime'])
        max_sec_timestamp = 0 # required for preventing duplicates

        for job_log in sorted_log_iterator:
            message_id = job_log['messageId']

            table_func_tuple = self.__supported_ids[message_id]

            (table_name, row_dict_func) = table_func_tuple

            if(not table_name):
                table_name = message_id

            try:
                row_dict = row_dict_func(job_log['messageParams'])
            except KeyError as error:
                ExceptionUtils.exception_info(
                    error, extra_message="MessageID params wrong defined. Skipping one MessageId")
                continue

            row_dict['messageId'] = message_id
            # Issue 9, In case where all tag values duplicate another record, including the timestamp, Influx will throw the insert
            # out as a duplicate.  In some cases, the changing of epoch timestamps from millisecond to second precision is
            # cause duplicate timestamps.  To avoid this for certain tables, add seconds to the timestamp as needed to
            # ensure uniqueness.  Only use this when some innacuracy of the timestamps is acceptable
            cur_timestamp = job_log['logTime']
            if(table_name == 'vmBackupSummary'):

                if(cur_timestamp is None): # prevent None
                    ExceptionUtils.error_message(f"Warning: logTime is None, duplicate may be purged. Log: {job_log}")

                if(isinstance(cur_timestamp, str)): # make sure its int
                    cur_timestamp = int(cur_timestamp)

                cur_sec_timestamp = SppUtils.to_epoch_secs(cur_timestamp)
                if(cur_sec_timestamp <= max_sec_timestamp):
                    digits = (int)(cur_timestamp / cur_sec_timestamp)
                    max_sec_timestamp += 1 # increase by 1 second
                    cur_timestamp = max_sec_timestamp * digits
                else:
                    max_sec_timestamp = cur_sec_timestamp

            row_dict['time'] = cur_timestamp

            for(key, item) in row_dict.items():
                if(item in ('null', 'null(null)')):
                    row_dict[key] = None

            self.__influx_client.insert_dicts_to_buffer(table_name, [row_dict])
    def set_critial_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any critical infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """
        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting")
            self.exit(error_code=ERROR_CODE_CMD_LINE)
        try:
            # critical components only

            auth_influx = SppUtils.get_cfg_params(param_dict=config_file,
                                                  param_name="influxDB")
            if (not isinstance(auth_influx, dict)):
                raise ValueError("influx config need to be dict")
            self.influx_client = InfluxClient(auth_influx=auth_influx)
            self.influx_client.connect()

        except ValueError as err:
            ExceptionUtils.exception_info(
                error=err,
                extra_message="error while setting up critical config. Aborting"
            )
            self.influx_client = None  # set none cause it does not work.
            self.exit(error_code=ERROR_CODE)
示例#6
0
    def filter_values_dict(
            cls,
            result_list: List[Dict[str, Any]],
            white_list: List[str] = None,
            ignore_list: List[str] = None) -> List[Dict[str, Any]]:
        """Removes unwanted values from a list of dicts.

        Use white_list to only pick the values specified.
        Use ignore_list to pick everything but the values specified
        Both: white_list itmes overwrite ignore_list times, still getting all items not filterd.

        Args:
            result_list (List[Dict[str, Any]]): items to be filtered
            white_list (List[str], optional): items to be kept. Defaults to None.
            ignore_list (List[str], optional): items to be removed. Defaults to None.

        Raises:
            ValueError: no result list specified

        Returns:
            List[Dict[str, Any]]: list of filtered dicts
        """

        if (result_list is None):
            raise ValueError("need valuelist to filter values")

        new_result_list: List[Dict[str, Any]] = []

        # if single object this is a 1 elem list
        for result in result_list:

            new_result: Dict[str, Any] = {}

            # Only aquire items wanted
            if (white_list):

                for white_key in white_list:
                    (key, value) = SppUtils.get_nested_kv(key_name=white_key,
                                                          nested_dict=result)
                    if (key in new_result):
                        key = white_key
                    new_result[key] = value

                # warn if something is missing
                if (len(new_result) != len(white_list)):
                    ExceptionUtils.error_message(
                        f"Result has not same lenght as whitelist, probably typing error: {result_list}"
                    )

            # aquire all but few unwanted
            if (ignore_list is not None):
                # add sub-dicts to dictonary itself, filtering inclusive
                full_result = cls.get_with_sub_values(mydict=result,
                                                      ignore_list=ignore_list)
                new_result.update(full_result)

            new_result_list.append(new_result)

        return new_result_list
示例#7
0
    def exit(self, error_code: int = SUCCESS_CODE) -> NoReturn:
        """Executes finishing tasks and exits sppmon. To be called every time.

        Executes finishing tasks and displays error messages.
        Specify only error message if something did went wrong.
        Use Error codes specified at top of module.
        Does NOT return.

        Keyword Arguments:
            error {int} -- Errorcode if a error occured. (default: {0})
        """

        # error with the command line arguments
        # dont store runtime here
        if (error_code == ERROR_CODE_CMD_ARGS):
            parser.print_help()
            sys.exit(ERROR_CODE_CMD_ARGS)  # unreachable?
        if (error_code == ERROR_CODE_START_ERROR):
            ExceptionUtils.error_message(
                "Error when starting SPPMon. Please review the errors above")
            sys.exit(ERROR_CODE_START_ERROR)

        script_end_time = SppUtils.get_actual_time_sec()
        LOGGER.debug("Script end time: %d", script_end_time)

        try:
            if (not self.ignore_setup):
                self.store_script_metrics()

                if (self.influx_client):
                    self.influx_client.disconnect()
                if (self.rest_client):
                    self.rest_client.logout()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="Error occured while exiting sppmon")
            error_code = ERROR_CODE

        self.remove_pid_file()

        # Both error-clauses are actually the same, but for possiblility of an split between error cases
        # always last due beeing true for any number != 0
        if (error_code == ERROR_CODE or error_code):
            ExceptionUtils.error_message(
                "Error occured while executing sppmon")
        elif (not self.ignore_setup):
            LOGGER.info("\n\n!!! script completed !!!\n")

        print(
            f"check log for details: grep \"PID {os.getpid()}\" {self.log_path} > sppmon.log.{os.getpid()}"
        )
        sys.exit(error_code)
    def exit(self, error_code: int = False) -> NoReturn:
        """Executes finishing tasks and exits sppmon. To be called every time.

        Executes finishing tasks and displays error messages.
        Specify only error message if something did went wrong.
        Use Error codes specified at top of module.
        Does NOT return.

        Keyword Arguments:
            error {int} -- Errorcode if a error occured. (default: {False})
        """

        # error with the command line arguments
        # dont store runtime here
        if (error_code == ERROR_CODE_CMD_LINE):
            prog_args = []
            prog_args.append(sys.argv[0])
            prog_args.append("--help")
            os.execv(sys.executable, ['python'] + prog_args)
            sys.exit(ERROR_CODE_CMD_LINE)  # unreachable?

        script_end_time = SppUtils.get_actual_time_sec()
        LOGGER.debug("Script end time: %d", script_end_time)

        try:
            if (not self.ignore_setup):
                self.store_script_metrics()

                if (self.influx_client):
                    self.influx_client.disconnect()
                if (self.rest_client):
                    self.rest_client.logout()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="Error occured while exiting sppmon")
            error_code = ERROR_CODE

        if (not error_code):
            LOGGER.info("\n\n!!! script completed !!!\n")

        self.remove_pid_file()

        # Both clauses are actually the same, but for clarification, always last due always beeing true for any number
        if (error_code == ERROR_CODE or error_code):
            ExceptionUtils.error_message(
                "Error occured while executing sppmon")

        print(
            f"check log for details: grep \"PID {os.getpid()}\" {self.log_path} > sppmon.log.{os.getpid()}"
        )
        sys.exit(error_code)
    def query_something(
            cls, name: str, source_func: Callable[[], List[Dict[str, Any]]],
            rename_tuples: List[Tuple[str, str]] = None,
            deactivate_verbose: bool = False) -> List[Dict[str, Any]]:
        """
        Generic function to query from the REST-API and rename elements within it.
        Use deactivate_verbose to deactivate any result-printing to compute the result and query yourself.

        Arguments:
            name {str} -- Name of item you want to query for the logger.
            source_func {Function} -- Function which returns a list of dicts with elems wanted.

        Keyword Arguments:
            rename_tuples {list} -- List of Tuples if you want to rename Keys. (old_name, new_name) (default: {None})
            deactivate_verbose {bool} -- deactivates result-prints within the function. (default: {False})

        Raises:
            ValueError: No name is provided
            ValueError: No Function is provided or not a function

        Returns:
            list -- List of dicts with the results.
        """

        # None checks
        if(rename_tuples is None):
            rename_tuples = []
        if(not name):
            raise ValueError("need name to query something")
        if(not source_func):
            raise ValueError("need a source function to query data")

        LOGGER.info("> getting %s", name)

        # request all Sites from SPP
        elem_list = source_func()
        if(not elem_list):
            ExceptionUtils.error_message(f">> No {name} are found")

        if(rename_tuples):
            for elem in elem_list:
                # rename fields to make it more informative.
                for(old_name, new_name) in rename_tuples:
                    elem[new_name] = elem.pop(old_name)

        if(cls.verbose and not deactivate_verbose):
            MethodUtils.my_print(elem_list)

        return elem_list
示例#10
0
    def site_name_by_id(self, site_id: Union[int, str]) -> Optional[str]:
        """Returns a site_name by a associated site_id.

        Uses a already buffered result if possible, otherwise queries the influxdb for the name.

        Arguments:
            site_id {Union[int, str]} -- id of the site

        Returns:
            Optional[str] -- name of the site, None if not found.
        """
        if (site_id is None):
            ExceptionUtils.error_message("siteId is none, returning None")
            return None
        # if string, parse to int
        if (isinstance(site_id, str)):
            site_id = site_id.strip(" ")
            if (re.match(r"\d+", site_id)):
                site_id = int(site_id)
            else:
                ExceptionUtils.error_message(
                    "siteId is of unsupported string format")
                return None
        # if still not int, error
        if (not isinstance(site_id, int)):
            ExceptionUtils.error_message("site id is of unsupported type")
            return None

        # return if already saved -> previous call or `sites`-call
        result = self.__site_name_dict.get(site_id, None)
        if (result is not None):  # empty str allowed
            return result

        table_name = 'sites'
        table = self.__influx_client.database[table_name]
        query = SelectionQuery(
            keyword=Keyword.SELECT,
            tables=[table],
            # description, throttleRates cause we need a field to query
            fields=["siteId", "siteName", "description", "throttleRates"],
            where_str=f"siteId = \'{site_id}\'",
            order_direction="DESC",
            limit=1)
        result_set = self.__influx_client.send_selection_query(
            query)  # type: ignore
        result_dict: Dict[str, Any] = next(result_set.get_points(),
                                           None)  # type: ignore
        if (not result_dict):
            ExceptionUtils.error_message(
                f"no site with the id {site_id} exists")
            return None

        # save result and return it
        result = result_dict['siteName']
        self.__site_name_dict[site_id] = result
        return result
    def setup_args(self) -> None:
        """This method set up all required parameters and transforms arg groups into individual args.
        """
        # ## call functions based on cmdline parameters

        # Temporary features / Depricated

        if (OPTIONS.minimumLogs):
            ExceptionUtils.error_message(
                "DEPRICATED: using depricated argument '--minumumLogs'. Switch to '--loadedSystem'."
            )

        # incremental setup, higher executes all below
        all_args: bool = OPTIONS.all
        daily: bool = OPTIONS.daily or all_args
        hourly: bool = OPTIONS.hourly or daily
        constant: bool = OPTIONS.constant or hourly

        # ######## All Methods #################

        self.sites: bool = OPTIONS.sites or all_args

        # ######## Daily Methods ###############

        self.vms: bool = OPTIONS.vms or daily
        self.job_logs: bool = OPTIONS.jobLogs or daily
        self.sla_stats: bool = OPTIONS.slaStats or daily
        self.vm_stats: bool = OPTIONS.vmStats or daily

        # ######## Hourly Methods ##############

        self.jobs: bool = OPTIONS.jobs or hourly
        self.vadps: bool = OPTIONS.vadps or hourly
        self.storages: bool = OPTIONS.storages or hourly
        # ssh vsnap pools ?

        # ######## Constant Methods ############

        self.ssh: bool = OPTIONS.ssh or constant
        self.process_stats: bool = OPTIONS.processStats or constant
        self.cpu: bool = OPTIONS.cpu or constant
        self.spp_catalog: bool = OPTIONS.sppcatalog or constant
示例#12
0
    def get_auto_datatype(value: Any) -> Datatype:
        """get Datatype enum by value typ analysis. Usage should be avoided.

        Only use if no datatype is declared. It skips time-type and fails if ints are mixed with floats.
        If no type is detected emits a warning and returns `NONE`.

        Arguments:
            value {Union[str, float, int, bool, None]} -- Value to be analyzed

        Returns:
            Datatype -- type of value or `NONE`.
        """
        for enum in Datatype:
            if (enum is Datatype.TIMESTAMP):
                continue
            if (isinstance(value, enum.value)):
                return enum

        ExceptionUtils.error_message(f"No auto type found for {value}")
        return Datatype.NONE
示例#13
0
    def __init__(self):
        self.log_path: str = ""
        """path to logger, set in set_logger."""
        self.pid_file_path: str = ""
        """path to pid_file, set in check_pid_file."""

        self.set_logger()

        LOGGER.info("Starting SPPMon")
        if (not self.check_pid_file()):
            ExceptionUtils.error_message(
                "Another instance of sppmon with the same args is running")
            self.exit(ERROR_CODE_START_ERROR)

        time_stamp_name, time_stamp = SppUtils.get_capture_timestamp_sec()
        self.start_counter = time.perf_counter()
        LOGGER.debug("\n\n")
        LOGGER.debug(f"running script version: {VERSION}")
        LOGGER.debug(f"cmdline options: {ARGS}")
        LOGGER.debug(f"{time_stamp_name}: {time_stamp}")
        LOGGER.debug("")

        if (not ARGS.configFile):
            ExceptionUtils.error_message("missing config file, aborting")
            self.exit(error_code=ERROR_CODE_CMD_ARGS)
        try:
            self.config_file = SppUtils.read_conf_file(
                config_file_path=ARGS.configFile)
        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message=
                "Error when trying to read Config file, unable to read")
            self.exit(error_code=ERROR_CODE_START_ERROR)

        LOGGER.info("Setting up configurations")
        self.setup_args()
        self.set_critial_configs(self.config_file)
        self.set_optional_configs(self.config_file)
示例#14
0
    def login(self) -> None:
        """Logs in into the REST-API. Call this before using any methods.

        Sets up the sessionId and the server URL.

        Raises:
            ValueError: Login was not sucessfull.
        """
        http_auth: HTTPBasicAuth = HTTPBasicAuth(self.__username, self.__password)
        self.__srv_url = f"https://{self.__srv_address}:{self.__srv_port}"
        login_url = self.get_url("/api/endeavour/session")

        LOGGER.debug(f"login to SPP REST API server: {self.__srv_url}")
        if(self.__verbose):
            LOGGER.info(f"login to SPP REST API server: {self.__srv_url}")
        try:
            (response_json, _) = self.query_url(url=login_url, auth=http_auth, request_type=RequestType.POST)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)
            ExceptionUtils.error_message(
                "Please make sure your Hostadress, port, username and password for REST-API (not SSH) login is correct."
                + "\nYou may test this by logging in into the SPP-Website with the used credentials.")
            raise ValueError(f"REST API login request not successfull.")
        try:
            self.__sessionid: str = response_json["sessionid"]
        except KeyError as error:
            ExceptionUtils.exception_info(error)
            raise ValueError("Login into SPP failed: No Session-ID received")

        (version, build) = self.get_spp_version_build()

        LOGGER.debug(f"SPP-Version: {version}, build {build}")
        LOGGER.debug(f"REST API Session ID: {self.__sessionid}")
        if(self.__verbose):
            LOGGER.info(f"REST API Session ID: {self.__sessionid}")
            LOGGER.info(f"SPP-Version: {version}, build {build}")

        self.__headers['X-Endeavour-Sessionid'] = self.__sessionid
示例#15
0
    def setup_db(self, database_name: str) -> None:
        if (not self.__client):
            raise ValueError(
                "Tried to setup DB while client wasn't connected.")
        try:
            # Check if database already exits -> nothing to do
            db_list: List[Dict[str, str]] = self.__client.get_list_database()
            if (database_name in map(lambda entry: entry["name"], db_list)):
                LOGGER.debug(f"SetupDB: DB {database_name} already exits")
                # nothing to do since db exits
                return

            # create db, nothing happens if it already exists
            self.__client.create_database(database_name)
            LOGGER.info(f"> Created database {database_name}")

            # Check if GrafanaReader exists and give him permissions
            user_list: List[Dict[str, str]] = self.__client.get_list_users()
            if (self.grafanaReader_name not in map(lambda entry: entry["user"],
                                                   user_list)):
                LOGGER.debug("SetupDB: Grafana User does not exits")
                ExceptionUtils.error_message(
                    f"WARNING: User '{self.grafanaReader_name}' does not exist"
                )
                # nothing to do since GrafanaReader does not exit
                return
            self.__client.grant_privilege("read", database_name,
                                          self.grafanaReader_name)
            LOGGER.info(
                f"> Granted read privileges for user {self.grafanaReader_name} on db {database_name}"
            )

        except (ValueError, InfluxDBClientError, InfluxDBServerError,
                requests.exceptions.ConnectionError) as error:  # type: ignore
            ExceptionUtils.exception_info(error=error)  # type: ignore
            raise ValueError(
                f"Setup of the new database failed. Maybe the connection failed or the user '{self.__user}' has no admin privileges."
            )
    def __init__(self):
        self.log_path: str = ""
        """path to logger, set in set_logger."""
        self.pid_file_path: str = ""
        """path to pid_file, set in check_pid_file."""

        # String, cause of days etc
        self.job_log_retention_time = "60d"
        """Configured spp log rentation time, logs get deleted after this time."""

        self.set_logger()

        if (not self.check_pid_file()):
            ExceptionUtils.error_message(
                "Another instance of sppmon with the same args is running")
            self.exit(ERROR_CODE_CMD_LINE)

        # everything is option, otherwise its a typo.
        if (len(ARGS) > 0):
            ExceptionUtils.error_message(
                f"CAREFUL: ARG DETECTED, probably typing in programm call: {ARGS}"
            )

        time_stamp_name, time_stamp = SppUtils.get_capture_timestamp_sec()
        self.start_counter = time.perf_counter()
        LOGGER.debug("\n\n")
        LOGGER.debug(f"running script version: {VERSION}")
        LOGGER.debug(f"cmdline options: {OPTIONS}")
        LOGGER.debug(f"{time_stamp_name}: {time_stamp}")
        LOGGER.debug("")

        if (not OPTIONS.confFileJSON):
            ExceptionUtils.error_message("missing config file, aborting")
            self.exit(error_code=ERROR_CODE_CMD_LINE)
        try:
            config_file = SppUtils.read_conf_file(
                config_file_path=OPTIONS.confFileJSON)
        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="Syntax Error in Config file, unable to read")
            self.exit(error_code=ERROR_CODE_CMD_LINE)

        self.setup_args()
        self.set_critial_configs(config_file)
        self.set_optional_configs(config_file)
示例#17
0
    def job_logs(self) -> None:
        """saves all jobLogs for the jobsessions in influx catalog.

        Make sure to call `get_all_jobs` before to aquire all jobsessions.
        In order to save them it deletes and rewrites all affected jobsession entrys.
        It automatically parses certain jobLogs into additional stats, defined by `supported_ids`.
        """

        # total count of requested logs
        logs_requested_total = 0
        # total count of inserted logs
        logs_to_stats_total = 0
        # should be equal, but on failure isnt (skipped logs)

        # list to be inserted after everything is updated
        job_update_list: List[Dict[str, Any]] = []

        LOGGER.info("> Requesting jobs with missing logs from influx database")

        table = self.__influx_client.database['jobs']
        # only store if there is something to store -> limited by job log rentation time.
        where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time
        where_str += f' AND time > now() - {table.retention_policy.duration}'

        # Select all jobs without joblogs
        keyword = Keyword.SELECT
        query = SelectionQuery(keyword=keyword,
                               tables=[table],
                               fields=['*'],
                               where_str=where_str)

        # send query and compute
        missing_logs_jobs_rs = self.__influx_client.send_selection_query(  # type: ignore
            query)

        # this list contains all jobs which are missing its Logs
        # Cast from resultset into list
        missing_logs_jobs: List[Dict[str, Any]] = list(
            missing_logs_jobs_rs.get_points())  # type: ignore

        LOGGER.info(
            f">>> Number of jobs with no joblogs stored in Influx database: {len(missing_logs_jobs)}"
        )

        LOGGER.info("> Requesting missing jobLogs from REST-API.")
        # request all jobLogs from REST-API
        # counter only for displaying purposes
        for counter, row in enumerate(missing_logs_jobs, 0):

            # Only print every 5 rows if not verbose
            # starts at 0, therefore already updated
            if (self.__verbose or counter % 5 == 0):
                LOGGER.info(
                    f">>> computed joblogs for {counter} / {len(missing_logs_jobs)} job sessions."
                )

            job_session_id: Optional[int] = row.get('id', None)

            # if somehow jobLogid is missing: skip
            # Should usually not happen
            if (job_session_id is None):
                ExceptionUtils.error_message(
                    f"Error: jobSessionId missing for row {row}")
                continue

            if (self.__verbose):
                LOGGER.info(
                    f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}."
                )
            LOGGER.debug(
                f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}."
            )

            try:
                # cant use `query_something` like in other places due the extra params:
                # api_queries - query_something only works with no params

                # This list contains all joblogs for a single job-execution
                current_job_logs = self.__api_queries.get_job_log_details(
                    jobsession_id=job_session_id,
                    job_logs_types=self.__job_log_types,
                    request_ids=list(self.__supported_ids.keys()))
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    f"Error when api-requesting joblogs for job_session_id {job_session_id}, skipping it"
                )
                continue

            job_log_count = len(current_job_logs)
            logs_requested_total += job_log_count

            if (self.__verbose):
                LOGGER.info(
                    f">>> Found {job_log_count} logs for jobsessionId {job_session_id}"
                )
            LOGGER.debug(
                f"Found {job_log_count} logs for jobsessionId {job_session_id}"
            )

            # ####################################################################################
            # Compute results and save logs
            # #####################################################################################
            # The request of REST-API Logs is finished here
            # To not crash by saving 100.000+ Logs, directly compute results and insert them
            # ######################################################################################

            for job_log in current_job_logs:
                # add additional information from job-session itself
                job_log["jobId"] = row.get("jobId", None)
                job_log["jobName"] = row.get("jobName", None)
                job_log["jobExecutionTime"] = row.get("start", None)

                # rename for clarity
                job_log["jobLogId"] = job_log.pop("id", None)
                job_log["jobSessionId"] = job_log.pop("jobsessionId", None)

            # ##########################################################
            # compute jobLog-Stats into each associated table
            # ##########################################################
            try:
                self.__job_logs_to_stats(current_job_logs)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error,
                    extra_message=
                    f"Failed parse jobLogs into its own table, skipping for jobsessionId {job_session_id}"
                )

            logs_to_stats_total += job_log_count

            # ##########################################################
            # save logs within the joblog-dump
            # ##########################################################

            # Only dump them after computing stats since they are read within the computing stats part
            for job_log in current_job_logs:
                # dump message params to allow saving as string
                job_log["messageParams"] = json.dumps(job_log["messageParams"])

            # if list is empty due beeing erased etc it will simply return and do nothing
            self.__influx_client.insert_dicts_to_buffer(
                list_with_dicts=current_job_logs, table_name="jobLogs")

            # shallow copy dict to allow a update without errors
            copied_jobsession = dict(row.items())

            # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails)
            update_fields = {
                "jobLogsCount": job_log_count,
                "jobsLogsStored": True
            }
            # update the fields
            for (key, value) in update_fields.items():
                copied_jobsession[key] = value
            job_update_list.append(copied_jobsession)

            # ##########################################################
            # End of For-Each
            # ##########################################################

        # ##########################################################
        # Delete each job, then re-insert
        # ##########################################################

        # Delete all jobs which got requested, no matter if failed
        delete_query = SelectionQuery(keyword=Keyword.DELETE,
                                      tables=[table],
                                      where_str=where_str)

        # now send remove query to prevent data loss
        self.__influx_client.send_selection_query(delete_query)  # type: ignore

        # Insert data after everything is completed
        self.__influx_client.insert_dicts_to_buffer(table.name,
                                                    job_update_list)

        if (logs_requested_total != logs_to_stats_total):
            LOGGER.info(
                f"> Requested a total of {logs_requested_total} but only computed {logs_to_stats_total} into sppmon statistics"
            )
        else:
            LOGGER.info(
                f">>> requested and computed a total of {logs_requested_total} logs"
            )

        LOGGER.info(f">> Updated a total of {len(job_update_list)} jobs")
    def __add_predef_table(
        cls,
        name: str,
        fields: Dict[str, Datatype],
        tags: List[str],
        time_key: Optional[str] = None,
        retention_policy: RetentionPolicy = None,
        continuous_queries: List[Union[ContinuousQuery,
                                       Callable[[Table, str],
                                                ContinuousQuery]]] = None
    ) -> None:
        """Declares a new predefined table. Recommended to to with every table you may want to insert into the influxdb.


        It is recommended to declare each param by name.
        If you do not declare the time_key, it will use sppmon capture time.
        Declare Retention Policy by ClassMethods declared above. Blank for `autogen`-RP (not recommended).
        Declare Continuous queries by using either the cq_template or creating your own.
        Be aware it is impossible to use `database["tablename"] to gain a instance of a table, this table is not defined yet.

        Arguments:
            name {str} -- Name of the table/measurement
            fields {Dict[str, Datatype]} -- fields of the table. At least one entry, name as key, dataype as value.
            tags {List[str]} -- tags of the table. Always of datatype string

        Keyword Arguments:
            time_key {Optional[str]} -- Name of key used as timestamp. Blank if capturetime (default: {None})
            retention_policy {RetentionPolicy} -- Retention policy to be associated (default: {None})
            continuous_queries {List[Union[ContinuousQuery, Callable[[Table, str], ContinuousQuery]]]}
                -- List of either a CQ or a template which is transformed within this method (default: {None})
        """

        # create a retention instance out of the constructor methods
        if (not retention_policy):
            retention_policy = cls._RP_AUTOGEN()

        # add to save used policies
        cls.__database.retention_policies.add(retention_policy)

        # switch needed to allow table default value to be used.
        # avoids redudant default declaration
        if (time_key):
            table = Table(database=cls.__database,
                          name=name,
                          fields=fields,
                          tags=tags,
                          time_key=time_key,
                          retention_policy=retention_policy)
        else:
            table = Table(database=cls.__database,
                          name=name,
                          fields=fields,
                          tags=tags,
                          retention_policy=retention_policy)
        cls.__database.tables[name] = table

        # save CQ
        if (continuous_queries):
            i = 0
            for continuous_query in continuous_queries:
                if (not isinstance(continuous_query, ContinuousQuery)):
                    continuous_query = continuous_query(
                        table, f"cq_{table.name}_{i}")
                    i += 1
                cls.__database.continuous_queries.add(continuous_query)

                # make sure the args exist
                if (continuous_query.select_query
                        and continuous_query.select_query.into_table):
                    cls.__database.retention_policies.add(
                        continuous_query.select_query.into_table.
                        retention_policy)
                else:
                    # regex parsing?
                    ExceptionUtils.error_message(
                        "Probably a programming error, report to DEV's. " +
                        f"Missing retention policy for CQ {continuous_query.name}."
                    )
    def test_connection(influx_client: InfluxClient, rest_client: RestClient,
                        config_file: Dict[str, Any]):
        if (not config_file):
            raise ValueError("SPPmon does not work without a config file")

        LOGGER.info("Testing all connections required for SPPMon to work")
        working: bool = True  # SPPMon itself will finish sucessfull (no critical errors)
        no_warnings: bool = True  # SPPMon will finish without any warnings (no errors at all)

        # ## InfluxDB ##

        LOGGER.info("> Testing and configuring InfluxDB")
        try:
            influx_client.connect()
            influx_client.disconnect()
            if (not influx_client.use_ssl):
                ExceptionUtils.error_message(
                    "> WARNING: Mandatory SSL is disabled. We hightly recommend to enable it!"
                )
                no_warnings = False

            LOGGER.info("InfluxDB is ready for use")
        except ValueError as error:
            ExceptionUtils.exception_info(
                error,
                extra_message=
                "> Testing of the InfluxDB failed. This is a crictial component of SPPMon."
            )
            working = False

        # ## REST-API ##

        LOGGER.info("> Testing REST-API of SPP.")
        try:
            rest_client.login()
            (version_nr, build_nr) = rest_client.get_spp_version_build()
            LOGGER.info(
                f">> Sucessfully connected to SPP V{version_nr}, build {build_nr}."
            )
            rest_client.logout()
            LOGGER.info("> REST-API is ready for use")
        except ValueError as error:
            ExceptionUtils.exception_info(
                error,
                extra_message=
                "> Testing of the REST-API failed. This is a crictial component of SPPMon."
            )
            working = False

        # ## SSH-CLIENTS ##

        LOGGER.info(
            "> Testing all types of SSH-Clients: Server, VAPDs, vSnaps, Cloudproxy and others"
        )
        ssh_working = True  # The arg --ssh will finish without any error at all

        # Count of clients checks
        ssh_clients: List[SshClient] = SshMethods.setup_ssh_clients(
            config_file)
        if (not ssh_clients):
            ExceptionUtils.error_message(
                ">> No SSH-clients detected at all. At least the server itself should be added for process-statistics."
            )
            ssh_working = False
        else:
            for type in SshTypes:
                if (not list(
                        filter(lambda client: client.client_type == type,
                               ssh_clients))):
                    LOGGER.info(f">> No {type.name} client detected.")

                    if (type == SshTypes.SERVER):
                        ExceptionUtils.error_message(
                            ">> Critical: Without Server as ssh client you wont have any process statistics available. These are a key part of SPPMon."
                        )
                        ssh_working = False  # No error, but still critical

                    if (type == SshTypes.VSNAP):
                        LOGGER.info(
                            ">> WARNING: Without vSnap as ssh client you have no access to storage information. You may add vSnap's for additional monitoring and alerts."
                        )
                        no_warnings = False  # ssh will still work, but thats definitly a warning

            ssh_methods: SshMethods = SshMethods(influx_client, config_file,
                                                 False)
            # Connection check
            LOGGER.info(
                f">> Testing now connection and commands of {len(ssh_clients)} registered ssh-clients."
            )
            for client in ssh_clients:
                try:
                    client.connect()
                    client.disconnect()

                    error_count: int = len(ExceptionUtils.stored_errors)
                    MethodUtils.ssh_execute_commands(
                        ssh_clients=[client],
                        ssh_type=client.client_type,
                        command_list=ssh_methods.client_commands[
                            client.client_type] + ssh_methods.all_command_list)
                    if (len(ExceptionUtils.stored_errors) != error_count):
                        ssh_working = False
                        ExceptionUtils.error_message(
                            f"Not all commands available for client {client.host_name} with type: {client.client_type}.\n"
                            +
                            "Please check manually if the commands are installed and their output."
                        )

                except ValueError as error:
                    ExceptionUtils.exception_info(
                        error,
                        extra_message=
                        f"Connection failed for client {client.host_name} with type: {client.client_type}."
                    )
                    ssh_working = False

        if (ssh_working):
            LOGGER.info("> Testing of SSH-clients sucessfull.")
        else:
            LOGGER.info(
                "> Testing of SSH-clients failed! SPPMon will still work, not all informations are available."
            )
            no_warnings = False

        # #### Conclusion ####

        if (working and no_warnings):
            LOGGER.info(
                "> All components tested sucessfully. SPPMon is ready to be used!"
            )
        elif (working):
            LOGGER.info(
                "> Testing partially sucessful. SPPMon will run, but please check the warnings."
            )
        else:
            LOGGER.info(
                "> Testing failed. SPPMon is not ready to be used. Please fix the connection issues."
            )
示例#20
0
    def main(self):

        LOGGER.info("Starting argument execution")

        if (not self.influx_client):
            ExceptionUtils.error_message(
                "somehow no influx client is present even after init")
            self.exit(ERROR_CODE)

        # ##################### SYSTEM METHODS #######################
        if (self.sites and self.system_methods):
            try:
                self.system_methods.sites()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting sites, skipping them all")

        if (self.cpu and self.system_methods):
            try:
                self.system_methods.cpuram()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting cpu stats, skipping them all"
                )

        if (self.spp_catalog and self.system_methods):
            try:
                self.system_methods.sppcatalog()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting file system stats, skipping them all"
                )

        # ####################### JOB METHODS ########################
        if (self.jobs and self.job_methods):
            # store all jobs grouped by jobID
            try:
                self.job_methods.get_all_jobs()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting jobs, skipping them all")

        if (self.job_logs and self.job_methods):
            # store all job logs per job session instance
            try:
                self.job_methods.job_logs()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting job logs, skipping them all"
                )

        # ####################### SSH METHODS ########################
        if (self.ssh and self.ssh_methods):
            # execute ssh statements for, VSNAP, VADP, other ssh hosts
            # store all job logs per job session instance
            try:
                self.ssh_methods.ssh()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh commands, skipping them all"
                )

        # ################### HYPERVISOR METHODS #####################
        if (self.vms and self.protection_methods):
            try:
                self.protection_methods.store_vms()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting all VMs, skipping them all"
                )

        if (self.sla_stats and self.protection_methods):
            # number of VMs per SLA and sla dumps
            try:
                self.protection_methods.vms_per_sla()
                self.protection_methods.sla_dumps()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting and computing VMs per sla, skipping them all"
                )

        if (self.vm_stats and self.protection_methods):
            # retrieve and calculate VM inventory summary
            try:
                self.protection_methods.create_inventory_summary()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when creating inventory summary, skipping them all"
                )

        if (self.vadps and self.protection_methods):
            try:
                self.protection_methods.vadps()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting vadps, skipping them all")

        if (self.storages and self.protection_methods):
            try:
                self.protection_methods.storages()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting storages, skipping them all"
                )

        # ###################### OTHER METHODS #######################

        if (ARGS.copy_database):
            try:
                self.influx_client.copy_database(ARGS.copy_database)
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when coping database.")

        # ################### NON-SETUP-METHODS #######################

        if (ARGS.test):
            try:
                TestingMethods.test_connection(self.config_file,
                                               self.influx_client,
                                               self.rest_client)
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when testing connection.")

        # DEPRECATED TODO REMOVE NEXT VERSION
        if (ARGS.create_dashboard):
            try:
                ExceptionUtils.error_message(
                    "This method is deprecated. You do not need to manually create a dashboard anymore.\n"
                    +
                    "Please just select the datasource when importing the regular 14-day dashboard in grafana.\n"
                    +
                    "Devs may adjust their dashboard to be generic with the scripts/generifyDashboard.py script."
                )
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when creating dashboard")

        self.exit()
示例#21
0
    def split_by_table_def(
        self, mydict: Dict[str, Any]
    ) -> Tuple[Dict[str, Any], Dict[str, Any], Union[str, int, None]]:
        """Split the given dict into a pre-defined set of tags, fields and a timestamp.

        None-Values and empty strings are ignored.
        If there are no fields declared, it will split by a default pattern.
        Undeclared collums will be added with a "MISSING" postfix to the key.
        This function uses the tag/field and timestamp definiton declared within this table.

        Arguments:
            self {Table} -- Table with predefined set of tags and fields
            mydict {Dict[str, Any]} -- dict with colums as keys. None-Values are ignored

        Raises:
            ValueError: If no dict is given or not of type dict.

        Returns:
            (Dict[str, Any], Dict[str, Any], int) -- Tuple of: tags, fields, timestamp
        """

        if (not mydict):
            raise ValueError("need at least one value in dict to split")

        # if table is not defined use default split
        if (not self.fields):
            return InfluxUtils.default_split(mydict=mydict)

        # fill dicts
        # table.fields is a dict, we only need the keys
        fields: Dict[str, Any] = dict.fromkeys(self.fields.keys(), None)
        tags: Dict[str, Any] = dict.fromkeys(self.tags, None)

        # what field should be recorded as time
        time_stamp_field = self.time_key
        # helper variable to only overwrite if it is not the time_stamp_field
        time_overwrite_allowed = True
        # actualy timestamp saved
        time_stamp: Union[str, int, None] = None

        for (key, value) in mydict.items():

            # Ignore empty entrys
            if (value is None or (isinstance(value, str) and not value)):
                continue

            # Check timestamp value if it matches any of predefined time names
            if (key in time_stamp_field or key in InfluxUtils.time_key_names):

                # sppmonCTS has lowest priority, only set if otherwise None
                if (time_stamp is None and key == SppUtils.capture_time_key):
                    time_stamp = value

                # time_stamp_field is highest priority. Do not overwrite it.
                elif (key is time_stamp_field):
                    time_overwrite_allowed: bool = False
                    time_stamp = value

                # if time_stamp_field is not used yet, overwrite sppmonCaptureTime or others
                elif (time_overwrite_allowed):
                    time_stamp = value

                # if no overwrite allowed, continue and drop field
                else:
                    continue

            # Otherwise check for Keys or Fields
            if (key in fields):
                fields[key] = value
            elif (key in tags):
                tags[key] = value
            elif (key in InfluxUtils.time_key_names
                  or key in time_stamp_field):
                continue
            else:
                ExceptionUtils.error_message(
                    f"Not all columns for table {self.name} are declared: {key}"
                )
                # before key+"MISSING" : Removed to avoid death-circle on repeated queries.
                fields[key] = value
        return (tags, fields, time_stamp)
示例#22
0
    def store_script_metrics(self) -> None:
        """Stores script metrics into influxb. To be called before exit.

        Does not raise any exceptions, skips if influxdb is missing.
        """
        LOGGER.info("Storing script metrics")
        try:
            if (not self.influx_client):
                raise ValueError("no influxClient set up")
            insert_dict: Dict[str, Union[str, int, float, bool]] = {}

            # add version nr, api calls are needed
            insert_dict["sppmon_version"] = VERSION
            insert_dict["influxdb_version"] = self.influx_client.version
            if (self.rest_client):
                try:
                    (version_nr,
                     build) = self.rest_client.get_spp_version_build()
                    insert_dict["spp_version"] = version_nr
                    insert_dict["spp_build"] = build
                except ValueError as error:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message="could not query SPP version and build.")

            # end total sppmon runtime
            end_counter = time.perf_counter()
            insert_dict['duration'] = int(
                (end_counter - self.start_counter) * 1000)

            # add arguments of sppmon
            for (key, value) in vars(ARGS).items():
                # Value is either string, true or false/None
                if (value):
                    insert_dict[key] = value

            # save occured errors
            error_count = len(ExceptionUtils.stored_errors)
            if (error_count > 0):
                ExceptionUtils.error_message(
                    f"total of {error_count} exception/s occured")
            insert_dict['errorCount'] = error_count
            # save list as str if not empty
            if (ExceptionUtils.stored_errors):
                insert_dict['errorMessages'] = str(
                    ExceptionUtils.stored_errors)

            # get end timestamp
            (time_key, time_val) = SppUtils.get_capture_timestamp_sec()
            insert_dict[time_key] = time_val

            # save the metrics
            self.influx_client.insert_dicts_to_buffer(
                table_name="sppmon_metrics", list_with_dicts=[insert_dict])
            self.influx_client.flush_insert_buffer()
            LOGGER.info("Stored script metrics sucessfull")
            # + 1 due the "total of x exception/s occured"
            if (error_count + 1 < len(ExceptionUtils.stored_errors)):
                ExceptionUtils.error_message(
                    "A non-critical error occured while storing script metrics. \n\
                    This error can't be saved into the DB, it's only displayed within the logs."
                )
        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message=
                "Error when storing sppmon-metrics, skipping this step. Possible insert-buffer data loss"
            )
    def main(self):

        LOGGER.info("Starting argument execution")

        if (not self.influx_client):
            ExceptionUtils.error_message(
                "somehow no influx client is present even after init")
            self.exit(ERROR_CODE)

        # ##################### SYSTEM METHODS #######################
        if (self.sites and self.system_methods):
            try:
                self.system_methods.sites()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting sites, skipping them all")

        if (self.cpu and self.system_methods):
            try:
                self.system_methods.cpuram()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting cpu stats, skipping them all"
                )

        if (self.spp_catalog and self.system_methods):
            try:
                self.system_methods.sppcatalog()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting file system stats, skipping them all"
                )

        # ####################### JOB METHODS ########################
        if (self.jobs and self.job_methods):
            # store all jobs grouped by jobID
            try:
                self.job_methods.get_all_jobs()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting jobs, skipping them all")

        if (self.job_logs and self.job_methods):
            # store all job logs per job session instance
            try:
                self.job_methods.job_logs()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting job logs, skipping them all"
                )

        # ####################### SSH METHODS ########################
        if (self.ssh and self.ssh_methods):
            # execute ssh statements for, VSNAP, VADP, other ssh hosts
            # store all job logs per job session instance
            try:
                self.ssh_methods.ssh()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh commands, skipping them all"
                )

        # ################### HYPERVISOR METHODS #####################
        if (self.vms and self.protection_methods):
            try:
                self.protection_methods.store_vms()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting all VMs, skipping them all"
                )

        if (self.sla_stats and self.protection_methods):
            # number of VMs per SLA and sla dumps
            try:
                self.protection_methods.vms_per_sla()
                self.protection_methods.sla_dumps()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting and computing VMs per sla, skipping them all"
                )

        if (self.vm_stats and self.protection_methods):
            # retrieve and calculate VM inventory summary
            try:
                self.protection_methods.create_inventory_summary()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when creating inventory summary, skipping them all"
                )

        if (self.vadps and self.protection_methods):
            try:
                self.protection_methods.vadps()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting vadps, skipping them all")

        if (self.storages and self.protection_methods):
            try:
                self.protection_methods.storages()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting storages, skipping them all"
                )

        # ###################### OTHER METHODS #######################

        if (OPTIONS.copy_database):
            try:
                self.influx_client.copy_database(OPTIONS.copy_database)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when coping database.")

        # ################### NON-SETUP-METHODS #######################

        if (OPTIONS.test):
            try:
                OtherMethods.test_connection(self.influx_client,
                                             self.rest_client,
                                             self.config_file)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when testing connection.")

        if (OPTIONS.create_dashboard):
            try:
                if (not OPTIONS.dashboard_folder_path):
                    ExceptionUtils.error_message(
                        "Only use --create_dashboard in combination with --dashboard_folder_path=\"PATH/TO/GRAFANA/FOLDER/\""
                    )
                else:
                    OtherMethods.create_dashboard(
                        dashboard_folder_path=OPTIONS.dashboard_folder_path,
                        database_name=self.influx_client.database.name)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when creating dashboard")

        self.exit()
示例#24
0
    def set_optional_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any optional infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should not abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """

        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting.")
            self.exit(error_code=ERROR_CODE_START_ERROR)
        if (not self.influx_client):
            ExceptionUtils.error_message(
                "Influx client is somehow missing. aborting")
            self.exit(error_code=ERROR_CODE)

        # ############################ REST-API #####################################
        try:
            ConnectionUtils.verbose = ARGS.verbose
            # ### Loaded Systems part 1/2 ### #
            if (ARGS.minimumLogs or ARGS.loadedSystem):
                # Setting pagesize scaling settings
                ConnectionUtils.timeout_reduction = self.loaded_timeout_reduction
                ConnectionUtils.allowed_send_delta = self.loaded_allowed_send_delta
                ConnectionUtils.max_scaling_factor = self.loaded_max_scaling_factor

                # Setting RestClient request settings.
                self.rest_client = RestClient(
                    config_file=config_file,
                    initial_connection_timeout=self.initial_connection_timeout,
                    pref_send_time=self.loaded_pref_send_time,
                    request_timeout=self.loaded_request_timeout,
                    max_send_retries=self.loaded_max_send_retries,
                    starting_page_size=self.loaded_starting_page_size,
                    min_page_size=self.loaded_min_page_size,
                    verbose=ARGS.verbose)
            else:
                ConnectionUtils.timeout_reduction = self.timeout_reduction
                ConnectionUtils.allowed_send_delta = self.allowed_send_delta
                ConnectionUtils.max_scaling_factor = self.max_scaling_factor

                # Setting RestClient request settings.
                self.rest_client = RestClient(
                    config_file=config_file,
                    initial_connection_timeout=self.initial_connection_timeout,
                    pref_send_time=self.pref_send_time,
                    request_timeout=self.request_timeout,
                    max_send_retries=self.max_send_retries,
                    starting_page_size=self.starting_page_size,
                    min_page_size=self.min_page_size,
                    verbose=ARGS.verbose)

            self.api_queries = ApiQueries(self.rest_client)
            if (not self.ignore_setup):
                # delay the connect into the testing phase
                self.rest_client.login()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="REST-API is not available due Config error")
            # Required to declare variable
            self.rest_client = None
            self.api_queries = None

        # ######################## System, Job and Hypervisor Methods ##################
        try:
            # explicit ahead due dependency
            self.system_methods = SystemMethods(self.influx_client,
                                                self.api_queries, ARGS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        # ### Full Logs ### #
        if (ARGS.fullLogs):
            given_log_types = self.full_joblog_types
        else:
            given_log_types = self.joblog_types

        try:
            auth_rest: Dict[str, Any] = SppUtils.get_cfg_params(
                param_dict=config_file, param_name="sppServer")  # type: ignore
            # TODO DEPRECATED TO BE REMOVED IN 1.1
            self.job_log_retention_time = auth_rest.get(
                "jobLog_rentation",
                auth_rest.get("jobLog_retention", self.job_log_retention_time))
            # TODO New once 1.1 is live
            #self.job_log_retention_time = auth_rest.get("jobLog_retention", self.job_log_retention_time)

            self.job_methods = JobMethods(self.influx_client, self.api_queries,
                                          self.job_log_retention_time,
                                          given_log_types, ARGS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        try:
            # dependen on system methods
            self.protection_methods = ProtectionMethods(
                self.system_methods, self.influx_client, self.api_queries,
                ARGS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        # ############################### SSH #####################################
        if (self.ssh and not self.ignore_setup):
            try:
                # set from None to methods once finished
                self.ssh_methods = SshMethods(influx_client=self.influx_client,
                                              config_file=config_file,
                                              verbose=ARGS.verbose)

            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "SSH-Commands are not available due Config error")
                # Variable needs to be declared
                self.ssh_methods = None
        else:
            # Variable needs to be declared
            self.ssh_methods = None
示例#25
0
    def setup_args(self) -> None:
        """This method set up all required parameters and transforms arg groups into individual args.
        """
        # ## call functions based on cmdline parameters

        # Temporary features / Deprecated

        if (ARGS.minimumLogs):
            ExceptionUtils.error_message(
                "DEPRECATED: using deprecated argument '--minumumLogs'. Use to '--loadedSystem' instead."
            )
        if (ARGS.processStats):
            ExceptionUtils.error_message(
                "DEPRECATED: using deprecated argument '--minumumLogs'. Use to '--ssh' instead."
            )

        # ignore setup args
        self.ignore_setup: bool = (ARGS.create_dashboard
                                   or bool(ARGS.dashboard_folder_path)
                                   or ARGS.test)
        if (self.ignore_setup):
            ExceptionUtils.error_message(
                "> WARNING: An option for a utility operation has been specified.  Bypassing normal SPPMON operation."
            )

        if ((ARGS.create_dashboard or bool(ARGS.dashboard_folder_path))
                and not (ARGS.create_dashboard
                         and bool(ARGS.dashboard_folder_path))):
            ExceptionUtils.error_message(
                "> Using --create_dashboard without associated folder path. Aborting."
            )
            self.exit(ERROR_CODE_CMD_ARGS)

        # incremental setup, higher executes all below
        all_args: bool = ARGS.all
        daily: bool = ARGS.daily or all_args
        hourly: bool = ARGS.hourly or daily
        constant: bool = ARGS.constant or hourly

        # ######## All Methods #################

        self.sites: bool = ARGS.sites or all_args

        # ######## Daily Methods ###############

        self.vms: bool = ARGS.vms or daily
        self.job_logs: bool = ARGS.jobLogs or daily
        self.sla_stats: bool = ARGS.slaStats or daily
        self.vm_stats: bool = ARGS.vmStats or daily

        # ######## Hourly Methods ##############

        self.jobs: bool = ARGS.jobs or hourly
        self.vadps: bool = ARGS.vadps or hourly
        self.storages: bool = ARGS.storages or hourly
        # ssh vsnap pools ?

        # ######## Constant Methods ############

        self.ssh: bool = ARGS.ssh or constant
        self.cpu: bool = ARGS.cpu or constant
        self.spp_catalog: bool = ARGS.sppcatalog or constant
    def transfer_data(self, old_database_name: str = None) -> None:
        # ######################   DISCLAMER   #######################
        # ###################  TEMPORARY FEATURE  ####################
        # this part is deleted once all old versions of SPPMon have been migrated
        # use at own caution
        # ############################################################
        if (not old_database_name):
            old_database_name = self.database.name
        LOGGER.info(
            f"transfering the data from database {old_database_name} into {self.database.name}."
        )
        LOGGER.info("Computing queries to be send to the server.")

        queries: List[str] = []
        # all tables into their respective, data will be dropped if over RP-Time
        for table in self.database.tables.values():
            query_str = f"SELECT * INTO {table} FROM {old_database_name}.autogen.{table.name} WHERE time > now() - {table.retention_policy.duration} GROUP BY *"
            queries.append(query_str)
        # Commpute the dropped data CQ-Like into the new tables.
        for con_query in self.database.continuous_queries:
            if (con_query.select_query):
                query_str: str = con_query.select_query.to_query()

                # replacing the rp of the string is easier then everything else

                match = re.search(r"(FROM ((.+)\.(.+)\..+) GROUP BY)",
                                  query_str)
                if (not match):
                    raise ValueError("error when matching")

                from_clause = match.group(1)
                full_qualified_table = match.group(2)
                database_str = match.group(3)
                rp_str = match.group(4)

                new_f_q_t = full_qualified_table.replace(
                    database_str, old_database_name)
                new_f_q_t = new_f_q_t.replace(rp_str, "autogen")

                if (con_query.select_query.into_table is None):
                    ExceptionUtils.error_message(
                        f"unable to process the query due an internal error: {query_str}"
                    )
                    continue
                if (con_query.select_query.into_table.retention_policy.duration
                        != '0s'):
                    # add where clause to prevent dataloss due overflowing retention drop.
                    if (re.search("WHERE", new_f_q_t)):
                        new_f_q_t += " AND "
                    else:
                        new_f_q_t += " WHERE "
                    new_f_q_t += f"time > now() - {con_query.select_query.into_table.retention_policy.duration}"

                # insert new where clause into the match
                new_from_clause = from_clause.replace(full_qualified_table,
                                                      new_f_q_t)
                new_query_str = query_str.replace(from_clause, new_from_clause)

                queries.append(new_query_str)

        LOGGER.info("Finished Computing, starting to send.")

        # how many lines were transfered
        line_count: int = 0
        # how often was a query partially written, not line count!
        dropped_count: int = 0
        # how often was data dropped above the 10.000 limit?
        critical_drop: int = 0
        LOGGER.info("starting transfer of data")

        # disable timeout
        old_timeout = self.__client._timeout
        self.__client = InfluxDBClient(  # type: ignore
            host=self.__address,
            port=self.__port,
            username=self.__user,
            password=self.__password,
            ssl=self.__use_ssl,
            verify_ssl=self.__verify_ssl,
            timeout=7200)
        # ping to make sure connection works
        version: str = self.__client.ping()
        LOGGER.info(
            f"Connected again to influxdb with new timeout of {self.__client._timeout}, version: {version}"
        )
        i = 0

        for query in queries:
            try:
                start_time = time.perf_counter()
                # seems like you may only send one SELECT INTO at once via python
                result = self.__client.query(  # type: ignore
                    query=query,
                    epoch='s',
                    database=self.database.name)
                end_time = time.perf_counter()

                # count lines written, max 1
                for result in result.get_points():
                    i += 1
                    line_count += result["written"]
                    LOGGER.info(
                        f'query {i}/{len(queries)}: {result["written"]} lines in {end_time-start_time}'
                    )

            except InfluxDBClientError as error:
                # only raise if the error is unexpected
                if (re.search(
                        f"partial write: points beyond retention policy dropped=10000",
                        error.content)):
                    critical_drop += 1
                    raise ValueError(
                        "transfer of data failed, retry manually with a shorter WHERE-clause",
                        query)
                if (re.search(
                        f"partial write: points beyond retention policy dropped=",
                        error.content)):
                    dropped_count += 1
                else:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message=
                        f"transfer of data failed for query {query}")
                    critical_drop += 1

            except (InfluxDBServerError,
                    requests.exceptions.ConnectionError) as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=f"transfer of data failed for query {query}")
                critical_drop += 1

        # reset timeout
        self.__client = InfluxDBClient(  # type: ignore
            host=self.__address,
            port=self.__port,
            username=self.__user,
            password=self.__password,
            ssl=self.__use_ssl,
            verify_ssl=self.__verify_ssl,
            timeout=old_timeout)
        # ping to make sure connection works
        version: str = self.__client.ping()
        LOGGER.info(
            f"Connected again to influxdb with old timeout of {self.__client._timeout}, version: {version}"
        )

        LOGGER.info("transfer of data sucessfully")
        LOGGER.info(f"Total transfered {line_count} lines of results.")
        if (dropped_count):
            LOGGER.info(
                f"Could not count lines of {dropped_count} queries due an expected error. No need for manual action."
            )
        if (critical_drop):
            LOGGER.info(
                f"Could not transfer data of {critical_drop} tables, check messages above to retry manually!"
                +
                "Please send the query manually with a adjusted 'from table': '$database.autogen.tablename'\n "
                +
                f"Adjust other values as required. Drop due Retention Policy is 'OK' until 10.000.\n"
                +
                "if it reaches 10.000 you need to cut the query into smaller bits."
            )
    def job_logs(self) -> None:
        """saves all jobLogs for the jobsessions in influx catalog.

        Make sure to call `get_all_jobs` before to aquire all jobsessions.
        In order to save them it deletes and rewrites all affected jobsession entrys.
        It automatically parses certain jobLogs into additional stats, defined by `supported_ids`.
        """

        table = self.__influx_client.database['jobs']
        # only store if there is something to store -> limited by job log rentation time.
        where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time
        where_str += f' AND time > now() - {table.retention_policy.duration}'

        jobs_updated = 0
        logs_total_count = 0
        LOGGER.info("> getting joblogs for jobsessions without saved logs")
        LOGGER.info(">> requesting jobList from database")

        # Select all jobs without joblogs
        keyword = Keyword.SELECT
        query = SelectionQuery(keyword=keyword,
                               tables=[table],
                               fields=['*'],
                               where_str=where_str)
        # send query and compute
        result = self.__influx_client.send_selection_query(  # type: ignore
            query)
        result_list: List[Dict[str, Any]] = list(
            result.get_points())  # type: ignore

        rows_affected = len(result_list)

        LOGGER.info(
            ">>> number of jobs with no joblogs stored in Influx database: {}".
            format(rows_affected))

        job_log_dict: Dict[int, List[Dict[str, Any]]] = {}

        # request all jobLogs from REST-API
        # if errors occur, skip single row and debug
        for row in result_list:
            job_session_id: Optional[int] = row.get('id', None)

            # if somehow id is missing: skip
            if (job_session_id is None):
                ExceptionUtils.error_message(
                    f"Error: joblogId missing for row {row}")
                continue

            if (job_session_id in job_log_dict):
                ExceptionUtils.error_message(
                    f"Error: joblogId duplicate, skipping.{job_session_id}")
                continue

            if (self.__verbose):
                LOGGER.info(
                    f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions."
                )
            elif (len(job_log_dict) % 5 == 0):
                LOGGER.info(
                    f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions."
                )

            # request job_session_id
            try:
                if (self.__verbose):
                    LOGGER.info(
                        f"requesting jobLogs {self.__job_log_type} for session {job_session_id}."
                    )
                LOGGER.debug(
                    f"requesting jobLogs {self.__job_log_type} for session {job_session_id}."
                )

                # cant use query something like everwhere due the extra params needed
                job_log_list = self.__api_queries.get_job_log_details(
                    jobsession_id=job_session_id,
                    job_logs_type=self.__job_log_type)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    f"error when api-requesting joblogs for job_session_id {job_session_id}, skipping it"
                )
                continue

            if (self.__verbose):
                LOGGER.info(
                    f">>> Found {len(job_log_list)} logs for jobsessionId {job_session_id}"
                )

            LOGGER.debug(
                f"Found {len(job_log_list)} logs for jobsessionId {job_session_id}"
            )
            # default empty list if no details available -> should not happen, in for safty reasons
            # if this is none, go down to rest client and fix it. Should be empty list.
            if (job_log_list is None):
                job_log_list = []
                ExceptionUtils.error_message(
                    "A joblog_list was none, even if the type does not allow it. Please report to developers."
                )
            job_log_dict[job_session_id] = job_log_list

        # list to be inserted after everything is updated
        insert_list: List[Dict[str, Any]] = []

        # Query data in ranges to avoid too many requests
        # Results from first select query above
        for row in result_list:
            job_id: int = row['id']
            job_log_list: Optional[List[Dict[str, Any]]] = job_log_dict.get(
                job_id, None)

            if (job_log_list is None):
                ExceptionUtils.error_message(
                    f"missing job_log_list even though it is in influxdb for jobId {job_id}. Skipping it"
                )
                continue

            # jobLogsCount will be zero if jobLogs are deleted after X days by maintenance jobs, GUI default is 60 days
            job_logs_count = len(job_log_list)
            if (self.__verbose):
                LOGGER.info(
                    ">>> storing {} joblogs for jobsessionId: {} in Influx database"
                    .format(len(job_log_list), job_id))
            LOGGER.debug(
                ">>> storing {} joblogs for jobsessionId: {} in Influx database"
                .format(len(job_log_list), job_id))

            for job_log in job_log_list:
                # rename log keys and add additional information
                job_log["jobId"] = row.get("jobId", None)
                job_log["jobName"] = row.get("jobName", None)
                job_log["jobExecutionTime"] = row.get("start", None)
                job_log["jobLogId"] = job_log.pop("id")
                job_log["jobSessionId"] = job_log.pop("jobsessionId")

            # compute other stats out of jobList
            try:
                self.__job_logs_to_stats(job_log_list)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error,
                    extra_message=
                    f"Failed to compute stats out of job logs, skipping for jobsessionId {job_id}"
                )

            for job_log in job_log_list:
                # dump message params to allow saving as string
                job_log["messageParams"] = json.dumps(job_log["messageParams"])

            # if list is empty due beeing erased etc it will simply return and do nothing
            self.__influx_client.insert_dicts_to_buffer(
                list_with_dicts=job_log_list, table_name="jobLogs")

            jobs_updated += 1
            logs_total_count += job_logs_count
            # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails)
            update_fields = {
                "jobLogsCount": job_logs_count,
                "jobsLogsStored": True
            }
            # copy dict to allow update without errors
            mydict = dict(row.items())
            # update fields
            for (key, value) in update_fields.items():
                mydict[key] = value
            insert_list.append(mydict)

        # Delete data to allow reinsert with different tags
        delete_query = SelectionQuery(keyword=Keyword.DELETE,
                                      tables=[table],
                                      where_str=where_str)

        # now send remove query to prevent data loss
        self.__influx_client.send_selection_query(delete_query)  # type: ignore

        # Insert data after everything is completed
        self.__influx_client.insert_dicts_to_buffer(table.name, insert_list)

        LOGGER.info(
            ">>> inserting a total of {} logs".format(logs_total_count))
    def main(self):

        if (not self.influx_client):
            ExceptionUtils.error_message(
                "somehow no influx client is present even after init")
            self.exit(ERROR_CODE)

        # ##################### SYSTEM METHODS #######################
        if (self.sites and self.system_methods):
            try:
                self.system_methods.sites()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting sites, skipping them all")

        if (self.cpu and self.system_methods):
            try:
                self.system_methods.cpuram()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting cpu stats, skipping them all"
                )

        if (self.spp_catalog and self.system_methods):
            try:
                self.system_methods.sppcatalog()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting file system stats, skipping them all"
                )

        # ####################### JOB METHODS ########################
        if (self.jobs and self.job_methods):
            # store all jobs grouped by jobID
            try:
                self.job_methods.get_all_jobs()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting jobs, skipping them all")

        if (self.job_logs and self.job_methods):
            # store all job logs per job session instance
            try:
                self.job_methods.job_logs()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting job logs, skipping them all"
                )

        # ####################### SSH METHODS ########################
        if (self.ssh and self.ssh_methods):
            # execute ssh statements for, VSNAP, VADP, other ssh hosts
            # store all job logs per job session instance
            try:
                self.ssh_methods.ssh()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh commands, skipping them all"
                )

        if (self.process_stats and self.ssh_methods):
            # execute process stats for server
            try:
                self.ssh_methods.process_stats()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh process statistic commands, skipping them all"
                )

        # ################### HYPERVISOR METHODS #####################
        if (self.vms and self.hypervisor_methods):
            try:
                self.hypervisor_methods.store_vms()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting all VMs, skipping them all"
                )

        if (self.sla_stats and self.hypervisor_methods):
            # number of VMs per SLA and sla dumps
            try:
                self.hypervisor_methods.vms_per_sla()
                self.hypervisor_methods.sla_dumps()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting and computing VMs per sla, skipping them all"
                )

        if (self.vm_stats and self.hypervisor_methods):
            # retrieve and calculate VM inventory summary
            try:
                self.hypervisor_methods.create_inventory_summary()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when creating inventory summary, skipping them all"
                )

        if (self.vadps and self.hypervisor_methods):
            try:
                self.hypervisor_methods.vadps()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting vadps, skipping them all")

        if (self.storages and self.hypervisor_methods):
            try:
                self.hypervisor_methods.storages()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting storages, skipping them all"
                )

        # ###################### OTHER METHODS #######################

        if (OPTIONS.create_dashboard):
            try:
                if (not self.influx_client):
                    raise ValueError(
                        "need the influxclient to create the dashboard")
                OtherMethods.create_dashboard(
                    dashboard_folder_path=OPTIONS.dashboard_folder_path,
                    database_name=self.influx_client.database.name)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when creating dashboards")

        # ######################   DISCLAMER   #######################
        # ###################  TEMPORARY FEATURE  ####################
        # this part is deleted once all old versions of SPPMon have been migrated
        # use at own caution
        # ############################################################
        if (OPTIONS.transfer_data):
            try:
                self.influx_client.transfer_data(OPTIONS.old_database)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when transfering data storages.")

        self.exit()
    def set_optional_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any optional infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should not abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """

        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting.")
            self.exit(error_code=ERROR_CODE_CMD_LINE)

        # ############################ REST-API #####################################
        try:
            auth_rest = SppUtils.get_cfg_params(param_dict=config_file,
                                                param_name="sppServer")

            if (not isinstance(auth_rest, dict)):
                raise ValueError("sppServer config need to be dict")

            self.job_log_retention_time = auth_rest.get(
                "jobLog_rentation", "60d")

            ConnectionUtils.verbose = OPTIONS.verbose
            ConnectionUtils.timeout_reduction = self.timeout_reduction
            ConnectionUtils.allowed_time_diff_quota = self.allowed_time_diff_quota
            ConnectionUtils.maximum_increase_pagesize = self.maximum_increase_pagesize

            if (OPTIONS.minimumLogs):
                rest_time_out = self.minimum_timeout
                rest_preferred_time = self.loaded_preferred_time
            else:
                rest_time_out = self.default_timeout
                rest_preferred_time = self.preferred_time

            self.rest_client = RestClient(auth_rest, rest_time_out,
                                          rest_preferred_time, self.page_size,
                                          self.min_page_size,
                                          self.send_retries, OPTIONS.verbose)

            self.api_queries = ApiQueries(self.rest_client)
            self.rest_client.login()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="REST-API is not available due Config error")
            self.rest_client = None
            self.api_queries = None

        # ######################## System, Job and Hypervisor Methods ##################
        try:
            # explicit ahead due dependency
            self.system_methods = SystemMethods(self.influx_client,
                                                self.api_queries,
                                                OPTIONS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        try:
            self.job_methods = JobMethods(self.influx_client, self.api_queries,
                                          self.job_log_retention_time,
                                          self.minLogs_joblog_type,
                                          self.default_joblog_type,
                                          OPTIONS.verbose, OPTIONS.minimumLogs)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        try:
            # dependen on system methods
            self.hypervisor_methods = ProtectionMethods(
                self.system_methods, self.influx_client, self.api_queries,
                OPTIONS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        # ############################### SSH #####################################
        if (self.ssh or self.process_stats):
            try:

                auth_ssh = SppUtils.get_cfg_params(param_dict=config_file,
                                                   param_name="sshclients")

                ssh_clients: List[SshClient] = []
                if (not isinstance(auth_ssh, list)):
                    raise ValueError("not a list of sshconfig given", auth_ssh)

                for client_ssh in auth_ssh:
                    try:
                        ssh_clients.append(SshClient(client_ssh))
                    except ValueError as error:
                        ExceptionUtils.exception_info(
                            error=error,
                            extra_message=
                            f"Setting up one client failed, skipping it. Client: \
                            {client_ssh.get('name', 'ERROR WHEN GETTING NAME')}"
                        )

                # set from None to methods once finished
                self.ssh_methods = SshMethods(influx_client=self.influx_client,
                                              ssh_clients=ssh_clients,
                                              verbose=OPTIONS.verbose)

            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "SSH-Commands are not available due Config error")
示例#30
0
    def flush_insert_buffer(self, fallback: bool = False) -> None:
        """Flushes the insert buffer, send querys to influxdb server.

        Sends in batches defined by `__batch_size` to reduce http overhead.
        Only send-statistics remain in buffer, flush again to send those too.
        Retries once into fallback mode if first request fails with modified settings.

        Keyword Arguments:
            fallback {bool} -- Whether to use fallback-options. Does not repeat on fallback (default: {False})

        Raises:
            ValueError: Critical: The query Buffer is None.
        """

        if (self.__insert_buffer is None):
            raise ValueError(
                "query buffer is somehow None, this should never happen!")
        # Only send if there is something to send
        if (not self.__insert_buffer):
            return

        # pre-save the keys to avoid Runtime-Error due "dictionary keys changed during iteration"
        # happens due re-run changing insert_buffer
        insert_keys = list(self.__insert_buffer.keys())
        for table in insert_keys:
            # get empty in case the key isnt valid anymore (due fallback option)
            queries = list(
                map(lambda query: query.to_query(),
                    self.__insert_buffer.get(table, [])))
            item_count = len(queries)
            if (item_count == 0):
                continue

            # stop time for send progess
            if (not fallback):
                batch_size = self.__query_max_batch_size
            else:
                batch_size = self.__fallback_max_batch_size

            re_send: bool = False
            error_msg: Optional[str] = None
            start_time = time.perf_counter()
            try:
                self.__client.write_points(
                    points=queries,
                    database=self.database.name,
                    retention_policy=table.retention_policy.name,
                    batch_size=batch_size,
                    time_precision='s',
                    protocol='line')
                end_time = time.perf_counter()
            except InfluxDBClientError as error:  # type: ignore
                match = re.match(r".*partial write:[\s\w]+=(\d+).*",
                                 error.content)

                if (match and int(match.group(1)) < batch_size):
                    # beyond 10.000 everything will be lost, below still written
                    # ignore this case, its unavoidable and doesnt change anything
                    pass
                elif (re.match(r".*partial write: unable to parse .*",
                               error.content)):
                    # some messages are lost, other written
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message=
                        f"Some messages were lost when sending buffer for table {table.name}, but everything else should be OK"
                    )
                    error_msg = getattr(error, 'message', repr(error))
                else:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message=
                        f"Client error when sending insert buffer for table {table.name}."
                    )
                    error_msg = getattr(error, 'message', repr(error))
                    # re-try with a smaller batch size, unsure if this helps
                    re_send = True

            except (InfluxDBServerError, ConnectionError,
                    requests.exceptions.ConnectionError
                    ) as error:  # type: ignore
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    f"Connection error when sending insert buffer for table {table.name}."
                )
                error_msg = getattr(error, 'message', repr(error))
                re_send = True

            # measure timing
            end_time = time.perf_counter()

            # clear the table which just got sent
            if (re_send and not fallback):
                ExceptionUtils.error_message(
                    "Trying to send influx buffer again with fallback options")
                self.flush_insert_buffer(fallback=True)

            # None to avoid key erro if table is popped on fallback
            self.__insert_buffer.pop(table, None)

            # add metrics for the next sending process.
            # compute duration, metrics computed per batch
            self.__insert_metrics_to_buffer(Keyword.INSERT,
                                            table,
                                            end_time - start_time,
                                            item_count,
                                            error=error_msg)