def get_all_jobs(self) -> None: """incrementally saves all stored jobsessions, even before first execution of sppmon""" job_list = MethodUtils.query_something( name="job list", source_func=self.__api_queries.get_job_list) for job in job_list: job_id = job.get("id", None) job_name = job.get("name", None) # this way to make sure we also catch empty strings if (not job_id or not job_name): ExceptionUtils.error_message( f"skipping, missing name or id for job {job}") continue LOGGER.info( ">> capturing Job information for Job \"{}\"".format(job_name)) try: self.__job_by_id(job_id=job_id) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"error when getting jobs for {job_name}, skipping it") continue
def login(self) -> None: """Logs in into the REST-API. Call this before using any methods. Sets up the sessionId and the server URL. Raises: ValueError: Login was not sucessfull. """ http_auth: HTTPBasicAuth = HTTPBasicAuth(self.__username, self.__password) # type: ignore self.__srv_url = "https://{srv_address}:{port}".format(srv_address=self.__srv_address, port=self.__srv_port) endpoint = "/api/endeavour/session" LOGGER.debug(f"login to SPP REST API server: {self.__srv_url}") if(self.__verbose): LOGGER.info(f"login to SPP REST API server: {self.__srv_url}") try: response_json = self.post_data(endpoint=endpoint, auth=http_auth) # type: ignore except ValueError as error: ExceptionUtils.exception_info(error=error) ExceptionUtils.error_message( "Please make sure your Hostadress, port, username and password for REST-API (not SSH) login is correct." + "\nYou may test this by logging in into the SPP-Website with the used credentials.") raise ValueError(f"REST API login request not successfull.") self.__sessionid: str = response_json.get("sessionid", "") (version, build) = self.get_spp_version_build() LOGGER.debug(f"SPP-Version: {version}, build {build}") LOGGER.debug(f"REST API Session ID: {self.__sessionid}") if(self.__verbose): LOGGER.info(f"REST API Session ID: {self.__sessionid}") LOGGER.info(f"SPP-Version: {version}, build {build}") self.__headers['X-Endeavour-Sessionid'] = self.__sessionid
def set_critial_configs(self, config_file: Dict[str, Any]) -> None: """Sets up any critical infrastructure, to be called within the init. Be aware not everything may be initalized on call time. Add config here if the system should abort if it is missing. Arguments: config_file {Dict[str, Any]} -- Opened Config file """ if (not config_file): ExceptionUtils.error_message( "missing or empty config file, aborting") self.exit(error_code=ERROR_CODE_START_ERROR) try: # critical components only self.influx_client = InfluxClient(config_file) if (not self.ignore_setup): # delay the connect into the testing phase self.influx_client.connect() except ValueError as err: ExceptionUtils.exception_info( error=err, extra_message="error while setting up critical config. Aborting" ) self.influx_client = None # set none, otherwise the variable is undeclared self.exit(error_code=ERROR_CODE)
def __job_logs_to_stats(self, list_with_logs: List[Dict[str, Any]]) -> None: """Parses joblogs into their own statisic table, using declared supported ID's To parse more jobLogs define additional entrys in the attribute `supported_ids`. Arguments: list_with_logs {List[Dict[str, Any]]} -- List with all saved joblogs """ # only continue with joblogs we want to save supported_log_iterator = filter(lambda log: log['messageId'] in self.__supported_ids.keys(), list_with_logs) sorted_log_iterator = sorted(supported_log_iterator, key=lambda entry: entry['logTime']) max_sec_timestamp = 0 # required for preventing duplicates for job_log in sorted_log_iterator: message_id = job_log['messageId'] table_func_tuple = self.__supported_ids[message_id] (table_name, row_dict_func) = table_func_tuple if(not table_name): table_name = message_id try: row_dict = row_dict_func(job_log['messageParams']) except KeyError as error: ExceptionUtils.exception_info( error, extra_message="MessageID params wrong defined. Skipping one MessageId") continue row_dict['messageId'] = message_id # Issue 9, In case where all tag values duplicate another record, including the timestamp, Influx will throw the insert # out as a duplicate. In some cases, the changing of epoch timestamps from millisecond to second precision is # cause duplicate timestamps. To avoid this for certain tables, add seconds to the timestamp as needed to # ensure uniqueness. Only use this when some innacuracy of the timestamps is acceptable cur_timestamp = job_log['logTime'] if(table_name == 'vmBackupSummary'): if(cur_timestamp is None): # prevent None ExceptionUtils.error_message(f"Warning: logTime is None, duplicate may be purged. Log: {job_log}") if(isinstance(cur_timestamp, str)): # make sure its int cur_timestamp = int(cur_timestamp) cur_sec_timestamp = SppUtils.to_epoch_secs(cur_timestamp) if(cur_sec_timestamp <= max_sec_timestamp): digits = (int)(cur_timestamp / cur_sec_timestamp) max_sec_timestamp += 1 # increase by 1 second cur_timestamp = max_sec_timestamp * digits else: max_sec_timestamp = cur_sec_timestamp row_dict['time'] = cur_timestamp for(key, item) in row_dict.items(): if(item in ('null', 'null(null)')): row_dict[key] = None self.__influx_client.insert_dicts_to_buffer(table_name, [row_dict])
def set_critial_configs(self, config_file: Dict[str, Any]) -> None: """Sets up any critical infrastructure, to be called within the init. Be aware not everything may be initalized on call time. Add config here if the system should abort if it is missing. Arguments: config_file {Dict[str, Any]} -- Opened Config file """ if (not config_file): ExceptionUtils.error_message( "missing or empty config file, aborting") self.exit(error_code=ERROR_CODE_CMD_LINE) try: # critical components only auth_influx = SppUtils.get_cfg_params(param_dict=config_file, param_name="influxDB") if (not isinstance(auth_influx, dict)): raise ValueError("influx config need to be dict") self.influx_client = InfluxClient(auth_influx=auth_influx) self.influx_client.connect() except ValueError as err: ExceptionUtils.exception_info( error=err, extra_message="error while setting up critical config. Aborting" ) self.influx_client = None # set none cause it does not work. self.exit(error_code=ERROR_CODE)
def filter_values_dict( cls, result_list: List[Dict[str, Any]], white_list: List[str] = None, ignore_list: List[str] = None) -> List[Dict[str, Any]]: """Removes unwanted values from a list of dicts. Use white_list to only pick the values specified. Use ignore_list to pick everything but the values specified Both: white_list itmes overwrite ignore_list times, still getting all items not filterd. Args: result_list (List[Dict[str, Any]]): items to be filtered white_list (List[str], optional): items to be kept. Defaults to None. ignore_list (List[str], optional): items to be removed. Defaults to None. Raises: ValueError: no result list specified Returns: List[Dict[str, Any]]: list of filtered dicts """ if (result_list is None): raise ValueError("need valuelist to filter values") new_result_list: List[Dict[str, Any]] = [] # if single object this is a 1 elem list for result in result_list: new_result: Dict[str, Any] = {} # Only aquire items wanted if (white_list): for white_key in white_list: (key, value) = SppUtils.get_nested_kv(key_name=white_key, nested_dict=result) if (key in new_result): key = white_key new_result[key] = value # warn if something is missing if (len(new_result) != len(white_list)): ExceptionUtils.error_message( f"Result has not same lenght as whitelist, probably typing error: {result_list}" ) # aquire all but few unwanted if (ignore_list is not None): # add sub-dicts to dictonary itself, filtering inclusive full_result = cls.get_with_sub_values(mydict=result, ignore_list=ignore_list) new_result.update(full_result) new_result_list.append(new_result) return new_result_list
def exit(self, error_code: int = SUCCESS_CODE) -> NoReturn: """Executes finishing tasks and exits sppmon. To be called every time. Executes finishing tasks and displays error messages. Specify only error message if something did went wrong. Use Error codes specified at top of module. Does NOT return. Keyword Arguments: error {int} -- Errorcode if a error occured. (default: {0}) """ # error with the command line arguments # dont store runtime here if (error_code == ERROR_CODE_CMD_ARGS): parser.print_help() sys.exit(ERROR_CODE_CMD_ARGS) # unreachable? if (error_code == ERROR_CODE_START_ERROR): ExceptionUtils.error_message( "Error when starting SPPMon. Please review the errors above") sys.exit(ERROR_CODE_START_ERROR) script_end_time = SppUtils.get_actual_time_sec() LOGGER.debug("Script end time: %d", script_end_time) try: if (not self.ignore_setup): self.store_script_metrics() if (self.influx_client): self.influx_client.disconnect() if (self.rest_client): self.rest_client.logout() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Error occured while exiting sppmon") error_code = ERROR_CODE self.remove_pid_file() # Both error-clauses are actually the same, but for possiblility of an split between error cases # always last due beeing true for any number != 0 if (error_code == ERROR_CODE or error_code): ExceptionUtils.error_message( "Error occured while executing sppmon") elif (not self.ignore_setup): LOGGER.info("\n\n!!! script completed !!!\n") print( f"check log for details: grep \"PID {os.getpid()}\" {self.log_path} > sppmon.log.{os.getpid()}" ) sys.exit(error_code)
def exit(self, error_code: int = False) -> NoReturn: """Executes finishing tasks and exits sppmon. To be called every time. Executes finishing tasks and displays error messages. Specify only error message if something did went wrong. Use Error codes specified at top of module. Does NOT return. Keyword Arguments: error {int} -- Errorcode if a error occured. (default: {False}) """ # error with the command line arguments # dont store runtime here if (error_code == ERROR_CODE_CMD_LINE): prog_args = [] prog_args.append(sys.argv[0]) prog_args.append("--help") os.execv(sys.executable, ['python'] + prog_args) sys.exit(ERROR_CODE_CMD_LINE) # unreachable? script_end_time = SppUtils.get_actual_time_sec() LOGGER.debug("Script end time: %d", script_end_time) try: if (not self.ignore_setup): self.store_script_metrics() if (self.influx_client): self.influx_client.disconnect() if (self.rest_client): self.rest_client.logout() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Error occured while exiting sppmon") error_code = ERROR_CODE if (not error_code): LOGGER.info("\n\n!!! script completed !!!\n") self.remove_pid_file() # Both clauses are actually the same, but for clarification, always last due always beeing true for any number if (error_code == ERROR_CODE or error_code): ExceptionUtils.error_message( "Error occured while executing sppmon") print( f"check log for details: grep \"PID {os.getpid()}\" {self.log_path} > sppmon.log.{os.getpid()}" ) sys.exit(error_code)
def query_something( cls, name: str, source_func: Callable[[], List[Dict[str, Any]]], rename_tuples: List[Tuple[str, str]] = None, deactivate_verbose: bool = False) -> List[Dict[str, Any]]: """ Generic function to query from the REST-API and rename elements within it. Use deactivate_verbose to deactivate any result-printing to compute the result and query yourself. Arguments: name {str} -- Name of item you want to query for the logger. source_func {Function} -- Function which returns a list of dicts with elems wanted. Keyword Arguments: rename_tuples {list} -- List of Tuples if you want to rename Keys. (old_name, new_name) (default: {None}) deactivate_verbose {bool} -- deactivates result-prints within the function. (default: {False}) Raises: ValueError: No name is provided ValueError: No Function is provided or not a function Returns: list -- List of dicts with the results. """ # None checks if(rename_tuples is None): rename_tuples = [] if(not name): raise ValueError("need name to query something") if(not source_func): raise ValueError("need a source function to query data") LOGGER.info("> getting %s", name) # request all Sites from SPP elem_list = source_func() if(not elem_list): ExceptionUtils.error_message(f">> No {name} are found") if(rename_tuples): for elem in elem_list: # rename fields to make it more informative. for(old_name, new_name) in rename_tuples: elem[new_name] = elem.pop(old_name) if(cls.verbose and not deactivate_verbose): MethodUtils.my_print(elem_list) return elem_list
def site_name_by_id(self, site_id: Union[int, str]) -> Optional[str]: """Returns a site_name by a associated site_id. Uses a already buffered result if possible, otherwise queries the influxdb for the name. Arguments: site_id {Union[int, str]} -- id of the site Returns: Optional[str] -- name of the site, None if not found. """ if (site_id is None): ExceptionUtils.error_message("siteId is none, returning None") return None # if string, parse to int if (isinstance(site_id, str)): site_id = site_id.strip(" ") if (re.match(r"\d+", site_id)): site_id = int(site_id) else: ExceptionUtils.error_message( "siteId is of unsupported string format") return None # if still not int, error if (not isinstance(site_id, int)): ExceptionUtils.error_message("site id is of unsupported type") return None # return if already saved -> previous call or `sites`-call result = self.__site_name_dict.get(site_id, None) if (result is not None): # empty str allowed return result table_name = 'sites' table = self.__influx_client.database[table_name] query = SelectionQuery( keyword=Keyword.SELECT, tables=[table], # description, throttleRates cause we need a field to query fields=["siteId", "siteName", "description", "throttleRates"], where_str=f"siteId = \'{site_id}\'", order_direction="DESC", limit=1) result_set = self.__influx_client.send_selection_query( query) # type: ignore result_dict: Dict[str, Any] = next(result_set.get_points(), None) # type: ignore if (not result_dict): ExceptionUtils.error_message( f"no site with the id {site_id} exists") return None # save result and return it result = result_dict['siteName'] self.__site_name_dict[site_id] = result return result
def setup_args(self) -> None: """This method set up all required parameters and transforms arg groups into individual args. """ # ## call functions based on cmdline parameters # Temporary features / Depricated if (OPTIONS.minimumLogs): ExceptionUtils.error_message( "DEPRICATED: using depricated argument '--minumumLogs'. Switch to '--loadedSystem'." ) # incremental setup, higher executes all below all_args: bool = OPTIONS.all daily: bool = OPTIONS.daily or all_args hourly: bool = OPTIONS.hourly or daily constant: bool = OPTIONS.constant or hourly # ######## All Methods ################# self.sites: bool = OPTIONS.sites or all_args # ######## Daily Methods ############### self.vms: bool = OPTIONS.vms or daily self.job_logs: bool = OPTIONS.jobLogs or daily self.sla_stats: bool = OPTIONS.slaStats or daily self.vm_stats: bool = OPTIONS.vmStats or daily # ######## Hourly Methods ############## self.jobs: bool = OPTIONS.jobs or hourly self.vadps: bool = OPTIONS.vadps or hourly self.storages: bool = OPTIONS.storages or hourly # ssh vsnap pools ? # ######## Constant Methods ############ self.ssh: bool = OPTIONS.ssh or constant self.process_stats: bool = OPTIONS.processStats or constant self.cpu: bool = OPTIONS.cpu or constant self.spp_catalog: bool = OPTIONS.sppcatalog or constant
def get_auto_datatype(value: Any) -> Datatype: """get Datatype enum by value typ analysis. Usage should be avoided. Only use if no datatype is declared. It skips time-type and fails if ints are mixed with floats. If no type is detected emits a warning and returns `NONE`. Arguments: value {Union[str, float, int, bool, None]} -- Value to be analyzed Returns: Datatype -- type of value or `NONE`. """ for enum in Datatype: if (enum is Datatype.TIMESTAMP): continue if (isinstance(value, enum.value)): return enum ExceptionUtils.error_message(f"No auto type found for {value}") return Datatype.NONE
def __init__(self): self.log_path: str = "" """path to logger, set in set_logger.""" self.pid_file_path: str = "" """path to pid_file, set in check_pid_file.""" self.set_logger() LOGGER.info("Starting SPPMon") if (not self.check_pid_file()): ExceptionUtils.error_message( "Another instance of sppmon with the same args is running") self.exit(ERROR_CODE_START_ERROR) time_stamp_name, time_stamp = SppUtils.get_capture_timestamp_sec() self.start_counter = time.perf_counter() LOGGER.debug("\n\n") LOGGER.debug(f"running script version: {VERSION}") LOGGER.debug(f"cmdline options: {ARGS}") LOGGER.debug(f"{time_stamp_name}: {time_stamp}") LOGGER.debug("") if (not ARGS.configFile): ExceptionUtils.error_message("missing config file, aborting") self.exit(error_code=ERROR_CODE_CMD_ARGS) try: self.config_file = SppUtils.read_conf_file( config_file_path=ARGS.configFile) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Error when trying to read Config file, unable to read") self.exit(error_code=ERROR_CODE_START_ERROR) LOGGER.info("Setting up configurations") self.setup_args() self.set_critial_configs(self.config_file) self.set_optional_configs(self.config_file)
def login(self) -> None: """Logs in into the REST-API. Call this before using any methods. Sets up the sessionId and the server URL. Raises: ValueError: Login was not sucessfull. """ http_auth: HTTPBasicAuth = HTTPBasicAuth(self.__username, self.__password) self.__srv_url = f"https://{self.__srv_address}:{self.__srv_port}" login_url = self.get_url("/api/endeavour/session") LOGGER.debug(f"login to SPP REST API server: {self.__srv_url}") if(self.__verbose): LOGGER.info(f"login to SPP REST API server: {self.__srv_url}") try: (response_json, _) = self.query_url(url=login_url, auth=http_auth, request_type=RequestType.POST) except ValueError as error: ExceptionUtils.exception_info(error=error) ExceptionUtils.error_message( "Please make sure your Hostadress, port, username and password for REST-API (not SSH) login is correct." + "\nYou may test this by logging in into the SPP-Website with the used credentials.") raise ValueError(f"REST API login request not successfull.") try: self.__sessionid: str = response_json["sessionid"] except KeyError as error: ExceptionUtils.exception_info(error) raise ValueError("Login into SPP failed: No Session-ID received") (version, build) = self.get_spp_version_build() LOGGER.debug(f"SPP-Version: {version}, build {build}") LOGGER.debug(f"REST API Session ID: {self.__sessionid}") if(self.__verbose): LOGGER.info(f"REST API Session ID: {self.__sessionid}") LOGGER.info(f"SPP-Version: {version}, build {build}") self.__headers['X-Endeavour-Sessionid'] = self.__sessionid
def setup_db(self, database_name: str) -> None: if (not self.__client): raise ValueError( "Tried to setup DB while client wasn't connected.") try: # Check if database already exits -> nothing to do db_list: List[Dict[str, str]] = self.__client.get_list_database() if (database_name in map(lambda entry: entry["name"], db_list)): LOGGER.debug(f"SetupDB: DB {database_name} already exits") # nothing to do since db exits return # create db, nothing happens if it already exists self.__client.create_database(database_name) LOGGER.info(f"> Created database {database_name}") # Check if GrafanaReader exists and give him permissions user_list: List[Dict[str, str]] = self.__client.get_list_users() if (self.grafanaReader_name not in map(lambda entry: entry["user"], user_list)): LOGGER.debug("SetupDB: Grafana User does not exits") ExceptionUtils.error_message( f"WARNING: User '{self.grafanaReader_name}' does not exist" ) # nothing to do since GrafanaReader does not exit return self.__client.grant_privilege("read", database_name, self.grafanaReader_name) LOGGER.info( f"> Granted read privileges for user {self.grafanaReader_name} on db {database_name}" ) except (ValueError, InfluxDBClientError, InfluxDBServerError, requests.exceptions.ConnectionError) as error: # type: ignore ExceptionUtils.exception_info(error=error) # type: ignore raise ValueError( f"Setup of the new database failed. Maybe the connection failed or the user '{self.__user}' has no admin privileges." )
def __init__(self): self.log_path: str = "" """path to logger, set in set_logger.""" self.pid_file_path: str = "" """path to pid_file, set in check_pid_file.""" # String, cause of days etc self.job_log_retention_time = "60d" """Configured spp log rentation time, logs get deleted after this time.""" self.set_logger() if (not self.check_pid_file()): ExceptionUtils.error_message( "Another instance of sppmon with the same args is running") self.exit(ERROR_CODE_CMD_LINE) # everything is option, otherwise its a typo. if (len(ARGS) > 0): ExceptionUtils.error_message( f"CAREFUL: ARG DETECTED, probably typing in programm call: {ARGS}" ) time_stamp_name, time_stamp = SppUtils.get_capture_timestamp_sec() self.start_counter = time.perf_counter() LOGGER.debug("\n\n") LOGGER.debug(f"running script version: {VERSION}") LOGGER.debug(f"cmdline options: {OPTIONS}") LOGGER.debug(f"{time_stamp_name}: {time_stamp}") LOGGER.debug("") if (not OPTIONS.confFileJSON): ExceptionUtils.error_message("missing config file, aborting") self.exit(error_code=ERROR_CODE_CMD_LINE) try: config_file = SppUtils.read_conf_file( config_file_path=OPTIONS.confFileJSON) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Syntax Error in Config file, unable to read") self.exit(error_code=ERROR_CODE_CMD_LINE) self.setup_args() self.set_critial_configs(config_file) self.set_optional_configs(config_file)
def job_logs(self) -> None: """saves all jobLogs for the jobsessions in influx catalog. Make sure to call `get_all_jobs` before to aquire all jobsessions. In order to save them it deletes and rewrites all affected jobsession entrys. It automatically parses certain jobLogs into additional stats, defined by `supported_ids`. """ # total count of requested logs logs_requested_total = 0 # total count of inserted logs logs_to_stats_total = 0 # should be equal, but on failure isnt (skipped logs) # list to be inserted after everything is updated job_update_list: List[Dict[str, Any]] = [] LOGGER.info("> Requesting jobs with missing logs from influx database") table = self.__influx_client.database['jobs'] # only store if there is something to store -> limited by job log rentation time. where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time where_str += f' AND time > now() - {table.retention_policy.duration}' # Select all jobs without joblogs keyword = Keyword.SELECT query = SelectionQuery(keyword=keyword, tables=[table], fields=['*'], where_str=where_str) # send query and compute missing_logs_jobs_rs = self.__influx_client.send_selection_query( # type: ignore query) # this list contains all jobs which are missing its Logs # Cast from resultset into list missing_logs_jobs: List[Dict[str, Any]] = list( missing_logs_jobs_rs.get_points()) # type: ignore LOGGER.info( f">>> Number of jobs with no joblogs stored in Influx database: {len(missing_logs_jobs)}" ) LOGGER.info("> Requesting missing jobLogs from REST-API.") # request all jobLogs from REST-API # counter only for displaying purposes for counter, row in enumerate(missing_logs_jobs, 0): # Only print every 5 rows if not verbose # starts at 0, therefore already updated if (self.__verbose or counter % 5 == 0): LOGGER.info( f">>> computed joblogs for {counter} / {len(missing_logs_jobs)} job sessions." ) job_session_id: Optional[int] = row.get('id', None) # if somehow jobLogid is missing: skip # Should usually not happen if (job_session_id is None): ExceptionUtils.error_message( f"Error: jobSessionId missing for row {row}") continue if (self.__verbose): LOGGER.info( f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}." ) LOGGER.debug( f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}." ) try: # cant use `query_something` like in other places due the extra params: # api_queries - query_something only works with no params # This list contains all joblogs for a single job-execution current_job_logs = self.__api_queries.get_job_log_details( jobsession_id=job_session_id, job_logs_types=self.__job_log_types, request_ids=list(self.__supported_ids.keys())) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"Error when api-requesting joblogs for job_session_id {job_session_id}, skipping it" ) continue job_log_count = len(current_job_logs) logs_requested_total += job_log_count if (self.__verbose): LOGGER.info( f">>> Found {job_log_count} logs for jobsessionId {job_session_id}" ) LOGGER.debug( f"Found {job_log_count} logs for jobsessionId {job_session_id}" ) # #################################################################################### # Compute results and save logs # ##################################################################################### # The request of REST-API Logs is finished here # To not crash by saving 100.000+ Logs, directly compute results and insert them # ###################################################################################### for job_log in current_job_logs: # add additional information from job-session itself job_log["jobId"] = row.get("jobId", None) job_log["jobName"] = row.get("jobName", None) job_log["jobExecutionTime"] = row.get("start", None) # rename for clarity job_log["jobLogId"] = job_log.pop("id", None) job_log["jobSessionId"] = job_log.pop("jobsessionId", None) # ########################################################## # compute jobLog-Stats into each associated table # ########################################################## try: self.__job_logs_to_stats(current_job_logs) except ValueError as error: ExceptionUtils.exception_info( error, extra_message= f"Failed parse jobLogs into its own table, skipping for jobsessionId {job_session_id}" ) logs_to_stats_total += job_log_count # ########################################################## # save logs within the joblog-dump # ########################################################## # Only dump them after computing stats since they are read within the computing stats part for job_log in current_job_logs: # dump message params to allow saving as string job_log["messageParams"] = json.dumps(job_log["messageParams"]) # if list is empty due beeing erased etc it will simply return and do nothing self.__influx_client.insert_dicts_to_buffer( list_with_dicts=current_job_logs, table_name="jobLogs") # shallow copy dict to allow a update without errors copied_jobsession = dict(row.items()) # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails) update_fields = { "jobLogsCount": job_log_count, "jobsLogsStored": True } # update the fields for (key, value) in update_fields.items(): copied_jobsession[key] = value job_update_list.append(copied_jobsession) # ########################################################## # End of For-Each # ########################################################## # ########################################################## # Delete each job, then re-insert # ########################################################## # Delete all jobs which got requested, no matter if failed delete_query = SelectionQuery(keyword=Keyword.DELETE, tables=[table], where_str=where_str) # now send remove query to prevent data loss self.__influx_client.send_selection_query(delete_query) # type: ignore # Insert data after everything is completed self.__influx_client.insert_dicts_to_buffer(table.name, job_update_list) if (logs_requested_total != logs_to_stats_total): LOGGER.info( f"> Requested a total of {logs_requested_total} but only computed {logs_to_stats_total} into sppmon statistics" ) else: LOGGER.info( f">>> requested and computed a total of {logs_requested_total} logs" ) LOGGER.info(f">> Updated a total of {len(job_update_list)} jobs")
def __add_predef_table( cls, name: str, fields: Dict[str, Datatype], tags: List[str], time_key: Optional[str] = None, retention_policy: RetentionPolicy = None, continuous_queries: List[Union[ContinuousQuery, Callable[[Table, str], ContinuousQuery]]] = None ) -> None: """Declares a new predefined table. Recommended to to with every table you may want to insert into the influxdb. It is recommended to declare each param by name. If you do not declare the time_key, it will use sppmon capture time. Declare Retention Policy by ClassMethods declared above. Blank for `autogen`-RP (not recommended). Declare Continuous queries by using either the cq_template or creating your own. Be aware it is impossible to use `database["tablename"] to gain a instance of a table, this table is not defined yet. Arguments: name {str} -- Name of the table/measurement fields {Dict[str, Datatype]} -- fields of the table. At least one entry, name as key, dataype as value. tags {List[str]} -- tags of the table. Always of datatype string Keyword Arguments: time_key {Optional[str]} -- Name of key used as timestamp. Blank if capturetime (default: {None}) retention_policy {RetentionPolicy} -- Retention policy to be associated (default: {None}) continuous_queries {List[Union[ContinuousQuery, Callable[[Table, str], ContinuousQuery]]]} -- List of either a CQ or a template which is transformed within this method (default: {None}) """ # create a retention instance out of the constructor methods if (not retention_policy): retention_policy = cls._RP_AUTOGEN() # add to save used policies cls.__database.retention_policies.add(retention_policy) # switch needed to allow table default value to be used. # avoids redudant default declaration if (time_key): table = Table(database=cls.__database, name=name, fields=fields, tags=tags, time_key=time_key, retention_policy=retention_policy) else: table = Table(database=cls.__database, name=name, fields=fields, tags=tags, retention_policy=retention_policy) cls.__database.tables[name] = table # save CQ if (continuous_queries): i = 0 for continuous_query in continuous_queries: if (not isinstance(continuous_query, ContinuousQuery)): continuous_query = continuous_query( table, f"cq_{table.name}_{i}") i += 1 cls.__database.continuous_queries.add(continuous_query) # make sure the args exist if (continuous_query.select_query and continuous_query.select_query.into_table): cls.__database.retention_policies.add( continuous_query.select_query.into_table. retention_policy) else: # regex parsing? ExceptionUtils.error_message( "Probably a programming error, report to DEV's. " + f"Missing retention policy for CQ {continuous_query.name}." )
def test_connection(influx_client: InfluxClient, rest_client: RestClient, config_file: Dict[str, Any]): if (not config_file): raise ValueError("SPPmon does not work without a config file") LOGGER.info("Testing all connections required for SPPMon to work") working: bool = True # SPPMon itself will finish sucessfull (no critical errors) no_warnings: bool = True # SPPMon will finish without any warnings (no errors at all) # ## InfluxDB ## LOGGER.info("> Testing and configuring InfluxDB") try: influx_client.connect() influx_client.disconnect() if (not influx_client.use_ssl): ExceptionUtils.error_message( "> WARNING: Mandatory SSL is disabled. We hightly recommend to enable it!" ) no_warnings = False LOGGER.info("InfluxDB is ready for use") except ValueError as error: ExceptionUtils.exception_info( error, extra_message= "> Testing of the InfluxDB failed. This is a crictial component of SPPMon." ) working = False # ## REST-API ## LOGGER.info("> Testing REST-API of SPP.") try: rest_client.login() (version_nr, build_nr) = rest_client.get_spp_version_build() LOGGER.info( f">> Sucessfully connected to SPP V{version_nr}, build {build_nr}." ) rest_client.logout() LOGGER.info("> REST-API is ready for use") except ValueError as error: ExceptionUtils.exception_info( error, extra_message= "> Testing of the REST-API failed. This is a crictial component of SPPMon." ) working = False # ## SSH-CLIENTS ## LOGGER.info( "> Testing all types of SSH-Clients: Server, VAPDs, vSnaps, Cloudproxy and others" ) ssh_working = True # The arg --ssh will finish without any error at all # Count of clients checks ssh_clients: List[SshClient] = SshMethods.setup_ssh_clients( config_file) if (not ssh_clients): ExceptionUtils.error_message( ">> No SSH-clients detected at all. At least the server itself should be added for process-statistics." ) ssh_working = False else: for type in SshTypes: if (not list( filter(lambda client: client.client_type == type, ssh_clients))): LOGGER.info(f">> No {type.name} client detected.") if (type == SshTypes.SERVER): ExceptionUtils.error_message( ">> Critical: Without Server as ssh client you wont have any process statistics available. These are a key part of SPPMon." ) ssh_working = False # No error, but still critical if (type == SshTypes.VSNAP): LOGGER.info( ">> WARNING: Without vSnap as ssh client you have no access to storage information. You may add vSnap's for additional monitoring and alerts." ) no_warnings = False # ssh will still work, but thats definitly a warning ssh_methods: SshMethods = SshMethods(influx_client, config_file, False) # Connection check LOGGER.info( f">> Testing now connection and commands of {len(ssh_clients)} registered ssh-clients." ) for client in ssh_clients: try: client.connect() client.disconnect() error_count: int = len(ExceptionUtils.stored_errors) MethodUtils.ssh_execute_commands( ssh_clients=[client], ssh_type=client.client_type, command_list=ssh_methods.client_commands[ client.client_type] + ssh_methods.all_command_list) if (len(ExceptionUtils.stored_errors) != error_count): ssh_working = False ExceptionUtils.error_message( f"Not all commands available for client {client.host_name} with type: {client.client_type}.\n" + "Please check manually if the commands are installed and their output." ) except ValueError as error: ExceptionUtils.exception_info( error, extra_message= f"Connection failed for client {client.host_name} with type: {client.client_type}." ) ssh_working = False if (ssh_working): LOGGER.info("> Testing of SSH-clients sucessfull.") else: LOGGER.info( "> Testing of SSH-clients failed! SPPMon will still work, not all informations are available." ) no_warnings = False # #### Conclusion #### if (working and no_warnings): LOGGER.info( "> All components tested sucessfully. SPPMon is ready to be used!" ) elif (working): LOGGER.info( "> Testing partially sucessful. SPPMon will run, but please check the warnings." ) else: LOGGER.info( "> Testing failed. SPPMon is not ready to be used. Please fix the connection issues." )
def main(self): LOGGER.info("Starting argument execution") if (not self.influx_client): ExceptionUtils.error_message( "somehow no influx client is present even after init") self.exit(ERROR_CODE) # ##################### SYSTEM METHODS ####################### if (self.sites and self.system_methods): try: self.system_methods.sites() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting sites, skipping them all") if (self.cpu and self.system_methods): try: self.system_methods.cpuram() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting cpu stats, skipping them all" ) if (self.spp_catalog and self.system_methods): try: self.system_methods.sppcatalog() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting file system stats, skipping them all" ) # ####################### JOB METHODS ######################## if (self.jobs and self.job_methods): # store all jobs grouped by jobID try: self.job_methods.get_all_jobs() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting jobs, skipping them all") if (self.job_logs and self.job_methods): # store all job logs per job session instance try: self.job_methods.job_logs() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting job logs, skipping them all" ) # ####################### SSH METHODS ######################## if (self.ssh and self.ssh_methods): # execute ssh statements for, VSNAP, VADP, other ssh hosts # store all job logs per job session instance try: self.ssh_methods.ssh() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when excecuting ssh commands, skipping them all" ) # ################### HYPERVISOR METHODS ##################### if (self.vms and self.protection_methods): try: self.protection_methods.store_vms() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting all VMs, skipping them all" ) if (self.sla_stats and self.protection_methods): # number of VMs per SLA and sla dumps try: self.protection_methods.vms_per_sla() self.protection_methods.sla_dumps() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting and computing VMs per sla, skipping them all" ) if (self.vm_stats and self.protection_methods): # retrieve and calculate VM inventory summary try: self.protection_methods.create_inventory_summary() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when creating inventory summary, skipping them all" ) if (self.vadps and self.protection_methods): try: self.protection_methods.vadps() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting vadps, skipping them all") if (self.storages and self.protection_methods): try: self.protection_methods.storages() self.influx_client.flush_insert_buffer() except Exception as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting storages, skipping them all" ) # ###################### OTHER METHODS ####################### if (ARGS.copy_database): try: self.influx_client.copy_database(ARGS.copy_database) except Exception as error: ExceptionUtils.exception_info( error=error, extra_message="Top-level-error when coping database.") # ################### NON-SETUP-METHODS ####################### if (ARGS.test): try: TestingMethods.test_connection(self.config_file, self.influx_client, self.rest_client) except Exception as error: ExceptionUtils.exception_info( error=error, extra_message="Top-level-error when testing connection.") # DEPRECATED TODO REMOVE NEXT VERSION if (ARGS.create_dashboard): try: ExceptionUtils.error_message( "This method is deprecated. You do not need to manually create a dashboard anymore.\n" + "Please just select the datasource when importing the regular 14-day dashboard in grafana.\n" + "Devs may adjust their dashboard to be generic with the scripts/generifyDashboard.py script." ) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Top-level-error when creating dashboard") self.exit()
def split_by_table_def( self, mydict: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any], Union[str, int, None]]: """Split the given dict into a pre-defined set of tags, fields and a timestamp. None-Values and empty strings are ignored. If there are no fields declared, it will split by a default pattern. Undeclared collums will be added with a "MISSING" postfix to the key. This function uses the tag/field and timestamp definiton declared within this table. Arguments: self {Table} -- Table with predefined set of tags and fields mydict {Dict[str, Any]} -- dict with colums as keys. None-Values are ignored Raises: ValueError: If no dict is given or not of type dict. Returns: (Dict[str, Any], Dict[str, Any], int) -- Tuple of: tags, fields, timestamp """ if (not mydict): raise ValueError("need at least one value in dict to split") # if table is not defined use default split if (not self.fields): return InfluxUtils.default_split(mydict=mydict) # fill dicts # table.fields is a dict, we only need the keys fields: Dict[str, Any] = dict.fromkeys(self.fields.keys(), None) tags: Dict[str, Any] = dict.fromkeys(self.tags, None) # what field should be recorded as time time_stamp_field = self.time_key # helper variable to only overwrite if it is not the time_stamp_field time_overwrite_allowed = True # actualy timestamp saved time_stamp: Union[str, int, None] = None for (key, value) in mydict.items(): # Ignore empty entrys if (value is None or (isinstance(value, str) and not value)): continue # Check timestamp value if it matches any of predefined time names if (key in time_stamp_field or key in InfluxUtils.time_key_names): # sppmonCTS has lowest priority, only set if otherwise None if (time_stamp is None and key == SppUtils.capture_time_key): time_stamp = value # time_stamp_field is highest priority. Do not overwrite it. elif (key is time_stamp_field): time_overwrite_allowed: bool = False time_stamp = value # if time_stamp_field is not used yet, overwrite sppmonCaptureTime or others elif (time_overwrite_allowed): time_stamp = value # if no overwrite allowed, continue and drop field else: continue # Otherwise check for Keys or Fields if (key in fields): fields[key] = value elif (key in tags): tags[key] = value elif (key in InfluxUtils.time_key_names or key in time_stamp_field): continue else: ExceptionUtils.error_message( f"Not all columns for table {self.name} are declared: {key}" ) # before key+"MISSING" : Removed to avoid death-circle on repeated queries. fields[key] = value return (tags, fields, time_stamp)
def store_script_metrics(self) -> None: """Stores script metrics into influxb. To be called before exit. Does not raise any exceptions, skips if influxdb is missing. """ LOGGER.info("Storing script metrics") try: if (not self.influx_client): raise ValueError("no influxClient set up") insert_dict: Dict[str, Union[str, int, float, bool]] = {} # add version nr, api calls are needed insert_dict["sppmon_version"] = VERSION insert_dict["influxdb_version"] = self.influx_client.version if (self.rest_client): try: (version_nr, build) = self.rest_client.get_spp_version_build() insert_dict["spp_version"] = version_nr insert_dict["spp_build"] = build except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="could not query SPP version and build.") # end total sppmon runtime end_counter = time.perf_counter() insert_dict['duration'] = int( (end_counter - self.start_counter) * 1000) # add arguments of sppmon for (key, value) in vars(ARGS).items(): # Value is either string, true or false/None if (value): insert_dict[key] = value # save occured errors error_count = len(ExceptionUtils.stored_errors) if (error_count > 0): ExceptionUtils.error_message( f"total of {error_count} exception/s occured") insert_dict['errorCount'] = error_count # save list as str if not empty if (ExceptionUtils.stored_errors): insert_dict['errorMessages'] = str( ExceptionUtils.stored_errors) # get end timestamp (time_key, time_val) = SppUtils.get_capture_timestamp_sec() insert_dict[time_key] = time_val # save the metrics self.influx_client.insert_dicts_to_buffer( table_name="sppmon_metrics", list_with_dicts=[insert_dict]) self.influx_client.flush_insert_buffer() LOGGER.info("Stored script metrics sucessfull") # + 1 due the "total of x exception/s occured" if (error_count + 1 < len(ExceptionUtils.stored_errors)): ExceptionUtils.error_message( "A non-critical error occured while storing script metrics. \n\ This error can't be saved into the DB, it's only displayed within the logs." ) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Error when storing sppmon-metrics, skipping this step. Possible insert-buffer data loss" )
def main(self): LOGGER.info("Starting argument execution") if (not self.influx_client): ExceptionUtils.error_message( "somehow no influx client is present even after init") self.exit(ERROR_CODE) # ##################### SYSTEM METHODS ####################### if (self.sites and self.system_methods): try: self.system_methods.sites() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting sites, skipping them all") if (self.cpu and self.system_methods): try: self.system_methods.cpuram() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting cpu stats, skipping them all" ) if (self.spp_catalog and self.system_methods): try: self.system_methods.sppcatalog() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting file system stats, skipping them all" ) # ####################### JOB METHODS ######################## if (self.jobs and self.job_methods): # store all jobs grouped by jobID try: self.job_methods.get_all_jobs() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting jobs, skipping them all") if (self.job_logs and self.job_methods): # store all job logs per job session instance try: self.job_methods.job_logs() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting job logs, skipping them all" ) # ####################### SSH METHODS ######################## if (self.ssh and self.ssh_methods): # execute ssh statements for, VSNAP, VADP, other ssh hosts # store all job logs per job session instance try: self.ssh_methods.ssh() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when excecuting ssh commands, skipping them all" ) # ################### HYPERVISOR METHODS ##################### if (self.vms and self.protection_methods): try: self.protection_methods.store_vms() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting all VMs, skipping them all" ) if (self.sla_stats and self.protection_methods): # number of VMs per SLA and sla dumps try: self.protection_methods.vms_per_sla() self.protection_methods.sla_dumps() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting and computing VMs per sla, skipping them all" ) if (self.vm_stats and self.protection_methods): # retrieve and calculate VM inventory summary try: self.protection_methods.create_inventory_summary() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when creating inventory summary, skipping them all" ) if (self.vadps and self.protection_methods): try: self.protection_methods.vadps() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting vadps, skipping them all") if (self.storages and self.protection_methods): try: self.protection_methods.storages() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting storages, skipping them all" ) # ###################### OTHER METHODS ####################### if (OPTIONS.copy_database): try: self.influx_client.copy_database(OPTIONS.copy_database) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Top-level-error when coping database.") # ################### NON-SETUP-METHODS ####################### if (OPTIONS.test): try: OtherMethods.test_connection(self.influx_client, self.rest_client, self.config_file) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Top-level-error when testing connection.") if (OPTIONS.create_dashboard): try: if (not OPTIONS.dashboard_folder_path): ExceptionUtils.error_message( "Only use --create_dashboard in combination with --dashboard_folder_path=\"PATH/TO/GRAFANA/FOLDER/\"" ) else: OtherMethods.create_dashboard( dashboard_folder_path=OPTIONS.dashboard_folder_path, database_name=self.influx_client.database.name) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Top-level-error when creating dashboard") self.exit()
def set_optional_configs(self, config_file: Dict[str, Any]) -> None: """Sets up any optional infrastructure, to be called within the init. Be aware not everything may be initalized on call time. Add config here if the system should not abort if it is missing. Arguments: config_file {Dict[str, Any]} -- Opened Config file """ if (not config_file): ExceptionUtils.error_message( "missing or empty config file, aborting.") self.exit(error_code=ERROR_CODE_START_ERROR) if (not self.influx_client): ExceptionUtils.error_message( "Influx client is somehow missing. aborting") self.exit(error_code=ERROR_CODE) # ############################ REST-API ##################################### try: ConnectionUtils.verbose = ARGS.verbose # ### Loaded Systems part 1/2 ### # if (ARGS.minimumLogs or ARGS.loadedSystem): # Setting pagesize scaling settings ConnectionUtils.timeout_reduction = self.loaded_timeout_reduction ConnectionUtils.allowed_send_delta = self.loaded_allowed_send_delta ConnectionUtils.max_scaling_factor = self.loaded_max_scaling_factor # Setting RestClient request settings. self.rest_client = RestClient( config_file=config_file, initial_connection_timeout=self.initial_connection_timeout, pref_send_time=self.loaded_pref_send_time, request_timeout=self.loaded_request_timeout, max_send_retries=self.loaded_max_send_retries, starting_page_size=self.loaded_starting_page_size, min_page_size=self.loaded_min_page_size, verbose=ARGS.verbose) else: ConnectionUtils.timeout_reduction = self.timeout_reduction ConnectionUtils.allowed_send_delta = self.allowed_send_delta ConnectionUtils.max_scaling_factor = self.max_scaling_factor # Setting RestClient request settings. self.rest_client = RestClient( config_file=config_file, initial_connection_timeout=self.initial_connection_timeout, pref_send_time=self.pref_send_time, request_timeout=self.request_timeout, max_send_retries=self.max_send_retries, starting_page_size=self.starting_page_size, min_page_size=self.min_page_size, verbose=ARGS.verbose) self.api_queries = ApiQueries(self.rest_client) if (not self.ignore_setup): # delay the connect into the testing phase self.rest_client.login() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="REST-API is not available due Config error") # Required to declare variable self.rest_client = None self.api_queries = None # ######################## System, Job and Hypervisor Methods ################## try: # explicit ahead due dependency self.system_methods = SystemMethods(self.influx_client, self.api_queries, ARGS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) # ### Full Logs ### # if (ARGS.fullLogs): given_log_types = self.full_joblog_types else: given_log_types = self.joblog_types try: auth_rest: Dict[str, Any] = SppUtils.get_cfg_params( param_dict=config_file, param_name="sppServer") # type: ignore # TODO DEPRECATED TO BE REMOVED IN 1.1 self.job_log_retention_time = auth_rest.get( "jobLog_rentation", auth_rest.get("jobLog_retention", self.job_log_retention_time)) # TODO New once 1.1 is live #self.job_log_retention_time = auth_rest.get("jobLog_retention", self.job_log_retention_time) self.job_methods = JobMethods(self.influx_client, self.api_queries, self.job_log_retention_time, given_log_types, ARGS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) try: # dependen on system methods self.protection_methods = ProtectionMethods( self.system_methods, self.influx_client, self.api_queries, ARGS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) # ############################### SSH ##################################### if (self.ssh and not self.ignore_setup): try: # set from None to methods once finished self.ssh_methods = SshMethods(influx_client=self.influx_client, config_file=config_file, verbose=ARGS.verbose) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "SSH-Commands are not available due Config error") # Variable needs to be declared self.ssh_methods = None else: # Variable needs to be declared self.ssh_methods = None
def setup_args(self) -> None: """This method set up all required parameters and transforms arg groups into individual args. """ # ## call functions based on cmdline parameters # Temporary features / Deprecated if (ARGS.minimumLogs): ExceptionUtils.error_message( "DEPRECATED: using deprecated argument '--minumumLogs'. Use to '--loadedSystem' instead." ) if (ARGS.processStats): ExceptionUtils.error_message( "DEPRECATED: using deprecated argument '--minumumLogs'. Use to '--ssh' instead." ) # ignore setup args self.ignore_setup: bool = (ARGS.create_dashboard or bool(ARGS.dashboard_folder_path) or ARGS.test) if (self.ignore_setup): ExceptionUtils.error_message( "> WARNING: An option for a utility operation has been specified. Bypassing normal SPPMON operation." ) if ((ARGS.create_dashboard or bool(ARGS.dashboard_folder_path)) and not (ARGS.create_dashboard and bool(ARGS.dashboard_folder_path))): ExceptionUtils.error_message( "> Using --create_dashboard without associated folder path. Aborting." ) self.exit(ERROR_CODE_CMD_ARGS) # incremental setup, higher executes all below all_args: bool = ARGS.all daily: bool = ARGS.daily or all_args hourly: bool = ARGS.hourly or daily constant: bool = ARGS.constant or hourly # ######## All Methods ################# self.sites: bool = ARGS.sites or all_args # ######## Daily Methods ############### self.vms: bool = ARGS.vms or daily self.job_logs: bool = ARGS.jobLogs or daily self.sla_stats: bool = ARGS.slaStats or daily self.vm_stats: bool = ARGS.vmStats or daily # ######## Hourly Methods ############## self.jobs: bool = ARGS.jobs or hourly self.vadps: bool = ARGS.vadps or hourly self.storages: bool = ARGS.storages or hourly # ssh vsnap pools ? # ######## Constant Methods ############ self.ssh: bool = ARGS.ssh or constant self.cpu: bool = ARGS.cpu or constant self.spp_catalog: bool = ARGS.sppcatalog or constant
def transfer_data(self, old_database_name: str = None) -> None: # ###################### DISCLAMER ####################### # ################### TEMPORARY FEATURE #################### # this part is deleted once all old versions of SPPMon have been migrated # use at own caution # ############################################################ if (not old_database_name): old_database_name = self.database.name LOGGER.info( f"transfering the data from database {old_database_name} into {self.database.name}." ) LOGGER.info("Computing queries to be send to the server.") queries: List[str] = [] # all tables into their respective, data will be dropped if over RP-Time for table in self.database.tables.values(): query_str = f"SELECT * INTO {table} FROM {old_database_name}.autogen.{table.name} WHERE time > now() - {table.retention_policy.duration} GROUP BY *" queries.append(query_str) # Commpute the dropped data CQ-Like into the new tables. for con_query in self.database.continuous_queries: if (con_query.select_query): query_str: str = con_query.select_query.to_query() # replacing the rp of the string is easier then everything else match = re.search(r"(FROM ((.+)\.(.+)\..+) GROUP BY)", query_str) if (not match): raise ValueError("error when matching") from_clause = match.group(1) full_qualified_table = match.group(2) database_str = match.group(3) rp_str = match.group(4) new_f_q_t = full_qualified_table.replace( database_str, old_database_name) new_f_q_t = new_f_q_t.replace(rp_str, "autogen") if (con_query.select_query.into_table is None): ExceptionUtils.error_message( f"unable to process the query due an internal error: {query_str}" ) continue if (con_query.select_query.into_table.retention_policy.duration != '0s'): # add where clause to prevent dataloss due overflowing retention drop. if (re.search("WHERE", new_f_q_t)): new_f_q_t += " AND " else: new_f_q_t += " WHERE " new_f_q_t += f"time > now() - {con_query.select_query.into_table.retention_policy.duration}" # insert new where clause into the match new_from_clause = from_clause.replace(full_qualified_table, new_f_q_t) new_query_str = query_str.replace(from_clause, new_from_clause) queries.append(new_query_str) LOGGER.info("Finished Computing, starting to send.") # how many lines were transfered line_count: int = 0 # how often was a query partially written, not line count! dropped_count: int = 0 # how often was data dropped above the 10.000 limit? critical_drop: int = 0 LOGGER.info("starting transfer of data") # disable timeout old_timeout = self.__client._timeout self.__client = InfluxDBClient( # type: ignore host=self.__address, port=self.__port, username=self.__user, password=self.__password, ssl=self.__use_ssl, verify_ssl=self.__verify_ssl, timeout=7200) # ping to make sure connection works version: str = self.__client.ping() LOGGER.info( f"Connected again to influxdb with new timeout of {self.__client._timeout}, version: {version}" ) i = 0 for query in queries: try: start_time = time.perf_counter() # seems like you may only send one SELECT INTO at once via python result = self.__client.query( # type: ignore query=query, epoch='s', database=self.database.name) end_time = time.perf_counter() # count lines written, max 1 for result in result.get_points(): i += 1 line_count += result["written"] LOGGER.info( f'query {i}/{len(queries)}: {result["written"]} lines in {end_time-start_time}' ) except InfluxDBClientError as error: # only raise if the error is unexpected if (re.search( f"partial write: points beyond retention policy dropped=10000", error.content)): critical_drop += 1 raise ValueError( "transfer of data failed, retry manually with a shorter WHERE-clause", query) if (re.search( f"partial write: points beyond retention policy dropped=", error.content)): dropped_count += 1 else: ExceptionUtils.exception_info( error=error, extra_message= f"transfer of data failed for query {query}") critical_drop += 1 except (InfluxDBServerError, requests.exceptions.ConnectionError) as error: ExceptionUtils.exception_info( error=error, extra_message=f"transfer of data failed for query {query}") critical_drop += 1 # reset timeout self.__client = InfluxDBClient( # type: ignore host=self.__address, port=self.__port, username=self.__user, password=self.__password, ssl=self.__use_ssl, verify_ssl=self.__verify_ssl, timeout=old_timeout) # ping to make sure connection works version: str = self.__client.ping() LOGGER.info( f"Connected again to influxdb with old timeout of {self.__client._timeout}, version: {version}" ) LOGGER.info("transfer of data sucessfully") LOGGER.info(f"Total transfered {line_count} lines of results.") if (dropped_count): LOGGER.info( f"Could not count lines of {dropped_count} queries due an expected error. No need for manual action." ) if (critical_drop): LOGGER.info( f"Could not transfer data of {critical_drop} tables, check messages above to retry manually!" + "Please send the query manually with a adjusted 'from table': '$database.autogen.tablename'\n " + f"Adjust other values as required. Drop due Retention Policy is 'OK' until 10.000.\n" + "if it reaches 10.000 you need to cut the query into smaller bits." )
def job_logs(self) -> None: """saves all jobLogs for the jobsessions in influx catalog. Make sure to call `get_all_jobs` before to aquire all jobsessions. In order to save them it deletes and rewrites all affected jobsession entrys. It automatically parses certain jobLogs into additional stats, defined by `supported_ids`. """ table = self.__influx_client.database['jobs'] # only store if there is something to store -> limited by job log rentation time. where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time where_str += f' AND time > now() - {table.retention_policy.duration}' jobs_updated = 0 logs_total_count = 0 LOGGER.info("> getting joblogs for jobsessions without saved logs") LOGGER.info(">> requesting jobList from database") # Select all jobs without joblogs keyword = Keyword.SELECT query = SelectionQuery(keyword=keyword, tables=[table], fields=['*'], where_str=where_str) # send query and compute result = self.__influx_client.send_selection_query( # type: ignore query) result_list: List[Dict[str, Any]] = list( result.get_points()) # type: ignore rows_affected = len(result_list) LOGGER.info( ">>> number of jobs with no joblogs stored in Influx database: {}". format(rows_affected)) job_log_dict: Dict[int, List[Dict[str, Any]]] = {} # request all jobLogs from REST-API # if errors occur, skip single row and debug for row in result_list: job_session_id: Optional[int] = row.get('id', None) # if somehow id is missing: skip if (job_session_id is None): ExceptionUtils.error_message( f"Error: joblogId missing for row {row}") continue if (job_session_id in job_log_dict): ExceptionUtils.error_message( f"Error: joblogId duplicate, skipping.{job_session_id}") continue if (self.__verbose): LOGGER.info( f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions." ) elif (len(job_log_dict) % 5 == 0): LOGGER.info( f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions." ) # request job_session_id try: if (self.__verbose): LOGGER.info( f"requesting jobLogs {self.__job_log_type} for session {job_session_id}." ) LOGGER.debug( f"requesting jobLogs {self.__job_log_type} for session {job_session_id}." ) # cant use query something like everwhere due the extra params needed job_log_list = self.__api_queries.get_job_log_details( jobsession_id=job_session_id, job_logs_type=self.__job_log_type) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"error when api-requesting joblogs for job_session_id {job_session_id}, skipping it" ) continue if (self.__verbose): LOGGER.info( f">>> Found {len(job_log_list)} logs for jobsessionId {job_session_id}" ) LOGGER.debug( f"Found {len(job_log_list)} logs for jobsessionId {job_session_id}" ) # default empty list if no details available -> should not happen, in for safty reasons # if this is none, go down to rest client and fix it. Should be empty list. if (job_log_list is None): job_log_list = [] ExceptionUtils.error_message( "A joblog_list was none, even if the type does not allow it. Please report to developers." ) job_log_dict[job_session_id] = job_log_list # list to be inserted after everything is updated insert_list: List[Dict[str, Any]] = [] # Query data in ranges to avoid too many requests # Results from first select query above for row in result_list: job_id: int = row['id'] job_log_list: Optional[List[Dict[str, Any]]] = job_log_dict.get( job_id, None) if (job_log_list is None): ExceptionUtils.error_message( f"missing job_log_list even though it is in influxdb for jobId {job_id}. Skipping it" ) continue # jobLogsCount will be zero if jobLogs are deleted after X days by maintenance jobs, GUI default is 60 days job_logs_count = len(job_log_list) if (self.__verbose): LOGGER.info( ">>> storing {} joblogs for jobsessionId: {} in Influx database" .format(len(job_log_list), job_id)) LOGGER.debug( ">>> storing {} joblogs for jobsessionId: {} in Influx database" .format(len(job_log_list), job_id)) for job_log in job_log_list: # rename log keys and add additional information job_log["jobId"] = row.get("jobId", None) job_log["jobName"] = row.get("jobName", None) job_log["jobExecutionTime"] = row.get("start", None) job_log["jobLogId"] = job_log.pop("id") job_log["jobSessionId"] = job_log.pop("jobsessionId") # compute other stats out of jobList try: self.__job_logs_to_stats(job_log_list) except ValueError as error: ExceptionUtils.exception_info( error, extra_message= f"Failed to compute stats out of job logs, skipping for jobsessionId {job_id}" ) for job_log in job_log_list: # dump message params to allow saving as string job_log["messageParams"] = json.dumps(job_log["messageParams"]) # if list is empty due beeing erased etc it will simply return and do nothing self.__influx_client.insert_dicts_to_buffer( list_with_dicts=job_log_list, table_name="jobLogs") jobs_updated += 1 logs_total_count += job_logs_count # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails) update_fields = { "jobLogsCount": job_logs_count, "jobsLogsStored": True } # copy dict to allow update without errors mydict = dict(row.items()) # update fields for (key, value) in update_fields.items(): mydict[key] = value insert_list.append(mydict) # Delete data to allow reinsert with different tags delete_query = SelectionQuery(keyword=Keyword.DELETE, tables=[table], where_str=where_str) # now send remove query to prevent data loss self.__influx_client.send_selection_query(delete_query) # type: ignore # Insert data after everything is completed self.__influx_client.insert_dicts_to_buffer(table.name, insert_list) LOGGER.info( ">>> inserting a total of {} logs".format(logs_total_count))
def main(self): if (not self.influx_client): ExceptionUtils.error_message( "somehow no influx client is present even after init") self.exit(ERROR_CODE) # ##################### SYSTEM METHODS ####################### if (self.sites and self.system_methods): try: self.system_methods.sites() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting sites, skipping them all") if (self.cpu and self.system_methods): try: self.system_methods.cpuram() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting cpu stats, skipping them all" ) if (self.spp_catalog and self.system_methods): try: self.system_methods.sppcatalog() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting file system stats, skipping them all" ) # ####################### JOB METHODS ######################## if (self.jobs and self.job_methods): # store all jobs grouped by jobID try: self.job_methods.get_all_jobs() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting jobs, skipping them all") if (self.job_logs and self.job_methods): # store all job logs per job session instance try: self.job_methods.job_logs() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting job logs, skipping them all" ) # ####################### SSH METHODS ######################## if (self.ssh and self.ssh_methods): # execute ssh statements for, VSNAP, VADP, other ssh hosts # store all job logs per job session instance try: self.ssh_methods.ssh() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when excecuting ssh commands, skipping them all" ) if (self.process_stats and self.ssh_methods): # execute process stats for server try: self.ssh_methods.process_stats() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when excecuting ssh process statistic commands, skipping them all" ) # ################### HYPERVISOR METHODS ##################### if (self.vms and self.hypervisor_methods): try: self.hypervisor_methods.store_vms() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting all VMs, skipping them all" ) if (self.sla_stats and self.hypervisor_methods): # number of VMs per SLA and sla dumps try: self.hypervisor_methods.vms_per_sla() self.hypervisor_methods.sla_dumps() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting and computing VMs per sla, skipping them all" ) if (self.vm_stats and self.hypervisor_methods): # retrieve and calculate VM inventory summary try: self.hypervisor_methods.create_inventory_summary() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when creating inventory summary, skipping them all" ) if (self.vadps and self.hypervisor_methods): try: self.hypervisor_methods.vadps() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting vadps, skipping them all") if (self.storages and self.hypervisor_methods): try: self.hypervisor_methods.storages() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting storages, skipping them all" ) # ###################### OTHER METHODS ####################### if (OPTIONS.create_dashboard): try: if (not self.influx_client): raise ValueError( "need the influxclient to create the dashboard") OtherMethods.create_dashboard( dashboard_folder_path=OPTIONS.dashboard_folder_path, database_name=self.influx_client.database.name) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Top-level-error when creating dashboards") # ###################### DISCLAMER ####################### # ################### TEMPORARY FEATURE #################### # this part is deleted once all old versions of SPPMon have been migrated # use at own caution # ############################################################ if (OPTIONS.transfer_data): try: self.influx_client.transfer_data(OPTIONS.old_database) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when transfering data storages.") self.exit()
def set_optional_configs(self, config_file: Dict[str, Any]) -> None: """Sets up any optional infrastructure, to be called within the init. Be aware not everything may be initalized on call time. Add config here if the system should not abort if it is missing. Arguments: config_file {Dict[str, Any]} -- Opened Config file """ if (not config_file): ExceptionUtils.error_message( "missing or empty config file, aborting.") self.exit(error_code=ERROR_CODE_CMD_LINE) # ############################ REST-API ##################################### try: auth_rest = SppUtils.get_cfg_params(param_dict=config_file, param_name="sppServer") if (not isinstance(auth_rest, dict)): raise ValueError("sppServer config need to be dict") self.job_log_retention_time = auth_rest.get( "jobLog_rentation", "60d") ConnectionUtils.verbose = OPTIONS.verbose ConnectionUtils.timeout_reduction = self.timeout_reduction ConnectionUtils.allowed_time_diff_quota = self.allowed_time_diff_quota ConnectionUtils.maximum_increase_pagesize = self.maximum_increase_pagesize if (OPTIONS.minimumLogs): rest_time_out = self.minimum_timeout rest_preferred_time = self.loaded_preferred_time else: rest_time_out = self.default_timeout rest_preferred_time = self.preferred_time self.rest_client = RestClient(auth_rest, rest_time_out, rest_preferred_time, self.page_size, self.min_page_size, self.send_retries, OPTIONS.verbose) self.api_queries = ApiQueries(self.rest_client) self.rest_client.login() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="REST-API is not available due Config error") self.rest_client = None self.api_queries = None # ######################## System, Job and Hypervisor Methods ################## try: # explicit ahead due dependency self.system_methods = SystemMethods(self.influx_client, self.api_queries, OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) try: self.job_methods = JobMethods(self.influx_client, self.api_queries, self.job_log_retention_time, self.minLogs_joblog_type, self.default_joblog_type, OPTIONS.verbose, OPTIONS.minimumLogs) except ValueError as error: ExceptionUtils.exception_info(error=error) try: # dependen on system methods self.hypervisor_methods = ProtectionMethods( self.system_methods, self.influx_client, self.api_queries, OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) # ############################### SSH ##################################### if (self.ssh or self.process_stats): try: auth_ssh = SppUtils.get_cfg_params(param_dict=config_file, param_name="sshclients") ssh_clients: List[SshClient] = [] if (not isinstance(auth_ssh, list)): raise ValueError("not a list of sshconfig given", auth_ssh) for client_ssh in auth_ssh: try: ssh_clients.append(SshClient(client_ssh)) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"Setting up one client failed, skipping it. Client: \ {client_ssh.get('name', 'ERROR WHEN GETTING NAME')}" ) # set from None to methods once finished self.ssh_methods = SshMethods(influx_client=self.influx_client, ssh_clients=ssh_clients, verbose=OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "SSH-Commands are not available due Config error")
def flush_insert_buffer(self, fallback: bool = False) -> None: """Flushes the insert buffer, send querys to influxdb server. Sends in batches defined by `__batch_size` to reduce http overhead. Only send-statistics remain in buffer, flush again to send those too. Retries once into fallback mode if first request fails with modified settings. Keyword Arguments: fallback {bool} -- Whether to use fallback-options. Does not repeat on fallback (default: {False}) Raises: ValueError: Critical: The query Buffer is None. """ if (self.__insert_buffer is None): raise ValueError( "query buffer is somehow None, this should never happen!") # Only send if there is something to send if (not self.__insert_buffer): return # pre-save the keys to avoid Runtime-Error due "dictionary keys changed during iteration" # happens due re-run changing insert_buffer insert_keys = list(self.__insert_buffer.keys()) for table in insert_keys: # get empty in case the key isnt valid anymore (due fallback option) queries = list( map(lambda query: query.to_query(), self.__insert_buffer.get(table, []))) item_count = len(queries) if (item_count == 0): continue # stop time for send progess if (not fallback): batch_size = self.__query_max_batch_size else: batch_size = self.__fallback_max_batch_size re_send: bool = False error_msg: Optional[str] = None start_time = time.perf_counter() try: self.__client.write_points( points=queries, database=self.database.name, retention_policy=table.retention_policy.name, batch_size=batch_size, time_precision='s', protocol='line') end_time = time.perf_counter() except InfluxDBClientError as error: # type: ignore match = re.match(r".*partial write:[\s\w]+=(\d+).*", error.content) if (match and int(match.group(1)) < batch_size): # beyond 10.000 everything will be lost, below still written # ignore this case, its unavoidable and doesnt change anything pass elif (re.match(r".*partial write: unable to parse .*", error.content)): # some messages are lost, other written ExceptionUtils.exception_info( error=error, extra_message= f"Some messages were lost when sending buffer for table {table.name}, but everything else should be OK" ) error_msg = getattr(error, 'message', repr(error)) else: ExceptionUtils.exception_info( error=error, extra_message= f"Client error when sending insert buffer for table {table.name}." ) error_msg = getattr(error, 'message', repr(error)) # re-try with a smaller batch size, unsure if this helps re_send = True except (InfluxDBServerError, ConnectionError, requests.exceptions.ConnectionError ) as error: # type: ignore ExceptionUtils.exception_info( error=error, extra_message= f"Connection error when sending insert buffer for table {table.name}." ) error_msg = getattr(error, 'message', repr(error)) re_send = True # measure timing end_time = time.perf_counter() # clear the table which just got sent if (re_send and not fallback): ExceptionUtils.error_message( "Trying to send influx buffer again with fallback options") self.flush_insert_buffer(fallback=True) # None to avoid key erro if table is popped on fallback self.__insert_buffer.pop(table, None) # add metrics for the next sending process. # compute duration, metrics computed per batch self.__insert_metrics_to_buffer(Keyword.INSERT, table, end_time - start_time, item_count, error=error_msg)