def job_logs(self) -> None: """saves all jobLogs for the jobsessions in influx catalog. Make sure to call `get_all_jobs` before to aquire all jobsessions. In order to save them it deletes and rewrites all affected jobsession entrys. It automatically parses certain jobLogs into additional stats, defined by `supported_ids`. """ table = self.__influx_client.database['jobs'] # only store if there is something to store -> limited by job log rentation time. where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time where_str += f' AND time > now() - {table.retention_policy.duration}' jobs_updated = 0 logs_total_count = 0 LOGGER.info("> getting joblogs for jobsessions without saved logs") LOGGER.info(">> requesting jobList from database") # Select all jobs without joblogs keyword = Keyword.SELECT query = SelectionQuery( keyword=keyword, tables=[table], fields=['*'], where_str=where_str ) # send query and compute result = self.__influx_client.send_selection_query(query) # type: ignore result_list: List[Dict[str, Any]] = list(result.get_points()) # type: ignore rows_affected = len(result_list) LOGGER.info(">>> number of jobs with no joblogs stored in Influx database: {}" .format(rows_affected)) job_log_dict: Dict[int, List[Dict[str, Any]]] = {} # request all jobLogs from REST-API # if errors occur, skip single row and debug for row in result_list: job_session_id: Optional[int] = row.get('id', None) # if somehow id is missing: skip if(job_session_id is None): ExceptionUtils.error_message(f"Error: joblogId missing for row {row}") continue if(job_session_id in job_log_dict): ExceptionUtils.error_message(f"Error: joblogId duplicate, skipping.{job_session_id}") continue if(self.__verbose): LOGGER.info( f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions.") elif(len(job_log_dict) % 5 == 0): LOGGER.info( f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions.") # request job_session_id try: if(self.__verbose): LOGGER.info(f"requesting jobLogs {self.__job_log_type} for session {job_session_id}.") LOGGER.debug(f"requesting jobLogs {self.__job_log_type} for session {job_session_id}.") # cant use query something like everwhere due the extra params needed job_log_list = self.__api_queries.get_job_log_details( jobsession_id=job_session_id, job_logs_type=self.__job_log_type) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message=f"error when api-requesting joblogs for job_session_id {job_session_id}, skipping it") continue if(self.__verbose): LOGGER.info(f">>> Found {len(job_log_list)} logs for jobsessionId {job_session_id}") LOGGER.debug(f"Found {len(job_log_list)} logs for jobsessionId {job_session_id}") # default empty list if no details available -> should not happen, in for safty reasons # if this is none, go down to rest client and fix it. Should be empty list. if(job_log_list is None): job_log_list = [] ExceptionUtils.error_message( "A joblog_list was none, even if the type does not allow it. Please report to developers.") job_log_dict[job_session_id] = job_log_list # list to be inserted after everything is updated insert_list: List[Dict[str, Any]] = [] # Query data in ranges to avoid too many requests # Results from first select query above for row in result_list: job_id: int = row['id'] job_log_list: Optional[List[Dict[str, Any]]] = job_log_dict.get(job_id, None) if(job_log_list is None): ExceptionUtils.error_message( f"missing job_log_list even though it is in influxdb for jobId {job_id}. Skipping it") continue # jobLogsCount will be zero if jobLogs are deleted after X days by maintenance jobs, GUI default is 60 days job_logs_count = len(job_log_list) if(self.__verbose): LOGGER.info(">>> storing {} joblogs for jobsessionId: {} in Influx database".format( len(job_log_list), job_id)) LOGGER.debug(">>> storing {} joblogs for jobsessionId: {} in Influx database".format( len(job_log_list), job_id)) # compute other stats out of jobList try: self.__job_logs_to_stats(job_log_list) except ValueError as error: ExceptionUtils.exception_info( error, extra_message=f"Failed to compute stats out of job logs, skipping for jobsessionId {job_id}") for job_log in job_log_list: # rename key 'id' to jobLogId and reformat messageParams job_log["jobSessionId"] = row.get("jobId", None) job_log["jobSessionName"] = row.get("jobName", None) job_log["jobLogId"] = job_log.pop("id") job_log["messageParams"] = json.dumps( job_log["messageParams"]) # if list is empty due beeing erased etc it will simply return and do nothing self.__influx_client.insert_dicts_to_buffer( list_with_dicts=job_log_list, table_name="jobLogs") jobs_updated += 1 logs_total_count += job_logs_count # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails) update_fields = { "jobLogsCount": job_logs_count, "jobsLogsStored": True } # copy dict to allow update without errors mydict = dict(row.items()) # update fields for(key, value) in update_fields.items(): mydict[key] = value insert_list.append(mydict) # Delete data to allow reinsert with different tags delete_query = SelectionQuery( keyword=Keyword.DELETE, tables=[table], where_str=where_str ) # now send remove query to prevent data loss self.__influx_client.send_selection_query(delete_query) # Insert data after everything is completed self.__influx_client.insert_dicts_to_buffer(table.name, insert_list) LOGGER.info(">>> inserted a total of {} logs".format(logs_total_count))
def __query_url(self, url: str) -> Tuple[Dict[str, Any], float]: """Sends a request to this endpoint. Repeats if timeout error occured. Adust the pagesize on timeout. Arguments: url {str} -- URL to be queried. Raises: ValueError: No URL specified ValueError: Error when requesting endpoint ValueError: Wrong status code ValueError: failed to parse result ValueError: Timeout when sending result Returns: Tuple[Dict[str, Any], float] -- Result of the request with the required send time """ if (not url): raise ValueError("no url specified") LOGGER.debug(f"endpoint request {url}") failed_trys: int = 0 response_query: Optional[Response] = None while (response_query is None): # read pagesize actual_page_size = ConnectionUtils.url_get_param_value( url=url, param_name="pageSize") # Always set Pagesize to avoid different pagesizes by system if (not actual_page_size): url = ConnectionUtils.url_set_param( url=url, param_name="pageSize", param_value=self.__page_size) else: # read the pagesize try: actual_page_size = int(actual_page_size[0]) except (ValueError, KeyError) as error: ExceptionUtils.exception_info( error, extra_message="invalid page size recorded") actual_page_size = -1 # adjust pagesize of url if (actual_page_size != self.__page_size): LOGGER.debug( f"setting new pageSize from {actual_page_size} to {self.__page_size}" ) url = ConnectionUtils.url_set_param( url=url, param_name="pageSize", param_value=self.__page_size) # send the query try: start_time = time.perf_counter() response_query = requests.get( # type: ignore url=url, headers=self.__headers, verify=False, timeout=self.__timeout) end_time = time.perf_counter() send_time = (end_time - start_time) except requests.exceptions.ReadTimeout as timeout_error: # timeout occured, increasing failed trys failed_trys += 1 # #### Aborting cases ###### if (self.__send_retries < failed_trys): ExceptionUtils.exception_info(error=timeout_error) # read start index for debugging start_index = ConnectionUtils.url_get_param_value( url=url, param_name="pageStartIndex") # report timeout with full information raise ValueError( "timeout after repeating a maximum ammount of times.", timeout_error, failed_trys, self.__page_size, start_index) if (self.__page_size == self.__min_page_size): ExceptionUtils.exception_info(error=timeout_error) # read start index for debugging start_index = ConnectionUtils.url_get_param_value( url=url, param_name="pageStartIndex") # report timeout with full information raise ValueError( "timeout after using minumum pagesize. repeating the request is of no use.", timeout_error, failed_trys, self.__page_size, start_index) # #### continuing cases ###### if (self.__send_retries == failed_trys): # last try LOGGER.debug( f"Timeout error when requesting, now last try of total {self.__send_retries}. Reducing pagesize to minimum for url: {url}" ) if (self.__verbose): LOGGER.info( f"Timeout error when requesting, now last try of total {self.__send_retries}. Reducing pagesize to minimum for url: {url}" ) self.__page_size = self.__min_page_size # repeat with minimal possible size elif (self.__send_retries > failed_trys): # more then 1 try left LOGGER.debug( f"Timeout error when requesting, now on try {failed_trys} of {self.__send_retries}. Reducing pagesizefor url: {url}" ) if (self.__verbose): LOGGER.info( f"Timeout error when requesting, now on try {failed_trys} of {self.__send_retries}. Reducing pagesize for url: {url}" ) self.__page_size = ConnectionUtils.adjust_page_size( page_size=self.__page_size, min_page_size=self.__min_page_size, time_out=True) # repeat with reduced page size except requests.exceptions.RequestException as error: ExceptionUtils.exception_info(error=error) raise ValueError("error when requesting endpoint", error) if response_query.status_code != 200: raise ValueError("Wrong Status code when requesting endpoint data", response_query.status_code, url, response_query) try: response_json: Dict[str, Any] = response_query.json() except (json.decoder.JSONDecodeError, ValueError) as error: # type: ignore raise ValueError("failed to parse query in restAPI post request", response_query) # type: ignore return (response_json, send_time)
def post_data( self, endpoint: str = None, url: str = None, post_data: str = None, auth: HTTPBasicAuth = None) -> Dict[str, Any]: # type: ignore """Queries endpoint by a POST-Request. Only specify `auth` if you want to log in. Either specify endpoint or url. Keyword Arguments: endpoint {str} -- Endpoint to be queried (default: {None}) url {str} -- URL to be queried (default: {None}) post_data {str} -- data with filters/parameters (default: {None}) auth {HTTPBasicAuth} -- auth if you want to log in (default: {None}) Raises: ValueError: no endpoint or url specified ValueError: both url and endpoint specified ValueError: no post_data or auth specified ValueError: error when sending post data ValueError: wrong status code in response ValueError: failed to parse query Returns: Dict[str, Any] -- [description] """ if (not endpoint and not url): raise ValueError("neither url nor endpoint specified") if (endpoint and url): raise ValueError("both url and endpoint specified") if (not post_data and not auth): raise ValueError("either provide auth or post_data") if (not url): url = self.__srv_url + endpoint LOGGER.debug(f"post_data request {url} {post_data} {auth}") try: if (post_data): response_query: Response = requests.post( # type: ignore url, headers=self.__headers, data=post_data, verify=False, timeout=self.__timeout) else: response_query: Response = requests.post( # type: ignore url, headers=self.__headers, auth=auth, verify=False, timeout=self.__timeout) except requests.exceptions.RequestException as error: # type: ignore ExceptionUtils.exception_info(error=error) # type: ignore raise ValueError("Error when sending REST-API post data", endpoint, post_data) if response_query.status_code != 200: raise ValueError( "Status Code Error in REST-API post data response", response_query.status_code, response_query, endpoint, post_data) # type: ignore try: response_json: Dict[str, Any] = response_query.json() except (json.decoder.JSONDecodeError, ValueError) as error: # type: ignore raise ValueError("failed to parse query in restAPI post request", response_query, endpoint, post_data) # type: ignore return response_json
def main(self): if (not self.influx_client): ExceptionUtils.error_message( "somehow no influx client is present even after init") self.exit(ERROR_CODE) # ##################### SYSTEM METHODS ####################### if (self.sites and self.system_methods): try: self.system_methods.sites() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting sites, skipping them all") if (self.cpu and self.system_methods): try: self.system_methods.cpuram() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting cpu stats, skipping them all" ) if (self.spp_catalog and self.system_methods): try: self.system_methods.sppcatalog() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting file system stats, skipping them all" ) # ####################### JOB METHODS ######################## if (self.jobs and self.job_methods): # store all jobs grouped by jobID try: self.job_methods.get_all_jobs() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting jobs, skipping them all") if (self.job_logs and self.job_methods): # store all job logs per job session instance try: self.job_methods.job_logs() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting job logs, skipping them all" ) # ####################### SSH METHODS ######################## if (self.ssh and self.ssh_methods): # execute ssh statements for, VSNAP, VADP, other ssh hosts # store all job logs per job session instance try: self.ssh_methods.ssh() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when excecuting ssh commands, skipping them all" ) if (self.process_stats and self.ssh_methods): # execute process stats for server try: self.ssh_methods.process_stats() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when excecuting ssh process statistic commands, skipping them all" ) # ################### HYPERVISOR METHODS ##################### if (self.vms and self.hypervisor_methods): try: self.hypervisor_methods.store_vms() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting all VMs, skipping them all" ) if (self.sla_stats and self.hypervisor_methods): # number of VMs per SLA and sla dumps try: self.hypervisor_methods.vms_per_sla() self.hypervisor_methods.sla_dumps() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting and computing VMs per sla, skipping them all" ) if (self.vm_stats and self.hypervisor_methods): # retrieve and calculate VM inventory summary try: self.hypervisor_methods.create_inventory_summary() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when creating inventory summary, skipping them all" ) if (self.vadps and self.hypervisor_methods): try: self.hypervisor_methods.vadps() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when requesting vadps, skipping them all") if (self.storages and self.hypervisor_methods): try: self.hypervisor_methods.storages() self.influx_client.flush_insert_buffer() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when collecting storages, skipping them all" ) # ###################### OTHER METHODS ####################### if (OPTIONS.create_dashboard): try: OtherMethods.create_dashboard( dashboard_folder_path=OPTIONS.dashboard_folder_path, database_name=self.influx_client.database.name) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Top-level-error when creating dashboards") # ###################### DISCLAMER ####################### # ################### TEMPORARY FEATURE #################### # this part is deleted once all old versions of SPPMon have been migrated # use at own caution # ############################################################ if (OPTIONS.transfer_data): try: self.influx_client.transfer_data(OPTIONS.old_database) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Top-level-error when transfering data storages.") self.exit()
def query_url( self, url: str, params: Dict[str, Any] = None, request_type: RequestType = RequestType.GET, post_data: Dict[str, str] = None, auth: HTTPBasicAuth = None) -> Tuple[Dict[str, Any], float]: """Sends a request to this endpoint. Repeats if timeout error occured. Adust the pagesize on timeout. Arguments: url {str} -- URL to be queried. Must contain the server-uri and Endpoint. Does not allow encoded parameters post_data {str} -- additional data with filters/parameters. Only to be send with a POST-Request (default: {None}) auth {HTTPBasicAuth} -- Basic auth to be used to login into SPP via POST-Request(default: {None}) type {RequestType} -- What kind of Request should be made, defaults to GET Raises: ValueError: No URL specified ValueError: Error when requesting endpoint ValueError: Wrong status code ValueError: failed to parse result ValueError: Timeout when sending result ValueError: No post-data/auth is allowed in a GET-Request Returns: Tuple[Dict[str, Any], float] -- Result of the request with the required send time """ if(not url): raise ValueError("no url specified") if((post_data or auth) and request_type == RequestType.GET): raise ValueError("No post-data/auth is allowed in a GET-Request") LOGGER.debug(f"query url: {url}, type: {type}, post_data: {post_data} auth: {True if auth else False}") if(not params): params = {} failed_tries: int = 0 response_query: Optional[Response] = None send_time: float = -1 # prevent unbound var # avoid unset pageSize to not get into SPP defaults if("pageSize" not in params): LOGGER.debug(f"setting pageSize to {self.__page_size} from unset value") params["pageSize"] = self.__page_size elif(params["pageSize"] is None): params.pop("pageSize") while(response_query is None): # send the query try: if(request_type == RequestType.GET): response_query = get( url=url, headers=self.__headers, verify=False, params=params, timeout=(self.__initial_connection_timeout, self.__timeout)) elif(request_type == RequestType.POST): response_query = post( url=url, headers=self.__headers, verify=False, params=params, json=post_data, auth=auth, timeout=(self.__initial_connection_timeout, self.__timeout)) send_time = response_query.elapsed.total_seconds() except ReadTimeout as timeout_error: # timeout occured, increasing failed trys failed_tries += 1 url_params = ConnectionUtils.get_url_params(url) # #### Aborting cases ###### if(failed_tries > self.__max_send_retries): ExceptionUtils.exception_info(error=timeout_error) # read start index for debugging start_index = url_params.get("pageStartIndex", None) page_size = url_params.get("pageSize", None) # report timeout with full information raise ValueError("timeout after repeating a maximum ammount of times.", timeout_error, failed_tries, page_size, start_index) if(self.__page_size == self.__min_page_size): ExceptionUtils.exception_info(error=timeout_error) # read start index for debugging start_index = url_params.get("pageStartIndex", None) page_size = url_params.get("pageSize", None) # report timeout with full information raise ValueError("timeout after using minumum pagesize. repeating the request is of no use.", timeout_error, failed_tries, page_size, start_index) # #### continuing cases ###### if(failed_tries == self.__max_send_retries): # last try LOGGER.debug(f"Timeout error when requesting, now last try of total {self.__max_send_retries}. Reducing pagesize to minimum for url: {url}") if(self.__verbose): LOGGER.info(f"Timeout error when requesting, now last try of total {self.__max_send_retries}. Reducing pagesize to minimum for url: {url}") # persist reduced size for further requests self.__page_size = self.__min_page_size # repeat with minimal possible size LOGGER.debug(f"setting pageSize from {params.get('pageSize', None)} to {self.__page_size}") params["pageSize"] = self.__page_size else: # (failed_tries < self.__max_send_retries): # more then 1 try left LOGGER.debug(f"Timeout error when requesting, now on try {failed_tries} of {self.__max_send_retries}. Reducing pagesizefor url: {url}") if(self.__verbose): LOGGER.info(f"Timeout error when requesting, now on try {failed_tries} of {self.__max_send_retries}. Reducing pagesize for url: {url}") # persist reduced size for further requests self.__page_size = ConnectionUtils.adjust_page_size( page_size=params["pageSize"], min_page_size=self.__min_page_size, timeout=True) # repeat with reduced page size LOGGER.debug(f"setting pageSize from {params.get('pageSize', None)} to {self.__page_size}") params["pageSize"] = self.__page_size except RequestException as error: ExceptionUtils.exception_info(error=error) raise ValueError("error when requesting endpoint", error) if( not response_query.ok): raise ConnectionUtils.rest_response_error( response_query, "Wrong Status code when requesting endpoint data", url) try: response_json: Dict[str, Any] = response_query.json() except (json.decoder.JSONDecodeError, ValueError) as error: raise ValueError("failed to parse query in restAPI request", response_query) return (response_json, send_time)
def _parse_pool_show_cmd( ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `vsnap --json pool show` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if (not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if (not ssh_type): raise ValueError("no sshtype given") pool_result_list: List[Dict[str, Any]] = [] try: result: Dict[str, List[Dict[str, Any]]] = json.loads(ssh_command.result) except json.decoder.JSONDecodeError: # type: ignore raise ValueError("cant decode json for pool command", ssh_command.result, ssh_command, ssh_type) for pool in result['pools']: pool_dict: Dict[str, Any] = {} # acts as white list insert_list = [ 'compression', 'compression_ratio', 'deduplication', 'deduplication_ratio', 'diskgroup_size', 'encryption.enabled', 'health', 'id', 'name', 'pool_type', 'size_before_compression', 'size_before_deduplication', 'size_free', 'size_total', 'size_used', 'status' ] for item in insert_list: (key, value) = SppUtils.get_nested_kv(item, pool) pool_dict[key] = value # rename pool_dict['encryption_enabled'] = pool_dict.pop('enabled') # change unit from bytes to megabytes try: sz_b_c = SppUtils.parse_unit( pool_dict['size_before_compression']) sz_b_d = SppUtils.parse_unit( pool_dict['size_before_deduplication']) sz_fr = SppUtils.parse_unit(pool_dict['size_free']) sz_t = SppUtils.parse_unit(pool_dict['size_total']) sz_u = SppUtils.parse_unit(pool_dict['size_used']) pool_dict['size_before_compression'] = int( sz_b_c / pow(2, 20)) if sz_b_c else None pool_dict['size_before_deduplication'] = int( sz_b_d / pow(2, 20)) if sz_b_d else None pool_dict['size_free'] = int(sz_fr / pow(2, 20)) if sz_fr else None pool_dict['size_total'] = int(sz_t / pow(2, 20)) if sz_t else None pool_dict['size_used'] = int(sz_u / pow(2, 20)) if sz_u else None except KeyError as error: ExceptionUtils.exception_info( error=error, extra_message= f"failed to reduce size of vsnap pool size for {pool_dict}" ) # set default needed fields pool_dict['hostName'] = ssh_command.host_name pool_dict['ssh_type'] = ssh_type.name pool_result_list.append(pool_dict) return (ssh_command.table_name, pool_result_list)
def ssh_execute_commands( cls, ssh_clients: List[SshClient], ssh_type: SshTypes, command_list: List[SshCommand] ) -> List[Tuple[str, List[Dict[str, Any]]]]: """ functions executes commands via ssh on several hosts. the hosts (other, vsnap, vadp) can be defined in the JSON configuation file commands which shall be executed on vsnap and / or vadp proxies in the dedicated ist of strings. 'otherCommands' is a list of commands which are executed on hosts which are not of type: vsnap | vadp. if any host are not reachable, they are skipped """ if (not command_list): LOGGER.debug("No commands specified, aborting command.") if (cls.verbose): LOGGER.info("No commands specified, aborting command.") return [] client_list = list( filter(lambda client: client.client_type is ssh_type, ssh_clients)) if (not client_list): LOGGER.debug( f"No {ssh_type.name} ssh client present. Aborting command") if (cls.verbose): LOGGER.info( f"No {ssh_type.name} ssh client present. Aborting command") return [] # List to persist ssh-result stats over each client ssh_cmd_response_list: List[Dict[str, Union[str, int, None]]] = [] # list to insert into influx, tuple of table and its result-lists result_list: List[Tuple[str, List[Dict[str, Any]]]] = [] for client in client_list: if (cls.verbose): LOGGER.info( f">> executing {ssh_type.name} command(s) on host {client.host_name}" ) try: result_commands = client.execute_commands( commands=command_list, verbose=cls.verbose) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Error when executing commands, skipping this client") continue for ssh_command in result_commands: # generate stats for the ssh-stats list insert_dict: Dict[str, Union[str, int, None]] = {} insert_dict["host"] = ssh_command.host_name insert_dict["command"] = ssh_command.cmd insert_dict["output"] = json.dumps(ssh_command.result) insert_dict['ssh_type'] = ssh_type.name time_key, time_value = SppUtils.get_capture_timestamp_sec() insert_dict[time_key] = time_value ssh_cmd_response_list.append(insert_dict) # execute the command try: table_result_tuple = ssh_command.parse_result( ssh_type=ssh_type) if (table_result_tuple): # save the command into the result set wit its table result_list.append(table_result_tuple) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Error when parsing result, skipping parsing of this result" ) # append the ssh command once, cause each client already added into the ssh_command list result_list.append(("sshCmdResponse", ssh_cmd_response_list)) return result_list
def insert_dicts_to_buffer(self, table_name: str, list_with_dicts: List[Dict[str, Any]]) -> None: """Insert a list of dicts with data into influxdb. Splits according to table definition. It is highly recommened to define a table before in database_table.py. If not present, splits by type analysis. Important: Querys are only buffered, not sent. Call flush_insert_buffer to flush. Arguments: table_name {str} -- Name of the table to be inserted list_with_dicts {List[Dict[str, Any]]} -- List with dicts whith collum name as key. Raises: ValueError: No list with dictonarys are given or of wrong type. ValueError: No table name is given """ LOGGER.debug(f"Enter insert_dicts for table: {table_name}") if (list_with_dicts is None): # empty list is allowed raise ValueError("missing list with dictonarys in insert") if (not table_name): raise ValueError("table name needs to be set in insert") # Only insert of something is there to insert if (not list_with_dicts): LOGGER.debug("nothing to insert for table %s due empty list", table_name) return # get table instance table = self.database[table_name] # Generate querys for each dict query_buffer = [] for mydict in list_with_dicts: try: # split dict according to default tables (tags, values, timestamp) = table.split_by_table_def(mydict=mydict) if (isinstance(timestamp, str)): timestamp = int(timestamp) # LOGGER.debug("%d %s %s %d",appendCount,tags,values,timestamp) # create query and append to query_buffer query_buffer.append(InsertQuery(table, values, tags, timestamp)) except ValueError as err: ExceptionUtils.exception_info( error=err, extra_message="skipping single dict to insert") continue # extend existing inserts by new one and add to insert_buffer table_buffer = self.__insert_buffer.get(table, list()) table_buffer.extend(query_buffer) self.__insert_buffer[table] = table_buffer LOGGER.debug("Appended %d items to the insert buffer", len(query_buffer)) # safeguard to avoid memoryError if (len(self.__insert_buffer[table]) > 2 * self.__query_max_batch_size): self.flush_insert_buffer() LOGGER.debug(f"Exit insert_dicts for table: {table_name}")
def flush_insert_buffer(self, fallback: bool = False) -> None: """Flushes the insert buffer, send querys to influxdb server. Sends in batches defined by `__batch_size` to reduce http overhead. Only send-statistics remain in buffer, flush again to send those too. Retries once into fallback mode if first request fails with modified settings. Keyword Arguments: fallback {bool} -- Whether to use fallback-options. Does not repeat on fallback (default: {False}) Raises: ValueError: Critical: The query Buffer is None. """ if (self.__insert_buffer is None): raise ValueError( "query buffer is somehow None, this should never happen!") # Only send if there is something to send if (not self.__insert_buffer): return # pre-save the keys to avoid Runtime-Error due "dictionary keys changed during iteration" # happens due re-run changing insert_buffer insert_keys = list(self.__insert_buffer.keys()) for table in insert_keys: # get empty in case the key isnt valid anymore (due fallback option) queries = list( map(lambda query: query.to_query(), self.__insert_buffer.get(table, []))) item_count = len(queries) if (item_count == 0): continue # stop time for send progess if (not fallback): batch_size = self.__query_max_batch_size else: batch_size = self.__fallback_max_batch_size re_send: bool = False error_msg: Optional[str] = None start_time = time.perf_counter() try: self.__client.write_points( points=queries, database=self.database.name, retention_policy=table.retention_policy.name, batch_size=batch_size, time_precision='s', protocol='line') end_time = time.perf_counter() except InfluxDBClientError as error: # type: ignore match = re.match(r".*partial write:[\s\w]+=(\d+).*", error.content) if (match and int(match.group(1)) < batch_size): # beyond 10.000 everything will be lost, below still written # ignore this case, its unavoidable and doesnt change anything pass elif (re.match(r".*partial write: unable to parse .*", error.content)): # some messages are lost, other written ExceptionUtils.exception_info( error=error, extra_message= f"Some messages were lost when sending buffer for table {table.name}, but everything else should be OK" ) error_msg = getattr(error, 'message', repr(error)) else: ExceptionUtils.exception_info( error=error, extra_message= f"Client error when sending insert buffer for table {table.name}." ) error_msg = getattr(error, 'message', repr(error)) # re-try with a smaller batch size, unsure if this helps re_send = True except (InfluxDBServerError, ConnectionError, requests.exceptions.ConnectionError ) as error: # type: ignore ExceptionUtils.exception_info( error=error, extra_message= f"Connection error when sending insert buffer for table {table.name}." ) error_msg = getattr(error, 'message', repr(error)) re_send = True # measure timing end_time = time.perf_counter() # clear the table which just got sent if (re_send and not fallback): ExceptionUtils.error_message( "Trying to send influx buffer again with fallback options") self.flush_insert_buffer(fallback=True) # None to avoid key erro if table is popped on fallback self.__insert_buffer.pop(table, None) # add metrics for the next sending process. # compute duration, metrics computed per batch self.__insert_metrics_to_buffer(Keyword.INSERT, table, end_time - start_time, item_count, error=error_msg)
def check_grant_user(self, username: str, permission: str): """Checks and Grants the permissions for a user to match at least the required permission or a higher one. Warns if user does not exists. Grants permission if current permissions to not fullfil the requirement. This method does not abort if the check or grant was unsuccessfull! Args: username (str): name of the user to be checked permission (str): permissions to be granted: READ, WRITE, ALL Raises: ValueError: No username provided ValueError: no permissions provided """ try: LOGGER.debug( f"Checking/Granting user {username} for {permission} permissions on db {self.database.name}." ) if (not username): raise ValueError( "checking/granting a user permissions require an username") if (not permission): raise ValueError( "checking/granting a user permissions require a defined set of permissions" ) # Get all users to check for the required user user_list: List[Dict[str, Union[ str, bool]]] = self.__client.get_list_users() LOGGER.debug(f"Returned list of users: {user_list}") # get the wanted user if it exists. Default value to not throw an error. user_dict = next( filter(lambda user_dict: user_dict['user'] == username, user_list), None) LOGGER.debug(f"Found user: {user_dict}") # SPPMon should not create a user since then a default password will be used # It is very unlikely that this one is getting changed and therefore a risk of leaking data. if (not user_dict): ExceptionUtils.error_message( f"The user '{username}' does not exist. Please create it according to the documentation." ) return # not abort SPPMon, only minor error if (user_dict['admin']): LOGGER.debug(f"{username} is already admin. Finished check") return # get privileges of user to check if he already has matching permissions db_privileges: List[Dict[ str, str]] = self.__client.get_list_privileges(username) LOGGER.debug(db_privileges) # check for existing privileges db_entry = next( filter( lambda entry_dict: entry_dict['database'] == self.database. name, db_privileges), None) # there must be permissions of either wanted permission or higher (all) if (db_entry and (db_entry['privilege'] == permission or db_entry['privilege'] == "ALL")): LOGGER.debug( f"{username} has already correct permissions. Finished check" ) return # else give permissions LOGGER.info( f"Permissions missing for user {username}, granting {permission} permissions." ) self.__client.grant_privilege(permission, self.database.name, username) LOGGER.debug(f"Granted permissions to {username}") except (ValueError, InfluxDBClientError, InfluxDBServerError, requests.exceptions.ConnectionError) as error: # type: ignore ExceptionUtils.exception_info( error=error, extra_message= "User check failed for user {username} with permissions {permission} on db {self.database.name}" ) # type: ignore
def copy_database(self, new_database_name: str) -> None: if (not new_database_name): raise ValueError( "copy_database requires a new database name to copy to.") # Programm information LOGGER.info( f"Copy Database: transfering the data from database {self.database.name} into {new_database_name}." ) LOGGER.info( "> Info: This also includes all data from `autogen` retention policy, sorted into the correct retention policies." ) # create db, nothing happens if it already exists LOGGER.info("> Creating the new database if it didn't already exist") self.setup_db(new_database_name) # check for exisiting retention policies and continuous queries in the influxdb LOGGER.info( ">> Checking and creating retention policies for the new database. Ignoring continuous queries." ) self.check_create_rp(new_database_name) # self.check_create_cq() # Note: Not possible due full qualified statements. this would also not truly conserve the data LOGGER.info("> Computing queries to be send to the server.") queries: List[str] = [] # copies all tables into their respective duplicate, data over RP-time will be dropped. for table in self.database.tables.values(): autogen_query_str = f"SELECT * INTO {new_database_name}.{table.retention_policy.name}.{table.name} FROM {table.database.name}.autogen.{table.name} WHERE time > now() - {table.retention_policy.duration} GROUP BY *" queries.append(autogen_query_str) rp_query_str = f"SELECT * INTO {new_database_name}.{table.retention_policy.name}.{table.name} FROM {table} WHERE time > now() - {table.retention_policy.duration} GROUP BY *" queries.append(rp_query_str) # Compute data with a timestamp over the initial RP-duration into other RP's. for con_query in self.database.continuous_queries: cq_query_str: str = con_query.to_query() # replacing the rp inside of the toString representation # this is easier than individual matching/code replacement # Not every database name should be replaced match = re.search( r"BEGIN(.*(INTO\s+(.+)\..+\..+)\s+(FROM\s+\w+\.(\w+)\.\w+)(?:\s+WHERE\s+(.+))?\s+GROUP BY.*)END", cq_query_str) if (not match): raise ValueError( f">> error when matching continous query {cq_query_str}. Aborting." ) full_match = match.group(1) into_clause = match.group(2) old_database_str = match.group(3) from_clause = match.group(4) from_rp = match.group(5) where_clause = match.group(6) # Add timelimit in where clause to prevent massive truncation due the rentention-policy time limit new_full_match = full_match if (not con_query.select_query or con_query.select_query.into_table is None): ExceptionUtils.error_message( f">> Into table of continous query is none. Adjust query manually! {full_match}" ) elif (con_query.select_query.into_table.retention_policy.duration != '0s'): # Caution: if truncation of a query is above 10.000 it won't be saved! clause = f"time > now() - {con_query.select_query.into_table.retention_policy.duration}" if (where_clause): new_full_match = new_full_match.replace( where_clause, where_clause + " AND " + clause) else: new_full_match = new_full_match.replace( from_clause, from_clause + " WHERE " + clause) # replace old dbname with new one new_into_clause = into_clause.replace(old_database_str, new_database_name) new_full_match = new_full_match.replace(into_clause, new_into_clause) # case 1: keep retention policy queries.append(new_full_match) # case 2: autogen as from RP new_from_clause = from_clause.replace(from_rp, "autogen") auto_gen_match = new_full_match.replace(from_clause, new_from_clause) queries.append(auto_gen_match) LOGGER.info("> Finished Computing, starting to send.") # how many lines were transfered line_count: int = 0 # how often was a query partially written, not line count! dropped_count: int = 0 # how often was data dropped above the 10.000 limit? critical_drop: int = 0 # print statistics # send time since last print send_time_collection: float = 0 # line count since last print line_collection: int = 0 # disable timeout old_timeout = self.__client._timeout self.__client = InfluxDBClient( # type: ignore host=self.__address, port=self.__port, username=self.__user, password=self.__password, ssl=self.__use_ssl, verify_ssl=self.__verify_ssl, timeout=7200) # ping to make sure connection works version: str = self.__client.ping() LOGGER.info( f">> Connected to influxdb with new timeout of {self.__client._timeout}, version: {version}" ) LOGGER.info(">> Starting transfer of data") i = 0 for query in queries: try: start_time = time.perf_counter() # seems like you may only send one SELECT INTO at once via python result = self.__client.query( # type: ignore query=query, epoch='s', database=self.database.name) end_time = time.perf_counter() # count lines written, max 1 for result in result.get_points(): i += 1 line_count += result["written"] # print statistics send_time_collection += end_time - start_time line_collection += result["written"] # Print only all 10 queries or if the collected send time is too high if (i % 10 == 0 or send_time_collection >= 2): LOGGER.info( f'query {i}/{len(queries)}: {line_collection} new lines in {send_time_collection}s.' ) line_collection = 0 send_time_collection = 0 except InfluxDBClientError as error: # only raise if the error is unexpected if (re.search( f"partial write: points beyond retention policy dropped=10000", error.content)): critical_drop += 1 raise ValueError( ">> transfer of data failed, retry manually with a shorter WHERE-clause", query) if (re.search( f"partial write: points beyond retention policy dropped=", error.content)): dropped_count += 1 else: ExceptionUtils.exception_info( error=error, extra_message= f">> transfer of data failed for query {query}") critical_drop += 1 except (InfluxDBServerError, requests.exceptions.ConnectionError) as error: ExceptionUtils.exception_info( error=error, extra_message= f">> transfer of data failed for query {query}") critical_drop += 1 # reset timeout self.__client = InfluxDBClient( # type: ignore host=self.__address, port=self.__port, username=self.__user, password=self.__password, ssl=self.__use_ssl, verify_ssl=self.__verify_ssl, timeout=old_timeout) # ping to make sure connection works version: str = self.__client.ping() LOGGER.info( f">> Changed timeout of influxDB to old timeout of {self.__client._timeout}, version: {version}" ) LOGGER.info(f"> Total transfered {line_count} lines of results.") if (dropped_count): LOGGER.info( f"> WARNING: Could not count lines of {dropped_count} queries due an expected error. No need for manual action." ) if (critical_drop): msg: str = ( f"ERROR: Could not transfer data of {critical_drop} tables, check messages above to retry manually!\n" + "Please send the query manually with a adjusted 'from table': '$database.autogen.tablename'\n " + "Adjust other values as required. Drop due Retention Policy is 'OK' until 10.000.\n" + "If the drop count reaches 10.000 you need to cut the query into smaller bits." ) ExceptionUtils.error_message(msg) elif (line_count == 0): ExceptionUtils.error_message( "ERROR: No data was transferred, make sure your database name is correct and the db is not empty." ) else: LOGGER.info("Database copied sucessfully")
def check_create_cq(self) -> None: """Checks if any continuous query needs to be altered or added Raises: ValueError: Check failed due Database error """ try: # returns a list of dictonarys with db name as key # inside the dicts there is a list of each cq # the cqs are displayed as a 2 elem dict: 'name' and 'query' results: List[Dict[str, List[Dict[ str, str]]]] = self.__client.get_list_continuous_queries() # get the cq's of the correct db # list of 2-elem cqs: 'name' and 'query' cq_result_list: List[Dict[str, str]] = next( ( cq.get(self.database.name, []) for cq in results # only if matches the db name if cq.get(self.database.name, False)), []) # save all results into a dict for quicker accessing afterwards cq_result_dict: Dict[str, str] = {} for cq_result in cq_result_list: cq_result_dict[cq_result['name']] = cq_result['query'] # queries which need to be added add_cq_list: List[ContinuousQuery] = [] # queries to be deleted (no alter possible): save name only drop_cq_list: List[str] = [] # check for each cq if it needs to be 1. dropped and 2. added for continuous_query in self.database.continuous_queries: result_cq = cq_result_dict.get(continuous_query.name, None) if (result_cq is None): add_cq_list.append(continuous_query) elif (result_cq != continuous_query.to_query()): LOGGER.debug(f"result_cq: {result_cq}") LOGGER.debug(f"desired_cq: {continuous_query.to_query()}") # delete result cq and then add it new # save name only drop_cq_list.append(continuous_query.name) add_cq_list.append(continuous_query) # else: all good LOGGER.debug(f"deleting {len(drop_cq_list)} CQ's: {drop_cq_list}") # alter not possible -> drop and readd for query_name in drop_cq_list: self.__client.drop_continuous_query( # type: ignore name=query_name, database=self.database.name) # adding new / altered CQ's LOGGER.debug( f"adding {len(add_cq_list)} CQ's. adding {add_cq_list}") for continuous_query in add_cq_list: self.__client.create_continuous_query( # type: ignore name=continuous_query.name, select=continuous_query.select, database=continuous_query.database.name, resample_opts=continuous_query.resample_opts) except (ValueError, InfluxDBClientError, InfluxDBServerError, requests.exceptions.ConnectionError) as error: # type: ignore ExceptionUtils.exception_info(error=error) # type: ignore raise ValueError("Continuous Query check failed")
def __init__(self, influx_client: InfluxClient, config_file: Dict[str, Any], verbose: bool = False): if(not config_file): raise ValueError("Require config file to setup ssh clients") if(not influx_client): raise ValueError("need InfluxClient to send data to DB") self.__influx_client = influx_client self.__verbose = verbose try: self.__ssh_clients = self.setup_ssh_clients(config_file) except ValueError as error: ExceptionUtils.exception_info(error) raise ValueError("No ssh-clients are present or error when reading config file. Skipping SSH-Methods creation") # ################################################################################################ # ################################### SSH COMMAND LIST GROUPS #################################### # ################################################################################################ # Add all required commands ONLY here. Format: # Always a list of `SshCommand`. Create a instance for each command needed. # group by type, while all will be executed for any type. # if you add new types also add them in the `SshTypes`-enum. # you can use the table name multiple times, just make sure you also define a according table in # `database_tables.py`. # After declaring here you may execute the command like the others below. # those commands are going to be executed on ANY client. self.__all_command_list = [ SshCommand( command="mpstat", parse_function=SshMethods._parse_mpstat_cmd, table_name="ssh_mpstat_cmd" ), SshCommand( command="free", parse_function=SshMethods._parse_free_cmd, table_name="ssh_free_cmd" ) ] # Those commands are only executed on the associated (key) client type self.__client_commands: Dict[SshTypes, List[SshCommand]] = { # SEVER SshTypes.SERVER: [ # added later due function, check below SshCommand( command='df -h / --block-size=G', parse_function=SshMethods._parse_df_cmd, table_name="df_ssh" ), SshCommand( command='df -h /opt/IBM/SPP --block-size=G', parse_function=SshMethods._parse_df_cmd, table_name="df_ssh" ), ## df -h / ## df -h /opt/IBM/SPP ], # VSnap SshTypes.VSNAP:[ SshCommand( command='systemctl status vsnap-api.service > /dev/null && sudo vsnap --json pool show', parse_function=SshMethods._parse_pool_show_cmd, table_name="vsnap_pools" ), SshCommand( command='systemctl status vsnap-api.service > /dev/null && sudo vsnap --json system stats', parse_function=SshMethods._parse_system_stats_cmd, table_name="vsnap_system_stats" ), SshCommand( command='df -h / --block-size=G', parse_function=SshMethods._parse_df_cmd, table_name="df_ssh" ), ## zpool list ## df -h / ], # VADP SshTypes.VADP: [ # nothing yet ], # CLOUDPROXY SshTypes.CLOUDPROXY: [ # nothing yet ], # OTHER SshTypes.OTHER: [ SshCommand( command="df -h --block-size=G", parse_function=SshMethods._parse_df_cmd, table_name="df_ssh" ) ] } # ################ MULTI COMMAND ADD ########################## # SERVER # add server later due multiple processes self.__process_grep_list = ["mongod", "beam.smp", "java"] # be aware this is double declared below for grep_name in self.__process_grep_list: self.__client_commands[SshTypes.SERVER].append( SshCommand( command=f"ps -o \"%cpu,%mem,comm,rss,vsz,user,pid,etimes\" -p $(pgrep -d',' -f {grep_name}) S -ww", parse_function=self._parse_ps_cmd, table_name="processStats" ) ) # Top commands for CPU Only for grep_name in self.__process_grep_list: self.__client_commands[SshTypes.SERVER].append( SshCommand( command=f"top -bs -w 512 -n1 -p $(pgrep -d',' -f {grep_name})", parse_function=self._parse_top_cmd, table_name="processStats" ) )
def execute_commands(self, commands: List[SshCommand], verbose: bool = False) -> List[SshCommand]: """Executes given commands on this ssh client. Returns a new list of commands. Automatically connects and disconnects. Arguments: commands {List[SshCommand]} -- List of commands to be executed Keyword Arguments: verbose {bool} -- whether to print the result (default: {False}) Raises: ValueError: No list of commands given. """ if (not commands or not isinstance(commands, list)): raise ValueError("Need list of commands to execute") LOGGER.debug( f"> connecting to {self.client_type.name} client on host {self.host_name}" ) if (verbose): LOGGER.info( f"> connecting to {self.client_type.name} client on host {self.host_name}" ) self.connect() LOGGER.debug("> connection successfull") if (verbose): LOGGER.info("> connection successfull") new_command_list: List[SshCommand] = [] for ssh_command in commands: if (self.__skip_cmd(ssh_command)): LOGGER.info( f"Skipped command {ssh_command.cmd} on host {self.host_name}" ) continue try: LOGGER.debug( f"Executing command {ssh_command.cmd} on host {self.host_name}" ) result = self.__send_command(ssh_command.cmd) # save result new_command = ssh_command.save_result(result, self.host_name) LOGGER.debug(f"Command result: {result}") except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"failed to execute command on host: {self.host_name}, skipping it: {ssh_command.cmd}" ) # make sure it is not set new_command = ssh_command.save_result(result=None, host_name=self.host_name) new_command_list.append(new_command) self.disconnect() return new_command_list
def test_connection(influx_client: InfluxClient, rest_client: Optional[RestClient], config_file: Dict[str, Any]): if (not config_file): raise ValueError("SPPmon does not work without a config file") LOGGER.info("Testing all connections required for SPPMon to work") working: bool = True # SPPMon itself will finish sucessfull (no critical errors) no_warnings: bool = True # SPPMon will finish without any warnings (no errors at all) # ## InfluxDB ## LOGGER.info("> Testing and configuring InfluxDB") try: influx_client.connect() influx_client.disconnect() if (not influx_client.use_ssl): ExceptionUtils.error_message( "> WARNING: Mandatory SSL is disabled. We hightly recommend to enable it!" ) no_warnings = False LOGGER.info("InfluxDB is ready for use") except ValueError as error: ExceptionUtils.exception_info( error, extra_message= "> Testing of the InfluxDB failed. This is a crictial component of SPPMon." ) working = False # ## REST-API ## LOGGER.info("> Testing REST-API of SPP.") try: if (not rest_client): raise ValueError( "Rest-client is setup. Unavailable to test it.") rest_client.login() (version_nr, build_nr) = rest_client.get_spp_version_build() LOGGER.info( f">> Sucessfully connected to SPP V{version_nr}, build {build_nr}." ) rest_client.logout() LOGGER.info("> REST-API is ready for use") except ValueError as error: ExceptionUtils.exception_info( error, extra_message= "> Testing of the REST-API failed. This is a crictial component of SPPMon." ) working = False # ## SSH-CLIENTS ## LOGGER.info( "> Testing all types of SSH-Clients: Server, VAPDs, vSnaps, Cloudproxy and others" ) ssh_working = True # The arg --ssh will finish without any error at all # Count of clients checks ssh_clients: List[SshClient] = SshMethods.setup_ssh_clients( config_file) if (not ssh_clients): ExceptionUtils.error_message( ">> No SSH-clients detected at all. At least the server itself should be added for process-statistics." ) ssh_working = False else: for type in SshTypes: if (not list( filter(lambda client: client.client_type == type, ssh_clients))): LOGGER.info(f">> No {type.name} client detected.") if (type == SshTypes.SERVER): ExceptionUtils.error_message( ">> Critical: Without Server as ssh client you wont have any process statistics available. These are a key part of SPPMon." ) ssh_working = False # No error, but still critical if (type == SshTypes.VSNAP): LOGGER.info( ">> WARNING: Without vSnap as ssh client you have no access to storage information. You may add vSnap's for additional monitoring and alerts." ) no_warnings = False # ssh will still work, but thats definitly a warning ssh_methods: SshMethods = SshMethods(influx_client, config_file, False) # Connection check LOGGER.info( f">> Testing now connection and commands of {len(ssh_clients)} registered ssh-clients." ) for client in ssh_clients: try: client.connect() client.disconnect() error_count: int = len(ExceptionUtils.stored_errors) MethodUtils.ssh_execute_commands( ssh_clients=[client], ssh_type=client.client_type, command_list=ssh_methods.client_commands[ client.client_type] + ssh_methods.all_command_list) if (len(ExceptionUtils.stored_errors) != error_count): ssh_working = False ExceptionUtils.error_message( f"Not all commands available for client {client.host_name} with type: {client.client_type}.\n" + "Please check manually if the commands are installed and their output." ) except ValueError as error: ExceptionUtils.exception_info( error, extra_message= f"Connection failed for client {client.host_name} with type: {client.client_type}." ) ssh_working = False if (ssh_working): LOGGER.info("> Testing of SSH-clients sucessfull.") else: LOGGER.info( "> Testing of SSH-clients failed! SPPMon will still work, not all informations are available." ) no_warnings = False # #### Conclusion #### if (working and no_warnings): LOGGER.info( "> All components tested sucessfully. SPPMon is ready to be used!" ) elif (working): LOGGER.info( "> Testing partially sucessful. SPPMon will run, but please check the warnings." ) else: LOGGER.info( "> Testing failed. SPPMon is not ready to be used. Please fix the connection issues." )
def set_optional_configs(self, config_file: Dict[str, Any]) -> None: """Sets up any optional infrastructure, to be called within the init. Be aware not everything may be initalized on call time. Add config here if the system should not abort if it is missing. Arguments: config_file {Dict[str, Any]} -- Opened Config file """ if (not config_file): ExceptionUtils.error_message( "missing or empty config file, aborting.") self.exit(error_code=ERROR_CODE_CMD_LINE) # ############################ REST-API ##################################### try: auth_rest = SppUtils.get_cfg_params(param_dict=config_file, param_name="sppServer") if (not isinstance(auth_rest, dict)): raise ValueError("sppServer config need to be dict") self.job_log_retention_time = auth_rest.get( "jobLog_rentation", "60d") ConnectionUtils.verbose = OPTIONS.verbose # ### Loaded Systems part 1/2 ### # if (OPTIONS.minimumLogs or OPTIONS.loadedSystem): # Setting pagesize scaling settings ConnectionUtils.timeout_reduction = self.loaded_timeout_reduction ConnectionUtils.allowed_send_delta = self.loaded_allowed_send_delta ConnectionUtils.max_scaling_factor = self.loaded_max_scaling_factor # Setting RestClient request settings. self.rest_client = RestClient( auth_rest=auth_rest, pref_send_time=self.loaded_pref_send_time, request_timeout=self.loaded_request_timeout, send_retries=self.loaded_send_retries, starting_page_size=self.loaded_starting_page_size, min_page_size=self.loaded_min_page_size, verbose=OPTIONS.verbose) else: ConnectionUtils.timeout_reduction = self.timeout_reduction ConnectionUtils.allowed_send_delta = self.allowed_send_delta ConnectionUtils.max_scaling_factor = self.max_scaling_factor # Setting RestClient request settings. self.rest_client = RestClient( auth_rest=auth_rest, pref_send_time=self.pref_send_time, request_timeout=self.request_timeout, send_retries=self.send_retries, starting_page_size=self.starting_page_size, min_page_size=self.min_page_size, verbose=OPTIONS.verbose) self.api_queries = ApiQueries(self.rest_client) self.rest_client.login() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="REST-API is not available due Config error") self.rest_client = None self.api_queries = None # ######################## System, Job and Hypervisor Methods ################## try: # explicit ahead due dependency self.system_methods = SystemMethods(self.influx_client, self.api_queries, OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) # ### Loaded Systems part 2/2 ### # if (OPTIONS.minimumLogs or OPTIONS.loadedSystem): given_log_types = self.loaded_joblog_types else: given_log_types = self.joblog_types try: self.job_methods = JobMethods(self.influx_client, self.api_queries, self.job_log_retention_time, given_log_types, OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) try: # dependen on system methods self.hypervisor_methods = ProtectionMethods( self.system_methods, self.influx_client, self.api_queries, OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) # ############################### SSH ##################################### if (self.ssh or self.process_stats): try: auth_ssh = SppUtils.get_cfg_params(param_dict=config_file, param_name="sshclients") ssh_clients: List[SshClient] = [] if (not isinstance(auth_ssh, list)): raise ValueError("not a list of sshconfig given", auth_ssh) for client_ssh in auth_ssh: try: ssh_clients.append(SshClient(client_ssh)) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"Setting up one client failed, skipping it. Client: \ {client_ssh.get('name', 'ERROR WHEN GETTING NAME')}" ) # set from None to methods once finished self.ssh_methods = SshMethods(influx_client=self.influx_client, ssh_clients=ssh_clients, verbose=OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "SSH-Commands are not available due Config error")
def create_inventory_summary(self) -> None: """Retrieves and calculate VM inventory summary by influx catalog data.""" LOGGER.info( "> computing inventory information (not from catalog, means not only backup data is calculated)") # ########## Part 1: Check if something need to be computed ############# # query the timestamp of the last vm, commited as a field is always needed by influx rules. vms_table = self.__influx_client.database["vms"] time_query = SelectionQuery( keyword=Keyword.SELECT, tables=[vms_table], fields=['time', 'commited'], limit=1, order_direction="DESC" ) result = self.__influx_client.send_selection_query(time_query) # type: ignore last_vm: Dict[str, Any] = next(result.get_points(), None) # type: ignore if(not last_vm): raise ValueError("no VM's stored, either none are available or you have to store vm's first") # query the last vm stats to compare timestamps with last vm last_time_ms: int = last_vm["time"] last_time = SppUtils.to_epoch_secs(last_time_ms) where_str = "time = {}s".format(last_time) vm_stats_table = self.__influx_client.database["vmStats"] vm_stats_query = SelectionQuery( keyword=Keyword.SELECT, tables=[vm_stats_table], fields=['*'], where_str=where_str, limit=1 ) result = self.__influx_client.send_selection_query(vm_stats_query) # type: ignore if(len(list(result.get_points())) > 0): # type: ignore LOGGER.info(">> vm statistics already computed, skipping") return # ####################### Part 2: Compute new Data #################### fields = [ 'uptime', 'powerState', 'commited', 'uncommited', 'memory', 'host', 'vmVersion', 'isProtected', 'inHLO', 'isEncrypted', 'datacenterName', 'hypervisorType', ] query = SelectionQuery( keyword=Keyword.SELECT, tables=[vms_table], fields=fields, where_str=where_str ) result = self.__influx_client.send_selection_query(query) # type: ignore all_vms_list: List[Dict[str, Union[str, int, float, bool]]] = list(result.get_points()) # type: ignore # skip if no new data can be computed if(not all_vms_list): raise ValueError("no VM's stored, either none are available or store vms first") vm_stats: Dict[str, Any] = {} try: vm_stats['vmCount'] = len(all_vms_list) # returns largest/smallest vm_stats['vmMaxSize'] = max(all_vms_list, key=(lambda mydict: mydict['commited']))['commited'] # on purpose zero size vm's are ignored vms_no_null_size = list(filter(lambda mydict: mydict['commited'] > 0, all_vms_list)) if(vms_no_null_size): vm_stats['vmMinSize'] = min(vms_no_null_size, key=(lambda mydict: mydict['commited']))['commited'] vm_stats['vmSizeTotal'] = sum(mydict['commited'] for mydict in all_vms_list) vm_stats['vmAvgSize'] = vm_stats['vmSizeTotal'] / vm_stats['vmCount'] # returns largest/smallest vm_stats['vmMaxUptime'] = max(all_vms_list, key=(lambda mydict: mydict['uptime']))['uptime'] # on purpose zero size vm's are ignored vms_no_null_time = list(filter(lambda mydict: mydict['uptime'] > 0, all_vms_list)) if(vms_no_null_time): vm_stats['vmMinUptime'] = min(vms_no_null_time, key=(lambda mydict: mydict['uptime']))['uptime'] vm_stats['vmUptimeTotal'] = sum(mydict['uptime'] for mydict in all_vms_list) vm_stats['vmAvgUptime'] = vm_stats['vmUptimeTotal'] / vm_stats['vmCount'] vm_stats['vmCountProtected'] = len(list(filter(lambda mydict: mydict['isProtected'] == "True", all_vms_list))) vm_stats['vmCountUnprotected'] = vm_stats['vmCount'] - vm_stats['vmCountProtected'] vm_stats['vmCountEncrypted'] = len(list(filter(lambda mydict: mydict['isEncrypted'] == "True", all_vms_list))) vm_stats['vmCountPlain'] = vm_stats['vmCount'] - vm_stats['vmCountEncrypted'] vm_stats['vmCountHLO'] = len(list(filter(lambda mydict: mydict['inHLO'] == "True", all_vms_list))) vm_stats['vmCountNotHLO'] = vm_stats['vmCount'] - vm_stats['vmCountHLO'] vm_stats['vmCountVMware'] = len(list(filter(lambda mydict: mydict['hypervisorType'] == "vmware", all_vms_list))) vm_stats['vmCountHyperV'] = len(list(filter(lambda mydict: mydict['hypervisorType'] == "hyperv", all_vms_list))) vm_stats['nrDataCenters'] = len(set(map(lambda vm: vm['datacenterName'], all_vms_list))) vm_stats['nrHosts'] = len(set(map(lambda vm: vm['host'], all_vms_list))) vm_stats['time'] = all_vms_list[0]['time'] if self.__verbose: MethodUtils.my_print([vm_stats]) except (ZeroDivisionError, AttributeError, KeyError, ValueError) as error: ExceptionUtils.exception_info(error=error) raise ValueError("error when computing extra vm stats", vm_stats) LOGGER.info(">> store vmInventory information in Influx DB") self.__influx_client.insert_dicts_to_buffer("vmStats", [vm_stats])
def store_script_metrics(self) -> None: """Stores script metrics into influxb. To be called before exit. Does not raise any exceptions, skips if influxdb is missing. """ LOGGER.info("Storing script metrics") try: if (not self.influx_client): raise ValueError("no influxClient set up") insert_dict: Dict[str, Union[str, int, float, bool]] = {} # add version nr, api calls are needed insert_dict["sppmon_version"] = VERSION if (self.rest_client): try: (version_nr, build) = self.rest_client.get_spp_version_build() insert_dict["spp_version"] = version_nr insert_dict["spp_build"] = build except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="could not query SPP version and build.") # end total sppmon runtime end_counter = time.perf_counter() insert_dict['duration'] = int( (end_counter - self.start_counter) * 1000) # add arguments of sppmon for (key, value) in vars(OPTIONS).items(): insert_dict[key] = value # save occured errors error_count = len(ExceptionUtils.stored_errors) if (error_count > 0): ExceptionUtils.error_message( f"total of {error_count} exception/s occured") insert_dict['errorCount'] = error_count # save list as str insert_dict['errorMessages'] = str(ExceptionUtils.stored_errors) # get end timestamp (time_key, time_val) = SppUtils.get_capture_timestamp_sec() insert_dict[time_key] = time_val # save the metrics self.influx_client.insert_dicts_to_buffer( table_name="sppmon_metrics", list_with_dicts=[insert_dict]) self.influx_client.flush_insert_buffer() LOGGER.info("Stored script metrics sucessfull") # + 1 due the "total of x exception/s occured" if (error_count + 1 < len(ExceptionUtils.stored_errors)): ExceptionUtils.error_message( "A non-critical error occured while storing script metrics. \n\ This error can't be saved into the DB, it's only displayed within the logs." ) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Error when storing sppmon-metrics, skipping this step. Possible insert-buffer data loss" )
def create_dashboard(dashboard_folder_path: str, database_name: str) -> None: """Creates from the 14 day dashboard a new dashboard for the individual database. Alerts are transferred Args: dashboard_folder_path (str): Path the the folder where the template is located database_name (str): name of the database Raises: ValueError: no path given ValueError: no db name given ValueError: error when reading or writing files """ if (not dashboard_folder_path): raise ValueError( "a path to the dashboard template is required to create a new dashboard" ) if (not database_name): raise ValueError( "need the name of the database to create the new dashboard") real_path = os.path.realpath(dashboard_folder_path) tmpl_path = os.path.join(real_path, "SPPMON for IBM Spectrum Protect Plus.json") LOGGER.info(f"> trying to open template dashboard on path {tmpl_path}") try: tmpl_file = open(tmpl_path, "rt") file_str = tmpl_file.read() tmpl_file.close() except Exception as error: ExceptionUtils.exception_info(error) raise ValueError( "Error opening dashboard template. Make sure you've the path to the correct folder (Grafana)." ) LOGGER.info("> Sucessfully opened. Creating new Dashboard") # replace name by new one name_str = file_str.replace( "\"title\": \"SPPMON for IBM Spectrum Protect Plus\"", f"\"title\": \"SPPMON for IBM Spectrum Protect Plus {database_name}\"" ) # replace uid by new one uid_str = re.sub("\"uid\": \".*\"", f"\"uid\": \"14_day_auto_gen_{database_name}\"", name_str) # replace all datasource = null by actual datasource datasource_str = uid_str.replace( "\"datasource\": null", f"\"datasource\": \"{database_name}\"", ) LOGGER.info("> finished creating content of dashboard") write_path = os.path.join( real_path, f"SPPMON for IBM Spectrum Protect Plus {database_name}.json") LOGGER.info(f"> trying to create dashboard file on path {write_path}") try: dashboard_file = open(write_path, "wt") dashboard_file.write(datasource_str) dashboard_file.close() except Exception as error: ExceptionUtils.exception_info(error) raise ValueError("Error creating new dashboard file.") LOGGER.info("> Sucessfully created new dashboard file.")
def get_vms_per_sla(self) -> List[Dict[str, Any]]: """retrieves and calculates all vmware per SLA.""" endpoint = "/ngp/slapolicy" allow_list = ["name", "id"] array_name = "slapolicies" sla_policty_list = self.__rest_client.get_objects( endpoint=endpoint, allow_list=allow_list, array_name=array_name, add_time_stamp=False ) result_list: List[Dict[str, Any]] = [] for sla_policty in sla_policty_list: try: sla_name: str = sla_policty["name"] except KeyError as error: ExceptionUtils.exception_info(error, extra_message="skipping one sla entry due missing name.") continue sla_id: Optional[str] = sla_policty.get("id", None) result_dict: Dict[str, Any] = {} ## hotadd: sla_name = urllib.parse.quote_plus(sla_name) endpoint = "/api/hypervisor/search" params = { "resourceType": "vm", "from": "hlo", "pageSize": 1, "filter": json.dumps([ { "property": "storageProfileName", "value": sla_name, "op": "=" } ]) } # other options: volume, vm, tag, tagcategory post_data = { "name": "*", "hypervisorType": "vmware", } (response_json, _) = self.__rest_client.query_url( self.__rest_client.get_url(endpoint), params, RequestType.POST, post_data) result_dict["slaName"] = sla_name result_dict["slaId"] = sla_id result_dict["vmCountBySLA"] = response_json.get("total", None) time_key, time = SppUtils.get_capture_timestamp_sec() result_dict[time_key] = time result_list.append(result_dict) return result_list