def __init__(self, table: Structures.Table, fields: Dict[str, Any], tags: Dict[str, Any] = None, time_stamp: Union[int, str] = None): if(not table): raise ValueError("need table to create query") if(not fields): raise ValueError("need at least one value to create query") if(tags is None): tags = {} if(time_stamp is None): time_stamp = SppUtils.get_actual_time_sec() # Keyword is always Insert since insert Statement self.__keyword = Keyword.INSERT self.__table = table self.__time_stamp = SppUtils.epoch_time_to_seconds(time_stamp) fields = self.format_fields(fields) # make sure you have some fields if they are not privided if(not list(filter(lambda field_tup: field_tup[1] is not None, fields.items()))): # need default def to be able to do anything if(not table.fields): raise ValueError("fields after formatting empty, need at least one value!") # only works for strings, any other addition would corrupt the data for (key, datatype) in table.fields.items(): if(datatype is Structures.Datatype.STRING): fields[key] = '\"autofilled\"' break # test again, improvement possible here if(not list(filter(lambda field_tup: field_tup[1] is not None, fields.items()))): raise ValueError("fields after formatting empty, need at least one value!") self.__fields: Dict[str, Union[int, float, str, bool]] = fields self.__tags: Dict[str, str] = self.format_tags(tags)
def _parse_df_cmd(ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `df` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if (not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if (not ssh_type): raise ValueError("no sshtype given") if (not ssh_command.table_name): raise ValueError("need table name to insert parsed value") result_lines = ssh_command.result.splitlines() header = result_lines[0].split() # remove "on" header.pop() values: List[Dict[str, Any]] = list( map(lambda row: dict(zip(header, row.split())), result_lines[1:])) # type: ignore for row in values: if ("1G-blocks" in row): row["Size"] = row.pop("1G-blocks") row["Size"] = SppUtils.parse_unit(row['Size']) if ("Avail" in row): row["Available"] = row.pop("Avail") row["Available"] = SppUtils.parse_unit(row['Available']) row["Used"] = SppUtils.parse_unit(row['Used']) row["Use%"] = row["Use%"][:-1] # set default needed fields row['hostName'] = ssh_command.host_name row['ssh_type'] = ssh_type (time_key, time_value) = SppUtils.get_capture_timestamp_sec() row[time_key] = time_value return (ssh_command.table_name, values)
def __job_logs_to_stats(self, list_with_logs: List[Dict[str, Any]]) -> None: """Parses joblogs into their own statisic table, using declared supported ID's To parse more jobLogs define additional entrys in the attribute `supported_ids`. Arguments: list_with_logs {List[Dict[str, Any]]} -- List with all saved joblogs """ # only continue with joblogs we want to save supported_log_iterator = filter(lambda log: log['messageId'] in self.__supported_ids.keys(), list_with_logs) sorted_log_iterator = sorted(supported_log_iterator, key=lambda entry: entry['logTime']) max_sec_timestamp = 0 # required for preventing duplicates for job_log in sorted_log_iterator: message_id = job_log['messageId'] table_func_tuple = self.__supported_ids[message_id] (table_name, row_dict_func) = table_func_tuple if(not table_name): table_name = message_id try: row_dict = row_dict_func(job_log['messageParams']) except KeyError as error: ExceptionUtils.exception_info( error, extra_message="MessageID params wrong defined. Skipping one MessageId") continue row_dict['messageId'] = message_id # Issue 9, In case where all tag values duplicate another record, including the timestamp, Influx will throw the insert # out as a duplicate. In some cases, the changing of epoch timestamps from millisecond to second precision is # cause duplicate timestamps. To avoid this for certain tables, add seconds to the timestamp as needed to # ensure uniqueness. Only use this when some innacuracy of the timestamps is acceptable cur_timestamp = job_log['logTime'] if(table_name == 'vmBackupSummary'): if(cur_timestamp is None): # prevent None ExceptionUtils.error_message(f"Warning: logTime is None, duplicate may be purged. Log: {job_log}") if(isinstance(cur_timestamp, str)): # make sure its int cur_timestamp = int(cur_timestamp) cur_sec_timestamp = SppUtils.to_epoch_secs(cur_timestamp) if(cur_sec_timestamp <= max_sec_timestamp): digits = (int)(cur_timestamp / cur_sec_timestamp) max_sec_timestamp += 1 # increase by 1 second cur_timestamp = max_sec_timestamp * digits else: max_sec_timestamp = cur_sec_timestamp row_dict['time'] = cur_timestamp for(key, item) in row_dict.items(): if(item in ('null', 'null(null)')): row_dict[key] = None self.__influx_client.insert_dicts_to_buffer(table_name, [row_dict])
def ssh_execute_commands(cls, ssh_clients: List[SshClient], ssh_type: SshTypes, command_list: List[SshCommand]) -> List[Tuple[str, List[Dict[str, Any]]]]: """ functions executes commands via ssh on several hosts. the hosts (other, vsnap, vadp) can be defined in the JSON configuation file commands which shall be executed on vsnap and / or vadp proxies in the dedicated ist of strings. 'otherCommands' is a list of commands which are executed on hosts which are not of type: vsnap | vadp. if any host are not reachable, they are skipped """ if(not command_list): LOGGER.debug("No commands specified, aborting command.") if(cls.verbose): LOGGER.info("No commands specified, aborting command.") return [] client_list = list(filter(lambda client: client.client_type is ssh_type, ssh_clients)) if(not client_list): LOGGER.debug(f"No {ssh_type.name} ssh client present. Aborting command") if(cls.verbose): LOGGER.info(f"No {ssh_type.name} ssh client present. Aborting command") return [] ssh_cmd_response_list = [] result_list = [] for client in client_list: if(cls.verbose): LOGGER.info(f">> executing {ssh_type.name} command(s) on host {client.host_name}") try: result_commands = client.execute_commands( commands=command_list, verbose=cls.verbose ) except ValueError as error: ExceptionUtils.exception_info(error=error, extra_message="Error when executing commands, skipping this client") continue for ssh_command in result_commands: insert_dict = {} insert_dict["host"] = ssh_command.host_name insert_dict["command"] = ssh_command.cmd insert_dict["output"] = json.dumps(ssh_command.result) insert_dict['ssh_type'] = ssh_type.name time_key, time_value = SppUtils.get_capture_timestamp_sec() insert_dict[time_key] = time_value ssh_cmd_response_list.append(insert_dict) try: table_result_tuple = ssh_command.parse_result(ssh_type=ssh_type) result_list.append(table_result_tuple) except ValueError as error: ExceptionUtils.exception_info(error=error, extra_message="Error when parsing result, skipping parsing of this result") result_list.append(("sshCmdResponse", ssh_cmd_response_list)) return result_list
def _parse_mpstat_cmd( ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `mpstat` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if (not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if (not ssh_type): raise ValueError("no sshtype given") pattern = re.compile( r"(.*)\s+\((.*)\)\s+(\d{2}\/\d{2}\/\d{4})\s+(\S*)\s+\((\d+)\sCPU\)" ) result_lines = ssh_command.result.splitlines() header = result_lines[2].split() # rename to make possible to identify header[0] = "time" header[1] = "am/pm" values: Dict[str, Any] = dict(zip(header, result_lines[3].split())) # drop, it is easier to use our own time values.pop('time') values.pop('am/pm') # set default needed fields values['hostName'] = ssh_command.host_name values['ssh_type'] = ssh_type.name (time_key, time_value) = SppUtils.get_capture_timestamp_sec() values[time_key] = time_value # zip between the exec information and the names for the matching group match = re.match(pattern, result_lines[0]) if (not match): raise ValueError( "the mpstat values are not in the expected pattern", result_lines, ssh_command, ssh_type) for (key, value) in zip( ["name", "host", "date", "system_type", "cpu_count"], match.groups()): values[key] = value # replace it with capture time values.pop('date') return (ssh_command.table_name, [values])
def set_critial_configs(self, config_file: Dict[str, Any]) -> None: """Sets up any critical infrastructure, to be called within the init. Be aware not everything may be initalized on call time. Add config here if the system should abort if it is missing. Arguments: config_file {Dict[str, Any]} -- Opened Config file """ if (not config_file): ExceptionUtils.error_message( "missing or empty config file, aborting") self.exit(error_code=ERROR_CODE_CMD_LINE) try: # critical components only auth_influx = SppUtils.get_cfg_params(param_dict=config_file, param_name="influxDB") if (not isinstance(auth_influx, dict)): raise ValueError("influx config need to be dict") self.influx_client = InfluxClient(auth_influx=auth_influx) self.influx_client.connect() except ValueError as err: ExceptionUtils.exception_info( error=err, extra_message="error while setting up critical config. Aborting" ) self.influx_client = None # set none cause it does not work. self.exit(error_code=ERROR_CODE)
def set_logger(self) -> None: """Sets global logger for stdout and file logging. Changes logger aquired by LOGGER_NAME. Raises: ValueError: Unable to open logger """ self.log_path = SppUtils.mk_logger_file(ARGS.configFile, ".log") try: file_handler = logging.FileHandler(self.log_path) except Exception as error: # TODO here: Right exception, how to print this error? print("unable to open logger", file=sys.stderr) raise ValueError("Unable to open Logger") from error file_handler_fmt = logging.Formatter( '%(asctime)s:[PID %(process)d]:%(levelname)s:%(module)s.%(funcName)s> %(message)s' ) file_handler.setFormatter(file_handler_fmt) if (ARGS.debug): file_handler.setLevel(logging.DEBUG) else: file_handler.setLevel(logging.ERROR) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) logger = logging.getLogger(LOGGER_NAME) logger.setLevel(logging.DEBUG) logger.addHandler(file_handler) logger.addHandler(stream_handler)
def filter_values_dict( cls, result_list: List[Dict[str, Any]], white_list: List[str] = None, ignore_list: List[str] = None) -> List[Dict[str, Any]]: """Removes unwanted values from a list of dicts. Use white_list to only pick the values specified. Use ignore_list to pick everything but the values specified Both: white_list itmes overwrite ignore_list times, still getting all items not filterd. Args: result_list (List[Dict[str, Any]]): items to be filtered white_list (List[str], optional): items to be kept. Defaults to None. ignore_list (List[str], optional): items to be removed. Defaults to None. Raises: ValueError: no result list specified Returns: List[Dict[str, Any]]: list of filtered dicts """ if (result_list is None): raise ValueError("need valuelist to filter values") new_result_list: List[Dict[str, Any]] = [] # if single object this is a 1 elem list for result in result_list: new_result: Dict[str, Any] = {} # Only aquire items wanted if (white_list): for white_key in white_list: (key, value) = SppUtils.get_nested_kv(key_name=white_key, nested_dict=result) if (key in new_result): key = white_key new_result[key] = value # warn if something is missing if (len(new_result) != len(white_list)): ExceptionUtils.error_message( f"Result has not same lenght as whitelist, probably typing error: {result_list}" ) # aquire all but few unwanted if (ignore_list is not None): # add sub-dicts to dictonary itself, filtering inclusive full_result = cls.get_with_sub_values(mydict=result, ignore_list=ignore_list) new_result.update(full_result) new_result_list.append(new_result) return new_result_list
def __insert_metrics_to_buffer(self, keyword: Keyword, tables_count: Dict[Table, int], duration_s: float, batch_size: int = 1) -> None: """Generates statistics per send Batch, total duration is split by item per table. Arguments: keyword {Keyword} -- Kind of query. tables_count {dict} -- Tables send in this batch, key is table, value is count of items. duration_s {float} -- Time needed to send the batch in seconds. Keyword Arguments: batch_size {int} -- Ammount of queries sent in one batch sent at once. (default: {1}) Raises: ValueError: Any arg does not match the defined parameters or value is unsupported """ # Arg checks if (list( filter(lambda arg: arg is None, [keyword, tables_count, duration_s, batch_size]))): raise ValueError("any metric arg is None. This is not supported") if (not isinstance(keyword, Keyword)): raise ValueError("need the keyword to be a instance of keyword.") if (not tables_count or not isinstance(tables_count, dict)): raise ValueError( "need at least one entry of a table in tables_count.") if (duration_s <= 0): raise ValueError( "only positive values are supported for duration. Must be not 0" ) if (batch_size < 1): raise ValueError( "only positive values are supported for batch_size. Must be not 0" ) # get shared record time to be saved on querys = [] # save metrics for each involved table individually for (table, item_count) in tables_count.items(): querys.append( InsertQuery( table=self.__metrics_table, fields={ # Calculating relative duration for this part of whole query 'duration_ms': duration_s * 1000 * (max(item_count, 1) / batch_size), 'item_count': item_count, }, tags={ 'keyword': keyword, 'tableName': table.name, }, time_stamp=SppUtils.get_actual_time_sec())) self.__insert_buffer[self.__metrics_table] = self.__insert_buffer.get( self.__metrics_table, []) + querys
def _parse_free_cmd( ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `free` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if (not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if (not ssh_type): raise ValueError("no sshtype given") if (not ssh_command.table_name): raise ValueError("need table name to insert parsed value") result_lines = ssh_command.result.splitlines() header = result_lines[0].split() header.insert(0, 'name') values: List[Dict[str, Any]] = list( map(lambda row: dict(zip(header, row.split())), result_lines[1:])) # type: ignore (time_key, _) = SppUtils.get_capture_timestamp_sec() for row in values: # remove ':' from name row['name'] = row['name'][:-1] # set default needed fields row['hostName'] = ssh_command.host_name row['ssh_type'] = ssh_type.name row[time_key] = SppUtils.get_actual_time_sec() # recalculate values to be more usefull if ('available' in row): row['free'] = int(row.pop('available')) row['used'] = int(row['total']) - int(row['free']) return (ssh_command.table_name, values)
def __init__(self): self.log_path: str = "" """path to logger, set in set_logger.""" self.pid_file_path: str = "" """path to pid_file, set in check_pid_file.""" # String, cause of days etc self.job_log_retention_time = "60d" """Configured spp log rentation time, logs get deleted after this time.""" self.set_logger() if (not self.check_pid_file()): ExceptionUtils.error_message( "Another instance of sppmon with the same args is running") self.exit(ERROR_CODE_CMD_LINE) # everything is option, otherwise its a typo. if (len(ARGS) > 0): ExceptionUtils.error_message( f"CAREFUL: ARG DETECTED, probably typing in programm call: {ARGS}" ) time_stamp_name, time_stamp = SppUtils.get_capture_timestamp_sec() self.start_counter = time.perf_counter() LOGGER.debug("\n\n") LOGGER.debug(f"running script version: {VERSION}") LOGGER.debug(f"cmdline options: {OPTIONS}") LOGGER.debug(f"{time_stamp_name}: {time_stamp}") LOGGER.debug("") if (not OPTIONS.confFileJSON): ExceptionUtils.error_message("missing config file, aborting") self.exit(error_code=ERROR_CODE_CMD_LINE) try: config_file = SppUtils.read_conf_file( config_file_path=OPTIONS.confFileJSON) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Syntax Error in Config file, unable to read") self.exit(error_code=ERROR_CODE_CMD_LINE) self.setup_args() self.set_critial_configs(config_file) self.set_optional_configs(config_file)
def _parse_ps_cmd(self, ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `df` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if(not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if(not ssh_type): raise ValueError("no sshtype given") if(not ssh_command.table_name): raise ValueError("need table name to insert parsed value") result_lines = ssh_command.result.splitlines() header = result_lines[0].split() values: List[Dict[str, Any]] = list( map(lambda row: dict(zip(header, row.split())), result_lines[1:])) # type: ignore # remove `ps` from commands, it is also tracked values = list(filter(lambda row: row["COMMAND"] in self.__process_grep_list, values)) for row in values: # Remove CPU, it is tracked by TOP-Command (see Issue #71) row.pop("%CPU", None) # Add information row["collectionType"] = "PS" # set default needed fields row['hostName'] = ssh_command.host_name row['ssh_type'] = ssh_type.name (time_key, time_value) = SppUtils.get_capture_timestamp_sec() row[time_key] = time_value row['TIME+'] = row.pop('ELAPSED') row['MEM_ABS'] = SppUtils.parse_unit(row.pop("RSS"),"kib") row['VIRT'] = SppUtils.parse_unit(row.pop('VSZ'), "kib") return (ssh_command.table_name, values)
def get_vms_per_sla(self) -> List[Dict[str, Any]]: """retrieves and calculates all vmware per SLA.""" endpoint = "/ngp/slapolicy" white_list = ["name", "id"] array_name = "slapolicies" sla_policty_list = self.__rest_client.get_objects( endpoint=endpoint, white_list=white_list, array_name=array_name, add_time_stamp=False) result_list: List[Dict[str, Any]] = [] for sla_policty in sla_policty_list: try: sla_name: str = sla_policty["name"] except KeyError as error: ExceptionUtils.exception_info( error, extra_message="skipping one sla entry due missing name.") continue sla_id: Optional[str] = sla_policty.get("id", None) result_dict: Dict[str, Any] = {} ## hotadd: sla_name = urllib.parse.quote_plus(sla_name) endpoint = "/api/hypervisor/search" endpoint = ConnectionUtils.url_set_param(url=endpoint, param_name="resourceType", param_value="vm") endpoint = ConnectionUtils.url_set_param(url=endpoint, param_name="from", param_value="hlo") filter_str: str = '[{"property":"storageProfileName","value": "' + sla_name + '", "op":"="}]' endpoint = ConnectionUtils.url_set_param(url=endpoint, param_name="filter", param_value=filter_str) # note: currently only vmware is queried per sla, not hyperV # need to check if hypervisortype must be specified post_data = json.dumps({"name": "*", "hypervisorType": "vmware"}) response_json = self.__rest_client.post_data(endpoint=endpoint, post_data=post_data) result_dict["slaName"] = sla_name result_dict["slaId"] = sla_id result_dict["vmCountBySLA"] = response_json.get("total") time_key, time = SppUtils.get_capture_timestamp_sec() result_dict[time_key] = time result_list.append(result_dict) return result_list
def exit(self, error_code: int = SUCCESS_CODE) -> NoReturn: """Executes finishing tasks and exits sppmon. To be called every time. Executes finishing tasks and displays error messages. Specify only error message if something did went wrong. Use Error codes specified at top of module. Does NOT return. Keyword Arguments: error {int} -- Errorcode if a error occured. (default: {0}) """ # error with the command line arguments # dont store runtime here if (error_code == ERROR_CODE_CMD_ARGS): parser.print_help() sys.exit(ERROR_CODE_CMD_ARGS) # unreachable? if (error_code == ERROR_CODE_START_ERROR): ExceptionUtils.error_message( "Error when starting SPPMon. Please review the errors above") sys.exit(ERROR_CODE_START_ERROR) script_end_time = SppUtils.get_actual_time_sec() LOGGER.debug("Script end time: %d", script_end_time) try: if (not self.ignore_setup): self.store_script_metrics() if (self.influx_client): self.influx_client.disconnect() if (self.rest_client): self.rest_client.logout() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Error occured while exiting sppmon") error_code = ERROR_CODE self.remove_pid_file() # Both error-clauses are actually the same, but for possiblility of an split between error cases # always last due beeing true for any number != 0 if (error_code == ERROR_CODE or error_code): ExceptionUtils.error_message( "Error occured while executing sppmon") elif (not self.ignore_setup): LOGGER.info("\n\n!!! script completed !!!\n") print( f"check log for details: grep \"PID {os.getpid()}\" {self.log_path} > sppmon.log.{os.getpid()}" ) sys.exit(error_code)
def check_pid_file(self) -> bool: if (ARGS.verbose): LOGGER.info("Checking for other SPPMon instances") self.pid_file_path = SppUtils.mk_logger_file(ARGS.configFile, ".pid_file") try: try: file = open(self.pid_file_path, "rt") match_list = re.findall(r"(\d+) " + str(ARGS), file.read()) file.close() deleted_processes: List[str] = [] for match in match_list: # add spaces to make clear the whole number is matched match = f' {match} ' try: if (os.name == 'nt'): args = ['ps', '-W'] else: args = ['ps', '-p', match] result = subprocess.run(args, check=True, capture_output=True) if (re.search(match, str(result.stdout))): return False # not in there -> delete entry deleted_processes.append(match) except CalledProcessError as error: deleted_processes.append(match) # delete processes which did get killed, not often called if (deleted_processes): file = open(self.pid_file_path, "rt") file_str = file.read() file.close() options = str(ARGS) for pid in deleted_processes: file_str = file_str.replace(f"{pid} {options}", "") # do not delete if empty since we will use it below file = open(self.pid_file_path, "wt") file.write(file_str.strip()) file.close() except FileNotFoundError: pass # no file created yet # always write your own pid into it file = open(self.pid_file_path, "at") file.write(f"{os.getpid()} {str(ARGS)}") file.close() return True except Exception as error: ExceptionUtils.exception_info(error) raise ValueError("Error when checking pid file")
def exit(self, error_code: int = False) -> NoReturn: """Executes finishing tasks and exits sppmon. To be called every time. Executes finishing tasks and displays error messages. Specify only error message if something did went wrong. Use Error codes specified at top of module. Does NOT return. Keyword Arguments: error {int} -- Errorcode if a error occured. (default: {False}) """ # error with the command line arguments # dont store runtime here if (error_code == ERROR_CODE_CMD_LINE): prog_args = [] prog_args.append(sys.argv[0]) prog_args.append("--help") os.execv(sys.executable, ['python'] + prog_args) sys.exit(ERROR_CODE_CMD_LINE) # unreachable? script_end_time = SppUtils.get_actual_time_sec() LOGGER.debug("Script end time: %d", script_end_time) try: if (not self.ignore_setup): self.store_script_metrics() if (self.influx_client): self.influx_client.disconnect() if (self.rest_client): self.rest_client.logout() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="Error occured while exiting sppmon") error_code = ERROR_CODE if (not error_code): LOGGER.info("\n\n!!! script completed !!!\n") self.remove_pid_file() # Both clauses are actually the same, but for clarification, always last due always beeing true for any number if (error_code == ERROR_CODE or error_code): ExceptionUtils.error_message( "Error occured while executing sppmon") print( f"check log for details: grep \"PID {os.getpid()}\" {self.log_path} > sppmon.log.{os.getpid()}" ) sys.exit(error_code)
def __init__(self): self.log_path: str = "" """path to logger, set in set_logger.""" self.pid_file_path: str = "" """path to pid_file, set in check_pid_file.""" self.set_logger() LOGGER.info("Starting SPPMon") if (not self.check_pid_file()): ExceptionUtils.error_message( "Another instance of sppmon with the same args is running") self.exit(ERROR_CODE_START_ERROR) time_stamp_name, time_stamp = SppUtils.get_capture_timestamp_sec() self.start_counter = time.perf_counter() LOGGER.debug("\n\n") LOGGER.debug(f"running script version: {VERSION}") LOGGER.debug(f"cmdline options: {ARGS}") LOGGER.debug(f"{time_stamp_name}: {time_stamp}") LOGGER.debug("") if (not ARGS.configFile): ExceptionUtils.error_message("missing config file, aborting") self.exit(error_code=ERROR_CODE_CMD_ARGS) try: self.config_file = SppUtils.read_conf_file( config_file_path=ARGS.configFile) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Error when trying to read Config file, unable to read") self.exit(error_code=ERROR_CODE_START_ERROR) LOGGER.info("Setting up configurations") self.setup_args() self.set_critial_configs(self.config_file) self.set_optional_configs(self.config_file)
def format_fields( self, fields: Dict[str, Any]) -> Dict[str, Union[int, float, str]]: """Formats fields accordingly to the requirements of the influxdb. Cast and transforms all values to the required datatype, declared in the given table. Escapes all characters which are not allowed, applies to both key and value. Arguments: table {Table} -- Table with given field declarations fields {Dict[str, Any]} -- Dict of all fields to be formatted, key is name, value is data Returns: Dict[str, Union[int, float, str]] -- Dict with field name as key and data as value """ ret_dict: Dict[str, Union[int, float, str]] = {} for (key, value) in fields.items(): if (value is None or (isinstance(value, str) and not value)): continue # Get Colum Datatype datatype = self.table.fields.get(key, None) # If nothing is defined select it automatic if (datatype is None): datatype = Structures.Datatype.get_auto_datatype(value) # Escape not allowed chars in Key key = InfluxUtils.escape_chars( value=key, replace_list=self.__bad_name_characters) # Format Strings if (datatype == Structures.Datatype.STRING): value = InfluxUtils.escape_chars(value=value, replace_list=[(r'"', r'\"')]) value = "\"{}\"".format(value) # Make time always be saved in seconds, save as int if (datatype == Structures.Datatype.TIMESTAMP): value = SppUtils.to_epoch_secs(value) value = '{}i'.format(value) # Make Integer to an IntLiteral if (datatype == Structures.Datatype.INT): value = '{}i'.format(value) ret_dict[key] = value return ret_dict
def setup_ssh_clients(config_file: Dict[str, Any]) -> List[SshClient]: auth_ssh = SppUtils.get_cfg_params(param_dict=config_file, param_name="sshclients") if (not isinstance(auth_ssh, list)): raise ValueError("not a list of sshconfig given", auth_ssh) ssh_clients: List[SshClient] = [] for client_ssh in auth_ssh: try: ssh_clients.append(SshClient(client_ssh)) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"Setting up one ssh-client failed, skipping it. Client: \ {client_ssh.get('name', 'ERROR WHEN GETTING NAME')}") return ssh_clients
def __insert_metrics_to_buffer(self, keyword: Keyword, table: Table, duration_s: float, item_count: int, error: Optional[str] = None) -> None: """Generates statistics of influx-requests and append them to be sent Arguments: keyword {Keyword} -- Kind of query. tables_count {dict} -- Tables send in this batch, key is table, value is count of items. duration_s {Optional[float]} -- Time needed to send the batch in seconds. None if a error occured item_count {int} -- Ammount of queries sent to the server Keyword Arguments: error {Optional[str]} -- Error message if an error occured. Raises: ValueError: Any arg does not match the defined parameters or value is unsupported """ # Arg checks if (list( filter(lambda arg: arg is None, [keyword, table, duration_s, item_count]))): raise ValueError( "One of the insert metrics to influx args is None. This is not supported" ) query = InsertQuery( table=self.__metrics_table, fields={ 'error': error, # Calculating relative duration for this part of whole query 'duration_ms': duration_s * 1000, 'item_count': item_count, }, tags={ 'keyword': keyword, 'tableName': table.name, }, time_stamp=SppUtils.get_actual_time_sec()) old_queries = self.__insert_buffer.get(self.__metrics_table, []) old_queries.append(query) self.__insert_buffer[self.__metrics_table] = old_queries
def __init__(self, config_file: Dict[str, Any]): """Initalize the influx client from a config dict. Call `connect` before using the client. Arguments: auth_influx {dictonary} -- Dictionary with required parameters. Raises: ValueError: Raises a ValueError if any important parameters are missing within the file """ if (not config_file): raise ValueError( "A config file is required to setup the InfluxDB client.") auth_influx = SppUtils.get_cfg_params(param_dict=config_file, param_name="influxDB") if (not isinstance(auth_influx, dict)): raise ValueError( "The InfluxDB config is corrupted within the file: Needs to be a dictionary." ) try: self.__user: str = auth_influx["username"] self.__password: str = auth_influx["password"] self.__use_ssl: bool = auth_influx["ssl"] if (self.__use_ssl): self.__verify_ssl: bool = auth_influx["verify_ssl"] else: self.__verify_ssl = False self.__port: int = auth_influx["srv_port"] self.__address: str = auth_influx["srv_address"] self.__database: Database = Database(auth_influx["dbName"]) # Create table definitions in code Definitions.add_table_definitions(self.database) self.__metrics_table: Table = self.database['influx_metrics'] except KeyError as key_error: ExceptionUtils.exception_info(error=key_error) raise ValueError("Missing Influx-Config arg", str(key_error)) # declare for later self.__client: InfluxDBClient self.__version: str
def transform_time_literal( value: str, single_vals: bool = False) -> Union[str, Tuple[int, int, int]]: """Transforms a time literal into hour/min/seconds literal. Checks before if the literal is valid. Args: value (str): time literal to be transformed single_vals (bool, optional): wheater the result should be a tuple. Defaults to False. Raises: ValueError: no value given ValueError: not a str given ValueError: value is no time literal Returns: Union[str, Tuple[int, int, int]]: influxdb time literal in 0h0m0s format or values as tuple """ if (not value): raise ValueError("need a value to verify the time literal") if (not isinstance(value, str)): raise ValueError( "type of the value for time literal transform is not str") if (not re.match(r"^(\d+(?:[smhdw]))+$", value)): if (value.lower() == "inf"): return "0s" raise ValueError("value does not pass the time literal check", value) match_list = re.findall(r"((\d+)([a-z]+))", value) time_s = 0 for (_, numbers, unit) in match_list: # full is first, but unused time_s += SppUtils.parse_unit(numbers, unit) hours = int(time_s / pow(60, 2)) time_s = time_s % pow(60, 2) mins = int(time_s / pow(60, 1)) seconds = int(time_s % pow(60, 1)) if (single_vals): return (hours, mins, seconds) return f"{hours}h{mins}m{seconds}s"
def _parse_system_stats_cmd( ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `vsnap --json system stats` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if (not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if (not ssh_type): raise ValueError("no sshtype given") if (not ssh_command.table_name): raise ValueError("need table name to insert parsed value") try: insert_dict: Dict[str, Any] = json.loads(ssh_command.result) except json.decoder.JSONDecodeError: # type: ignore raise ValueError("cant decode json for system stats command", ssh_command.result, ssh_command, ssh_type) if (not list(filter(lambda val: val is not None, insert_dict.values()))): raise ValueError( "Command and result given, but all values are None") # set default needed fields insert_dict['hostName'] = ssh_command.host_name insert_dict['ssh_type'] = ssh_type.name (time_key, time_value) = SppUtils.get_capture_timestamp_sec() insert_dict[time_key] = time_value return (ssh_command.table_name, [insert_dict])
def __init__(self, config_file: Dict[str, Any], initial_connection_timeout: float, pref_send_time: int, request_timeout: int | None, send_retries: int, starting_page_size: int, min_page_size: int, verbose: bool): if(not config_file): raise ValueError("A config file is required to setup the InfluxDB client.") auth_rest = SppUtils.get_cfg_params( param_dict=config_file, param_name="sppServer") if(not isinstance(auth_rest, dict)): raise ValueError("The REST-API config is corrupted within the file: Needs to be a dictionary.") self.__timeout = request_timeout self.__initial_connection_timeout = initial_connection_timeout self.__preferred_time = pref_send_time self.__page_size = starting_page_size self.__min_page_size = min_page_size self.__send_retries = send_retries self.__verbose = verbose try: self.__username: str = auth_rest["username"] self.__password: str = auth_rest["password"] self.__srv_address: str = auth_rest["srv_address"] self.__srv_port: int = auth_rest["srv_port"] except KeyError as error: raise ValueError("Not all REST-API Parameters are given", auth_rest) from error self.__sessionid: str = "" self.__srv_url: str = ""
def create_inventory_summary(self) -> None: """Retrieves and calculate VM inventory summary by influx catalog data.""" LOGGER.info( "> computing inventory information (not from catalog, means not only backup data is calculated)") # ########## Part 1: Check if something need to be computed ############# # query the timestamp of the last vm, commited as a field is always needed by influx rules. vms_table = self.__influx_client.database["vms"] time_query = SelectionQuery( keyword=Keyword.SELECT, tables=[vms_table], fields=['time', 'commited'], limit=1, order_direction="DESC" ) result = self.__influx_client.send_selection_query(time_query) # type: ignore last_vm: Dict[str, Any] = next(result.get_points(), None) # type: ignore if(not last_vm): raise ValueError("no VM's stored, either none are available or you have to store vm's first") # query the last vm stats to compare timestamps with last vm last_time_ms: int = last_vm["time"] last_time = SppUtils.epoch_time_to_seconds(last_time_ms) where_str = "time = {}s".format(last_time) vm_stats_table = self.__influx_client.database["vmStats"] vm_stats_query = SelectionQuery( keyword=Keyword.SELECT, tables=[vm_stats_table], fields=['*'], where_str=where_str, limit=1 ) result = self.__influx_client.send_selection_query(vm_stats_query) # type: ignore if(len(list(result.get_points())) > 0): # type: ignore LOGGER.info(">> vm statistics already computed, skipping") return # ####################### Part 2: Compute new Data #################### fields = [ 'uptime', 'powerState', 'commited', 'uncommited', 'memory', 'host', 'vmVersion', 'isProtected', 'inHLO', 'isEncrypted', 'datacenterName', 'hypervisorType', ] query = SelectionQuery( keyword=Keyword.SELECT, tables=[vms_table], fields=fields, where_str=where_str ) result = self.__influx_client.send_selection_query(query) # type: ignore all_vms_list: List[Dict[str, Union[str, int, float, bool]]] = list(result.get_points()) # type: ignore # skip if no new data can be computed if(not all_vms_list): raise ValueError("no VM's stored, either none are available or store vms first") vm_stats: Dict[str, Any] = {} try: vm_stats['vmCount'] = len(all_vms_list) # returns largest/smallest vm_stats['vmMaxSize'] = max(all_vms_list, key=(lambda mydict: mydict['commited']))['commited'] # on purpose zero size vm's are ignored vms_no_null_size = list(filter(lambda mydict: mydict['commited'] > 0, all_vms_list)) if(vms_no_null_size): vm_stats['vmMinSize'] = min(vms_no_null_size, key=(lambda mydict: mydict['commited']))['commited'] vm_stats['vmSizeTotal'] = sum(mydict['commited'] for mydict in all_vms_list) vm_stats['vmAvgSize'] = vm_stats['vmSizeTotal'] / vm_stats['vmCount'] # returns largest/smallest vm_stats['vmMaxUptime'] = max(all_vms_list, key=(lambda mydict: mydict['uptime']))['uptime'] # on purpose zero size vm's are ignored vms_no_null_time = list(filter(lambda mydict: mydict['uptime'] > 0, all_vms_list)) if(vms_no_null_time): vm_stats['vmMinUptime'] = min(vms_no_null_time, key=(lambda mydict: mydict['uptime']))['uptime'] vm_stats['vmUptimeTotal'] = sum(mydict['uptime'] for mydict in all_vms_list) vm_stats['vmAvgUptime'] = vm_stats['vmUptimeTotal'] / vm_stats['vmCount'] vm_stats['vmCountProtected'] = len(list(filter(lambda mydict: mydict['isProtected'] == "True", all_vms_list))) vm_stats['vmCountUnprotected'] = vm_stats['vmCount'] - vm_stats['vmCountProtected'] vm_stats['vmCountEncrypted'] = len(list(filter(lambda mydict: mydict['isEncrypted'] == "True", all_vms_list))) vm_stats['vmCountPlain'] = vm_stats['vmCount'] - vm_stats['vmCountEncrypted'] vm_stats['vmCountHLO'] = len(list(filter(lambda mydict: mydict['inHLO'] == "True", all_vms_list))) vm_stats['vmCountNotHLO'] = vm_stats['vmCount'] - vm_stats['vmCountHLO'] vm_stats['vmCountVMware'] = len(list(filter(lambda mydict: mydict['hypervisorType'] == "vmware", all_vms_list))) vm_stats['vmCountHyperV'] = len(list(filter(lambda mydict: mydict['hypervisorType'] == "hyperv", all_vms_list))) vm_stats['nrDataCenters'] = len(set(map(lambda vm: vm['datacenterName'], all_vms_list))) vm_stats['nrHosts'] = len(set(map(lambda vm: vm['host'], all_vms_list))) vm_stats['time'] = all_vms_list[0]['time'] if self.__verbose: MethodUtils.my_print([vm_stats]) except (ZeroDivisionError, AttributeError, KeyError, ValueError) as error: ExceptionUtils.exception_info(error=error) raise ValueError("error when computing extra vm stats", vm_stats) LOGGER.info(">> store vmInventory information in Influx DB") self.__influx_client.insert_dicts_to_buffer("vmStats", [vm_stats])
class JobMethods: """Wrapper for all job related functionality. You may implement new methods in here. Methods: get_all_jobs - incrementally saves all stored jobsessions, even before first execution of sppmon. job_logs -> saves all jobLogs for the jobsessions in influx catalog. """ # only here to maintain for later, unused yet __job_log_allow_list = [ "CTGGA2340", "CTGGA0071", "CTGGA2260", "CTGGA2315", "CTGGA0550", "CTGGA2384" ] # to be moved somewhere else # ######### Add new logs to be parsed here ####################################### # Structure: # Dict with messageID of log as name # value is a tuple of # #1 the tablename # #2 a lambda which maps each elem to a name. Must contain at least one argument! # #3 list of tuples: keys of additional informations to be saved: (#1: key, #2: rename). Part 2 optional, only if rename # the values are delived by the param_list of the joblog # if the value is something like 10sec or 10gb use `parse_unit` to parse it. __supported_ids: Dict[str, Tuple[str, Callable[[List[Any]], Dict[ str, Any]], List[Union[Tuple[str, str], str]]]] = { 'CTGGA2384': ( 'vmBackupSummary', lambda params: { "name": params[0], "proxy": params[1], "vsnaps": params[2], "type": params[3], "transportType": params[4], "transferredBytes": SppUtils.parse_unit(params[5]), "throughputBytes/s": SppUtils.parse_unit(params[6]), "queueTimeSec": SppUtils.parse_unit(params[7]), "protectedVMDKs": params[8], "TotalVMDKs": params[9], "status": params[10] }, ["messageId"] # Additional Information from job-message itself ), 'CTGGA0071': ('vmBackupSummary', lambda params: { 'protectedVMDKs': params[0], 'TotalVMDKs': int(params[1]) + int(params[0]), 'transferredBytes': SppUtils.parse_unit(params[2]), 'throughputBytes/s': SppUtils.parse_unit(params[3]), 'queueTimeSec': SppUtils.parse_unit(params[4]) }, ["messageId"]), 'CTGGA0072': ('vmReplicateSummary', lambda params: { 'total': params[0], 'failed': params[1], 'duration': SppUtils.parse_unit(params[2]) }, []), 'CTGGA0398': ('vmReplicateStats', lambda params: { 'replicatedBytes': SppUtils.parse_unit(params[0]), 'throughputBytes/sec': SppUtils.parse_unit(params[1]), 'duration': SppUtils.parse_unit(params[2], delimiter=':') }, []), 'CTGGR0003': ( 'office365Stats', lambda params: { 'imported365Users': int(params[0]), }, [ # Additional Information from job-message itself, including rename "jobId", "jobSessionId", "jobName", "jobExecutionTime" # used to instantly integrate with other stats ]), 'CTGGA2444': ( 'office365Stats', lambda params: { 'protectedItems': int(params[0]), 'selectedItems': int(params[0]), }, [ "jobId", "jobSessionId", "jobName", "jobExecutionTime" # used to instantly integrate with other stats ]), 'CTGGA2402': ( 'office365TransfBytes', lambda params: # If not matching, this will return a empty dict which is going to be ignored MethodUtils.joblogs_parse_params( r"(\w+)\s*\(Server:\s*([^\s,]+), Transfer Size: (\d+(?:.\d*)?\s*\w*)\)", params[1], lambda match_list: { "itemName": params[0], "itemType": match_list[1], "serverName": match_list[2], "transferredBytes": SppUtils.parse_unit(match_list[3]), }), ["jobId", "jobSessionId", "jobName"]), } """LogLog messageID's which can be parsed by sppmon. Check detailed summary above the declaration.""" def __init__(self, influx_client: Optional[InfluxClient], api_queries: Optional[ApiQueries], job_log_retention_time: str, job_log_types: List[str], verbose: bool): if (not influx_client): raise ValueError( "Job Methods are not available, missing influx_client") if (not api_queries): raise ValueError( "Job Methods are not available, missing api_queries") self.__influx_client = influx_client self.__api_queries = api_queries self.__verbose = verbose self.__job_log_retention_time = job_log_retention_time """used to limit the time jobLogs are queried, only interestig for init call""" self.__job_log_types = job_log_types def get_all_jobs(self) -> None: """incrementally saves all stored jobsessions, even before first execution of sppmon""" job_list = MethodUtils.query_something( name="job list", source_func=self.__api_queries.get_job_list) for job in job_list: job_id = job.get("id", None) job_name = job.get("name", None) # this way to make sure we also catch empty strings if (not job_id or not job_name): ExceptionUtils.error_message( f"skipping, missing name or id for job {job}") continue LOGGER.info( ">> capturing Job information for Job \"{}\"".format(job_name)) try: self.__job_by_id(job_id=job_id) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"error when getting jobs for {job_name}, skipping it") continue def __job_by_id(self, job_id: str) -> None: """Requests and saves all jobsessions for a jobID""" if (not job_id): raise ValueError("need job_id to request jobs for that ID") keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] query = SelectionQuery( keyword=keyword, fields=['id', 'jobName'], tables=[table], where_str= f'jobId = \'{job_id}\' AND time > now() - {table.retention_policy.duration}' # unnecessary filter? ) LOGGER.debug(query) result = self.__influx_client.send_selection_query( # type: ignore query) id_list: List[int] = [] row: Dict[str, Any] = {} # make sure the var exists for row in result.get_points(): # type: ignore id_list.append(row['id']) # type: ignore if (not row): LOGGER.info( f">>> no entries in Influx database found for job with id {job_id}" ) # calculate time to be requested (rp_hours, rp_mins, rp_secs) = InfluxUtils.transform_time_literal( table.retention_policy.duration, single_vals=True) max_request_timestamp = datetime.datetime.now() - datetime.timedelta( hours=float(rp_hours), minutes=float(rp_mins), seconds=float(rp_secs)) unixtime = int(time.mktime(max_request_timestamp.timetuple())) # make it ms instead of s unixtime *= 1000 # retrieve all jobs in this category from REST API, filter to avoid drops due RP LOGGER.debug(f">>> requesting job sessions for id {job_id}") all_jobs = self.__api_queries.get_jobs_by_id(job_id=job_id) # filter all jobs where start time is not bigger then the retention time limit latest_jobs = list( filter(lambda job: job['start'] > unixtime, all_jobs)) missing_jobs = list( filter(lambda job_api: int(job_api['id']) not in id_list, latest_jobs)) if (len(missing_jobs) > 0): LOGGER.info( f">>> {len(missing_jobs)} datasets missing in DB for jobId: {job_id}" ) # Removes `statistics` from jobs self.__compute_extra_job_stats(missing_jobs, job_id) LOGGER.info( f">>> inserting job information of {len(missing_jobs)} jobs into jobs table" ) self.__influx_client.insert_dicts_to_buffer( list_with_dicts=missing_jobs, table_name="jobs") else: LOGGER.info( f">>> no new jobs to insert into DB for job with ID {job_id}") # TODO: artifact from older versions, not replaced yet if self.__verbose: display_number_of_jobs = 5 keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] where_str = 'jobId = \'{}\''.format(job_id) query = SelectionQuery(keyword=keyword, fields=['*'], tables=[table], where_str=where_str, order_direction='DESC', limit=display_number_of_jobs) result = self.__influx_client.send_selection_query( # type: ignore query) # type: ignore result_list: List[str] = list(result.get_points()) # type: ignore job_list_to_print: List[str] = [] for row_str in result_list: job_list_to_print.append(row_str) print() print( "displaying last {} jobs for job with ID {} from database (as available)" .format(display_number_of_jobs, job_id)) MethodUtils.my_print(data=job_list_to_print) def __compute_extra_job_stats(self, list_with_jobs: List[Dict[str, Any]], job_id: str) -> None: """Extracts additional `statistic` list from jobs and removes it from the original list. Computes an additional table out of the data. Args: list_with_jobs (List[Dict[str, Any]]): list with all jobs """ LOGGER.info( f">>> computing additional job statistics for jobId: {job_id}") insert_list: List[Dict[str, Any]] = [] # check for none instead of bool-check: Remove empty statistic lists []. for job in filter(lambda x: x.get("statistics", None) is not None, list_with_jobs): job_statistics_list = job.pop('statistics') for job_stats in job_statistics_list: try: insert_dict: Dict[str, Any] = {} # fields insert_dict['resourceType'] = job_stats.get( 'resourceType', None) insert_dict['total'] = job_stats.get('total', 0) insert_dict['success'] = job_stats.get('success', 0) insert_dict['failed'] = job_stats.get('failed', 0) skipped = job_stats.get('skipped', None) if (skipped is None): skipped = insert_dict["total"] - insert_dict[ "success"] - insert_dict["failed"] insert_dict["skipped"] = skipped # time key insert_dict['start'] = job['start'] # regular tag values for grouping: insert_dict['id'] = job.get('id', None) insert_dict['jobId'] = job.get('jobId', None) insert_dict['status'] = job.get('status', None) insert_dict['indexStatus'] = job.get('indexStatus', None) insert_dict['jobName'] = job.get('jobName', None) insert_dict['type'] = job.get('type', None) insert_dict['subPolicyType'] = job.get( 'subPolicyType', None) insert_list.append(insert_dict) except KeyError as error: ExceptionUtils.exception_info( error=error, extra_message= f"failed to compute job-individual statistics due key error. report to developer. Job: {job} ; job_stats: {job_stats}" ) if (len(insert_list) > 0): self.__influx_client.insert_dicts_to_buffer( list_with_dicts=insert_list, table_name="jobs_statistics") else: LOGGER.info( f">>> no additional job statistics to insert into DB for jobId: {job_id}" ) def __job_logs_to_stats(self, list_with_logs: List[Dict[str, Any]]) -> None: """Parses joblogs into their own statisic table, using declared supported ID's To parse more jobLogs define additional entrys in the attribute `supported_ids`. Arguments: list_with_logs {List[Dict[str, Any]]} -- List with all saved joblogs """ # only continue with joblogs we want to save supported_log_iterator = filter( lambda log: log['messageId'] in self.__supported_ids.keys(), list_with_logs) sorted_log_iterator = sorted(supported_log_iterator, key=lambda entry: entry['logTime']) max_sec_timestamp = 0 # required for preventing duplicates for job_log in sorted_log_iterator: message_id = job_log['messageId'] table_func_triple = self.__supported_ids[message_id] (table_name, row_dict_func, additional_fields) = table_func_triple if (not table_name): table_name = message_id ExceptionUtils.error_message( f"Warning: No tablename specified for message_id {message_id}. Please report to developer." ) try: # Saving information from the message-params list within the job_log row_dict = row_dict_func(job_log['messageParams']) if (not row_dict): # this was matched incorrectly, therefore skipped. # No warning cause this will happen often. continue # Saving additional fields from the job_log struct itself. if (additional_fields): for value in additional_fields: # with rename if (isinstance(value, Tuple)): row_dict[value[0]] = job_log[value[1]] else: # without rename row_dict[value] = job_log[value] except (KeyError, IndexError) as error: ExceptionUtils.exception_info( error, extra_message= f"MessageID params wrong defined. Skipping message_id {message_id} with content: {job_log}" ) continue # Issue 9, In case where all tag values duplicate another record, including the timestamp, Influx will throw the insert # out as a duplicate. In some cases, the changing of epoch timestamps from millisecond to second precision is # cause duplicate timestamps. To avoid this for certain tables, add seconds to the timestamp as needed to # ensure uniqueness. Only use this when some innacuracy of the timestamps is acceptable cur_timestamp = job_log['logTime'] if (table_name == 'vmBackupSummary'): if (cur_timestamp is None): # prevent None ExceptionUtils.error_message( f"Warning: logTime is None, duplicate may be purged. Log: {job_log}" ) if (isinstance(cur_timestamp, str)): # make sure its int cur_timestamp = int(cur_timestamp) cur_sec_timestamp = SppUtils.to_epoch_secs(cur_timestamp) if (cur_sec_timestamp <= max_sec_timestamp): digits = (int)(cur_timestamp / cur_sec_timestamp) max_sec_timestamp += 1 # increase by 1 second cur_timestamp = max_sec_timestamp * digits else: max_sec_timestamp = cur_sec_timestamp row_dict['time'] = cur_timestamp for (key, item) in row_dict.items(): if (item in ('null', 'null(null)')): row_dict[key] = None self.__influx_client.insert_dicts_to_buffer(table_name, [row_dict]) def job_logs(self) -> None: """saves all jobLogs for the jobsessions in influx catalog. Make sure to call `get_all_jobs` before to aquire all jobsessions. In order to save them it deletes and rewrites all affected jobsession entrys. It automatically parses certain jobLogs into additional stats, defined by `supported_ids`. """ # total count of requested logs logs_requested_total = 0 # total count of inserted logs logs_to_stats_total = 0 # should be equal, but on failure isnt (skipped logs) # list to be inserted after everything is updated job_update_list: List[Dict[str, Any]] = [] LOGGER.info("> Requesting jobs with missing logs from influx database") table = self.__influx_client.database['jobs'] # only store if there is something to store -> limited by job log rentation time. where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time where_str += f' AND time > now() - {table.retention_policy.duration}' # Select all jobs without joblogs keyword = Keyword.SELECT query = SelectionQuery(keyword=keyword, tables=[table], fields=['*'], where_str=where_str) # send query and compute missing_logs_jobs_rs = self.__influx_client.send_selection_query( # type: ignore query) # this list contains all jobs which are missing its Logs # Cast from resultset into list missing_logs_jobs: List[Dict[str, Any]] = list( missing_logs_jobs_rs.get_points()) # type: ignore LOGGER.info( f">>> Number of jobs with no joblogs stored in Influx database: {len(missing_logs_jobs)}" ) LOGGER.info("> Requesting missing jobLogs from REST-API.") # request all jobLogs from REST-API # counter only for displaying purposes for counter, row in enumerate(missing_logs_jobs, 0): # Only print every 5 rows if not verbose # starts at 0, therefore already updated if (self.__verbose or counter % 5 == 0): LOGGER.info( f">>> computed joblogs for {counter} / {len(missing_logs_jobs)} job sessions." ) job_session_id: Optional[int] = row.get('id', None) # if somehow jobLogid is missing: skip # Should usually not happen if (job_session_id is None): ExceptionUtils.error_message( f"Error: jobSessionId missing for row {row}") continue if (self.__verbose): LOGGER.info( f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}." ) LOGGER.debug( f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}." ) try: # cant use `query_something` like in other places due the extra params: # api_queries - query_something only works with no params # This list contains all joblogs for a single job-execution current_job_logs = self.__api_queries.get_job_log_details( jobsession_id=job_session_id, job_logs_types=self.__job_log_types, request_ids=list(self.__supported_ids.keys())) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"Error when api-requesting joblogs for job_session_id {job_session_id}, skipping it" ) continue job_log_count = len(current_job_logs) logs_requested_total += job_log_count if (self.__verbose): LOGGER.info( f">>> Found {job_log_count} logs for jobsessionId {job_session_id}" ) LOGGER.debug( f"Found {job_log_count} logs for jobsessionId {job_session_id}" ) # #################################################################################### # Compute results and save logs # ##################################################################################### # The request of REST-API Logs is finished here # To not crash by saving 100.000+ Logs, directly compute results and insert them # ###################################################################################### for job_log in current_job_logs: # add additional information from job-session itself job_log["jobId"] = row.get("jobId", None) job_log["jobName"] = row.get("jobName", None) job_log["jobExecutionTime"] = row.get("start", None) # rename for clarity job_log["jobLogId"] = job_log.pop("id", None) job_log["jobSessionId"] = job_log.pop("jobsessionId", None) # ########################################################## # compute jobLog-Stats into each associated table # ########################################################## try: self.__job_logs_to_stats(current_job_logs) except ValueError as error: ExceptionUtils.exception_info( error, extra_message= f"Failed parse jobLogs into its own table, skipping for jobsessionId {job_session_id}" ) logs_to_stats_total += job_log_count # ########################################################## # save logs within the joblog-dump # ########################################################## # Only dump them after computing stats since they are read within the computing stats part for job_log in current_job_logs: # dump message params to allow saving as string job_log["messageParams"] = json.dumps(job_log["messageParams"]) # if list is empty due beeing erased etc it will simply return and do nothing self.__influx_client.insert_dicts_to_buffer( list_with_dicts=current_job_logs, table_name="jobLogs") # shallow copy dict to allow a update without errors copied_jobsession = dict(row.items()) # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails) update_fields = { "jobLogsCount": job_log_count, "jobsLogsStored": True } # update the fields for (key, value) in update_fields.items(): copied_jobsession[key] = value job_update_list.append(copied_jobsession) # ########################################################## # End of For-Each # ########################################################## # ########################################################## # Delete each job, then re-insert # ########################################################## # Delete all jobs which got requested, no matter if failed delete_query = SelectionQuery(keyword=Keyword.DELETE, tables=[table], where_str=where_str) # now send remove query to prevent data loss self.__influx_client.send_selection_query(delete_query) # type: ignore # Insert data after everything is completed self.__influx_client.insert_dicts_to_buffer(table.name, job_update_list) if (logs_requested_total != logs_to_stats_total): LOGGER.info( f"> Requested a total of {logs_requested_total} but only computed {logs_to_stats_total} into sppmon statistics" ) else: LOGGER.info( f">>> requested and computed a total of {logs_requested_total} logs" ) LOGGER.info(f">> Updated a total of {len(job_update_list)} jobs")
def get_objects(self, endpoint: str = None, uri: str = None, array_name: str = None, white_list: List[str] = None, ignore_list: List[str] = None, add_time_stamp: bool = False) -> List[Dict[str, Any]]: """Querys a response(-list) from a REST-API endpoint or URI. Specify `array_name` if there are multiple results / list. Use white_list to pick only the values specified. Use ignore_list to pick everything but the values specified. Both: white_list items overwrite ignore_list items, still getting all not filtered. Note: Do not specify both endpoint and uri, only uri will be used Keyword Arguments: endpoint {str} -- endpoint to be queried. Either use this or uri (default: {None}) uri {str} -- uri to be queried. Either use this or endpoint (default: {None}) array_name {str} -- name of array if there are multiple results wanted (default: {None}) white_list {list} -- list of item to query (default: {None}) ignore_list {list} -- query all but these items(-groups). (default: {None}) page_size {int} -- Size of page, recommendation is 100, depending on size of data (default: {100}) add_time_stamp {bool} -- whether to add the capture timestamp (default: {False}) Raises: ValueError: Neither a endpoint nor uri is specfied ValueError: Negative or 0 pagesize ValueError: array_name is specified but it is only a single object Returns: {List[Dict[str, Any]]} -- List of dictonarys as the results """ if(not endpoint and not uri): raise ValueError("neiter endpoint nor uri specified") if(endpoint and uri): LOGGER.debug("added both endpoint and uri. This is unneccessary, endpoint is ignored") # if neither specifed, get everything if(not white_list and not ignore_list): ignore_list = [] # create uri out of endpoint if(not uri): next_page = self.__srv_url + endpoint else: next_page = uri result_list: List[Dict[str, Any]] = [] # Aborts if no nextPage is found while(next_page): LOGGER.debug(f"Collected {len(result_list)} items until now. Next page: {next_page}") if(self.__verbose): LOGGER.info(f"Collected {len(result_list)} items until now. Next page: {next_page}") # Request response (response, send_time) = self.__query_url(url=next_page) # find follow page if available and set it (_, next_page_link) = SppUtils.get_nested_kv(key_name="links.nextPage.href", nested_dict=response) next_page = next_page_link # Check if single object or not if(array_name): # get results for this page, if empty nothing happens page_result_list: Optional[List[Dict[str, Any]]] = response.get(array_name, None) if(page_result_list is None): raise ValueError("array_name does not exist, this is probably a single object") else: page_result_list = [response] filtered_results = ConnectionUtils.filter_values_dict( result_list=page_result_list, white_list=white_list, ignore_list=ignore_list) if(add_time_stamp): # direct time add to make the timestamps represent the real capture time for mydict in filtered_results: time_key, time_val = SppUtils.get_capture_timestamp_sec() mydict[time_key] = time_val result_list.extend(filtered_results) # adjust pagesize if(send_time > self.__preferred_time or len(page_result_list) == self.__page_size): self.__page_size = ConnectionUtils.adjust_page_size( page_size=len(page_result_list), min_page_size=self.__min_page_size, preferred_time=self.__preferred_time, send_time=send_time) LOGGER.debug("objectList size %d", len(result_list)) return result_list
def store_script_metrics(self) -> None: """Stores script metrics into influxb. To be called before exit. Does not raise any exceptions, skips if influxdb is missing. """ LOGGER.info("Storing script metrics") try: if (not self.influx_client): raise ValueError("no influxClient set up") insert_dict: Dict[str, Union[str, int, float, bool]] = {} # add version nr, api calls are needed insert_dict["sppmon_version"] = VERSION insert_dict["influxdb_version"] = self.influx_client.version if (self.rest_client): try: (version_nr, build) = self.rest_client.get_spp_version_build() insert_dict["spp_version"] = version_nr insert_dict["spp_build"] = build except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="could not query SPP version and build.") # end total sppmon runtime end_counter = time.perf_counter() insert_dict['duration'] = int( (end_counter - self.start_counter) * 1000) # add arguments of sppmon for (key, value) in vars(ARGS).items(): # Value is either string, true or false/None if (value): insert_dict[key] = value # save occured errors error_count = len(ExceptionUtils.stored_errors) if (error_count > 0): ExceptionUtils.error_message( f"total of {error_count} exception/s occured") insert_dict['errorCount'] = error_count # save list as str if not empty if (ExceptionUtils.stored_errors): insert_dict['errorMessages'] = str( ExceptionUtils.stored_errors) # get end timestamp (time_key, time_val) = SppUtils.get_capture_timestamp_sec() insert_dict[time_key] = time_val # save the metrics self.influx_client.insert_dicts_to_buffer( table_name="sppmon_metrics", list_with_dicts=[insert_dict]) self.influx_client.flush_insert_buffer() LOGGER.info("Stored script metrics sucessfull") # + 1 due the "total of x exception/s occured" if (error_count + 1 < len(ExceptionUtils.stored_errors)): ExceptionUtils.error_message( "A non-critical error occured while storing script metrics. \n\ This error can't be saved into the DB, it's only displayed within the logs." ) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "Error when storing sppmon-metrics, skipping this step. Possible insert-buffer data loss" )
def set_optional_configs(self, config_file: Dict[str, Any]) -> None: """Sets up any optional infrastructure, to be called within the init. Be aware not everything may be initalized on call time. Add config here if the system should not abort if it is missing. Arguments: config_file {Dict[str, Any]} -- Opened Config file """ if (not config_file): ExceptionUtils.error_message( "missing or empty config file, aborting.") self.exit(error_code=ERROR_CODE_START_ERROR) if (not self.influx_client): ExceptionUtils.error_message( "Influx client is somehow missing. aborting") self.exit(error_code=ERROR_CODE) # ############################ REST-API ##################################### try: ConnectionUtils.verbose = ARGS.verbose # ### Loaded Systems part 1/2 ### # if (ARGS.minimumLogs or ARGS.loadedSystem): # Setting pagesize scaling settings ConnectionUtils.timeout_reduction = self.loaded_timeout_reduction ConnectionUtils.allowed_send_delta = self.loaded_allowed_send_delta ConnectionUtils.max_scaling_factor = self.loaded_max_scaling_factor # Setting RestClient request settings. self.rest_client = RestClient( config_file=config_file, initial_connection_timeout=self.initial_connection_timeout, pref_send_time=self.loaded_pref_send_time, request_timeout=self.loaded_request_timeout, max_send_retries=self.loaded_max_send_retries, starting_page_size=self.loaded_starting_page_size, min_page_size=self.loaded_min_page_size, verbose=ARGS.verbose) else: ConnectionUtils.timeout_reduction = self.timeout_reduction ConnectionUtils.allowed_send_delta = self.allowed_send_delta ConnectionUtils.max_scaling_factor = self.max_scaling_factor # Setting RestClient request settings. self.rest_client = RestClient( config_file=config_file, initial_connection_timeout=self.initial_connection_timeout, pref_send_time=self.pref_send_time, request_timeout=self.request_timeout, max_send_retries=self.max_send_retries, starting_page_size=self.starting_page_size, min_page_size=self.min_page_size, verbose=ARGS.verbose) self.api_queries = ApiQueries(self.rest_client) if (not self.ignore_setup): # delay the connect into the testing phase self.rest_client.login() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="REST-API is not available due Config error") # Required to declare variable self.rest_client = None self.api_queries = None # ######################## System, Job and Hypervisor Methods ################## try: # explicit ahead due dependency self.system_methods = SystemMethods(self.influx_client, self.api_queries, ARGS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) # ### Full Logs ### # if (ARGS.fullLogs): given_log_types = self.full_joblog_types else: given_log_types = self.joblog_types try: auth_rest: Dict[str, Any] = SppUtils.get_cfg_params( param_dict=config_file, param_name="sppServer") # type: ignore # TODO DEPRECATED TO BE REMOVED IN 1.1 self.job_log_retention_time = auth_rest.get( "jobLog_rentation", auth_rest.get("jobLog_retention", self.job_log_retention_time)) # TODO New once 1.1 is live #self.job_log_retention_time = auth_rest.get("jobLog_retention", self.job_log_retention_time) self.job_methods = JobMethods(self.influx_client, self.api_queries, self.job_log_retention_time, given_log_types, ARGS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) try: # dependen on system methods self.protection_methods = ProtectionMethods( self.system_methods, self.influx_client, self.api_queries, ARGS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) # ############################### SSH ##################################### if (self.ssh and not self.ignore_setup): try: # set from None to methods once finished self.ssh_methods = SshMethods(influx_client=self.influx_client, config_file=config_file, verbose=ARGS.verbose) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "SSH-Commands are not available due Config error") # Variable needs to be declared self.ssh_methods = None else: # Variable needs to be declared self.ssh_methods = None
def set_optional_configs(self, config_file: Dict[str, Any]) -> None: """Sets up any optional infrastructure, to be called within the init. Be aware not everything may be initalized on call time. Add config here if the system should not abort if it is missing. Arguments: config_file {Dict[str, Any]} -- Opened Config file """ if (not config_file): ExceptionUtils.error_message( "missing or empty config file, aborting.") self.exit(error_code=ERROR_CODE_CMD_LINE) # ############################ REST-API ##################################### try: auth_rest = SppUtils.get_cfg_params(param_dict=config_file, param_name="sppServer") if (not isinstance(auth_rest, dict)): raise ValueError("sppServer config need to be dict") self.job_log_retention_time = auth_rest.get( "jobLog_rentation", "60d") ConnectionUtils.verbose = OPTIONS.verbose ConnectionUtils.timeout_reduction = self.timeout_reduction ConnectionUtils.allowed_time_diff_quota = self.allowed_time_diff_quota ConnectionUtils.maximum_increase_pagesize = self.maximum_increase_pagesize if (OPTIONS.minimumLogs): rest_time_out = self.minimum_timeout rest_preferred_time = self.loaded_preferred_time else: rest_time_out = self.default_timeout rest_preferred_time = self.preferred_time self.rest_client = RestClient(auth_rest, rest_time_out, rest_preferred_time, self.page_size, self.min_page_size, self.send_retries, OPTIONS.verbose) self.api_queries = ApiQueries(self.rest_client) self.rest_client.login() except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message="REST-API is not available due Config error") self.rest_client = None self.api_queries = None # ######################## System, Job and Hypervisor Methods ################## try: # explicit ahead due dependency self.system_methods = SystemMethods(self.influx_client, self.api_queries, OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) try: self.job_methods = JobMethods(self.influx_client, self.api_queries, self.job_log_retention_time, self.minLogs_joblog_type, self.default_joblog_type, OPTIONS.verbose, OPTIONS.minimumLogs) except ValueError as error: ExceptionUtils.exception_info(error=error) try: # dependen on system methods self.hypervisor_methods = ProtectionMethods( self.system_methods, self.influx_client, self.api_queries, OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info(error=error) # ############################### SSH ##################################### if (self.ssh or self.process_stats): try: auth_ssh = SppUtils.get_cfg_params(param_dict=config_file, param_name="sshclients") ssh_clients: List[SshClient] = [] if (not isinstance(auth_ssh, list)): raise ValueError("not a list of sshconfig given", auth_ssh) for client_ssh in auth_ssh: try: ssh_clients.append(SshClient(client_ssh)) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"Setting up one client failed, skipping it. Client: \ {client_ssh.get('name', 'ERROR WHEN GETTING NAME')}" ) # set from None to methods once finished self.ssh_methods = SshMethods(influx_client=self.influx_client, ssh_clients=ssh_clients, verbose=OPTIONS.verbose) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= "SSH-Commands are not available due Config error")