def storages(self) -> None: """Saves all storages such as vsnaps.""" table_name = 'storages' # deactivate verbose to avoid double print result = MethodUtils.query_something( name=table_name, source_func=self.__api_queries.get_storages, deactivate_verbose=True ) # get calulated extra info for row in result: row['siteName'] = self.__system_methods.site_name_by_id(row['site']) if('free' in row and 'total' in row and row['free'] > 0 and row['total'] > 0): row['used'] = row['total'] - row['free'] row['pct_free'] = row['free'] / row['total'] * 100 row['pct_used'] = row['used'] / row['total'] * 100 if(self.__verbose): MethodUtils.my_print(data=result) LOGGER.info(">> inserting storage info into database") self.__influx_client.insert_dicts_to_buffer(table_name=table_name, list_with_dicts=result)
def sla_dumps(self) -> None: """Captures and saves SLA subpolicys.""" # capture and display / store SLA dumps sla_dump_list = MethodUtils.query_something( name="slaDumps", source_func=self.__api_queries.get_sla_dump, rename_tuples=[ ("id", "slaId"), ("subpolicy", "slaDump"), ("name", "slaName") ] ) LOGGER.info(">> updating slaStat table with dump of SLA subpolicy") table_name = "slaStats" for row in sla_dump_list: sla_dump = row['slaDump'] time_stamp = row[SppUtils.capture_time_key] sla_id = row['slaId'] tag_dic = {} field_dic = {'slaDump': sla_dump} self.__influx_client.update_row( table_name=table_name, tag_dic=tag_dic, field_dic=field_dic, where_str="time = {}ms AND slaId = \'{}\'".format( time_stamp, sla_id) )
def get_all_jobs(self) -> None: """incrementally saves all stored jobsessions, even before first execution of sppmon""" job_list = MethodUtils.query_something( name="job list", source_func=self.__api_queries.get_job_list) for job in job_list: job_id = job.get("id", None) job_name = job.get("name", None) # this way to make sure we also catch empty strings if (not job_id or not job_name): ExceptionUtils.error_message( f"skipping, missing name or id for job {job}") continue LOGGER.info( ">> capturing Job information for Job \"{}\"".format(job_name)) try: self.__job_by_id(job_id=job_id) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"error when getting jobs for {job_name}, skipping it") continue
def vadps(self) -> None: """Requests and stores all VAPD proxys from the SPP-server.""" table_name = 'vadps' result = MethodUtils.query_something( name=table_name, source_func=self.__api_queries.get_vadps, rename_tuples=[ ('id', 'vadpId'), ('displayName', 'vadpName') ], deactivate_verbose=True ) for row in result: row['siteName'] = self.__system_methods.site_name_by_id(row['siteId']) if(self.__verbose): MethodUtils.my_print(result) self.__influx_client.insert_dicts_to_buffer(table_name=table_name, list_with_dicts=result)
def sppcatalog(self) -> None: """Saves the spp filesystem catalog information.""" result = MethodUtils.query_something( name="sppcatalog stats", source_func=self.__api_queries.get_file_system, deactivate_verbose=True) value_renames = { 'Configuration': "Configuration", 'Search': "File", 'System': "System", 'Catalog': "Recovery" } for row in result: row['name'] = value_renames[row['name']] if (self.__verbose): MethodUtils.my_print(result) self.__influx_client.insert_dicts_to_buffer("sppcatalog", result)
def vms_per_sla(self) -> None: """Calculates the number of VM's per SLA. Hypervisors not supported yet.""" LOGGER.info("> calculating number of VMs per SLA") result = MethodUtils.query_something( name="VMs per SLA", source_func=self.__api_queries.get_vms_per_sla ) LOGGER.info(">> inserting number of VMs per SLA into DB") self.__influx_client.insert_dicts_to_buffer( table_name="slaStats", list_with_dicts=result)
def __exec_save_commands(self, ssh_type: SshTypes, command_list: List[SshCommand]) -> None: """Helper method, executes and saves all commands via ssh for all clients of the given type. Used cause of the individual save of the results + the verbose print. This functionality is not integrated in MethodUtils cause of the missing Influxclient in static context. Arguments: ssh_type {SshTypes} -- all clients of this type are going to be queried command_list {List[SshCommand]} -- list of commands to be executed on all clients. """ result_tuples = MethodUtils.ssh_execute_commands( ssh_clients=self.__ssh_clients, ssh_type=ssh_type, command_list=command_list) for (table_name, insert_list) in result_tuples: if (self.__verbose): MethodUtils.my_print(insert_list) self.__influx_client.insert_dicts_to_buffer( table_name=table_name, list_with_dicts=insert_list)
def cpuram(self) -> None: """Saves the cpu and ram usage of the spp system.""" table_name = 'cpuram' result = MethodUtils.query_something( name=table_name, rename_tuples=[ ('data.size', 'dataSize'), ('data.util', 'dataUtil'), ('data2.size', 'data2Size'), ('data2.util', 'data2Util'), ('data3.size', 'data3Size'), ('data3.util', 'data3Util'), ('memory.size', 'memorySize'), ('memory.util', 'memoryUtil'), ], source_func=self.__api_queries.get_server_metrics) self.__influx_client.insert_dicts_to_buffer(table_name=table_name, list_with_dicts=result)
def sites(self) -> None: """Collects all site informations including throttle rate. This information does not contain much statistic information. It should only be called if new sites were added or changed. """ table_name = 'sites' result = MethodUtils.query_something( name=table_name, source_func=self.__api_queries.get_sites, rename_tuples=[('id', 'siteId'), ('name', 'siteName'), ('throttles', 'throttleRates')]) # save results for renames later for row in result: self.__site_name_dict[row['siteId']] = row['siteName'] self.__influx_client.insert_dicts_to_buffer(table_name=table_name, list_with_dicts=result)
def store_vms(self) -> None: """Stores all vms stats individually Those are reused later to compute vm_stats """ all_vms_list = MethodUtils.query_something( name="all VMs", source_func=self.__api_queries.get_all_vms, rename_tuples=[ ("properties.datacenter.name", "datacenterName") ], deactivate_verbose=True) if(self.__verbose): LOGGER.info(f"found {len(all_vms_list)} vm's.") self.__influx_client.insert_dicts_to_buffer( table_name="vms", list_with_dicts=all_vms_list )
def sites(self) -> None: """Collects all site informations including throttle rate. This information does not contain much statistic information. It should only be called if new sites were added or changed. """ table_name = 'sites' result = MethodUtils.query_something( name=table_name, source_func=self.__api_queries.get_sites, rename_tuples=[('id', 'siteId'), ('name', 'siteName'), ('throttles', 'throttleRates')]) LOGGER.debug(f"sites: {result}") # save results into internal storage to avoid additional request for ID's # used instead of `site_name_by_id` for row in result: self.__site_name_dict[row['siteId']] = row['siteName'] # explicit none check since [] also needs to be converted into str if (row['throttleRates'] != None): row['throttleRates'] = str(row['throttleRates']) self.__influx_client.insert_dicts_to_buffer(table_name=table_name, list_with_dicts=result)
def create_inventory_summary(self) -> None: """Retrieves and calculate VM inventory summary by influx catalog data.""" LOGGER.info( "> computing inventory information (not from catalog, means not only backup data is calculated)") # ########## Part 1: Check if something need to be computed ############# # query the timestamp of the last vm, commited as a field is always needed by influx rules. vms_table = self.__influx_client.database["vms"] time_query = SelectionQuery( keyword=Keyword.SELECT, tables=[vms_table], fields=['time', 'commited'], limit=1, order_direction="DESC" ) result = self.__influx_client.send_selection_query(time_query) # type: ignore last_vm: Dict[str, Any] = next(result.get_points(), None) # type: ignore if(not last_vm): raise ValueError("no VM's stored, either none are available or you have to store vm's first") # query the last vm stats to compare timestamps with last vm last_time_ms: int = last_vm["time"] last_time = SppUtils.epoch_time_to_seconds(last_time_ms) where_str = "time = {}s".format(last_time) vm_stats_table = self.__influx_client.database["vmStats"] vm_stats_query = SelectionQuery( keyword=Keyword.SELECT, tables=[vm_stats_table], fields=['*'], where_str=where_str, limit=1 ) result = self.__influx_client.send_selection_query(vm_stats_query) # type: ignore if(len(list(result.get_points())) > 0): # type: ignore LOGGER.info(">> vm statistics already computed, skipping") return # ####################### Part 2: Compute new Data #################### fields = [ 'uptime', 'powerState', 'commited', 'uncommited', 'memory', 'host', 'vmVersion', 'isProtected', 'inHLO', 'isEncrypted', 'datacenterName', 'hypervisorType', ] query = SelectionQuery( keyword=Keyword.SELECT, tables=[vms_table], fields=fields, where_str=where_str ) result = self.__influx_client.send_selection_query(query) # type: ignore all_vms_list: List[Dict[str, Union[str, int, float, bool]]] = list(result.get_points()) # type: ignore # skip if no new data can be computed if(not all_vms_list): raise ValueError("no VM's stored, either none are available or store vms first") vm_stats: Dict[str, Any] = {} try: vm_stats['vmCount'] = len(all_vms_list) # returns largest/smallest vm_stats['vmMaxSize'] = max(all_vms_list, key=(lambda mydict: mydict['commited']))['commited'] # on purpose zero size vm's are ignored vms_no_null_size = list(filter(lambda mydict: mydict['commited'] > 0, all_vms_list)) if(vms_no_null_size): vm_stats['vmMinSize'] = min(vms_no_null_size, key=(lambda mydict: mydict['commited']))['commited'] vm_stats['vmSizeTotal'] = sum(mydict['commited'] for mydict in all_vms_list) vm_stats['vmAvgSize'] = vm_stats['vmSizeTotal'] / vm_stats['vmCount'] # returns largest/smallest vm_stats['vmMaxUptime'] = max(all_vms_list, key=(lambda mydict: mydict['uptime']))['uptime'] # on purpose zero size vm's are ignored vms_no_null_time = list(filter(lambda mydict: mydict['uptime'] > 0, all_vms_list)) if(vms_no_null_time): vm_stats['vmMinUptime'] = min(vms_no_null_time, key=(lambda mydict: mydict['uptime']))['uptime'] vm_stats['vmUptimeTotal'] = sum(mydict['uptime'] for mydict in all_vms_list) vm_stats['vmAvgUptime'] = vm_stats['vmUptimeTotal'] / vm_stats['vmCount'] vm_stats['vmCountProtected'] = len(list(filter(lambda mydict: mydict['isProtected'] == "True", all_vms_list))) vm_stats['vmCountUnprotected'] = vm_stats['vmCount'] - vm_stats['vmCountProtected'] vm_stats['vmCountEncrypted'] = len(list(filter(lambda mydict: mydict['isEncrypted'] == "True", all_vms_list))) vm_stats['vmCountPlain'] = vm_stats['vmCount'] - vm_stats['vmCountEncrypted'] vm_stats['vmCountHLO'] = len(list(filter(lambda mydict: mydict['inHLO'] == "True", all_vms_list))) vm_stats['vmCountNotHLO'] = vm_stats['vmCount'] - vm_stats['vmCountHLO'] vm_stats['vmCountVMware'] = len(list(filter(lambda mydict: mydict['hypervisorType'] == "vmware", all_vms_list))) vm_stats['vmCountHyperV'] = len(list(filter(lambda mydict: mydict['hypervisorType'] == "hyperv", all_vms_list))) vm_stats['nrDataCenters'] = len(set(map(lambda vm: vm['datacenterName'], all_vms_list))) vm_stats['nrHosts'] = len(set(map(lambda vm: vm['host'], all_vms_list))) vm_stats['time'] = all_vms_list[0]['time'] if self.__verbose: MethodUtils.my_print([vm_stats]) except (ZeroDivisionError, AttributeError, KeyError, ValueError) as error: ExceptionUtils.exception_info(error=error) raise ValueError("error when computing extra vm stats", vm_stats) LOGGER.info(">> store vmInventory information in Influx DB") self.__influx_client.insert_dicts_to_buffer("vmStats", [vm_stats])
class JobMethods: """Wrapper for all job related functionality. You may implement new methods in here. Methods: get_all_jobs - incrementally saves all stored jobsessions, even before first execution of sppmon. job_logs -> saves all jobLogs for the jobsessions in influx catalog. """ # only here to maintain for later, unused yet __job_log_allow_list = [ "CTGGA2340", "CTGGA0071", "CTGGA2260", "CTGGA2315", "CTGGA0550", "CTGGA2384" ] # to be moved somewhere else # ######### Add new logs to be parsed here ####################################### # Structure: # Dict with messageID of log as name # value is a tuple of # #1 the tablename # #2 a lambda which maps each elem to a name. Must contain at least one argument! # #3 list of tuples: keys of additional informations to be saved: (#1: key, #2: rename). Part 2 optional, only if rename # the values are delived by the param_list of the joblog # if the value is something like 10sec or 10gb use `parse_unit` to parse it. __supported_ids: Dict[str, Tuple[str, Callable[[List[Any]], Dict[ str, Any]], List[Union[Tuple[str, str], str]]]] = { 'CTGGA2384': ( 'vmBackupSummary', lambda params: { "name": params[0], "proxy": params[1], "vsnaps": params[2], "type": params[3], "transportType": params[4], "transferredBytes": SppUtils.parse_unit(params[5]), "throughputBytes/s": SppUtils.parse_unit(params[6]), "queueTimeSec": SppUtils.parse_unit(params[7]), "protectedVMDKs": params[8], "TotalVMDKs": params[9], "status": params[10] }, ["messageId"] # Additional Information from job-message itself ), 'CTGGA0071': ('vmBackupSummary', lambda params: { 'protectedVMDKs': params[0], 'TotalVMDKs': int(params[1]) + int(params[0]), 'transferredBytes': SppUtils.parse_unit(params[2]), 'throughputBytes/s': SppUtils.parse_unit(params[3]), 'queueTimeSec': SppUtils.parse_unit(params[4]) }, ["messageId"]), 'CTGGA0072': ('vmReplicateSummary', lambda params: { 'total': params[0], 'failed': params[1], 'duration': SppUtils.parse_unit(params[2]) }, []), 'CTGGA0398': ('vmReplicateStats', lambda params: { 'replicatedBytes': SppUtils.parse_unit(params[0]), 'throughputBytes/sec': SppUtils.parse_unit(params[1]), 'duration': SppUtils.parse_unit(params[2], delimiter=':') }, []), 'CTGGR0003': ( 'office365Stats', lambda params: { 'imported365Users': int(params[0]), }, [ # Additional Information from job-message itself, including rename "jobId", "jobSessionId", "jobName", "jobExecutionTime" # used to instantly integrate with other stats ]), 'CTGGA2444': ( 'office365Stats', lambda params: { 'protectedItems': int(params[0]), 'selectedItems': int(params[0]), }, [ "jobId", "jobSessionId", "jobName", "jobExecutionTime" # used to instantly integrate with other stats ]), 'CTGGA2402': ( 'office365TransfBytes', lambda params: # If not matching, this will return a empty dict which is going to be ignored MethodUtils.joblogs_parse_params( r"(\w+)\s*\(Server:\s*([^\s,]+), Transfer Size: (\d+(?:.\d*)?\s*\w*)\)", params[1], lambda match_list: { "itemName": params[0], "itemType": match_list[1], "serverName": match_list[2], "transferredBytes": SppUtils.parse_unit(match_list[3]), }), ["jobId", "jobSessionId", "jobName"]), } """LogLog messageID's which can be parsed by sppmon. Check detailed summary above the declaration.""" def __init__(self, influx_client: Optional[InfluxClient], api_queries: Optional[ApiQueries], job_log_retention_time: str, job_log_types: List[str], verbose: bool): if (not influx_client): raise ValueError( "Job Methods are not available, missing influx_client") if (not api_queries): raise ValueError( "Job Methods are not available, missing api_queries") self.__influx_client = influx_client self.__api_queries = api_queries self.__verbose = verbose self.__job_log_retention_time = job_log_retention_time """used to limit the time jobLogs are queried, only interestig for init call""" self.__job_log_types = job_log_types def get_all_jobs(self) -> None: """incrementally saves all stored jobsessions, even before first execution of sppmon""" job_list = MethodUtils.query_something( name="job list", source_func=self.__api_queries.get_job_list) for job in job_list: job_id = job.get("id", None) job_name = job.get("name", None) # this way to make sure we also catch empty strings if (not job_id or not job_name): ExceptionUtils.error_message( f"skipping, missing name or id for job {job}") continue LOGGER.info( ">> capturing Job information for Job \"{}\"".format(job_name)) try: self.__job_by_id(job_id=job_id) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"error when getting jobs for {job_name}, skipping it") continue def __job_by_id(self, job_id: str) -> None: """Requests and saves all jobsessions for a jobID""" if (not job_id): raise ValueError("need job_id to request jobs for that ID") keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] query = SelectionQuery( keyword=keyword, fields=['id', 'jobName'], tables=[table], where_str= f'jobId = \'{job_id}\' AND time > now() - {table.retention_policy.duration}' # unnecessary filter? ) LOGGER.debug(query) result = self.__influx_client.send_selection_query( # type: ignore query) id_list: List[int] = [] row: Dict[str, Any] = {} # make sure the var exists for row in result.get_points(): # type: ignore id_list.append(row['id']) # type: ignore if (not row): LOGGER.info( f">>> no entries in Influx database found for job with id {job_id}" ) # calculate time to be requested (rp_hours, rp_mins, rp_secs) = InfluxUtils.transform_time_literal( table.retention_policy.duration, single_vals=True) max_request_timestamp = datetime.datetime.now() - datetime.timedelta( hours=float(rp_hours), minutes=float(rp_mins), seconds=float(rp_secs)) unixtime = int(time.mktime(max_request_timestamp.timetuple())) # make it ms instead of s unixtime *= 1000 # retrieve all jobs in this category from REST API, filter to avoid drops due RP LOGGER.debug(f">>> requesting job sessions for id {job_id}") all_jobs = self.__api_queries.get_jobs_by_id(job_id=job_id) # filter all jobs where start time is not bigger then the retention time limit latest_jobs = list( filter(lambda job: job['start'] > unixtime, all_jobs)) missing_jobs = list( filter(lambda job_api: int(job_api['id']) not in id_list, latest_jobs)) if (len(missing_jobs) > 0): LOGGER.info( f">>> {len(missing_jobs)} datasets missing in DB for jobId: {job_id}" ) # Removes `statistics` from jobs self.__compute_extra_job_stats(missing_jobs, job_id) LOGGER.info( f">>> inserting job information of {len(missing_jobs)} jobs into jobs table" ) self.__influx_client.insert_dicts_to_buffer( list_with_dicts=missing_jobs, table_name="jobs") else: LOGGER.info( f">>> no new jobs to insert into DB for job with ID {job_id}") # TODO: artifact from older versions, not replaced yet if self.__verbose: display_number_of_jobs = 5 keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] where_str = 'jobId = \'{}\''.format(job_id) query = SelectionQuery(keyword=keyword, fields=['*'], tables=[table], where_str=where_str, order_direction='DESC', limit=display_number_of_jobs) result = self.__influx_client.send_selection_query( # type: ignore query) # type: ignore result_list: List[str] = list(result.get_points()) # type: ignore job_list_to_print: List[str] = [] for row_str in result_list: job_list_to_print.append(row_str) print() print( "displaying last {} jobs for job with ID {} from database (as available)" .format(display_number_of_jobs, job_id)) MethodUtils.my_print(data=job_list_to_print) def __compute_extra_job_stats(self, list_with_jobs: List[Dict[str, Any]], job_id: str) -> None: """Extracts additional `statistic` list from jobs and removes it from the original list. Computes an additional table out of the data. Args: list_with_jobs (List[Dict[str, Any]]): list with all jobs """ LOGGER.info( f">>> computing additional job statistics for jobId: {job_id}") insert_list: List[Dict[str, Any]] = [] # check for none instead of bool-check: Remove empty statistic lists []. for job in filter(lambda x: x.get("statistics", None) is not None, list_with_jobs): job_statistics_list = job.pop('statistics') for job_stats in job_statistics_list: try: insert_dict: Dict[str, Any] = {} # fields insert_dict['resourceType'] = job_stats.get( 'resourceType', None) insert_dict['total'] = job_stats.get('total', 0) insert_dict['success'] = job_stats.get('success', 0) insert_dict['failed'] = job_stats.get('failed', 0) skipped = job_stats.get('skipped', None) if (skipped is None): skipped = insert_dict["total"] - insert_dict[ "success"] - insert_dict["failed"] insert_dict["skipped"] = skipped # time key insert_dict['start'] = job['start'] # regular tag values for grouping: insert_dict['id'] = job.get('id', None) insert_dict['jobId'] = job.get('jobId', None) insert_dict['status'] = job.get('status', None) insert_dict['indexStatus'] = job.get('indexStatus', None) insert_dict['jobName'] = job.get('jobName', None) insert_dict['type'] = job.get('type', None) insert_dict['subPolicyType'] = job.get( 'subPolicyType', None) insert_list.append(insert_dict) except KeyError as error: ExceptionUtils.exception_info( error=error, extra_message= f"failed to compute job-individual statistics due key error. report to developer. Job: {job} ; job_stats: {job_stats}" ) if (len(insert_list) > 0): self.__influx_client.insert_dicts_to_buffer( list_with_dicts=insert_list, table_name="jobs_statistics") else: LOGGER.info( f">>> no additional job statistics to insert into DB for jobId: {job_id}" ) def __job_logs_to_stats(self, list_with_logs: List[Dict[str, Any]]) -> None: """Parses joblogs into their own statisic table, using declared supported ID's To parse more jobLogs define additional entrys in the attribute `supported_ids`. Arguments: list_with_logs {List[Dict[str, Any]]} -- List with all saved joblogs """ # only continue with joblogs we want to save supported_log_iterator = filter( lambda log: log['messageId'] in self.__supported_ids.keys(), list_with_logs) sorted_log_iterator = sorted(supported_log_iterator, key=lambda entry: entry['logTime']) max_sec_timestamp = 0 # required for preventing duplicates for job_log in sorted_log_iterator: message_id = job_log['messageId'] table_func_triple = self.__supported_ids[message_id] (table_name, row_dict_func, additional_fields) = table_func_triple if (not table_name): table_name = message_id ExceptionUtils.error_message( f"Warning: No tablename specified for message_id {message_id}. Please report to developer." ) try: # Saving information from the message-params list within the job_log row_dict = row_dict_func(job_log['messageParams']) if (not row_dict): # this was matched incorrectly, therefore skipped. # No warning cause this will happen often. continue # Saving additional fields from the job_log struct itself. if (additional_fields): for value in additional_fields: # with rename if (isinstance(value, Tuple)): row_dict[value[0]] = job_log[value[1]] else: # without rename row_dict[value] = job_log[value] except (KeyError, IndexError) as error: ExceptionUtils.exception_info( error, extra_message= f"MessageID params wrong defined. Skipping message_id {message_id} with content: {job_log}" ) continue # Issue 9, In case where all tag values duplicate another record, including the timestamp, Influx will throw the insert # out as a duplicate. In some cases, the changing of epoch timestamps from millisecond to second precision is # cause duplicate timestamps. To avoid this for certain tables, add seconds to the timestamp as needed to # ensure uniqueness. Only use this when some innacuracy of the timestamps is acceptable cur_timestamp = job_log['logTime'] if (table_name == 'vmBackupSummary'): if (cur_timestamp is None): # prevent None ExceptionUtils.error_message( f"Warning: logTime is None, duplicate may be purged. Log: {job_log}" ) if (isinstance(cur_timestamp, str)): # make sure its int cur_timestamp = int(cur_timestamp) cur_sec_timestamp = SppUtils.to_epoch_secs(cur_timestamp) if (cur_sec_timestamp <= max_sec_timestamp): digits = (int)(cur_timestamp / cur_sec_timestamp) max_sec_timestamp += 1 # increase by 1 second cur_timestamp = max_sec_timestamp * digits else: max_sec_timestamp = cur_sec_timestamp row_dict['time'] = cur_timestamp for (key, item) in row_dict.items(): if (item in ('null', 'null(null)')): row_dict[key] = None self.__influx_client.insert_dicts_to_buffer(table_name, [row_dict]) def job_logs(self) -> None: """saves all jobLogs for the jobsessions in influx catalog. Make sure to call `get_all_jobs` before to aquire all jobsessions. In order to save them it deletes and rewrites all affected jobsession entrys. It automatically parses certain jobLogs into additional stats, defined by `supported_ids`. """ # total count of requested logs logs_requested_total = 0 # total count of inserted logs logs_to_stats_total = 0 # should be equal, but on failure isnt (skipped logs) # list to be inserted after everything is updated job_update_list: List[Dict[str, Any]] = [] LOGGER.info("> Requesting jobs with missing logs from influx database") table = self.__influx_client.database['jobs'] # only store if there is something to store -> limited by job log rentation time. where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time where_str += f' AND time > now() - {table.retention_policy.duration}' # Select all jobs without joblogs keyword = Keyword.SELECT query = SelectionQuery(keyword=keyword, tables=[table], fields=['*'], where_str=where_str) # send query and compute missing_logs_jobs_rs = self.__influx_client.send_selection_query( # type: ignore query) # this list contains all jobs which are missing its Logs # Cast from resultset into list missing_logs_jobs: List[Dict[str, Any]] = list( missing_logs_jobs_rs.get_points()) # type: ignore LOGGER.info( f">>> Number of jobs with no joblogs stored in Influx database: {len(missing_logs_jobs)}" ) LOGGER.info("> Requesting missing jobLogs from REST-API.") # request all jobLogs from REST-API # counter only for displaying purposes for counter, row in enumerate(missing_logs_jobs, 0): # Only print every 5 rows if not verbose # starts at 0, therefore already updated if (self.__verbose or counter % 5 == 0): LOGGER.info( f">>> computed joblogs for {counter} / {len(missing_logs_jobs)} job sessions." ) job_session_id: Optional[int] = row.get('id', None) # if somehow jobLogid is missing: skip # Should usually not happen if (job_session_id is None): ExceptionUtils.error_message( f"Error: jobSessionId missing for row {row}") continue if (self.__verbose): LOGGER.info( f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}." ) LOGGER.debug( f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}." ) try: # cant use `query_something` like in other places due the extra params: # api_queries - query_something only works with no params # This list contains all joblogs for a single job-execution current_job_logs = self.__api_queries.get_job_log_details( jobsession_id=job_session_id, job_logs_types=self.__job_log_types, request_ids=list(self.__supported_ids.keys())) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"Error when api-requesting joblogs for job_session_id {job_session_id}, skipping it" ) continue job_log_count = len(current_job_logs) logs_requested_total += job_log_count if (self.__verbose): LOGGER.info( f">>> Found {job_log_count} logs for jobsessionId {job_session_id}" ) LOGGER.debug( f"Found {job_log_count} logs for jobsessionId {job_session_id}" ) # #################################################################################### # Compute results and save logs # ##################################################################################### # The request of REST-API Logs is finished here # To not crash by saving 100.000+ Logs, directly compute results and insert them # ###################################################################################### for job_log in current_job_logs: # add additional information from job-session itself job_log["jobId"] = row.get("jobId", None) job_log["jobName"] = row.get("jobName", None) job_log["jobExecutionTime"] = row.get("start", None) # rename for clarity job_log["jobLogId"] = job_log.pop("id", None) job_log["jobSessionId"] = job_log.pop("jobsessionId", None) # ########################################################## # compute jobLog-Stats into each associated table # ########################################################## try: self.__job_logs_to_stats(current_job_logs) except ValueError as error: ExceptionUtils.exception_info( error, extra_message= f"Failed parse jobLogs into its own table, skipping for jobsessionId {job_session_id}" ) logs_to_stats_total += job_log_count # ########################################################## # save logs within the joblog-dump # ########################################################## # Only dump them after computing stats since they are read within the computing stats part for job_log in current_job_logs: # dump message params to allow saving as string job_log["messageParams"] = json.dumps(job_log["messageParams"]) # if list is empty due beeing erased etc it will simply return and do nothing self.__influx_client.insert_dicts_to_buffer( list_with_dicts=current_job_logs, table_name="jobLogs") # shallow copy dict to allow a update without errors copied_jobsession = dict(row.items()) # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails) update_fields = { "jobLogsCount": job_log_count, "jobsLogsStored": True } # update the fields for (key, value) in update_fields.items(): copied_jobsession[key] = value job_update_list.append(copied_jobsession) # ########################################################## # End of For-Each # ########################################################## # ########################################################## # Delete each job, then re-insert # ########################################################## # Delete all jobs which got requested, no matter if failed delete_query = SelectionQuery(keyword=Keyword.DELETE, tables=[table], where_str=where_str) # now send remove query to prevent data loss self.__influx_client.send_selection_query(delete_query) # type: ignore # Insert data after everything is completed self.__influx_client.insert_dicts_to_buffer(table.name, job_update_list) if (logs_requested_total != logs_to_stats_total): LOGGER.info( f"> Requested a total of {logs_requested_total} but only computed {logs_to_stats_total} into sppmon statistics" ) else: LOGGER.info( f">>> requested and computed a total of {logs_requested_total} logs" ) LOGGER.info(f">> Updated a total of {len(job_update_list)} jobs")
def __job_by_id(self, job_id: str) -> None: """Requests and saves all jobsessions for a jobID""" if (not job_id): raise ValueError("need job_id to request jobs for that ID") keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] query = SelectionQuery( keyword=keyword, fields=['id', 'jobName'], tables=[table], where_str= f'jobId = \'{job_id}\' AND time > now() - {table.retention_policy.duration}' # unnecessary filter? ) LOGGER.debug(query) result = self.__influx_client.send_selection_query( # type: ignore query) id_list: List[int] = [] row: Dict[str, Any] = {} # make sure the var exists for row in result.get_points(): # type: ignore id_list.append(row['id']) # type: ignore if (not row): LOGGER.info( f">>> no entries in Influx database found for job with id {job_id}" ) # calculate time to be requested (rp_hours, rp_mins, rp_secs) = InfluxUtils.transform_time_literal( table.retention_policy.duration, single_vals=True) max_request_timestamp = datetime.datetime.now() - datetime.timedelta( hours=float(rp_hours), minutes=float(rp_mins), seconds=float(rp_secs)) unixtime = int(time.mktime(max_request_timestamp.timetuple())) # make it ms instead of s unixtime *= 1000 # retrieve all jobs in this category from REST API, filter to avoid drops due RP LOGGER.debug(f">>> requesting job sessions for id {job_id}") all_jobs = self.__api_queries.get_jobs_by_id(job_id=job_id) # filter all jobs where start time is not bigger then the retention time limit latest_jobs = list( filter(lambda job: job['start'] > unixtime, all_jobs)) missing_jobs = list( filter(lambda job_api: int(job_api['id']) not in id_list, latest_jobs)) if (len(missing_jobs) > 0): LOGGER.info( f">>> {len(missing_jobs)} datasets missing in DB for jobId: {job_id}" ) # Removes `statistics` from jobs self.__compute_extra_job_stats(missing_jobs, job_id) LOGGER.info( f">>> inserting job information of {len(missing_jobs)} jobs into jobs table" ) self.__influx_client.insert_dicts_to_buffer( list_with_dicts=missing_jobs, table_name="jobs") else: LOGGER.info( f">>> no new jobs to insert into DB for job with ID {job_id}") # TODO: artifact from older versions, not replaced yet if self.__verbose: display_number_of_jobs = 5 keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] where_str = 'jobId = \'{}\''.format(job_id) query = SelectionQuery(keyword=keyword, fields=['*'], tables=[table], where_str=where_str, order_direction='DESC', limit=display_number_of_jobs) result = self.__influx_client.send_selection_query( # type: ignore query) # type: ignore result_list: List[str] = list(result.get_points()) # type: ignore job_list_to_print: List[str] = [] for row_str in result_list: job_list_to_print.append(row_str) print() print( "displaying last {} jobs for job with ID {} from database (as available)" .format(display_number_of_jobs, job_id)) MethodUtils.my_print(data=job_list_to_print)
def test_connection(influx_client: InfluxClient, rest_client: RestClient, config_file: Dict[str, Any]): if (not config_file): raise ValueError("SPPmon does not work without a config file") LOGGER.info("Testing all connections required for SPPMon to work") working: bool = True # SPPMon itself will finish sucessfull (no critical errors) no_warnings: bool = True # SPPMon will finish without any warnings (no errors at all) # ## InfluxDB ## LOGGER.info("> Testing and configuring InfluxDB") try: influx_client.connect() influx_client.disconnect() if (not influx_client.use_ssl): ExceptionUtils.error_message( "> WARNING: Mandatory SSL is disabled. We hightly recommend to enable it!" ) no_warnings = False LOGGER.info("InfluxDB is ready for use") except ValueError as error: ExceptionUtils.exception_info( error, extra_message= "> Testing of the InfluxDB failed. This is a crictial component of SPPMon." ) working = False # ## REST-API ## LOGGER.info("> Testing REST-API of SPP.") try: rest_client.login() (version_nr, build_nr) = rest_client.get_spp_version_build() LOGGER.info( f">> Sucessfully connected to SPP V{version_nr}, build {build_nr}." ) rest_client.logout() LOGGER.info("> REST-API is ready for use") except ValueError as error: ExceptionUtils.exception_info( error, extra_message= "> Testing of the REST-API failed. This is a crictial component of SPPMon." ) working = False # ## SSH-CLIENTS ## LOGGER.info( "> Testing all types of SSH-Clients: Server, VAPDs, vSnaps, Cloudproxy and others" ) ssh_working = True # The arg --ssh will finish without any error at all # Count of clients checks ssh_clients: List[SshClient] = SshMethods.setup_ssh_clients( config_file) if (not ssh_clients): ExceptionUtils.error_message( ">> No SSH-clients detected at all. At least the server itself should be added for process-statistics." ) ssh_working = False else: for type in SshTypes: if (not list( filter(lambda client: client.client_type == type, ssh_clients))): LOGGER.info(f">> No {type.name} client detected.") if (type == SshTypes.SERVER): ExceptionUtils.error_message( ">> Critical: Without Server as ssh client you wont have any process statistics available. These are a key part of SPPMon." ) ssh_working = False # No error, but still critical if (type == SshTypes.VSNAP): LOGGER.info( ">> WARNING: Without vSnap as ssh client you have no access to storage information. You may add vSnap's for additional monitoring and alerts." ) no_warnings = False # ssh will still work, but thats definitly a warning ssh_methods: SshMethods = SshMethods(influx_client, config_file, False) # Connection check LOGGER.info( f">> Testing now connection and commands of {len(ssh_clients)} registered ssh-clients." ) for client in ssh_clients: try: client.connect() client.disconnect() error_count: int = len(ExceptionUtils.stored_errors) MethodUtils.ssh_execute_commands( ssh_clients=[client], ssh_type=client.client_type, command_list=ssh_methods.client_commands[ client.client_type] + ssh_methods.all_command_list) if (len(ExceptionUtils.stored_errors) != error_count): ssh_working = False ExceptionUtils.error_message( f"Not all commands available for client {client.host_name} with type: {client.client_type}.\n" + "Please check manually if the commands are installed and their output." ) except ValueError as error: ExceptionUtils.exception_info( error, extra_message= f"Connection failed for client {client.host_name} with type: {client.client_type}." ) ssh_working = False if (ssh_working): LOGGER.info("> Testing of SSH-clients sucessfull.") else: LOGGER.info( "> Testing of SSH-clients failed! SPPMon will still work, not all informations are available." ) no_warnings = False # #### Conclusion #### if (working and no_warnings): LOGGER.info( "> All components tested sucessfully. SPPMon is ready to be used!" ) elif (working): LOGGER.info( "> Testing partially sucessful. SPPMon will run, but please check the warnings." ) else: LOGGER.info( "> Testing failed. SPPMon is not ready to be used. Please fix the connection issues." )