def spin_process(self, i, run_timestamp, metric_check_file): """ Assign a metric for a process to analyze. :param i: python process id :param run_timestamp: the epoch timestamp at which this process was called :param metric_check_file: full path to the metric check file :return: returns True """ child_process_pid = os.getpid() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('child_process_pid - %s' % str(child_process_pid)) self.process_list.append(child_process_pid) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('processing metric check - %s' % metric_check_file) if not os.path.isfile(str(metric_check_file)): logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file))) return check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_failed_check_dir - %s' % metric_failed_check_dir) # failed_check_file = failed_checks_dir + '/' + check_file_name failed_check_file = metric_failed_check_dir + '/' + check_file_name # Load and validate metric variables try: metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) except: logger.error( 'error :: failed to import metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) # TBD - a failed check Panorama update will go here, perhaps txt # files are not the only "queue" that will be used, both, but # Panorama, may be just a part of Skyline Flux, the flux DB # would allow for a very nice, distributed "queue" and a # distributed Skyline workforce... # Any Skyline node could just have one role, e.g. lots of # Skyline nodes running crucible only and instead of reading # the local filesystem for input, they could read the Flux DB # queue or both... return # Test metric variables # We use a pythonic methodology to test if the variables are defined, # this ensures that if any of the variables are not set for some reason # we can handle unexpected data or situations gracefully and try and # ensure that the process does not hang. # if len(str(metric_vars.metric)) == 0: # if not metric_vars.metric: try: metric_vars.metric except: logger.error( 'error :: failed to read metric variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric = str(metric_vars.metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric - %s' % metric) # if len(metric_vars.value) == 0: # if not metric_vars.value: try: metric_vars.value except: logger.error( 'error :: failed to read value variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: value = str(metric_vars.value) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - value - %s' % (value)) # if len(metric_vars.from_timestamp) == 0: # if not metric_vars.from_timestamp: try: metric_vars.from_timestamp except: logger.error( 'error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: from_timestamp = str(metric_vars.from_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - from_timestamp - %s' % from_timestamp) # if len(metric_vars.metric_timestamp) == 0: # if not metric_vars.metric_timestamp: try: metric_vars.metric_timestamp except: logger.error( 'error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric_timestamp = str(metric_vars.metric_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric_timestamp - %s' % metric_timestamp) # if len(metric_vars.algorithms) == 0: # if not metric_vars.algorithms: try: metric_vars.algorithms except: logger.error( 'error :: failed to read algorithms variable from check file setting to all' % (metric_check_file)) algorithms = ['all'] else: algorithms = [] for i_algorithm in metric_vars.algorithms: algorithms.append(i_algorithm) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - algorithms - %s' % algorithms) # if len(metric_vars.anomaly_dir) == 0: # if not metric_vars.anomaly_dir: try: metric_vars.anomaly_dir except: logger.error( 'error :: failed to read anomaly_dir variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: anomaly_dir = str(metric_vars.anomaly_dir) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - anomaly_dir - %s' % anomaly_dir) # if len(str(metric_vars.graphite_metric)) == 0: try: metric_vars.graphite_metric except: logger.info( 'failed to read graphite_metric variable from check file setting to False' ) # yes this is a string graphite_metric = 'False' else: graphite_metric = str(metric_vars.graphite_metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - graphite_metric - %s' % graphite_metric) # if len(str(metric_vars.run_crucible_tests)) == 0: try: metric_vars.run_crucible_tests except: logger.info( 'failed to read run_crucible_tests variable from check file setting to False' ) # yes this is a string run_crucible_tests = 'False' else: run_crucible_tests = str(metric_vars.run_crucible_tests) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_crucible_tests - %s' % run_crucible_tests) try: metric_vars.added_by except: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'failed to read added_by variable from check file setting to crucible - set to crucible' ) added_by = 'crucible' else: added_by = str(metric_vars.added_by) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - added_by - %s' % added_by) try: metric_vars.run_script except: run_script = False if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'metric variable - run_script - not present set to False') else: run_script = str(metric_vars.run_script) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_script - %s' % run_script) # Only check if the metric does not a EXPIRATION_TIME key set, crucible # uses the alert EXPIRATION_TIME for the relevant alert setting contexts # whether that be analyzer, mirage, boundary, etc and sets its own # cache_keys in redis. This prevents large amounts of data being added # in terms of tieseries json and image files, crucible samples at the # same EXPIRATION_TIME as alerts. source_app = 'crucible' expiration_timeout = 1800 remove_all_anomaly_files = False check_expired = False check_time = time() if added_by == 'analyzer' or added_by == 'mirage': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Will check %s ALERTS' % added_by) if settings.ENABLE_ALERTS: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking %s ALERTS' % added_by) for alert in settings.ALERTS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = added_by expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'the check is older than EXPIRATION_TIME for the metric - not checking - check_expired' ) if added_by == 'boundary': if settings.BOUNDARY_ENABLE_ALERTS: for alert in settings.BOUNDARY_METRICS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = 'boundary' expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'the check is older than EXPIRATION_TIME for the metric - not checking - check_expired' ) cache_key = 'crucible.last_check.%s.%s' % (source_app, metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('cache_key - crucible.last_check.%s.%s' % (source_app, metric)) # Only use the cache_key EXPIRATION_TIME if this is not a request to # run_crucible_tests on a timeseries if run_crucible_tests == 'False': if check_expired: logger.info('check_expired - not checking Redis key') last_check = True else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking if cache_key exists') try: last_check = self.redis_conn.get(cache_key) except Exception as e: logger.error( 'error :: could not query cache_key for %s - %s - %s' % (source_app, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True if not last_check: try: self.redis_conn.setex(cache_key, expiration_timeout, packb(value)) logger.info( 'set cache_key for %s - %s with timeout of %s' % (source_app, metric, str(expiration_timeout))) except Exception as e: logger.error( 'error :: could not query cache_key for %s - %s - %s' % (source_app, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True else: if check_expired: logger.info( 'check_expired - all anomaly files will be removed') remove_all_anomaly_files = True else: logger.info( 'cache_key is set and not expired for %s - %s - all anomaly files will be removed' % (source_app, metric)) remove_all_anomaly_files = True # anomaly dir if not os.path.exists(str(anomaly_dir)): try: # mkdir_p(skyline_app, str(anomaly_dir)) if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode = 0o755 os.makedirs(anomaly_dir, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('created anomaly dir - %s' % str(anomaly_dir)) except: logger.error('error :: failed to create anomaly_dir - %s' % str(anomaly_dir)) if not os.path.exists(str(anomaly_dir)): logger.error('error :: anomaly_dir does not exist') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly dir exists - %s' % str(anomaly_dir)) failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('failed_check_file - %s' % str(failed_check_file)) # Retrieve data from graphite is necessary anomaly_graph = anomaly_dir + '/' + metric + '.png' anomaly_json = anomaly_dir + '/' + metric + '.json' anomaly_json_gz = anomaly_json + '.gz' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph - %s' % str(anomaly_graph)) logger.info('anomaly_json - %s' % str(anomaly_json)) logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz)) # Some things added to crucible may not be added by a skyline app per se # and if run_crucible_tests is string True then no anomaly files should # be removed. if run_crucible_tests == 'True': remove_all_anomaly_files = False # Remove check and anomaly files if the metric has a EXPIRATION_TIME # cache_key set if remove_all_anomaly_files: if os.path.isfile(anomaly_graph): try: os.remove(anomaly_graph) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph removed - %s' % str(anomaly_graph)) except OSError: pass if os.path.isfile(anomaly_json): try: os.remove(anomaly_json) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json removed - %s' % str(anomaly_json)) except OSError: pass if os.path.isfile(anomaly_json_gz): try: os.remove(anomaly_json_gz) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json_gz removed - %s' % str(anomaly_json_gz)) except OSError: pass anomaly_txt_file = anomaly_dir + '/' + metric + '.txt' if os.path.isfile(anomaly_txt_file): try: os.remove(anomaly_txt_file) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_txt_file removed - %s' % str(anomaly_txt_file)) except OSError: pass # TBD - this data would have to be added to the panaorama DB before # it is removed if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass if os.path.exists(str(anomaly_dir)): try: os.rmdir(str(anomaly_dir)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_dir removed - %s' % str(anomaly_dir)) except OSError: pass logger.info('check and anomaly files removed') return # Check if the image exists if graphite_metric == 'True': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite_metric - %s' % (graphite_metric)) # Graphite timeouts connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('connect_timeout - %s' % str(connect_timeout)) read_timeout = int(settings.GRAPHITE_READ_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('read_timeout - %s' % str(read_timeout)) graphite_until = datetime.datetime.fromtimestamp( int(metric_timestamp)).strftime('%H:%M_%Y%m%d') graphite_from = datetime.datetime.fromtimestamp( int(from_timestamp)).strftime('%H:%M_%Y%m%d') # graphite URL if settings.GRAPHITE_PORT != '': url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' else: url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite url - %s' % (url)) if not os.path.isfile(anomaly_graph): logger.info( 'retrieving png - surfacing %s graph from graphite from %s to %s' % (metric, graphite_from, graphite_until)) image_url = url.replace('&format=json', '') graphite_image_file = anomaly_dir + '/' + metric + '.png' if 'width' not in image_url: image_url += '&width=586' if 'height' not in image_url: image_url += '&height=308' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite image url - %s' % (image_url)) image_url_timeout = int(connect_timeout) image_data = None try: image_data = urllib2.urlopen( image_url, timeout=image_url_timeout).read() logger.info('url OK - %s' % (image_url)) except urllib2.URLError: image_data = None logger.error('error :: url bad - %s' % (image_url)) if image_data is not None: with open(graphite_image_file, 'w') as f: f.write(image_data) logger.info('retrieved - %s' % (anomaly_graph)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(graphite_image_file, mode_arg) else: logger.error('error :: failed to retrieved - %s' % (anomaly_graph)) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph file exists - %s' % str(anomaly_graph)) if not os.path.isfile(anomaly_graph): logger.error( 'error :: retrieve failed to surface %s graph from graphite' % (metric)) else: logger.info('graph image exists - %s' % (anomaly_graph)) # Check if the json exists if not os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): logger.info( 'surfacing timeseries data for %s from graphite from %s to %s' % (metric, graphite_from, graphite_until)) if requests.__version__ >= '2.4.0': use_timeout = (int(connect_timeout), int(read_timeout)) else: use_timeout = int(connect_timeout) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('use_timeout - %s' % (str(use_timeout))) try: r = requests.get(url, timeout=use_timeout) js = r.json() datapoints = js[0]['datapoints'] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('data retrieved OK') except: datapoints = [[None, int(graphite_until)]] logger.error('error :: data retrieval failed') converted = [] for datapoint in datapoints: try: new_datapoint = [ float(datapoint[1]), float(datapoint[0]) ] converted.append(new_datapoint) except: continue with open(anomaly_json, 'w') as f: f.write(json.dumps(converted)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('json file - %s' % anomaly_json) if not os.path.isfile(anomaly_json): logger.error( 'error :: failed to surface %s json from graphite' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error( 'error :: failed to move check file to - %s' % failed_check_file) pass return # Check timeseries json exists - raw or gz if not os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): logger.error('error :: no json data found' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return else: logger.info('timeseries json gzip exists - %s' % (anomaly_json_gz)) else: logger.info('timeseries json exists - %s' % (anomaly_json)) # If timeseries json and run_crucible_tests is str(False) gzip and # return here as there is nothing further to do if run_crucible_tests == 'False': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_crucible_tests - %s' % run_crucible_tests) # gzip the json timeseries data if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipping - %s' % anomaly_json) try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json_gz, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipped - %s' % anomaly_json_gz) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return if os.path.isfile(anomaly_json_gz): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzip exists - %s' % anomaly_json) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return nothing_to_do = 'true - for debug only' # self.check_if_parent_is_alive() # Run crucible algorithms logger.info('running crucible tests - %s' % (metric)) if os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipping - %s' % anomaly_json_gz) try: # with gzip.open(anomaly_json_gz, 'rb') as fr: fr = gzip.open(anomaly_json_gz, 'rb') raw_timeseries = fr.read() fr.close() except Exception as e: logger.error('error :: could not ungzip %s - %s' % (anomaly_json_gz, e)) traceback.print_exc() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipped') logger.info('writing to - %s' % anomaly_json) with open(anomaly_json, 'w') as fw: fw.write(raw_timeseries) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json done') if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json, mode_arg) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('No gzip - %s' % anomaly_json_gz) nothing_to_do = 'true - for debug only' if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json exists - %s' % anomaly_json) if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loading timeseries from - %s' % anomaly_json) with open(anomaly_json, 'r') as f: timeseries = json.loads(f.read()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loaded timeseries from - %s' % anomaly_json) else: try: logger.error('error :: file not found - %s' % anomaly_json) shutil.move(metric_check_file, failed_check_file) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(failed_check_file, mode_arg) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return start_timestamp = int(timeseries[0][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('start_timestamp - %s' % str(start_timestamp)) end_timestamp = int(timeseries[-1][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('end_timestamp - %s' % str(end_timestamp)) full_duration = end_timestamp - start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('full_duration - %s' % str(full_duration)) self.check_if_parent_is_alive() run_algorithms_start_timestamp = int(time()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms_start_timestamp - %s' % str(run_algorithms_start_timestamp)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' % (metric, str(end_timestamp), str(full_duration), anomaly_json, skyline_app, str(algorithms))) try: anomalous, ensemble = run_algorithms(timeseries, str(metric), end_timestamp, full_duration, str(anomaly_json), skyline_app, algorithms) except: logger.error('error :: run_algorithms failed - %s' % str(traceback.print_exc())) run_algorithms_end_timestamp = int(time()) run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous, ensemble - %s, %s' % (anomalous, str(ensemble))) if anomalous: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous - %s' % (anomalous)) nothing_to_do = 'true - for debug only' logger.info('run_algorithms took %s seconds' % str(run_algorithms_seconds)) # Update anomaly file crucible_data = 'crucible_tests_run = "%s"\n' \ 'crucible_triggered_algorithms = %s\n' \ 'tested_by = "%s"\n' \ % (str(run_timestamp), str(ensemble), str(this_host)) crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric) with open(crucible_anomaly_file, 'a') as fh: fh.write(crucible_data) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(crucible_anomaly_file, mode_arg) logger.info('updated crucible anomaly file - %s/%s.txt' % (anomaly_dir, metric)) # gzip the json timeseries data after analysis if os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json_gz, mode_arg) logger.info('gzipped - %s' % (anomaly_json_gz)) except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) else: os.remove(anomaly_json) if run_script: if os.path.isfile(run_script): logger.info('running - %s' % (run_script)) os.system('%s %s' % (str(run_script), str(crucible_anomaly_file))) # Remove metric check file nothing_to_do = '' try: os.remove(metric_check_file) logger.info('complete removed check file - %s %s' % (metric_check_file, nothing_to_do)) except OSError: pass
def spin_process(self, i, run_timestamp, metric_check_file): """ Assign a metric for a process to analyze. :param i: python process id :param run_timestamp: the epoch timestamp at which this process was called :param metric_check_file: full path to the metric check file :return: returns True """ child_process_pid = os.getpid() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('child_process_pid - %s' % str(child_process_pid)) self.process_list.append(child_process_pid) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('processing metric check - %s' % metric_check_file) if not os.path.isfile(str(metric_check_file)): logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file))) return check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_failed_check_dir - %s' % metric_failed_check_dir) # failed_check_file = failed_checks_dir + '/' + check_file_name failed_check_file = metric_failed_check_dir + '/' + check_file_name # Load and validate metric variables try: metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) except: logger.error('error :: failed to import metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) # TBD - a failed check Panorama update will go here, perhaps txt # files are not the only "queue" that will be used, both, but # Panorama, may be just a part of Skyline Flux, the flux DB # would allow for a very nice, distributed "queue" and a # distributed Skyline workforce... # Any Skyline node could just have one role, e.g. lots of # Skyline nodes running crucible only and instead of reading # the local filesystem for input, they could read the Flux DB # queue or both... return # Test metric variables # We use a pythonic methodology to test if the variables are defined, # this ensures that if any of the variables are not set for some reason # we can handle unexpected data or situations gracefully and try and # ensure that the process does not hang. # if len(str(metric_vars.metric)) == 0: # if not metric_vars.metric: try: metric_vars.metric except: logger.error('error :: failed to read metric variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric = str(metric_vars.metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric - %s' % metric) # if len(metric_vars.value) == 0: # if not metric_vars.value: try: metric_vars.value except: logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: value = str(metric_vars.value) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - value - %s' % (value)) # if len(metric_vars.from_timestamp) == 0: # if not metric_vars.from_timestamp: try: metric_vars.from_timestamp except: logger.error('error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: from_timestamp = str(metric_vars.from_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - from_timestamp - %s' % from_timestamp) # if len(metric_vars.metric_timestamp) == 0: # if not metric_vars.metric_timestamp: try: metric_vars.metric_timestamp except: logger.error('error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric_timestamp = str(metric_vars.metric_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric_timestamp - %s' % metric_timestamp) # if len(metric_vars.algorithms) == 0: # if not metric_vars.algorithms: try: metric_vars.algorithms except: logger.error('error :: failed to read algorithms variable from check file setting to all' % (metric_check_file)) algorithms = ['all'] else: algorithms = [] for i_algorithm in metric_vars.algorithms: algorithms.append(i_algorithm) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - algorithms - %s' % algorithms) # if len(metric_vars.anomaly_dir) == 0: # if not metric_vars.anomaly_dir: try: metric_vars.anomaly_dir except: logger.error('error :: failed to read anomaly_dir variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: anomaly_dir = str(metric_vars.anomaly_dir) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - anomaly_dir - %s' % anomaly_dir) # if len(str(metric_vars.graphite_metric)) == 0: try: metric_vars.graphite_metric except: logger.info('failed to read graphite_metric variable from check file setting to False') # yes this is a string graphite_metric = 'False' else: graphite_metric = str(metric_vars.graphite_metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - graphite_metric - %s' % graphite_metric) # if len(str(metric_vars.run_crucible_tests)) == 0: try: metric_vars.run_crucible_tests except: logger.info('failed to read run_crucible_tests variable from check file setting to False') # yes this is a string run_crucible_tests = 'False' else: run_crucible_tests = str(metric_vars.run_crucible_tests) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_crucible_tests - %s' % run_crucible_tests) try: metric_vars.added_by except: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('failed to read added_by variable from check file setting to crucible - set to crucible') added_by = 'crucible' else: added_by = str(metric_vars.added_by) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - added_by - %s' % added_by) try: metric_vars.run_script except: run_script = False if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_script - not present set to False') else: run_script = str(metric_vars.run_script) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_script - %s' % run_script) # Only check if the metric does not a EXPIRATION_TIME key set, crucible # uses the alert EXPIRATION_TIME for the relevant alert setting contexts # whether that be analyzer, mirage, boundary, etc and sets its own # cache_keys in redis. This prevents large amounts of data being added # in terms of tieseries json and image files, crucible samples at the # same EXPIRATION_TIME as alerts. source_app = 'crucible' expiration_timeout = 1800 remove_all_anomaly_files = False check_expired = False check_time = time() if added_by == 'analyzer' or added_by == 'mirage': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Will check %s ALERTS' % added_by) if settings.ENABLE_ALERTS: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking %s ALERTS' % added_by) for alert in settings.ALERTS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = added_by expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired') if added_by == 'boundary': if settings.BOUNDARY_ENABLE_ALERTS: for alert in settings.BOUNDARY_METRICS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = 'boundary' expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired') cache_key = 'crucible.last_check.%s.%s' % (source_app, metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('cache_key - crucible.last_check.%s.%s' % (source_app, metric)) # Only use the cache_key EXPIRATION_TIME if this is not a request to # run_crucible_tests on a timeseries if run_crucible_tests == 'False': if check_expired: logger.info('check_expired - not checking Redis key') last_check = True else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking if cache_key exists') try: last_check = self.redis_conn.get(cache_key) except Exception as e: logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True if not last_check: try: self.redis_conn.setex(cache_key, expiration_timeout, packb(value)) logger.info('set cache_key for %s - %s with timeout of %s' % (source_app, metric, str(expiration_timeout))) except Exception as e: logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True else: if check_expired: logger.info('check_expired - all anomaly files will be removed') remove_all_anomaly_files = True else: logger.info('cache_key is set and not expired for %s - %s - all anomaly files will be removed' % (source_app, metric)) remove_all_anomaly_files = True # anomaly dir if not os.path.exists(str(anomaly_dir)): try: mkdir_p(skyline_app, str(anomaly_dir)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('created anomaly dir - %s' % str(anomaly_dir)) except: logger.error('error :: failed to create anomaly_dir - %s' % str(anomaly_dir)) if not os.path.exists(str(anomaly_dir)): logger.error('error :: anomaly_dir does not exist') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly dir exists - %s' % str(anomaly_dir)) failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('failed_check_file - %s' % str(failed_check_file)) # Retrieve data from graphite is necessary anomaly_graph = anomaly_dir + '/' + metric + '.png' anomaly_json = anomaly_dir + '/' + metric + '.json' anomaly_json_gz = anomaly_json + '.gz' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph - %s' % str(anomaly_graph)) logger.info('anomaly_json - %s' % str(anomaly_json)) logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz)) # Some things added to crucible may not be added by a skyline app per se # and if run_crucible_tests is string True then no anomaly files should # be removed. if run_crucible_tests == 'True': remove_all_anomaly_files = False # Remove check and anomaly files if the metric has a EXPIRATION_TIME # cache_key set if remove_all_anomaly_files: if os.path.isfile(anomaly_graph): try: os.remove(anomaly_graph) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph removed - %s' % str(anomaly_graph)) except OSError: pass if os.path.isfile(anomaly_json): try: os.remove(anomaly_json) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json removed - %s' % str(anomaly_json)) except OSError: pass if os.path.isfile(anomaly_json_gz): try: os.remove(anomaly_json_gz) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json_gz removed - %s' % str(anomaly_json_gz)) except OSError: pass anomaly_txt_file = anomaly_dir + '/' + metric + '.txt' if os.path.isfile(anomaly_txt_file): try: os.remove(anomaly_txt_file) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_txt_file removed - %s' % str(anomaly_txt_file)) except OSError: pass # TBD - this data would have to be added to the panaorama DB before # it is removed if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass if os.path.exists(str(anomaly_dir)): try: os.rmdir(str(anomaly_dir)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_dir removed - %s' % str(anomaly_dir)) except OSError: pass logger.info('check and anomaly files removed') return # Check if the image exists if graphite_metric == 'True': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite_metric - %s' % (graphite_metric)) # Graphite timeouts connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('connect_timeout - %s' % str(connect_timeout)) read_timeout = int(settings.GRAPHITE_READ_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('read_timeout - %s' % str(read_timeout)) graphite_until = datetime.datetime.fromtimestamp(int(metric_timestamp)).strftime('%H:%M_%Y%m%d') graphite_from = datetime.datetime.fromtimestamp(int(from_timestamp)).strftime('%H:%M_%Y%m%d') # graphite URL if settings.GRAPHITE_PORT != '': url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' else: url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite url - %s' % (url)) if not os.path.isfile(anomaly_graph): logger.info('retrieving png - surfacing %s graph from graphite from %s to %s' % (metric, graphite_from, graphite_until)) image_url = url.replace('&format=json', '') graphite_image_file = anomaly_dir + '/' + metric + '.png' if 'width' not in image_url: image_url += '&width=586' if 'height' not in image_url: image_url += '&height=308' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite image url - %s' % (image_url)) image_url_timeout = int(connect_timeout) image_data = None try: image_data = urllib2.urlopen(image_url, timeout=image_url_timeout).read() logger.info('url OK - %s' % (image_url)) except urllib2.URLError: image_data = None logger.error('error :: url bad - %s' % (image_url)) if image_data is not None: with open(graphite_image_file, 'w') as f: f.write(image_data) logger.info('retrieved - %s' % (anomaly_graph)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(graphite_image_file, mode_arg) else: logger.error('error :: failed to retrieved - %s' % (anomaly_graph)) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph file exists - %s' % str(anomaly_graph)) if not os.path.isfile(anomaly_graph): logger.error('error :: retrieve failed to surface %s graph from graphite' % (metric)) else: logger.info('graph image exists - %s' % (anomaly_graph)) # Check if the json exists if not os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): logger.info('surfacing timeseries data for %s from graphite from %s to %s' % (metric, graphite_from, graphite_until)) if requests.__version__ >= '2.4.0': use_timeout = (int(connect_timeout), int(read_timeout)) else: use_timeout = int(connect_timeout) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('use_timeout - %s' % (str(use_timeout))) try: r = requests.get(url, timeout=use_timeout) js = r.json() datapoints = js[0]['datapoints'] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('data retrieved OK') except: datapoints = [[None, int(graphite_until)]] logger.error('error :: data retrieval failed') converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[1]), float(datapoint[0])] converted.append(new_datapoint) except: continue with open(anomaly_json, 'w') as f: f.write(json.dumps(converted)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('json file - %s' % anomaly_json) if not os.path.isfile(anomaly_json): logger.error('error :: failed to surface %s json from graphite' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return # Check timeseries json exists - raw or gz if not os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): logger.error('error :: no json data found' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return else: logger.info('timeseries json gzip exists - %s' % (anomaly_json_gz)) else: logger.info('timeseries json exists - %s' % (anomaly_json)) # If timeseries json and run_crucible_tests is str(False) gzip and # return here as there is nothing further to do if run_crucible_tests == 'False': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_crucible_tests - %s' % run_crucible_tests) # gzip the json timeseries data if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipping - %s' % anomaly_json) try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json_gz, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipped - %s' % anomaly_json_gz) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return if os.path.isfile(anomaly_json_gz): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzip exists - %s' % anomaly_json) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return nothing_to_do = 'true - for debug only' # self.check_if_parent_is_alive() # Run crucible algorithms logger.info('running crucible tests - %s' % (metric)) timeseries_dir = metric.replace('.', '/') if os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipping - %s' % anomaly_json_gz) try: # with gzip.open(anomaly_json_gz, 'rb') as fr: fr = gzip.open(anomaly_json_gz, 'rb') raw_timeseries = fr.read() fr.close() except Exception as e: logger.error('error :: could not ungzip %s - %s' % (anomaly_json_gz, e)) traceback.print_exc() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipped') logger.info('writing to - %s' % anomaly_json) with open(anomaly_json, 'w') as fw: fw.write(raw_timeseries) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json done') if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json, mode_arg) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('No gzip - %s' % anomaly_json_gz) nothing_to_do = 'true - for debug only' if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json exists - %s' % anomaly_json) if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loading timeseries from - %s' % anomaly_json) with open(anomaly_json, 'r') as f: timeseries = json.loads(f.read()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loaded timeseries from - %s' % anomaly_json) else: try: logger.error('error :: file not found - %s' % anomaly_json) shutil.move(metric_check_file, failed_check_file) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(failed_check_file, mode_arg) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return start_timestamp = int(timeseries[0][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('start_timestamp - %s' % str(start_timestamp)) end_timestamp = int(timeseries[-1][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('end_timestamp - %s' % str(end_timestamp)) full_duration = end_timestamp - start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('full_duration - %s' % str(full_duration)) self.check_if_parent_is_alive() run_algorithms_start_timestamp = int(time()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms_start_timestamp - %s' % str(run_algorithms_start_timestamp)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' % (metric, str(end_timestamp), str(full_duration), anomaly_json, skyline_app, str(algorithms))) try: anomalous, ensemble = run_algorithms(timeseries, str(metric), end_timestamp, full_duration, str(anomaly_json), skyline_app, algorithms) except: logger.error('error :: run_algorithms failed - %s' % str(traceback.print_exc())) run_algorithms_end_timestamp = int(time()) run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous, ensemble - %s, %s' % (anomalous, str(ensemble))) if anomalous: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous - %s' % (anomalous)) nothing_to_do = 'true - for debug only' logger.info('run_algorithms took %s seconds' % str(run_algorithms_seconds)) # Update anomaly file crucible_data = 'crucible_tests_run = "%s"\n' \ 'crucible_triggered_algorithms = %s\n' \ 'tested_by = "%s"\n' \ % (str(run_timestamp), str(ensemble), str(this_host)) crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric) with open(crucible_anomaly_file, 'a') as fh: fh.write(crucible_data) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(crucible_anomaly_file, mode_arg) logger.info('updated crucible anomaly file - %s/%s.txt' % (anomaly_dir, metric)) # gzip the json timeseries data after analysis if os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json_gz, mode_arg) logger.info('gzipped - %s' % (anomaly_json_gz)) except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) else: os.remove(anomaly_json) if run_script: if os.path.isfile(run_script): logger.info('running - %s' % (run_script)) os.system('%s %s' % (str(run_script), str(crucible_anomaly_file))) # Remove metric check file try: os.remove(metric_check_file) logger.info('complete removed check file - %s' % (metric_check_file)) except OSError: pass
def run(self): """ Called when the process intializes. Determine if what is known in the Skyline DB blah """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: logger.info('removing %s' % skyline_app_logwait) os.remove(skyline_app_logwait) except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) # See if I am known in the DB, if so, what are my variables # self.populate mysql # What is my host id in the Skyline panorama DB? # - if not known - INSERT hostname INTO hosts # What are the known apps? # - if returned make a dictionary # What are the known algorithms? # - if returned make a dictionary while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() if ENABLE_PANORAMA_DEBUG: logger.info('debug :: connected to Redis') except: logger.error('error :: cannot connect to redis at socket path %s' % ( settings.REDIS_SOCKET_PATH)) sleep(30) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up try: self.redis_conn.setex(skyline_app, 120, now) logger.info('updated Redis key for %s up' % skyline_app) except: logger.error('error :: failed to update Redis key for %s up' % skyline_app) if ENABLE_PANORAMA_DEBUG: # Make sure mysql is available mysql_down = True while mysql_down: query = 'SHOW TABLES' results = self.mysql_select(query) if results: mysql_down = False logger.info('debug :: tested database query - OK') else: logger.error('error :: failed to query database') sleep(30) if ENABLE_PANORAMA_DEBUG: try: query = 'SELECT id, test FROM test' result = self.mysql_select(query) logger.info('debug :: tested mysql SELECT query - OK') logger.info('debug :: result: %s' % str(result)) logger.info('debug :: result[0]: %s' % str(result[0])) logger.info('debug :: result[1]: %s' % str(result[1])) # Works # 2016-06-10 19:07:23 :: 4707 :: result: [(1, u'test1')] except: logger.error( 'error :: mysql error - %s' % traceback.print_exc()) logger.error('error :: failed to SELECT') # self.populate the database metatdata tables # What is my host id in the Skyline panorama DB? host_id = False # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests query = 'select id FROM hosts WHERE host=\'%s\'' % this_host # nosec results = self.mysql_select(query) if results: host_id = results[0][0] logger.info('host_id: %s' % str(host_id)) else: logger.info('failed to determine host id of %s' % this_host) # - if not known - INSERT hostname INTO host if not host_id: logger.info('inserting %s into hosts table' % this_host) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests query = 'insert into hosts (host) VALUES (\'%s\')' % this_host # nosec host_id = self.mysql_insert(query) if host_id: logger.info('new host_id: %s' % str(host_id)) if not host_id: logger.error( 'error :: failed to determine populate %s into the hosts table' % this_host) sleep(30) continue # Like loop through the panorama dir and see if anyone has left you # any work, etc # Make sure check_dir exists and has not been removed try: if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: checking check dir exists - %s' % settings.PANORAMA_CHECK_PATH) os.path.exists(settings.PANORAMA_CHECK_PATH) except: logger.error('error :: check dir did not exist - %s' % settings.PANORAMA_CHECK_PATH) mkdir_p(settings.PANORAMA_CHECK_PATH) logger.info('check dir created - %s' % settings.PANORAMA_CHECK_PATH) os.path.exists(settings.PANORAMA_CHECK_PATH) # continue """ Determine if any metric has been added to add """ while True: metric_var_files = False try: metric_var_files = [f for f in listdir(settings.PANORAMA_CHECK_PATH) if isfile(join(settings.PANORAMA_CHECK_PATH, f))] except: logger.error('error :: failed to list files in check dir') logger.info(traceback.format_exc()) if not metric_var_files: logger.info('sleeping 20 no metric check files') sleep(20) # Discover metric anomalies to insert metric_var_files = False try: metric_var_files = [f for f in listdir(settings.PANORAMA_CHECK_PATH) if isfile(join(settings.PANORAMA_CHECK_PATH, f))] except: logger.error('error :: failed to list files in check dir') logger.info(traceback.format_exc()) if metric_var_files: break metric_var_files_sorted = sorted(metric_var_files) metric_check_file = '%s/%s' % (settings.PANORAMA_CHECK_PATH, str(metric_var_files_sorted[0])) logger.info('assigning anomaly for insertion - %s' % str(metric_var_files_sorted[0])) # Spawn processes pids = [] spawned_pids = [] pid_count = 0 now = time() for i in range(1, settings.PANORAMA_PROCESSES + 1): try: p = Process(target=self.spin_process, args=(i, metric_check_file)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.PANORAMA_PROCESSES))) p.start() spawned_pids.append(p.pid) except: logger.error('error :: to start spin_process') logger.info(traceback.format_exc()) continue # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than CRUCIBLE_TESTS_TIMEOUT p_starts = time() while time() - p_starts <= 20: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info( '%s :: %s spin_process/es completed in %.2f seconds' % ( skyline_app, str(settings.PANORAMA_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() # p.join() check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = '%s/%s/%s' % (failed_checks_dir, check_file_metricname_dir, check_file_timestamp) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) for p in pids: if p.is_alive(): logger.info('%s :: stopping spin_process - %s' % (skyline_app, str(p.is_alive()))) p.join()
def spin_process(self, i, metric_check_file): """ Assign a metric anomaly to process. :param i: python process id :param metric_check_file: full path to the metric check file :return: returns True """ child_process_pid = os.getpid() if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: child_process_pid - %s' % str(child_process_pid)) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: processing metric check - %s' % metric_check_file) if not os.path.isfile(str(metric_check_file)): logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file))) return check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = '%s/%s/%s' % (failed_checks_dir, check_file_metricname_dir, check_file_timestamp) failed_check_file = '%s/%s' % (metric_failed_check_dir, check_file_name) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: failed_check_file - %s' % failed_check_file) # Load and validate metric variables try: # @modified 20170101 - Feature #1830: Ionosphere alerts # Bug #1460: panorama check file fails # Panorama check file fails #24 # Get rid of the skyline_functions imp as imp is deprecated in py3 anyway # Use def new_load_metric_vars(self, metric_vars_file): # metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) metric_vars_array = self.new_load_metric_vars(str(metric_check_file)) except: logger.info(traceback.format_exc()) logger.error('error :: failed to load metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return # Test metric variables # We use a pythonic methodology to test if the variables are defined, # this ensures that if any of the variables are not set for some reason # we can handle unexpected data or situations gracefully and try and # ensure that the process does not hang. metric = None try: # metric_vars.metric # metric = str(metric_vars.metric) key = 'metric' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] metric = str(value_list[0]) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - metric - %s' % metric) except: logger.error('error :: failed to read metric variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not metric: logger.error('error :: failed to load metric variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return value = None # @added 20171214 - Bug #2234: panorama metric_vars value check value_valid = None try: # metric_vars.value # value = str(metric_vars.value) key = 'value' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] value = float(value_list[0]) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - value - %s' % (value)) # @added 20171214 - Bug #2234: panorama metric_vars value check value_valid = True except: logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return # @added 20171214 - Bug #2234: panorama metric_vars value check # If value was float of 0.0 then this was interpolated as not set # if not value: if not value_valid: # @added 20171214 - Bug #2234: panorama metric_vars value check # Added exception handling here logger.info(traceback.format_exc()) logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return from_timestamp = None try: # metric_vars.from_timestamp # from_timestamp = str(metric_vars.from_timestamp) key = 'from_timestamp' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] from_timestamp = int(value_list[0]) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - from_timestamp - %s' % from_timestamp) except: # @added 20160822 - Bug #1460: panorama check file fails # Added exception handling here logger.info(traceback.format_exc()) logger.error('error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not from_timestamp: logger.error('error :: failed to load from_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return metric_timestamp = None try: # metric_vars.metric_timestamp # metric_timestamp = str(metric_vars.metric_timestamp) key = 'metric_timestamp' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] metric_timestamp = int(value_list[0]) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - metric_timestamp - %s' % metric_timestamp) except: logger.error('error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not metric_timestamp: logger.error('error :: failed to load metric_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return algorithms = None try: # metric_vars.algorithms # algorithms = metric_vars.algorithms key = 'algorithms' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] algorithms = value_list[0] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - algorithms - %s' % str(algorithms)) except: logger.error('error :: failed to read algorithms variable from check file setting to all - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not algorithms: logger.error('error :: failed to load algorithms variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return triggered_algorithms = None try: # metric_vars.triggered_algorithms # triggered_algorithms = metric_vars.triggered_algorithms key = 'triggered_algorithms' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] triggered_algorithms = value_list[0] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - triggered_algorithms - %s' % str(triggered_algorithms)) except: logger.error('error :: failed to read triggered_algorithms variable from check file setting to all - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not triggered_algorithms: logger.error('error :: failed to load triggered_algorithms variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return app = None try: # metric_vars.app # app = str(metric_vars.app) key = 'app' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] app = str(value_list[0]) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - app - %s' % app) except: logger.error('error :: failed to read app variable from check file setting to all - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not app: logger.error('error :: failed to load app variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return source = None try: # metric_vars.source # source = str(metric_vars.source) key = 'source' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] source = str(value_list[0]) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - source - %s' % source) except: logger.error('error :: failed to read source variable from check file setting to all - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not app: logger.error('error :: failed to load app variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return added_by = None try: # metric_vars.added_by # added_by = str(metric_vars.added_by) key = 'added_by' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] added_by = str(value_list[0]) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - added_by - %s' % added_by) except: logger.error('error :: failed to read added_by variable from check file setting to all - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not added_by: logger.error('error :: failed to load added_by variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return added_at = None try: # metric_vars.added_at # added_at = str(metric_vars.added_at) key = 'added_at' value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key] added_at = str(value_list[0]) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - added_at - %s' % added_at) except: logger.error('error :: failed to read added_at variable from check file setting to all - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return if not added_at: logger.error('error :: failed to load added_at variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return record_anomaly = True cache_key = '%s.last_check.%s.%s' % (skyline_app, app, metric) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: cache_key - %s.last_check.%s.%s' % ( skyline_app, app, metric)) try: last_check = self.redis_conn.get(cache_key) except Exception as e: logger.error( 'error :: could not query cache_key - %s.last_check.%s.%s - %s' % ( skyline_app, app, metric, e)) last_check = None if last_check: record_anomaly = False logger.info( 'Panorama metric key not expired - %s.last_check.%s.%s' % ( skyline_app, app, metric)) # @added 20160907 - Handle Panorama stampede on restart after not running #26 # Allow to expire check if greater than PANORAMA_CHECK_MAX_AGE if max_age: now = time() anomaly_age = int(now) - int(metric_timestamp) if anomaly_age > max_age_seconds: record_anomaly = False logger.info( 'Panorama check max age exceeded - %s - %s seconds old, older than %s seconds discarding' % ( metric, str(anomaly_age), str(max_age_seconds))) if not record_anomaly: logger.info('not recording anomaly for - %s' % (metric)) if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass return # Determine id of something thing def determine_id(table, key, value): """ Get the id of something from Redis or the database and insert a new record if one does not exist for the value. :param table: table name :param key: key name :param value: value name :type table: str :type key: str :type value: str :return: int or boolean """ query_cache_key = '%s.mysql_ids.%s.%s.%s' % (skyline_app, table, key, value) determined_id = None redis_determined_id = None if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: query_cache_key - %s' % (query_cache_key)) try: redis_known_id = self.redis_conn.get(query_cache_key) except: redis_known_id = None if redis_known_id: unpacker = Unpacker(use_list=False) unpacker.feed(redis_known_id) redis_determined_id = list(unpacker) if redis_determined_id: determined_id = int(redis_determined_id[0]) if determined_id: if determined_id > 0: return determined_id # Query MySQL # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests query = 'select id FROM %s WHERE %s=\'%s\'' % (table, key, value) # nosec # @modified 20170916 - Bug #2166: panorama incorrect mysql_id cache keys # Wrap in except # results = self.mysql_select(query) results = None try: results = self.mysql_select(query) except: logger.error('error :: failed to determine results from - %s' % (query)) determined_id = 0 if results: try: determined_id = int(results[0][0]) except Exception as e: logger.error(traceback.format_exc()) logger.error('error :: determined_id is not an int') determined_id = 0 if determined_id > 0: # Set the key for a week if not redis_determined_id: try: self.redis_conn.setex(query_cache_key, 604800, packb(determined_id)) logger.info('set redis query_cache_key - %s - id: %s' % ( query_cache_key, str(determined_id))) except Exception as e: logger.error(traceback.format_exc()) logger.error('error :: failed to set query_cache_key - %s - id: %s' % ( query_cache_key, str(determined_id))) return int(determined_id) # @added 20170115 - Feature #1854: Ionosphere learn - generations # Added determination of the learn related variables # learn_full_duration_days, learn_valid_ts_older_than, # max_generations and max_percent_diff_from_origin value to the # insert statement if the table is the metrics table. if table == 'metrics' and key == 'metric': # Set defaults learn_full_duration_days = int(settings.IONOSPHERE_LEARN_DEFAULT_FULL_DURATION_DAYS) valid_learning_duration = int(settings.IONOSPHERE_LEARN_DEFAULT_VALID_TIMESERIES_OLDER_THAN_SECONDS) max_generations = int(settings.IONOSPHERE_LEARN_DEFAULT_MAX_GENERATIONS) max_percent_diff_from_origin = float(settings.IONOSPHERE_LEARN_DEFAULT_MAX_PERCENT_DIFF_FROM_ORIGIN) try: use_full_duration, valid_learning_duration, use_full_duration_days, max_generations, max_percent_diff_from_origin = get_ionosphere_learn_details(skyline_app, value) learn_full_duration_days = use_full_duration_days except: logger.error(traceback.format_exc()) logger.error('error :: failed to get_ionosphere_learn_details for %s' % value) logger.info('metric learn details determined for %s' % value) logger.info('learn_full_duration_days :: %s days' % (str(learn_full_duration_days))) logger.info('valid_learning_duration :: %s seconds' % (str(valid_learning_duration))) logger.info('max_generations :: %s' % (str(max_generations))) logger.info('max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin))) # INSERT because no known id # @modified 20170115 - Feature #1854: Ionosphere learn - generations # Added the learn_full_duration_days, learn_valid_ts_older_than, # max_generations and max_percent_diff_from_origin value to the # insert statement if the table is the metrics table. # insert_query = 'insert into %s (%s) VALUES (\'%s\')' % (table, key, value) if table == 'metrics' and key == 'metric': # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests insert_query_string = '%s (%s, learn_full_duration_days, learn_valid_ts_older_than, max_generations, max_percent_diff_from_origin) VALUES (\'%s\', %s, %s, %s, %s)' % ( table, key, value, str(learn_full_duration_days), str(valid_learning_duration), str(max_generations), str(max_percent_diff_from_origin)) insert_query = 'insert into %s' % insert_query_string # nosec else: insert_query = 'insert into %s (%s) VALUES (\'%s\')' % (table, key, value) # nosec logger.info('inserting %s into %s table' % (value, table)) try: results = self.mysql_insert(insert_query) except: logger.error(traceback.format_exc()) logger.error('error :: failed to determine the id of %s from the insert' % (value)) raise determined_id = 0 if results: determined_id = int(results) else: logger.error('error :: results not set') raise if determined_id > 0: # Set the key for a week if not redis_determined_id: try: self.redis_conn.setex(query_cache_key, 604800, packb(determined_id)) logger.info('set redis query_cache_key - %s - id: %s' % ( query_cache_key, str(determined_id))) except Exception as e: logger.error(traceback.format_exc()) logger.error('%s' % str(e)) logger.error('error :: failed to set query_cache_key - %s - id: %s' % ( query_cache_key, str(determined_id))) return determined_id logger.error('error :: failed to determine the inserted id for %s' % value) return False try: added_by_host_id = determine_id('hosts', 'host', added_by) except: logger.error('error :: failed to determine id of %s' % (added_by)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: app_id = determine_id('apps', 'app', app) except: logger.error('error :: failed to determine id of %s' % (app)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: source_id = determine_id('sources', 'source', source) except: logger.error('error :: failed to determine id of %s' % (source)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: metric_id = determine_id('metrics', 'metric', metric) except: logger.error('error :: failed to determine id of %s' % (metric)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False algorithms_ids_csv = '' for algorithm in algorithms: try: algorithm_id = determine_id('algorithms', 'algorithm', algorithm) except: logger.error('error :: failed to determine id of %s' % (algorithm)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False if algorithms_ids_csv == '': algorithms_ids_csv = str(algorithm_id) else: new_algorithms_ids_csv = '%s,%s' % (algorithms_ids_csv, str(algorithm_id)) algorithms_ids_csv = new_algorithms_ids_csv triggered_algorithms_ids_csv = '' for triggered_algorithm in triggered_algorithms: try: triggered_algorithm_id = determine_id('algorithms', 'algorithm', triggered_algorithm) except: logger.error('error :: failed to determine id of %s' % (triggered_algorithm)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False if triggered_algorithms_ids_csv == '': triggered_algorithms_ids_csv = str(triggered_algorithm_id) else: new_triggered_algorithms_ids_csv = '%s,%s' % ( triggered_algorithms_ids_csv, str(triggered_algorithm_id)) triggered_algorithms_ids_csv = new_triggered_algorithms_ids_csv logger.info('inserting anomaly') try: full_duration = int(metric_timestamp) - int(from_timestamp) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: full_duration - %s' % str(full_duration)) except: logger.error('error :: failed to determine full_duration') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: anomalous_datapoint = round(float(value), 6) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: anomalous_datapoint - %s' % str(anomalous_datapoint)) except: logger.error('error :: failed to determine anomalous_datapoint') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: columns = '%s, %s, %s, %s, %s, %s, %s, %s, %s' % ( 'metric_id', 'host_id', 'app_id', 'source_id', 'anomaly_timestamp', 'anomalous_datapoint', 'full_duration', 'algorithms_run', 'triggered_algorithms') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: columns - %s' % str(columns)) except: logger.error('error :: failed to construct columns string') logger.info(traceback.format_exc()) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests query_string = '(%s) VALUES (%d, %d, %d, %d, %s, %.6f, %d, \'%s\', \'%s\')' % ( columns, metric_id, added_by_host_id, app_id, source_id, metric_timestamp, anomalous_datapoint, full_duration, algorithms_ids_csv, triggered_algorithms_ids_csv) query = 'insert into anomalies %s' % query_string # nosec except: logger.error('error :: failed to construct insert query') logger.info(traceback.format_exc()) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: anomaly insert - %s' % str(query)) try: anomaly_id = self.mysql_insert(query) logger.info('anomaly id - %d - created for %s at %s' % ( anomaly_id, metric, metric_timestamp)) except: logger.error('error :: failed to insert anomaly %s at %s' % ( anomaly_id, metric, metric_timestamp)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False # Set anomaly record cache key try: self.redis_conn.setex( cache_key, settings.PANORAMA_EXPIRY_TIME, packb(value)) logger.info('set cache_key - %s.last_check.%s.%s - %s' % ( skyline_app, app, metric, str(settings.PANORAMA_EXPIRY_TIME))) except Exception as e: logger.error( 'error :: could not query cache_key - %s.last_check.%s.%s - %s' % ( skyline_app, app, metric, e)) if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass return anomaly_id
def spin_process(self, i, metric_check_file): """ Assign a metric anomaly to process. :param i: python process id :param metric_check_file: full path to the metric check file :return: returns True """ child_process_pid = os.getpid() if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: child_process_pid - %s' % str(child_process_pid)) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: processing metric check - %s' % metric_check_file) if not os.path.isfile(str(metric_check_file)): logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file))) return check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = '%s/%s/%s' % (failed_checks_dir, check_file_metricname_dir, check_file_timestamp) failed_check_file = '%s/%s' % (metric_failed_check_dir, check_file_name) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: failed_check_file - %s' % failed_check_file) # Load and validate metric variables try: metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) except: logger.info(traceback.format_exc()) logger.error('error :: failed to load metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return # Test metric variables # We use a pythonic methodology to test if the variables are defined, # this ensures that if any of the variables are not set for some reason # we can handle unexpected data or situations gracefully and try and # ensure that the process does not hang. try: metric_vars.metric metric = str(metric_vars.metric) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - metric - %s' % metric) except: logger.error('error :: failed to read metric variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.value value = str(metric_vars.value) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - value - %s' % (value)) except: logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.from_timestamp from_timestamp = str(metric_vars.from_timestamp) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - from_timestamp - %s' % from_timestamp) except: # @added 20160822 - Bug #1460: panorama check file fails # Added exception handling here logger.info(traceback.format_exc()) logger.error('error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.metric_timestamp metric_timestamp = str(metric_vars.metric_timestamp) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - metric_timestamp - %s' % metric_timestamp) except: logger.error('error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.algorithms algorithms = metric_vars.algorithms if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - algorithms - %s' % str(algorithms)) except: logger.error('error :: failed to read algorithms variable from check file setting to all' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.triggered_algorithms triggered_algorithms = metric_vars.triggered_algorithms if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - triggered_algorithms - %s' % str(triggered_algorithms)) except: logger.error('error :: failed to read triggered_algorithms variable from check file setting to all' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.app app = str(metric_vars.app) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - app - %s' % app) except: logger.error('error :: failed to read app variable from check file setting to all' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.source source = str(metric_vars.source) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - source - %s' % source) except: logger.error('error :: failed to read source variable from check file setting to all' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.added_by added_by = str(metric_vars.added_by) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - added_by - %s' % added_by) except: logger.error('error :: failed to read added_by variable from check file setting to all' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return try: metric_vars.added_at added_at = str(metric_vars.added_at) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: metric variable - added_at - %s' % added_at) except: logger.error('error :: failed to read added_at variable from check file setting to all' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return record_anomaly = True cache_key = '%s.last_check.%s.%s' % (skyline_app, app, metric) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: cache_key - %s.last_check.%s.%s' % ( skyline_app, app, metric)) try: last_check = self.redis_conn.get(cache_key) except Exception as e: logger.error( 'error :: could not query cache_key - %s.last_check.%s.%s - %s' % ( skyline_app, app, metric, e)) last_check = None if last_check: record_anomaly = False logger.info( 'Panorama metric key not expired - %s.last_check.%s.%s' % ( skyline_app, app, metric)) # @added 20160907 - Handle Panorama stampede on restart after not running #26 # Allow to expire check if greater than PANORAMA_CHECK_MAX_AGE if max_age: now = time() anomaly_age = int(now) - int(metric_timestamp) if anomaly_age > max_age_seconds: record_anomaly = False logger.info( 'Panorama check max age exceeded - %s - %s seconds old, older than %s seconds discarding' % ( metric, str(anomaly_age), str(max_age_seconds))) if not record_anomaly: logger.info('not recording anomaly for - %s' % (metric)) if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass return # Determine id of something thing def determine_id(table, key, value): """ Get the id of something from Redis or the database and insert a new record if one does not exist for the value. :param table: table name :param key: key name :param value: value name :type table: str :type key: str :type value: str :return: int or boolean """ query_cache_key = '%s.mysql_ids.%s.%s.%s' % (skyline_app, table, key, value) determined_id = None redis_determined_id = None if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: query_cache_key - %s' % (query_cache_key)) try: redis_known_id = self.redis_conn.get(query_cache_key) except: redis_known_id = None if redis_known_id: unpacker = Unpacker(use_list=False) unpacker.feed(redis_known_id) redis_determined_id = list(unpacker) if redis_determined_id: determined_id = int(redis_determined_id[0]) if determined_id: if determined_id > 0: return determined_id # Query MySQL query = 'select id FROM %s WHERE %s=\'%s\'' % (table, key, value) results = self.mysql_select(query) determined_id = 0 if results: determined_id = int(results[0][0]) if determined_id > 0: # Set the key for a week if not redis_determined_id: try: self.redis_conn.setex(query_cache_key, 604800, packb(determined_id)) logger.info('set redis query_cache_key - %s - id: %s' % ( query_cache_key, str(determined_id))) except Exception as e: logger.error(traceback.format_exc()) logger.error('error :: failed to set query_cache_key - %s - id: %s' % ( query_cache_key, str(determined_id))) return int(determined_id) # INSERT because no known id insert_query = 'insert into %s (%s) VALUES (\'%s\')' % (table, key, value) logger.info('inserting %s into %s table' % (value, table)) try: results = self.mysql_insert(insert_query) except: logger.error(traceback.format_exc()) logger.error('error :: failed to determine the id of %s from the insert' % (value)) raise determined_id = 0 if results: determined_id = int(results) else: logger.error('error :: results not set') raise if determined_id > 0: # Set the key for a week if not redis_determined_id: try: self.redis_conn.setex(query_cache_key, 604800, packb(determined_id)) logger.info('set redis query_cache_key - %s - id: %s' % ( query_cache_key, str(determined_id))) except Exception as e: logger.error(traceback.format_exc()) logger.error('%s' % str(e)) logger.error('error :: failed to set query_cache_key - %s - id: %s' % ( query_cache_key, str(determined_id))) return determined_id logger.error('error :: failed to determine the inserted id for %s' % value) return False try: added_by_host_id = determine_id('hosts', 'host', added_by) except: logger.error('error :: failed to determine id of %s' % (added_by)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: app_id = determine_id('apps', 'app', app) except: logger.error('error :: failed to determine id of %s' % (app)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: source_id = determine_id('sources', 'source', source) except: logger.error('error :: failed to determine id of %s' % (source)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: metric_id = determine_id('metrics', 'metric', metric) except: logger.error('error :: failed to determine id of %s' % (metric)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False algorithms_ids_csv = '' for algorithm in algorithms: try: algorithm_id = determine_id('algorithms', 'algorithm', algorithm) except: logger.error('error :: failed to determine id of %s' % (algorithm)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False if algorithms_ids_csv == '': algorithms_ids_csv = str(algorithm_id) else: new_algorithms_ids_csv = '%s,%s' % (algorithms_ids_csv, str(algorithm_id)) algorithms_ids_csv = new_algorithms_ids_csv triggered_algorithms_ids_csv = '' for triggered_algorithm in triggered_algorithms: try: triggered_algorithm_id = determine_id('algorithms', 'algorithm', triggered_algorithm) except: logger.error('error :: failed to determine id of %s' % (triggered_algorithm)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False if triggered_algorithms_ids_csv == '': triggered_algorithms_ids_csv = str(triggered_algorithm_id) else: new_triggered_algorithms_ids_csv = '%s,%s' % ( triggered_algorithms_ids_csv, str(triggered_algorithm_id)) triggered_algorithms_ids_csv = new_triggered_algorithms_ids_csv logger.info('inserting anomaly') try: full_duration = int(metric_timestamp) - int(from_timestamp) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: full_duration - %s' % str(full_duration)) except: logger.error('error :: failed to determine full_duration') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: anomalous_datapoint = round(float(value), 6) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: anomalous_datapoint - %s' % str(anomalous_datapoint)) except: logger.error('error :: failed to determine anomalous_datapoint') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: columns = '%s, %s, %s, %s, %s, %s, %s, %s, %s' % ( 'metric_id', 'host_id', 'app_id', 'source_id', 'anomaly_timestamp', 'anomalous_datapoint', 'full_duration', 'algorithms_run', 'triggered_algorithms') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: columns - %s' % str(columns)) except: logger.error('error :: failed to construct columns string') logger.info(traceback.format_exc()) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False try: query = 'insert into anomalies (%s) VALUES (%d, %d, %d, %d, %s, %.6f, %d, \'%s\', \'%s\')' % ( columns, metric_id, added_by_host_id, app_id, source_id, metric_timestamp, anomalous_datapoint, full_duration, algorithms_ids_csv, triggered_algorithms_ids_csv) except: logger.error('error :: failed to construct insert query') logger.info(traceback.format_exc()) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: anomaly insert - %s' % str(query)) try: anomaly_id = self.mysql_insert(query) logger.info('anomaly id - %d - created for %s at %s' % ( anomaly_id, metric, metric_timestamp)) except: logger.error('error :: failed to insert anomaly %s at %s' % ( anomaly_id, metric, metric_timestamp)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return False # Set anomaly record cache key try: self.redis_conn.setex( cache_key, settings.PANORAMA_EXPIRY_TIME, packb(value)) logger.info('set cache_key - %s.last_check.%s.%s - %s' % ( skyline_app, app, metric, str(settings.PANORAMA_EXPIRY_TIME))) except Exception as e: logger.error( 'error :: could not query cache_key - %s.last_check.%s.%s - %s' % ( skyline_app, app, metric, e)) if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass return anomaly_id
def spin_process(self, i, run_timestamp): """ Assign a metric for a process to analyze. """ # Discover metric to analyze metric_var_files = [ f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f)) ] # Check if this process is unnecessary if len(metric_var_files) == 0: return metric_var_files_sorted = sorted(metric_var_files) metric_check_file = '%s/%s' % (settings.MIRAGE_CHECK_PATH, str(metric_var_files_sorted[0])) # Load metric variables # @modified 20160822 - Bug #1460: panorama check file fails # Changed to panorama style skyline_functions load_metric_vars # self.load_metric_vars(metric_check_file) # Load and validate metric variables try: metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to load metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return # Test metric variables if len(metric_vars.metric) == 0: return else: metric = metric_vars.metric metric_name = ['metric_name', metric_vars.metric] self.metric_variables.append(metric_name) if len(metric_vars.value) == 0: return else: metric_value = ['metric_value', metric_vars.value] self.metric_variables.append(metric_value) if len(metric_vars.hours_to_resolve) == 0: return else: hours_to_resolve = [ 'hours_to_resolve', metric_vars.hours_to_resolve ] self.metric_variables.append(hours_to_resolve) if len(metric_vars.metric_timestamp) == 0: return else: metric_timestamp = [ 'metric_timestamp', metric_vars.metric_timestamp ] self.metric_variables.append(metric_timestamp) # Ignore any metric check with a timestamp greater than 10 minutes ago int_metric_timestamp = int(metric_vars.metric_timestamp) int_run_timestamp = int(run_timestamp) metric_timestamp_age = int_run_timestamp - int_metric_timestamp if metric_timestamp_age > settings.MIRAGE_STALE_SECONDS: logger.info( 'stale check :: %s check request is %s seconds old - discarding' % (metric_vars.metric, metric_timestamp_age)) # Remove metric check file # try: # os.remove(metric_check_file) # except OSError: # pass # return if os.path.exists(metric_check_file): os.remove(metric_check_file) logger.info('removed %s' % (metric_check_file)) else: logger.info('could not remove %s' % (metric_check_file)) # Calculate hours second order resolution to seconds second_order_resolution_seconds = int( metric_vars.hours_to_resolve) * 3600 # Calculate graphite from and until parameters from the metric timestamp graphite_until = datetime.datetime.fromtimestamp( int(metric_vars.metric_timestamp)).strftime('%H:%M_%Y%m%d') int_second_order_resolution_seconds = int( second_order_resolution_seconds) second_resolution_timestamp = int_metric_timestamp - int_second_order_resolution_seconds graphite_from = datetime.datetime.fromtimestamp( int(second_resolution_timestamp)).strftime('%H:%M_%Y%m%d') # Remove any old json file related to the metric metric_json_file = '%s/%s/%s.json' % (settings.MIRAGE_DATA_FOLDER, str(metric_vars.metric), str(metric_vars.metric)) try: os.remove(metric_json_file) except OSError: pass # Get data from graphite logger.info( 'retrieve data :: surfacing %s timeseries from graphite for %s seconds' % (metric_vars.metric, second_order_resolution_seconds)) self.surface_graphite_metric_data(metric_vars.metric, graphite_from, graphite_until) # Check there is a json timeseries file to test if not os.path.isfile(metric_json_file): logger.error( 'error :: retrieve failed - failed to surface %s timeseries from graphite' % (metric_vars.metric)) # Remove metric check file try: os.remove(metric_check_file) except OSError: pass return else: logger.info('retrieved data :: for %s at %s seconds' % (metric_vars.metric, second_order_resolution_seconds)) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) self.check_if_parent_is_alive() with open((metric_json_file), 'r') as f: timeseries = json.loads(f.read()) logger.info('data points surfaced :: %s' % (len(timeseries))) try: logger.info('analyzing :: %s at %s seconds' % (metric_vars.metric, second_order_resolution_seconds)) anomalous, ensemble, datapoint = run_selected_algorithm( timeseries, metric_vars.metric, second_order_resolution_seconds) # If it's anomalous, add it to list if anomalous: base_name = metric.replace(settings.FULL_NAMESPACE, '', 1) anomalous_metric = [datapoint, base_name] self.anomalous_metrics.append(anomalous_metric) logger.info('anomaly detected :: %s with %s' % (metric_vars.metric, metric_vars.value)) # It runs so fast, this allows us to process 30 anomalies/min sleep(2) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.MIRAGE_ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # If Crucible or Panorama are enabled determine details determine_anomaly_details = False if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED: determine_anomaly_details = True if settings.PANORAMA_ENABLED: determine_anomaly_details = True if determine_anomaly_details: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') # If Panorama is enabled - create a Panorama check if settings.PANORAMA_ENABLED: if not os.path.exists(settings.PANORAMA_CHECK_PATH): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode = 0o755 os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg) # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 added_at = str(int(time())) source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.MIRAGE_ALGORITHMS), triggered_algorithms, skyline_app, source, this_host, added_at) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % ( settings.PANORAMA_CHECK_PATH, added_at, base_name) try: write_data_to_file(skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error( 'error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) logger.info(traceback.format_exc()) # If crucible is enabled - save timeseries and create a # crucible check if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp if not os.path.exists(crucible_anomaly_dir): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode = 0o755 os.makedirs(crucible_anomaly_dir, mode_arg) # Note: # The value is enclosed is single quoted intentionally # as the imp.load_source used in crucible results in a # shift in the decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = True\n' \ 'run_crucible_tests = False\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.MIRAGE_ALGORITHMS), triggered_algorithms, crucible_anomaly_dir, skyline_app, metric_timestamp) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % ( crucible_anomaly_dir, base_name) try: write_data_to_file(skyline_app, crucible_anomaly_file, 'w', crucible_anomaly_data) logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file)) except: logger.error( 'error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file)) logger.info(traceback.format_exc()) # Create timeseries json file with the timeseries json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name) timeseries_json = str(timeseries).replace('[', '(').replace( ']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info('added crucible timeseries file :: %s' % (json_file)) except: logger.error( 'error :: failed to add crucible timeseries file :: %s' % (json_file)) logger.info(traceback.format_exc()) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % ( settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name) try: write_data_to_file(skyline_app, crucible_check_file, 'w', crucible_anomaly_data) logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp)) except: logger.error( 'error :: failed to add crucible check file :: %s' % (crucible_check_file)) logger.info(traceback.format_exc()) else: base_name = metric.replace(settings.FULL_NAMESPACE, '', 1) not_anomalous_metric = [datapoint, base_name] self.not_anomalous_metrics.append(not_anomalous_metric) logger.info('not anomalous :: %s with %s' % (metric_vars.metric, metric_vars.value)) # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 logger.info('exceptions :: DeletedByRoomba') except TooShort: exceptions['TooShort'] += 1 logger.info('exceptions :: TooShort') except Stale: exceptions['Stale'] += 1 logger.info('exceptions :: Stale') except Boring: exceptions['Boring'] += 1 logger.info('exceptions :: Boring') except: exceptions['Other'] += 1 logger.info('exceptions :: Other') logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.mirage_anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.mirage_exceptions_q.put((key, value)) metric_var_files = [] timeseries = [] # Remove metric check file try: os.remove(metric_check_file) except OSError: pass