def check_dask_settings(cnf=None): if cnf is None: cnf_file = 'ede_config.yaml' try: opts, args = getopt.getopt( cnf, "he:tf:m:vx:d:lq:", ["endpoint=", "file=", "method=", "export=", "detect=", "query="]) except getopt.GetoptError: logger.warning( '[%s] : [WARN] Invalid argument received exiting', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("ede.py -f <filelocation>, -t -m <method> -v -x <modelname>") sys.exit(0) for opt, arg in opts: if opt in ("-f", "--file"): cnf_file = arg try: with open(cnf_file) as cf: readCnf = yaml.unsafe_load(cf) SchedulerEndpoint = readCnf['Connector']['Dask']['SchedulerEndpoint'] Scale = readCnf['Connector']['Dask']['Scale'] SchedulerPort = readCnf['Connector']['Dask']['SchedulerPort'] EnforceCheck = readCnf['Connector']['Dask']['EnforceCheck'] except Exception: SchedulerEndpoint = 0 Scale = 0 SchedulerPort = 0 EnforceCheck = 0 return SchedulerEndpoint, Scale, SchedulerPort, EnforceCheck
def dask_clusterMethod(self, cluster_method, mname, data ): try: logger.info('[{}] : [INFO] Loading Clustering method {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method))) # delattr(cluster_method, 'behaviour') # del cluster_method.__dict__['behaviour'] for k, v in cluster_method.get_params().items(): logger.info('[{}] : [INFO] Method parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: with joblib.parallel_backend('dask'): logger.info('[{}] : [INFO] Using Dask backend for user defined method'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to fit user defined method with dask backedn with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning('[{}] : [WARN] using default process based backedn for user defined method'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to fit {} with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method), type(inst), inst.args)) sys.exit(1) predictions = clf.predict(data) logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions)) fname = str(clf).split('(')[0] self.__serializemodel(clf, fname, mname) return clf
def scale(self, data, scaler_type=None, rindex='time'): # todo, integrate if not scaler_type: logger.warning('[{}] : [WARN] No data scaling used!'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) return data if scaler_type is None: scaler_type = {"StandardScaler": {"copy": True, "with_mean": True, "with_std": True}} logger.warning('[{}] : [WARN] No user defined scaler using default'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_type)) scaler_name = list(scaler_type.keys())[-1] scaler_attr = list(scaler_type.values())[-1] logger.info('[{}] : [INFO] Scaler set to {} with parameters {}.'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_name, scaler_attr)) try: sc_mod = importlib.import_module(self.scaler_mod) scaler_instance = getattr(sc_mod, scaler_name) scaler = scaler_instance(**scaler_attr) except Exception as inst: logger.error('[{}] : [ERROR] Error while initializing scaler {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_name)) sys.exit(2) # Fit and transform data logger.info('[{}] : [INFO] Scaling data ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) scaled_data = scaler.fit_transform(data) # Transform numpy array into dataframe, re-add columns to scaled numpyarray df_scaled = pd.DataFrame(scaled_data, columns=data.columns) df_scaled[rindex] = list(data.index) df_scaled.set_index(rindex, inplace=True) scaler_file = '{}.scaler'.format(scaler_name) logger.info('[{}] : [INFO] Saving scaler instance {} ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_file)) scale_file_location = os.path.join(self.dataDir, scaler_file) joblib.dump(scaler, filename=scale_file_location) return df_scaled
def pushAnomalyKafka(self, body): if self.producer is None: logger.warning( '[{}] : [WARN] Kafka reporter not defined, skipping reporting'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: try: self.producer.send(self.prKafkaTopic, body) # self.producer.flush() logger.info( '[{}] : [INFO] Anomalies reported to kafka topic {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.prKafkaTopic)) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to report anomalies to kafka topic {} with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.prKafkaTopic, type(inst), inst.args)) return 0
def dask_sdbscanTrain(self, settings, mname, data, scaler=None): ''' :param data: -> dataframe with data :param settings: -> settings dictionary :param mname: -> name of serialized clusterer :param scaler: -> scaler to use on data :return: -> clusterer :example settings: -> {eps:0.9, min_samples:10, metric:'euclidean' , algorithm:'auto, leaf_size:30, p:0.2, n_jobs:1} ''' if scaler is None: logger.warning('[{}] : [WARN] Scaler not defined'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: logger.info('[{}] : [INFO] Scaling data ...'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) data = scaler.fit_transform(data) if not settings or settings is None: logger.warning( '[{}] : [WARN] No DBScan parameters defined using default'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings = {} else: for k, v in settings.items(): logger.info( '[{}] : [INFO] DBScan parameter {} set to {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: db = DBSCAN(**settings).fit(data) except Exception as inst: logger.error( '[{}] : [INFO] Failed to instanciate DBScan with {} and {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(1) labels = db.labels_ logger.info('[{}] : [INFO] DBScan labels: {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), labels)) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) logger.info( '[{}] : [INFO] DBScan estimated number of clusters {} '.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), n_clusters_)) self.__serializemodel(db, 'sdbscan', mname) return db
def prtoDF(self, data, checkpoint=False, verbose=False, index=None, detect=False): """ From PR backend to dataframe :param data: PR response JSON :return: dataframe """ if not data: logger.error('[{}] : [ERROR] PR query response is empty, exiting.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) sys.exit(2) df = pd.DataFrame() df_time = pd.DataFrame() if verbose: dr = tqdm.tqdm(data['data']['result']) else: dr = data['data']['result'] for el in dr: metric_name = el['metric']['__name__'] instance_name = el['metric']['instance'] new_metric = "{}_{}".format(metric_name, instance_name) values = el['values'] proc_val = [] proc_time = [] for val in values: proc_val.append(val[1]) proc_time.append(val[0]) df[new_metric] = proc_val time_new_metric = "time_{}".format(new_metric) df_time[time_new_metric] = proc_time # Calculate the meant time for all metrics df_time['mean'] = df_time.mean(axis=1) # Round to np.ceil all metrics df_time['mean'] = df_time['mean'].apply(np.ceil) # Add the meant time to rest of metrics df['time'] = df_time['mean'] logger.info('[{}] : [INFO] PR query resulted in dataframe of size: {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), df.shape)) if index is not None: df.set_index(index, inplace=True) logger.warning('[{}] : [WARN] PR query dataframe index set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), index)) if checkpoint: if detect: pr = "pr_data_detect.csv" else: pr = "pr_data.csv" pr_csv_loc = os.path.join(self.dataDir, pr) df.to_csv(pr_csv_loc, index=True) logger.info('[{}] : [INFO] PR query dataframe persisted to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.dataDir)) return df
def getGT(self, data, gt='target'): if gt is None: logger.warning('[{}] : [WARN] Ground truth column not defined, fetching last column as target'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) features = data.columns[:-1] X = data[features] y = data.iloc[:, -1].values else: logger.info('[{}] : [INFO] Ground truth column set to {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), gt)) y = data[gt].values X = data.drop([gt], axis=1) return X, y
def dask_isolationForest(self, settings, mname, data ): ''' :param settings: -> settings dictionary :param mname: -> name of serialized clusterer :param scaler: -> scaler to use on data :return: -> isolation forest instance :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False, max_features:1.0, n_jobs:1, random_state:None, verbose:0} ''' if not settings or settings is None: logger.warning('[{}] : [WARN] No IsolationForest parameters defined using defaults'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) # print(settings) settings = {} else: for k, v in settings.items(): logger.info('[{}] : [INFO] IsolationForest parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: clf = IsolationForest(**settings) # print(clf) except Exception as inst: logger.error('[{}] : [INFO] Failed to instanciate IsolationForest with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(1) try: with joblib.parallel_backend('dask'): logger.info('[{}] : [INFO] Using Dask backend for IsolationForest'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf.fit(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to fit IsolationForest with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(1) predict = clf.predict(data) anoOnly = np.argwhere(predict == -1) logger.info('[{}] : [INFO] Found {} anomalies in training dataset of shape {}.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anoOnly), data.shape)) logger.info('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predict)) self.__serializemodel(clf, 'isoforest', mname) self.__appendPredictions(method='isoforest', mname=mname, data=data, pred=predict)
def reinitialize(self): try: self.esInstance.indices.delete( index=['anomalies', '.watches', 'watch_history*'], ignore=404) except Exception as inst: logger.warning( '[%s] : [WARN] Watcher index reinitialization failed with %s and %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) return 1 logger.info( '[%s] : [INFO] Watcher index reinitialization succesfull!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) return 0
def __loadClusterModel(self, method, model): ''' :param method: -> method name :param model: -> model name :return: -> instance of serialized object ''' lmodel = glob.glob(os.path.join(self.modelDir, ("%s_%s.pkl" % (method, model)))) if not lmodel: logger.warning('[%s] : [WARN] No %s model with the name %s found', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), method, model) return 0 else: smodel = pickle.load(open(lmodel[0], "rb")) logger.info('[%s] : [INFO] Succesfully loaded %s model with the name %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), method, model) return smodel
def listMerge(self, lFiles): ''' :param lFiles: -> list of files :return: merged dataframe :note: Only use if dataframes have divergent headers ''' dfList = [] if all(isinstance(x, str) for x in lFiles): for f in lFiles: if not f: logger.warning('[%s] : [WARN] Found empty string instead of abs path ...', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: df = pd.read_csv(f) except Exception as inst: logger.error('[%s] : [ERROR] Cannot load file at %s exiting', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), f) sys.exit(1) dfList.append(df) elif all(isinstance(x, pd.DataFrame) for x in lFiles): dfList = lFiles else: incomp = [] for el in lFiles: if not isinstance(el, pd.DataFrame): incomp.append(type(el)) logger.error('[%s] : [ERROR] Incompatible type detected for merging, cannot merge type %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(incomp)) # for d in dfList: # if d.empty: # logger.warning('[%s] : [INFO] Detected empty dataframe in final merge, removing ...', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # # dfList.pop(dfList.index(d)) try: current = reduce(lambda x, y: pd.merge(x, y, on='key'), dfList) except Exception as inst: logger.error('[%s] : [ERROR] Merge dataframes exception %s with args %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) logger.error('[%s] : [ERROR] Merge dataframes exception df list %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), dfList) sys.exit(1) # current.set_index('key', inplace=True) return current
def dask_detect(self, method, model, data ): smodel = self.__loadClusterModel(method, model) anomaliesList = [] if not smodel: dpredict = 0 else: if data.shape[0]: try: logger.info('[{}] : [INFO] Loading predictive model {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(smodel).split('(')[0])) for k, v in smodel.get_params().items(): logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) dpredict = smodel.predict(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to load predictive model with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) dpredict = 0 else: dpredict = 0 logger.warning('[{}] : [WARN] DataFrame is empty with shape {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape))) if type(dpredict) is not int: anomalyArray = np.argwhere(dpredict == -1) for an in anomalyArray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]].name) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name)) anomaliesList.append(anomalies) anomaliesDict = {} anomaliesDict['anomalies'] = anomaliesList logger.info('[{}] : [INFO] Detected {} anomalies with model {} using method {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anomaliesList), model, str(smodel).split('(')[0])) return anomaliesDict
qlte = 1477914720000 qsize = 0 qinterval = "10s" dmonEndpoint = '85.120.206.27' dmonConnector = Connector(dmonEndpoint) qConstructor = QueryConstructor() dformat = DataFormatter(dataDir) nodeList = dmonConnector.getNodeList() interval = dmonConnector.getInterval() if int(qinterval[:-1]) < interval['System']: logger.warning( '[%s] : [WARN] System Interval smaller than set interval!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # per slave unique process name list nodeProcessReduce = {} nodeProcessMap = {} # Get host based metrics for node in nodeList: # Query and file string load, load_file = qConstructor.loadString(node) memory, memory_file = qConstructor.memoryString(node) interface, interface_file = qConstructor.interfaceString(node) packet, packet_file = qConstructor.packetString(node) nodeManager, nodeManager_file = qConstructor.nodeManagerString(node) jvmNodeManager, jvmNodeManager_file = qConstructor.jvmnodeManagerString(
def computeOnColumns(self, df, operations, remove_filtered=True): if operations: if 'STD' in list(operations.keys()): std = operations['STD'] else: std = None if 'Mean' in list(operations.keys()): mean = operations['Mean'] else: mean = None if 'Median' in list(operations.keys()): median = operations['Median'] else: median = None all_processed_columns = [] if std or std is not None: for cl_std in std: for ncol_n, fcol_n in cl_std.items(): df_std = self.filterColumns(df, lColumns=fcol_n) logger.info('[{}] : [INFO] Computing standard deviation {} on columns {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n)) std_df = df_std.std(axis=1, skipna=True) df[ncol_n] = std_df for c in fcol_n: all_processed_columns.append(c) if mean or mean is not None: for cl_mean in mean: for ncol_n, fcol_n in cl_mean.items(): df_mean = self.filterColumns(df, lColumns=fcol_n) logger.info('[{}] : [INFO] Computing mean {} on columns {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n)) mean_df = df_mean.mean(axis=1, skipna=True) df[ncol_n] = mean_df for c in fcol_n: all_processed_columns.append(c) if median or median is not None: for cl_median in median: for ncol_n, fcol_n in cl_median.items(): df_median = self.filterColumns(df, lColumns=fcol_n) logger.info('[{}] : [INFO] Computing median {} on columns {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n)) median_df = df_median.median(axis=1, skipna=True) df[ncol_n] = median_df for c in fcol_n: all_processed_columns.append(c) if "Method" in list(operations.keys()): df = self.__operationMethod(operations['Method'], data=df) if remove_filtered: unique_all_processed_columns = list(set(all_processed_columns)) logger.warning('[{}] : [WARN] Droping columns used for computation ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), unique_all_processed_columns)) self.dropColumns(df, unique_all_processed_columns, cp=False) else: logger.info('[{}] : [INFO] No data operations/augmentations defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) logger.info('[{}] : [INFO] Augmented data shape {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), df.shape)) return df
def __init__(self, prEndpoint=None, esEndpoint=None, dmonPort=5001, MInstancePort=9200, index="logstash-*", prKafkaEndpoint=None, prKafkaPort=9092, prKafkaTopic='edetopic'): self.dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') if esEndpoint is None: self.esInstance = None else: self.esInstance = Elasticsearch(esEndpoint) self.esEndpoint = esEndpoint self.dmonPort = dmonPort self.esInstanceEndpoint = MInstancePort self.myIndex = index logger.info( '[{}] : [INFO] EDE ES backend Defined at: {} with port {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), esEndpoint, MInstancePort)) if prEndpoint is None: pass else: self.prEndpoint = prEndpoint self.MInstancePort = MInstancePort logger.info( '[{}] : [INFO] EDE PR backend Defined at: {} with port {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), prEndpoint, MInstancePort)) self.dataDir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'data') if prKafkaEndpoint is None: self.producer = None logger.warning('[{}] : [WARN] EDE Kafka reporter not set'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: self.prKafkaTopic = prKafkaTopic try: self.producer = KafkaProducer( value_serializer=lambda v: json.dumps(v).encode('utf-8'), bootstrap_servers=[ "{}:{}".format(prKafkaEndpoint, prKafkaPort) ], retries=5) logger.info( '[{}] : [INFO] EDE Kafka reporter initialized to server {}:{}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), prKafkaEndpoint, prKafkaPort)) except Exception as inst: logger.error( '[{}] : [ERROR] EDE Kafka reporter failed with {} and {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) self.producer = None
def main(argv, cluster, client): dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') modelsDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models') queryDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'queries') settings = Dict() settings.esendpoint = None settings.prendpoint = None settings.Dask.SchedulerEndpoint = None # "local" settings.Dask.SchedulerPort = 8787 settings.Dask.EnforceCheck = False settings.prkafkaendpoint = None settings.prkafkaport = 9092 settings.prkafkatopic = "edetopic" settings.augmentation = None # augmentation including scaler and user defined methods settings.detectionscaler = None settings.MPort = 9090 settings.dmonPort = 5001 settings.index = "logstash-*" settings["from"] = None settings.to = None settings.query = None settings.nodes = None settings.qsize = None settings.qinterval = None settings.fillna = None settings.dropna = None settings.local = None settings.train = None settings.hpomethod = None settings.tpot = None settings.ParamDistribution = None settings.detecttype = None # TODO settings.traintype = None settings.validationtype = None # Todo settings.target = None settings.load = None settings.file = None settings.method = None settings.detectMethod = None settings.trainMethod = None settings.cv = None settings.trainscore = None settings.scorer = None settings.returnestimators = None settings.analysis = None settings.validate = None settings.export = None settings.trainexport = None settings.detect = None # Bool default None settings.cfilter = None settings.rfilter = None settings.dfilter = None settings.sload = None settings.smemory = None settings.snetwork = None settings.heap = None settings.checkpoint = None settings.delay = None settings.interval = None settings.resetindex = None settings.training = None settings.validation = None settings.validratio = 0.2 settings.compare = False settings.anomalyOnly = False settings.categorical = None settings.point = False # Only for testing settings['validate'] = False dask_backend = False try: opts, args = getopt.getopt(argv, "he:tf:m:vx:d:lq:", ["endpoint=", "file=", "method=", "export=", "detect=", "query="]) # todo:expand comand line options except getopt.GetoptError: logger.warning('[%s] : [WARN] Invalid argument received exiting', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("ede.py -f <filelocation>, -t -m <method> -v -x <modelname>") sys.exit(0) for opt, arg in opts: if opt == '-h': print("#" * 100) print("H2020 ASPIDE") print('Event Detection Engine') print("-" * 100) print('Utilisation:') print('-f -> configuration file location') print('-t -> activate training mode') print('-m -> methods') print(' -> allowed methods: skm, em, dbscan, sdbscan, isoforest') print('-x -> export model name') print('-v -> validation') print('-q -> query string for anomaly/event detection') print("#" * 100) sys.exit(0) elif opt in ("-e", "--endpoint"): settings['esendpoint'] = arg elif opt in ("-t"): settings["train"] = True elif opt in ("-f", "--file"): settings["file"] = arg elif opt in ("-m", "--method"): settings["method"] = arg elif opt in ("-v"): settings["validate"] = True elif opt in ("-x", "--export"): settings["export"] = arg elif opt in ("-d", "--detect"): settings["detect"] = arg elif opt in ("-l", "--list-models"): print ("Current saved models are:\n") print((getModelList())) sys.exit(0) elif opt in ("-q", "--query"): settings["query"] = arg # print("#" * 100) # print(queryDir) logger.info('[{}] : [INFO] Starting EDE framework ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) logger.info('[{}] : [INFO] Trying to read configuration file ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) if settings["file"] is None: file_conf = 'ede_config.yaml' logger.info('[%s] : [INFO] Settings file set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), file_conf) else: if os.path.isfile(settings["file"]): file_conf = settings["file"] logger.info('[%s] : [INFO] Settings file set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), file_conf) else: logger.error('[%s] : [ERROR] Settings file not found at locations %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["file"]) sys.exit(1) readCnf = readConf(file_conf) logger.info('[{}] : [INFO] Reading configuration file ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) # TODO: create def dls(file_conf) # Connector try: logger.info('[{}] : [INFO] Index Name set to : {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Connector']['indexname'])) except: logger.warning('[%s] : [WARN] Index not set in conf setting to default value %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['index']) if settings['esendpoint'] is None: try: logger.info('[{}] : [INFO] Monitoring ES Backend endpoint in config {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Connector']['ESEndpoint'])) settings['esendpoint'] = readCnf['Connector']['ESEndpoint'] except: if readCnf['Connector']['PREndpoint'] is None: # todo; now only available in config file not in commandline logger.error('[%s] : [ERROR] ES and PR backend Enpoints not set in conf or commandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: settings['prendpoint'] = readCnf['Connector']['PREndpoint'] logger.info('[{}] : [INFO] Monitoring PR Endpoint set to {}'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["prendpoint"])) else: logger.info('[%s] : [INFO] ES Backend Enpoint set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['esendpoint']) if settings["from"] is None: try: settings["from"] = readCnf['Connector']['From'] logger.info('[%s] : [INFO] From timestamp set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["from"]) except: logger.info('[{}] : [INFO] PR Backend endpoint set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['prendpoint'])) if settings['prendpoint'] is not None: logger.info('[{}] : [INFO] PR Backedn endpoint set to {}'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['prendpoint'])) else: logger.error('[%s] : [ERROR] From timestamp not set in conf or commandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] From timestamp set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['from']) if settings["to"] is None: try: settings["to"] = readCnf['Connector']['to'] logger.info('[%s] : [INFO] To timestamp set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["to"]) except: if settings['prendpoint'] is not None: pass else: logger.error('[%s] : [ERROR] To timestamp not set in conf or commandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] To timestamp set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['to']) if settings['query'] is None: try: settings['query'] = readCnf['Connector']['Query'] logger.info('[%s] : [INFO] Query set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['query']) except: if settings['prendpoint'] is not None: pass logger.error('[%s] : [ERROR] Query not set in conf or commandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Query set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['query']) if settings.prkafkaendpoint is None: try: settings.prkafkaendpoint = readCnf['Connector']['KafkaEndpoint'] if settings.prkafkaendpoint == 'None': settings.prkafkaendpoint = None else: settings.prkafkatopic = readCnf['Connector']['KafkaTopic'] settings.prkafkaport = readCnf['Connector']['KafkaPort'] logger.info('[{}] : [INFO] Kafka Endpoint set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.prkafkaendpoint)) except: logger.warning('[{}] : [WARN] Kafka Endpoint not set.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.prkafkaendpoint)) if settings["nodes"] is None: try: if not readCnf['Connector']['nodes']: readCnf['Connector']['nodes'] = 0 settings["nodes"] = readCnf['Connector']['nodes'] logger.info('[%s] : [INFO] Desired nodes set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['nodes']) except: logger.warning('[%s] : [WARN] No nodes selected from config file or comandline querying all', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) settings["nodes"] = 0 else: logger.info('[%s] : [INFO] Desired nodes set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["nodes"]) if settings["qsize"] is None: try: settings["qsize"] = readCnf['Connector']['QSize'] logger.info('[%s] : [INFO] Query size set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['qsize']) except: logger.warning('[%s] : [WARN] Query size not set in conf or commandline setting to default', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) settings["qsize"] = 'default' else: logger.info('[%s] : [INFO] Query size set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["qsize"]) if settings["qinterval"] is None: try: settings["qinterval"] = readCnf['Connector']['MetricsInterval'] logger.info('[%s] : [INFO] Metric Interval set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['qinterval']) except: logger.warning('[%s] : [WARN] Metric Interval not set in conf or commandline setting to default', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) settings["qsize"] = "default" else: logger.info('[%s] : [INFO] Metric interval set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["qinterval"]) if readCnf['Connector']['Dask']: try: settings['Dask']['SchedulerEndpoint'] = readCnf['Connector']['Dask']['SchedulerEndpoint'] settings['Dask']['SchedulerPort'] = readCnf['Connector']['Dask']['SchedulerPort'] settings['Dask']['EnforceCheck'] = readCnf['Connector']['Dask']['EnforceCheck'] logger.info('[{}] : [INFO] Dask scheduler set to: endpoint {}, port {}, check {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['Dask']['SchedulerEndpoint'], settings['Dask']['SchedulerPort'], settings['Dask']['EnforceCheck'])) dask_backend = True except: logger.warning('[{}] : [WARN] Dask scheduler set to default values'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) dask_backend = False if settings['local'] is None: try: settings['local'] = readCnf['Connector']['Local'] logger.info('[{}] : [INFO] Local datasource set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['local'])) except: logger.info('[{}] : [INFO] Local datasource set to default'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings['local'] = None else: logger.info('[{}] : [INFO] Local datasource set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['local'])) # Mode if settings["train"] is None: try: settings["train"] = readCnf['Mode']['Training'] logger.info('[%s] : [INFO] Train is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['train']) except: logger.error('[%s] : [ERROR] Train is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Train is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['train']) # Analysis if settings.analysis is None: try: logger.info('[{}] : [INFO] Loading user defined analysis'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings.analysis = readCnf['Analysis'] except: logger.info('[{}] : [INFO] No user defined analysis detected'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) # Validate if settings["validate"] is None: try: settings["validate"] = readCnf['Mode']['Validate'] logger.info('[%s] : [INFO] Validate is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['validate']) except: logger.error('[%s] : [ERROR] Validate is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Validate is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['validate']) # Detect if settings["detect"] is None: try: settings["detect"] = readCnf['Mode']['Detect'] logger.info('[%s] : [INFO] Detect is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['detect']) except: logger.error('[%s] : [ERROR] Detect is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Detect is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['detect']) if settings["detectMethod"] is None: try: settings["detectMethod"] = readCnf['Detect']['Method'] logger.info('[%s] : [INFO] Detect Method is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detectMethod"]) except: logger.error('[%s] : [ERROR] Detect Method is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Detect Method is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detectMethod"]) if settings["detecttype"] is None: try: settings["detecttype"] = readCnf['Detect']['Type'] logger.info('[{}] : [INFO] Detect Type is set to {} from conf'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detecttype"])) except: logger.error('[%s] : [ERROR] Detect Type is not set in conf or command line!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Detect Type is set to %s from command line', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detecttype"]) if settings["trainMethod"] is None: try: settings["trainMethod"] = readCnf['Training']['Method'] logger.info('[%s] : [INFO] Train Method is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["trainMethod"]) except: try: settings['Training']['TPOTParam'] except: logger.error('[%s] : [ERROR] Train Method is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Train Method is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["trainMethod"]) if settings["traintype"] is None: try: settings["traintype"] = readCnf['Training']['Type'] logger.info('[%s] : [INFO] Train Type is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["traintype"]) except: logger.error('[%s] : [ERROR] Train Type is not set in conf or command line!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Train Type is set to %s from command line', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["traintype"]) if settings.target is None: try: settings.target = readCnf['Training']['Target'] logger.info('[{}] : [INFO] Classification Target set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.target)) except: if settings['traintype'] == 'classification': logger.warning('[{}] : [WARN] Classification Target not set in config'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.target)) else: pass if settings.hpomethod is None: try: settings.hpomethod = readCnf['Training']['HPOMethod'] logger.info('[{}] : [INFO] HPO method set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod)) try: settings.hpoparam = readCnf['Training']['HPOParam'] for k, v in readCnf['Training']['HPOParam'].items(): logger.info('[{}] : [INFO] HPO Method {} Param {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod, k, v)) except: logger.warn('[{}] : [WARN] HPO Method Params set to default!'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings.hpoparam = {} except: if readCnf['Training']['Type'] == 'hpo': logger.error('[{}] : [ERROR] HPO invoked without method! Exiting'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod)) sys.exit(1) else: pass if settings.ParamDistribution is None: try: settings.ParamDistribution = readCnf['Training']['ParamDistribution'] logger.info('[{}] : [INFO] HPO Parameter Distribution found.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: if readCnf['Training']['Type'] == 'hpo': logger.error('[{}] : [ERROR] HPO invoked without Parameter distribution! Exiting'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod)) sys.exit(1) else: pass if settings.tpot is None: try: settings.tpot = readCnf['Training']['TPOTParam'] logger.info('[{}] : [INFO] TPO Parameters found.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: try: if readCnf['Training']['Type'] == 'tpot': settings.tpot = {} logger.warning('[{}] : [WARN] TPO Parameters not found. Using defaults'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: pass except: pass if settings["export"] is None: try: settings["export"] = readCnf['Training']['Export'] logger.info('[%s] : [INFO] Export is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["export"]) except: logger.error('[%s] : [ERROR] Export is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Model is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["export"]) if settings.cv is None: try: settings.cv = readCnf['Training']['CV'] try: logger.info('[{}] : [INFO] Cross Validation set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cv']['Type'])) except: logger.info('[{}] : [INFO] Cross Validation set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cv'])) try: settings['cv'] = int(settings['cv']) except: logger.error('[{}] : [ERROR] Issues with CV definition in Training!'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) sys.exit(1) except: logger.info('[{}] : [INFO] Cross Validation not defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) if settings.trainscore is None: try: settings.trainscore = readCnf['Training']['TrainScore'] logger.info('[{}] : [INFO] Cross Validation set to include training scores'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: settings.trainscore = False if settings.scorer is None: try: settings.scorer = readCnf['Training']['Scorers'] logger.info('[{}] : [INFO] Training scorers defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: logger.info('[{}] : [INFO] No Training scorers defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) if settings.returnestimators is None: try: settings.returnestimators = readCnf['Training']['ReturnEstimators'] logger.info('[{}] : [INFO] CV Estimators will be saved'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: settings.returnestimators = False if settings["load"] is None: try: settings["load"] = readCnf['Detect']['Load'] logger.info('[%s] : [INFO] Load is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["load"]) except: logger.error('[%s] : [ERROR] Load is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Load is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["load"]) if settings.detectionscaler is None: try: settings.detectionscaler = readCnf['Detect']['Scaler'] logger.info('[{}] : [INFO] Detection Scaler set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.detectionscaler)) except: settings.detectionscaler = None logger.warning('[{}] : [WARN] Detection scaler not specified'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) try: settings['MethodSettings'] = {} #todo read settings from commandline ? for name, value in readCnf['Training']['MethodSettings'].items(): # print("%s -> %s" % (name, value)) settings['MethodSettings'][name] = value except: settings['MethodSettings'] = None logger.warning('[%s] : [WARN] No Method settings detected, using defaults for %s!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["method"]) # Augmentation try: settings['augmentation'] = readCnf['Augmentation'] logger.info('[%s] : [INFO] Augmentations loaded', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) except: settings['augmentation'] = None logger.info('[%s] : [INFO] Augmentations not defined', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # Point anomaly settings try: settings["smemory"] = readCnf['Point']['memory'] logger.info('[%s] : [INFO] System memory is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["smemory"]) except: settings["smemory"] = "default" logger.warning('[%s] : [WARN] System memory is not set, using default!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: settings["sload"] = readCnf['Point']['load'] logger.info('[%s] : [INFO] System load is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["sload"]) except: settings["sload"] = "default" logger.warning('[%s] : [WARN] System load is not set, using default!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: settings["snetwork"] = readCnf['Point']['network'] logger.info('[%s] : [INFO] System netowrk is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["snetwork"]) except: settings["snetwork"] = "default" logger.warning('[%s] : [WARN] System network is not set, using default!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: settings['heap'] = readCnf['Misc']['heap'] logger.info('[%s] : [INFO] Heap size set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['heap']) except: settings['heap'] = '512m' logger.info('[%s] : [INFO] Heap size set to default %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['heap']) # Filter try: if readCnf['Filter']['Columns']: logger.info('[{}] : [INFO] Filter columns set in config as {}.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Columns'])) settings["cfilter"] = readCnf['Filter']['columns'] else: logger.info('[{}] : [INFO] Filter columns set in config as {}.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["cfilter"])) except: pass finally: logger.info('[%s] : [INFO] Filter column set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cfilter']) try: # logger.info('[%s] : [INFO] Filter rows set to %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Rows']) settings["rfilter"] = readCnf['Filter']['Rows'] except: pass # logger.info('[%s] : [INFO] Filter rows %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["rfilter"]) finally: logger.info('[%s] : [INFO] Filter rows set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['rfilter']) try: if readCnf['Filter']['DColumns']: # print("Filter drop columns -> %s" % readCnf['Filter']['DColumns']) settings["dfilter"] = readCnf['Filter']['DColumns'] else: # print("Filter drop columns -> %s" % settings["dfilter"]) pass except: # print("Filter drop columns -> %s" % settings["dfilter"]) pass finally: logger.info('[%s] : [INFO] Filter drop column set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['dfilter']) try: if readCnf['Filter']['Fillna']: settings['fillna'] = readCnf['Filter']['Fillna'] else: settings['fillna'] = False logger.info('[{}] : [INFO] Fill None values set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Fillna'])) except: logger.info('[{}] : [INFO] Fill None not set, skipping ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings['fillna'] = False try: if readCnf['Filter']['Dropna']: settings['dropna'] = readCnf['Filter']['Dropna'] else: settings['dropna'] = False logger.info('[{}] : [INFO] Drop None values set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Dropna'])) except: logger.info('[{}] : [INFO] Drop None not set, skipping ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings['dropna'] = False if settings["checkpoint"] is None: try: settings["checkpoint"] = readCnf['Misc']['checkpoint'] logger.info('[%s] : [INFO] Checkpointing is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['checkpoint']) except: settings["checkpoint"] = "True" logger.info('[%s] : [INFO] Checkpointing is set to True', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) else: logger.info('[%s] : [INFO] Checkpointing is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['checkpoint']) if settings["delay"] is None: try: settings["delay"] = readCnf['Misc']['delay'] # logger.info('[%s] : [INFO] Delay is set to %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay']) except: settings["delay"] = "2m" logger.info('[%s] : [INFO] Delay is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay']) else: logger.info('[%s] : [INFO] Delay is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay']) if settings["interval"] is None: try: settings["interval"] = readCnf['Misc']['interval'] logger.info('[%s] : [INFO] Interval is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval']) except: settings["interval"] = "15m" logger.info('[%s] : [INFO] Interval is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval']) else: logger.info('[%s] : [INFO] Interval is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval']) if settings["resetindex"] is None: try: settings["resetindex"] = readCnf['Misc']['resetindex'] except: settings["resetindex"] = False else: logger.info('[%s] : [INFO] Reset index set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['resetindex']) try: settings['dmonPort'] = readCnf['Connector']['dmonport'] logger.info('[{}] : [INFO] DMon Port is set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['dmonPort'])) except: logger.info('[%s] : [INFO] DMon Port is set to %s"', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['dmonPort'])) try: settings['training'] = readCnf['Detect']['training'] logger.info('[{}] : [INFO] Classification Training set is {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Detect']['training'])) except: logger.info('[%s] : [INFO] Classification Training set is %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['training'])) # try: # print("Classification Validation set is %s" % readCnf['Detect']['validation']) # settings['validation'] = readCnf['Detect']['validation'] # except: # print("Classification Validation set is default") # logger.info('[%s] : [INFO] Classification Validation set is %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['validation'])) try: # print("Classification validation ratio is set to %d" % int(readCnf['Training']['ValidRatio'])) logger.info('[{}] : [INFO] Classification validation ratio is set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Training']['ValidRatio'])) if float(readCnf['Training']['ValidRatio']) > 1.0: # print("Validation ratio is out of range, must be between 1.0 and 0.1") settings['validratio'] = 0.0 logger.warning('[{}] : [WARN] Validation ratio is out of range, must be between 1.0 and 0.1, overwritting'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Training']['ValidRatio'])) settings['validratio'] = float(readCnf['Detect']['validratio']) except: logger.warning('[{}] : [WARN] Validation ratio is set to default'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) logger.info('[%s] : [INFO] Classification Validation ratio is %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['validratio'])) # try: # print("Classification comparison is set to %s" % readCnf['Detect']['compare']) # settings['compare'] = readCnf['Detect']['compare'] # except: # print("Classification comparison is default") # logger.info('[%s] : [INFO] Classification comparison is %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['compare']) try: # print("Classification data generation using only anomalies set to %s" % readCnf['Detect']['anomalyOnly']) settings['anomalyOnly'] = readCnf['Detect']['anomalyOnly'] except: # print("Classification data generation using only anomalies set to False") pass logger.info('[%s] : [INFO] Classification data generation using only anomalies set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['anomalyOnly'])) if settings["categorical"] is None: try: if not readCnf['Augmentation']['Categorical']: readCnf['Augmentation']['Categorical'] = None logger.info('[{}] : [INFO] Categorical columns defined as: {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Augmentation']['Categorical'])) if readCnf['Augmentation']['Categorical'] == '0': settings["categorical"] = None else: settings["categorical"] = readCnf['Augmentation']['Categorical'] logger.info('[%s] : [INFO] Categorical Features -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['categorical']) except: logger.warning('[%s] : [WARN] No Categorical Features selected from config file or comandline! Skipping encoding', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) settings["categorical"] = None else: logger.info('[%s] : [INFO] Categorical Features -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["categorical"]) if not settings["point"]: try: settings['point'] = readCnf['Misc']['point'] logger.info('[%s] : [INFO] Point set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['point']) except: settings['point'] = 'False' logger.info('[%s] : [INFO] Point detection set to default %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['point']) #print dmonC # sys.exit() # print("Conf file -> %s" %readCnf) # print("Settings -> %s" %settings) engine = aspideedengine.EDEngine(settings, dataDir=dataDir, modelsDir=modelsDir, queryDir=queryDir) #engine.printTest() engine.initConnector() if dask_backend: engine.runDask(engine) else: try: engine.runProcess(engine) except Exception as inst: logger.error('[{}] : [ERROR] Failed Process backend initialization with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning('[{}] : [WARN] Initializing default threaded engine, limited performance to be expected!'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) engine.run(engine) logger.info('[{}] : [INFO] Exiting EDE framework'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
def detect(self, method, model, data): ''' :param method: -> method name :param model: -> trained clusterer :param data: -> dataframe with data :return: -> dictionary that contains the list of anomalous timestamps ''' smodel = self.__loadClusterModel(method, model) anomalieslist = [] if not smodel: dpredict = 0 else: if data.shape[0]: if isinstance(smodel, IsolationForest): logger.info('[{}] : [INFO] Loading predictive model IsolationForest ').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) for k, v in smodel.get_params().items(): logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) # print("Contamination -> %s" % smodel.contamination) # print("Max_Features -> %s" % smodel.max_features) # print("Max_Samples -> %s" % smodel.max_samples_) # print("Threashold -> %s " % smodel.threshold_) try: dpredict = smodel.predict(data) logger.debug('[{}] : [DEBUG] IsolationForest prediction array: {}').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(dpredict)) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 elif isinstance(smodel, DBSCAN): logger.info('[{}] : [INFO] Loading predictive model DBSCAN ').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) for k, v in smodel.get_params().items(): logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) # print("Leaf_zise -> %s" % smodel.leaf_size) # print("Algorithm -> %s" % smodel.algorithm) # print("EPS -> %s" % smodel.eps) # print("Min_Samples -> %s" % smodel.min_samples) # print("N_jobs -> %s" % smodel.n_jobs) try: dpredict = smodel.fit_predict(data) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 else: dpredict = 0 logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]), str(data.shape[1])) print("Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]), str(data.shape[1]))) print("dpredict type is %s" % (type(dpredict))) if type(dpredict) is not int: anomalyarray = np.argwhere(dpredict == -1) for an in anomalyarray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]].name) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name)) anomalieslist.append(anomalies) anomaliesDict = {} anomaliesDict['anomalies'] = anomalieslist logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict)) return anomaliesDict
def dask_clusterMethod(self, cluster_method, mname, data): try: logger.info('[{}] : [INFO] Loading Clustering method {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method))) # delattr(cluster_method, 'behaviour') # del cluster_method.__dict__['behaviour'] for k, v in cluster_method.get_params().items(): logger.info( '[{}] : [INFO] Method parameter {} set to {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: with joblib.parallel_backend('dask'): logger.info( '[{}] : [INFO] Using Dask backend for user defined method' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit user defined method with dask backend with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning( '[{}] : [WARN] using default process based backend for user defined method' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit {} with {} and {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method), type(inst), inst.args)) sys.exit(1) predictions = clf.predict(data) if list(np.unique(predictions)) == [0, 1]: anomaly_marker = 1 normal_marker = 0 else: anomaly_marker = -1 normal_marker = 1 logger.info( '[{}] : [INFO] Number of Predicted Anomalies {} from a total of {} datapoints.' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), list(predictions).count(anomaly_marker), len(list(predictions)))) logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions)) fname = str(clf).split('(')[0] self.__serializemodel(clf, fname, mname) self.__plot_feature_sep(data, predictions, method=fname, mname=mname, anomaly_label=anomaly_marker, normal_label=normal_marker) self.__decision_boundary(clf, data, method=fname, mname=mname, anomaly_label=anomaly_marker) return clf
def dask_detect( self, method, model, data, anomaly_label=-1 # Todo make anomaly label user definable ): smodel = self.__loadClusterModel(method, model) anomaliesList = [] anomaliesDict = {} shap_values_p = 0 if not smodel: dpredict = 0 else: if data.shape[0]: try: logger.info( '[{}] : [INFO] Loading predictive model {} '.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(smodel).split('(')[0])) for k, v in smodel.get_params().items(): logger.info( '[{}] : [INFO] Predict model parameter {} set to {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) dpredict = smodel.predict(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to load predictive model with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) dpredict = 0 else: dpredict = 0 logger.warning( '[{}] : [WARN] DataFrame is empty with shape {} '.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape))) if list(np.unique(dpredict)) == [0, 1] or isinstance( smodel, pyod.models.iforest.IForest): anomaly_label = 1 else: anomaly_label = -1 if type(dpredict) is not int: anomalyArray = np.argwhere(dpredict == anomaly_label) if self.pred_analysis and anomalyArray.shape[0]: try: plot = self.pred_analysis['Plot'] # print(self.pred_analysis['Plot']) except Exception: plot = False feature_importance, shap_values = self.__shap_analysis( model=smodel, data=data, plot=plot) anomaliesDict['complete_shap_analysis'] = feature_importance shap_values_p = 1 count = 0 for an in anomalyArray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]].name) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name)) if shap_values_p: anomalies['analysis'] = self.__shap_force_layout( shap_values=shap_values, instance=count) anomaliesList.append(anomalies) count += 1 anomaliesDict['anomalies'] = anomaliesList logger.info( '[{}] : [INFO] Detected {} anomalies with model {} using method {} ' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anomaliesList), model, str(smodel).split('(')[0])) return anomaliesDict