def getDmonStatus(self): nUrl = "http://%s:%s/dmon/v1/overlord/core/status" % (self.esEndpoint, self.dmonPort) logger.info( '[%s] : [INFO] dmon get core status url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rdmonStatus = requests.get(nUrl) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) return rdmonStatus.json()
def localData(self, data): data_loc = os.path.join(self.dataDir, data) try: df = pd.read_csv(data_loc) except Exception as inst: logger.error( '[{}] : [ERROR] Cannot load local data with {} and {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(2) logger.info( '[{}] : [INFO] Loading local data from {} with shape {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), data_loc, df.shape)) return df
def roles(self): # self.__check_valid_es() nUrl = "http://%s:%s/dmon/v1/overlord/nodes/roles" % (self.esEndpoint, self.dmonPort) logger.info( '[%s] : [INFO] dmon get roles url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rRoles = requests.get(nUrl) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) rData = rRoles.json() return rData
def getStormTopology(self): nUrl = "http://%s:%s/dmon/v1/overlord/detect/storm" % (self.esEndpoint, self.dmonPort) logger.info( '[%s] : [INFO] dmon get storm topology url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rStormTopology = requests.get(nUrl) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print("Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)) sys.exit(2) rData = rStormTopology.json() return rData
def __loadClusterModel(self, method, model): ''' :param method: -> method name :param model: -> model name :return: -> instance of serialized object ''' lmodel = glob.glob( os.path.join(self.modelDir, ("%s_%s.pkl" % (method, model)))) if not lmodel: logger.warning( '[%s] : [WARN] No %s model with the name %s found', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), method, model) return 0 else: smodel = pickle.load(open(lmodel[0], "rb")) logger.info( '[%s] : [INFO] Succesfully loaded %s model with the name %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), method, model) return smodel
def dask_clusterMethod(self, cluster_method, mname, data ): try: logger.info('[{}] : [INFO] Loading Clustering method {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method))) # delattr(cluster_method, 'behaviour') # del cluster_method.__dict__['behaviour'] for k, v in cluster_method.get_params().items(): logger.info('[{}] : [INFO] Method parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: with joblib.parallel_backend('dask'): logger.info('[{}] : [INFO] Using Dask backend for user defined method'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to fit user defined method with dask backedn with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning('[{}] : [WARN] using default process based backedn for user defined method'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to fit {} with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method), type(inst), inst.args)) sys.exit(1) predictions = clf.predict(data) logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions)) fname = str(clf).split('(')[0] self.__serializemodel(clf, fname, mname) return clf
def scale(self, data, scaler_type=None, rindex='time'): # todo, integrate if not scaler_type: logger.warning('[{}] : [WARN] No data scaling used!'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) return data if scaler_type is None: scaler_type = {"StandardScaler": {"copy": True, "with_mean": True, "with_std": True}} logger.warning('[{}] : [WARN] No user defined scaler using default'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_type)) scaler_name = list(scaler_type.keys())[-1] scaler_attr = list(scaler_type.values())[-1] logger.info('[{}] : [INFO] Scaler set to {} with parameters {}.'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_name, scaler_attr)) try: sc_mod = importlib.import_module(self.scaler_mod) scaler_instance = getattr(sc_mod, scaler_name) scaler = scaler_instance(**scaler_attr) except Exception as inst: logger.error('[{}] : [ERROR] Error while initializing scaler {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_name)) sys.exit(2) # Fit and transform data logger.info('[{}] : [INFO] Scaling data ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) scaled_data = scaler.fit_transform(data) # Transform numpy array into dataframe, re-add columns to scaled numpyarray df_scaled = pd.DataFrame(scaled_data, columns=data.columns) df_scaled[rindex] = list(data.index) df_scaled.set_index(rindex, inplace=True) scaler_file = '{}.scaler'.format(scaler_name) logger.info('[{}] : [INFO] Saving scaler instance {} ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_file)) scale_file_location = os.path.join(self.dataDir, scaler_file) joblib.dump(scaler, filename=scale_file_location) return df_scaled
def sdbscanTrain(self, settings, mname, data): ''' :param data: -> dataframe with data :param settings: -> settings dictionary :param mname: -> name of serialized clusterer :return: -> clusterer :example settings: -> {eps:0.9, min_samples:10, metric:'euclidean' , algorithm:'auto, leaf_size:30, p:0.2, n_jobs:1} ''' for k, v in settings.items(): logger.info( '[%s] : [INFO] SDBSCAN %s set to %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v) sdata = StandardScaler().fit_transform(data) try: db = DBSCAN(eps=float(settings['eps']), min_samples=int(settings['min_samples']), metric=settings['metric'], algorithm=settings['algorithm'], leaf_size=int(settings['leaf_size']), p=float(settings['p']), n_jobs=int(settings['n_jobs'])).fit(sdata) except Exception as inst: logger.error( '[%s] : [ERROR] Cannot instanciate sDBSCAN with %s and %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print("Error while instanciating sDBSCAN with %s and %s" % (type(inst), inst.args)) sys.exit(1) labels = db.labels_ print(labels) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) self.__serializemodel(db, 'sdbscan', mname) return db
def dask_isolationForest(self, settings, mname, data ): ''' :param settings: -> settings dictionary :param mname: -> name of serialized clusterer :param scaler: -> scaler to use on data :return: -> isolation forest instance :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False, max_features:1.0, n_jobs:1, random_state:None, verbose:0} ''' if not settings or settings is None: logger.warning('[{}] : [WARN] No IsolationForest parameters defined using defaults'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) # print(settings) settings = {} else: for k, v in settings.items(): logger.info('[{}] : [INFO] IsolationForest parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: clf = IsolationForest(**settings) # print(clf) except Exception as inst: logger.error('[{}] : [INFO] Failed to instanciate IsolationForest with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(1) try: with joblib.parallel_backend('dask'): logger.info('[{}] : [INFO] Using Dask backend for IsolationForest'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf.fit(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to fit IsolationForest with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(1) predict = clf.predict(data) anoOnly = np.argwhere(predict == -1) logger.info('[{}] : [INFO] Found {} anomalies in training dataset of shape {}.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anoOnly), data.shape)) logger.info('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predict)) self.__serializemodel(clf, 'isoforest', mname) self.__appendPredictions(method='isoforest', mname=mname, data=data, pred=predict)
def filterWildcard(self, df, wild_card, keep=False): """ :param df: dataframe to filer :param wild_card: str wildcard of columns to be filtered :param keep: if keep True, only cols with wildcard are kept, if False they will be deleted :return: filtered dataframe """ filtr_list = [] mask = df.columns.str.contains(wild_card) filtr_list.extend(list(df.loc[:, mask].columns.values)) logger.info('[%s] : [INFO] Columns to be filtered based on wildcard: %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), filtr_list) if keep: df_wild = df[filtr_list] else: df_wild = df.drop(filtr_list, axis=1) logger.info('[%s] : [INFO] Filtered shape: %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), df_wild.shape) # print("Columns of filtered data:") # print(df_concat_filtered.columns) return df_wild
def ohEncoding(self, data, cols=None, replace=True): if cols is None: cols = [] for el, v in data.dtypes.items(): if v == 'object': if el == 'time': pass else: cols.append(el) logger.info('[%s] : [INFO] Categorical features not set, detected as categorical: %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(cols)) logger.info('[{}] : [INFO] Categorical features now set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(cols))) vec = DictVectorizer() mkdict = lambda row: dict((col, row[col]) for col in cols) vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray()) vecData.columns = vec.get_feature_names() vecData.index = data.index if replace is True: data = data.drop(cols, axis=1) data = data.join(vecData) return data, vecData, vec
def dask_sdbscanTrain(self, settings, mname, data, scaler=None): ''' :param data: -> dataframe with data :param settings: -> settings dictionary :param mname: -> name of serialized clusterer :param scaler: -> scaler to use on data :return: -> clusterer :example settings: -> {eps:0.9, min_samples:10, metric:'euclidean' , algorithm:'auto, leaf_size:30, p:0.2, n_jobs:1} ''' if scaler is None: logger.warning('[{}] : [WARN] Scaler not defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: logger.info('[{}] : [INFO] Scaling data ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) data = scaler.fit_transform(data) if not settings or settings is None: logger.warning('[{}] : [WARN] No DBScan parameters defined using default'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings = {} else: for k, v in settings.items(): logger.info('[{}] : [INFO] DBScan parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: db = DBSCAN(**settings).fit(data) except Exception as inst: logger.error('[{}] : [INFO] Failed to instanciate DBScan with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(1) labels = db.labels_ logger.info('[{}] : [INFO] DBScan labels: {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), labels)) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) logger.info('[{}] : [INFO] DBScan estimated number of clusters {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), n_clusters_)) self.__serializemodel(db, 'sdbscan', mname) return db
def dask_detect(self, method, model, data ): smodel = self.__loadClusterModel(method, model) anomaliesList = [] if not smodel: dpredict = 0 else: if data.shape[0]: try: logger.info('[{}] : [INFO] Loading predictive model {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(smodel).split('(')[0])) for k, v in smodel.get_params().items(): logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) dpredict = smodel.predict(data) except Exception as inst: logger.error('[{}] : [ERROR] Failed to load predictive model with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) dpredict = 0 else: dpredict = 0 logger.warning('[{}] : [WARN] DataFrame is empty with shape {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape))) if type(dpredict) is not int: anomalyArray = np.argwhere(dpredict == -1) for an in anomalyArray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]].name) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name)) anomaliesList.append(anomalies) anomaliesDict = {} anomaliesDict['anomalies'] = anomaliesList logger.info('[{}] : [INFO] Detected {} anomalies with model {} using method {} '.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anomaliesList), model, str(smodel).split('(')[0])) return anomaliesDict
def main(argv, cluster, client): dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') modelsDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models') queryDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'queries') settings = Dict() settings.esendpoint = None settings.prendpoint = None settings.Dask.SchedulerEndpoint = None # "local" settings.Dask.SchedulerPort = 8787 settings.Dask.EnforceCheck = False settings.prkafkaendpoint = None settings.prkafkaport = 9092 settings.prkafkatopic = "edetopic" settings.augmentation = None # augmentation including scaler and user defined methods settings.detectionscaler = None settings.MPort = 9090 settings.dmonPort = 5001 settings.index = "logstash-*" settings["from"] = None settings.to = None settings.query = None settings.nodes = None settings.qsize = None settings.qinterval = None settings.fillna = None settings.dropna = None settings.local = None settings.train = None settings.hpomethod = None settings.tpot = None settings.ParamDistribution = None settings.detecttype = None # TODO settings.traintype = None settings.validationtype = None # Todo settings.target = None settings.load = None settings.file = None settings.method = None settings.detectMethod = None settings.trainMethod = None settings.cv = None settings.trainscore = None settings.scorer = None settings.returnestimators = None settings.analysis = None settings.validate = None settings.export = None settings.trainexport = None settings.detect = None # Bool default None settings.cfilter = None settings.rfilter = None settings.dfilter = None settings.sload = None settings.smemory = None settings.snetwork = None settings.heap = None settings.checkpoint = None settings.delay = None settings.interval = None settings.resetindex = None settings.training = None settings.validation = None settings.validratio = 0.2 settings.compare = False settings.anomalyOnly = False settings.categorical = None settings.point = False # Only for testing settings['validate'] = False dask_backend = False try: opts, args = getopt.getopt(argv, "he:tf:m:vx:d:lq:", ["endpoint=", "file=", "method=", "export=", "detect=", "query="]) # todo:expand comand line options except getopt.GetoptError: logger.warning('[%s] : [WARN] Invalid argument received exiting', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("ede.py -f <filelocation>, -t -m <method> -v -x <modelname>") sys.exit(0) for opt, arg in opts: if opt == '-h': print("#" * 100) print("H2020 ASPIDE") print('Event Detection Engine') print("-" * 100) print('Utilisation:') print('-f -> configuration file location') print('-t -> activate training mode') print('-m -> methods') print(' -> allowed methods: skm, em, dbscan, sdbscan, isoforest') print('-x -> export model name') print('-v -> validation') print('-q -> query string for anomaly/event detection') print("#" * 100) sys.exit(0) elif opt in ("-e", "--endpoint"): settings['esendpoint'] = arg elif opt in ("-t"): settings["train"] = True elif opt in ("-f", "--file"): settings["file"] = arg elif opt in ("-m", "--method"): settings["method"] = arg elif opt in ("-v"): settings["validate"] = True elif opt in ("-x", "--export"): settings["export"] = arg elif opt in ("-d", "--detect"): settings["detect"] = arg elif opt in ("-l", "--list-models"): print ("Current saved models are:\n") print((getModelList())) sys.exit(0) elif opt in ("-q", "--query"): settings["query"] = arg # print("#" * 100) # print(queryDir) logger.info('[{}] : [INFO] Starting EDE framework ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) logger.info('[{}] : [INFO] Trying to read configuration file ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) if settings["file"] is None: file_conf = 'ede_config.yaml' logger.info('[%s] : [INFO] Settings file set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), file_conf) else: if os.path.isfile(settings["file"]): file_conf = settings["file"] logger.info('[%s] : [INFO] Settings file set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), file_conf) else: logger.error('[%s] : [ERROR] Settings file not found at locations %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["file"]) sys.exit(1) readCnf = readConf(file_conf) logger.info('[{}] : [INFO] Reading configuration file ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) # TODO: create def dls(file_conf) # Connector try: logger.info('[{}] : [INFO] Index Name set to : {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Connector']['indexname'])) except: logger.warning('[%s] : [WARN] Index not set in conf setting to default value %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['index']) if settings['esendpoint'] is None: try: logger.info('[{}] : [INFO] Monitoring ES Backend endpoint in config {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Connector']['ESEndpoint'])) settings['esendpoint'] = readCnf['Connector']['ESEndpoint'] except: if readCnf['Connector']['PREndpoint'] is None: # todo; now only available in config file not in commandline logger.error('[%s] : [ERROR] ES and PR backend Enpoints not set in conf or commandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: settings['prendpoint'] = readCnf['Connector']['PREndpoint'] logger.info('[{}] : [INFO] Monitoring PR Endpoint set to {}'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["prendpoint"])) else: logger.info('[%s] : [INFO] ES Backend Enpoint set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['esendpoint']) if settings["from"] is None: try: settings["from"] = readCnf['Connector']['From'] logger.info('[%s] : [INFO] From timestamp set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["from"]) except: logger.info('[{}] : [INFO] PR Backend endpoint set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['prendpoint'])) if settings['prendpoint'] is not None: logger.info('[{}] : [INFO] PR Backedn endpoint set to {}'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['prendpoint'])) else: logger.error('[%s] : [ERROR] From timestamp not set in conf or commandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] From timestamp set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['from']) if settings["to"] is None: try: settings["to"] = readCnf['Connector']['to'] logger.info('[%s] : [INFO] To timestamp set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["to"]) except: if settings['prendpoint'] is not None: pass else: logger.error('[%s] : [ERROR] To timestamp not set in conf or commandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] To timestamp set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['to']) if settings['query'] is None: try: settings['query'] = readCnf['Connector']['Query'] logger.info('[%s] : [INFO] Query set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['query']) except: if settings['prendpoint'] is not None: pass logger.error('[%s] : [ERROR] Query not set in conf or commandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Query set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['query']) if settings.prkafkaendpoint is None: try: settings.prkafkaendpoint = readCnf['Connector']['KafkaEndpoint'] if settings.prkafkaendpoint == 'None': settings.prkafkaendpoint = None else: settings.prkafkatopic = readCnf['Connector']['KafkaTopic'] settings.prkafkaport = readCnf['Connector']['KafkaPort'] logger.info('[{}] : [INFO] Kafka Endpoint set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.prkafkaendpoint)) except: logger.warning('[{}] : [WARN] Kafka Endpoint not set.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.prkafkaendpoint)) if settings["nodes"] is None: try: if not readCnf['Connector']['nodes']: readCnf['Connector']['nodes'] = 0 settings["nodes"] = readCnf['Connector']['nodes'] logger.info('[%s] : [INFO] Desired nodes set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['nodes']) except: logger.warning('[%s] : [WARN] No nodes selected from config file or comandline querying all', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) settings["nodes"] = 0 else: logger.info('[%s] : [INFO] Desired nodes set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["nodes"]) if settings["qsize"] is None: try: settings["qsize"] = readCnf['Connector']['QSize'] logger.info('[%s] : [INFO] Query size set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['qsize']) except: logger.warning('[%s] : [WARN] Query size not set in conf or commandline setting to default', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) settings["qsize"] = 'default' else: logger.info('[%s] : [INFO] Query size set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["qsize"]) if settings["qinterval"] is None: try: settings["qinterval"] = readCnf['Connector']['MetricsInterval'] logger.info('[%s] : [INFO] Metric Interval set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['qinterval']) except: logger.warning('[%s] : [WARN] Metric Interval not set in conf or commandline setting to default', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) settings["qsize"] = "default" else: logger.info('[%s] : [INFO] Metric interval set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["qinterval"]) if readCnf['Connector']['Dask']: try: settings['Dask']['SchedulerEndpoint'] = readCnf['Connector']['Dask']['SchedulerEndpoint'] settings['Dask']['SchedulerPort'] = readCnf['Connector']['Dask']['SchedulerPort'] settings['Dask']['EnforceCheck'] = readCnf['Connector']['Dask']['EnforceCheck'] logger.info('[{}] : [INFO] Dask scheduler set to: endpoint {}, port {}, check {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['Dask']['SchedulerEndpoint'], settings['Dask']['SchedulerPort'], settings['Dask']['EnforceCheck'])) dask_backend = True except: logger.warning('[{}] : [WARN] Dask scheduler set to default values'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) dask_backend = False if settings['local'] is None: try: settings['local'] = readCnf['Connector']['Local'] logger.info('[{}] : [INFO] Local datasource set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['local'])) except: logger.info('[{}] : [INFO] Local datasource set to default'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings['local'] = None else: logger.info('[{}] : [INFO] Local datasource set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['local'])) # Mode if settings["train"] is None: try: settings["train"] = readCnf['Mode']['Training'] logger.info('[%s] : [INFO] Train is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['train']) except: logger.error('[%s] : [ERROR] Train is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Train is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['train']) # Analysis if settings.analysis is None: try: logger.info('[{}] : [INFO] Loading user defined analysis'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings.analysis = readCnf['Analysis'] except: logger.info('[{}] : [INFO] No user defined analysis detected'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) # Validate if settings["validate"] is None: try: settings["validate"] = readCnf['Mode']['Validate'] logger.info('[%s] : [INFO] Validate is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['validate']) except: logger.error('[%s] : [ERROR] Validate is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Validate is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['validate']) # Detect if settings["detect"] is None: try: settings["detect"] = readCnf['Mode']['Detect'] logger.info('[%s] : [INFO] Detect is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['detect']) except: logger.error('[%s] : [ERROR] Detect is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Detect is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['detect']) if settings["detectMethod"] is None: try: settings["detectMethod"] = readCnf['Detect']['Method'] logger.info('[%s] : [INFO] Detect Method is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detectMethod"]) except: logger.error('[%s] : [ERROR] Detect Method is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Detect Method is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detectMethod"]) if settings["detecttype"] is None: try: settings["detecttype"] = readCnf['Detect']['Type'] logger.info('[{}] : [INFO] Detect Type is set to {} from conf'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detecttype"])) except: logger.error('[%s] : [ERROR] Detect Type is not set in conf or command line!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Detect Type is set to %s from command line', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detecttype"]) if settings["trainMethod"] is None: try: settings["trainMethod"] = readCnf['Training']['Method'] logger.info('[%s] : [INFO] Train Method is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["trainMethod"]) except: try: settings['Training']['TPOTParam'] except: logger.error('[%s] : [ERROR] Train Method is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Train Method is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["trainMethod"]) if settings["traintype"] is None: try: settings["traintype"] = readCnf['Training']['Type'] logger.info('[%s] : [INFO] Train Type is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["traintype"]) except: logger.error('[%s] : [ERROR] Train Type is not set in conf or command line!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Train Type is set to %s from command line', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["traintype"]) if settings.target is None: try: settings.target = readCnf['Training']['Target'] logger.info('[{}] : [INFO] Classification Target set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.target)) except: if settings['traintype'] == 'classification': logger.warning('[{}] : [WARN] Classification Target not set in config'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.target)) else: pass if settings.hpomethod is None: try: settings.hpomethod = readCnf['Training']['HPOMethod'] logger.info('[{}] : [INFO] HPO method set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod)) try: settings.hpoparam = readCnf['Training']['HPOParam'] for k, v in readCnf['Training']['HPOParam'].items(): logger.info('[{}] : [INFO] HPO Method {} Param {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod, k, v)) except: logger.warn('[{}] : [WARN] HPO Method Params set to default!'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings.hpoparam = {} except: if readCnf['Training']['Type'] == 'hpo': logger.error('[{}] : [ERROR] HPO invoked without method! Exiting'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod)) sys.exit(1) else: pass if settings.ParamDistribution is None: try: settings.ParamDistribution = readCnf['Training']['ParamDistribution'] logger.info('[{}] : [INFO] HPO Parameter Distribution found.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: if readCnf['Training']['Type'] == 'hpo': logger.error('[{}] : [ERROR] HPO invoked without Parameter distribution! Exiting'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod)) sys.exit(1) else: pass if settings.tpot is None: try: settings.tpot = readCnf['Training']['TPOTParam'] logger.info('[{}] : [INFO] TPO Parameters found.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: try: if readCnf['Training']['Type'] == 'tpot': settings.tpot = {} logger.warning('[{}] : [WARN] TPO Parameters not found. Using defaults'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: pass except: pass if settings["export"] is None: try: settings["export"] = readCnf['Training']['Export'] logger.info('[%s] : [INFO] Export is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["export"]) except: logger.error('[%s] : [ERROR] Export is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Model is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["export"]) if settings.cv is None: try: settings.cv = readCnf['Training']['CV'] try: logger.info('[{}] : [INFO] Cross Validation set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cv']['Type'])) except: logger.info('[{}] : [INFO] Cross Validation set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cv'])) try: settings['cv'] = int(settings['cv']) except: logger.error('[{}] : [ERROR] Issues with CV definition in Training!'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) sys.exit(1) except: logger.info('[{}] : [INFO] Cross Validation not defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) if settings.trainscore is None: try: settings.trainscore = readCnf['Training']['TrainScore'] logger.info('[{}] : [INFO] Cross Validation set to include training scores'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: settings.trainscore = False if settings.scorer is None: try: settings.scorer = readCnf['Training']['Scorers'] logger.info('[{}] : [INFO] Training scorers defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: logger.info('[{}] : [INFO] No Training scorers defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) if settings.returnestimators is None: try: settings.returnestimators = readCnf['Training']['ReturnEstimators'] logger.info('[{}] : [INFO] CV Estimators will be saved'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) except: settings.returnestimators = False if settings["load"] is None: try: settings["load"] = readCnf['Detect']['Load'] logger.info('[%s] : [INFO] Load is set to %s from conf', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["load"]) except: logger.error('[%s] : [ERROR] Load is not set in conf or comandline!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) sys.exit(1) else: logger.info('[%s] : [INFO] Load is set to %s from comandline', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["load"]) if settings.detectionscaler is None: try: settings.detectionscaler = readCnf['Detect']['Scaler'] logger.info('[{}] : [INFO] Detection Scaler set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.detectionscaler)) except: settings.detectionscaler = None logger.warning('[{}] : [WARN] Detection scaler not specified'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) try: settings['MethodSettings'] = {} #todo read settings from commandline ? for name, value in readCnf['Training']['MethodSettings'].items(): # print("%s -> %s" % (name, value)) settings['MethodSettings'][name] = value except: settings['MethodSettings'] = None logger.warning('[%s] : [WARN] No Method settings detected, using defaults for %s!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["method"]) # Augmentation try: settings['augmentation'] = readCnf['Augmentation'] logger.info('[%s] : [INFO] Augmentations loaded', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) except: settings['augmentation'] = None logger.info('[%s] : [INFO] Augmentations not defined', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # Point anomaly settings try: settings["smemory"] = readCnf['Point']['memory'] logger.info('[%s] : [INFO] System memory is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["smemory"]) except: settings["smemory"] = "default" logger.warning('[%s] : [WARN] System memory is not set, using default!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: settings["sload"] = readCnf['Point']['load'] logger.info('[%s] : [INFO] System load is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["sload"]) except: settings["sload"] = "default" logger.warning('[%s] : [WARN] System load is not set, using default!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: settings["snetwork"] = readCnf['Point']['network'] logger.info('[%s] : [INFO] System netowrk is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["snetwork"]) except: settings["snetwork"] = "default" logger.warning('[%s] : [WARN] System network is not set, using default!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: settings['heap'] = readCnf['Misc']['heap'] logger.info('[%s] : [INFO] Heap size set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['heap']) except: settings['heap'] = '512m' logger.info('[%s] : [INFO] Heap size set to default %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['heap']) # Filter try: if readCnf['Filter']['Columns']: logger.info('[{}] : [INFO] Filter columns set in config as {}.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Columns'])) settings["cfilter"] = readCnf['Filter']['columns'] else: logger.info('[{}] : [INFO] Filter columns set in config as {}.'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["cfilter"])) except: pass finally: logger.info('[%s] : [INFO] Filter column set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cfilter']) try: # logger.info('[%s] : [INFO] Filter rows set to %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Rows']) settings["rfilter"] = readCnf['Filter']['Rows'] except: pass # logger.info('[%s] : [INFO] Filter rows %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["rfilter"]) finally: logger.info('[%s] : [INFO] Filter rows set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['rfilter']) try: if readCnf['Filter']['DColumns']: # print("Filter drop columns -> %s" % readCnf['Filter']['DColumns']) settings["dfilter"] = readCnf['Filter']['DColumns'] else: # print("Filter drop columns -> %s" % settings["dfilter"]) pass except: # print("Filter drop columns -> %s" % settings["dfilter"]) pass finally: logger.info('[%s] : [INFO] Filter drop column set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['dfilter']) try: if readCnf['Filter']['Fillna']: settings['fillna'] = readCnf['Filter']['Fillna'] else: settings['fillna'] = False logger.info('[{}] : [INFO] Fill None values set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Fillna'])) except: logger.info('[{}] : [INFO] Fill None not set, skipping ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings['fillna'] = False try: if readCnf['Filter']['Dropna']: settings['dropna'] = readCnf['Filter']['Dropna'] else: settings['dropna'] = False logger.info('[{}] : [INFO] Drop None values set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Dropna'])) except: logger.info('[{}] : [INFO] Drop None not set, skipping ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) settings['dropna'] = False if settings["checkpoint"] is None: try: settings["checkpoint"] = readCnf['Misc']['checkpoint'] logger.info('[%s] : [INFO] Checkpointing is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['checkpoint']) except: settings["checkpoint"] = "True" logger.info('[%s] : [INFO] Checkpointing is set to True', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) else: logger.info('[%s] : [INFO] Checkpointing is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['checkpoint']) if settings["delay"] is None: try: settings["delay"] = readCnf['Misc']['delay'] # logger.info('[%s] : [INFO] Delay is set to %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay']) except: settings["delay"] = "2m" logger.info('[%s] : [INFO] Delay is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay']) else: logger.info('[%s] : [INFO] Delay is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay']) if settings["interval"] is None: try: settings["interval"] = readCnf['Misc']['interval'] logger.info('[%s] : [INFO] Interval is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval']) except: settings["interval"] = "15m" logger.info('[%s] : [INFO] Interval is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval']) else: logger.info('[%s] : [INFO] Interval is set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval']) if settings["resetindex"] is None: try: settings["resetindex"] = readCnf['Misc']['resetindex'] except: settings["resetindex"] = False else: logger.info('[%s] : [INFO] Reset index set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['resetindex']) try: settings['dmonPort'] = readCnf['Connector']['dmonport'] logger.info('[{}] : [INFO] DMon Port is set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['dmonPort'])) except: logger.info('[%s] : [INFO] DMon Port is set to %s"', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['dmonPort'])) try: settings['training'] = readCnf['Detect']['training'] logger.info('[{}] : [INFO] Classification Training set is {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Detect']['training'])) except: logger.info('[%s] : [INFO] Classification Training set is %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['training'])) # try: # print("Classification Validation set is %s" % readCnf['Detect']['validation']) # settings['validation'] = readCnf['Detect']['validation'] # except: # print("Classification Validation set is default") # logger.info('[%s] : [INFO] Classification Validation set is %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['validation'])) try: # print("Classification validation ratio is set to %d" % int(readCnf['Training']['ValidRatio'])) logger.info('[{}] : [INFO] Classification validation ratio is set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Training']['ValidRatio'])) if float(readCnf['Training']['ValidRatio']) > 1.0: # print("Validation ratio is out of range, must be between 1.0 and 0.1") settings['validratio'] = 0.0 logger.warning('[{}] : [WARN] Validation ratio is out of range, must be between 1.0 and 0.1, overwritting'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Training']['ValidRatio'])) settings['validratio'] = float(readCnf['Detect']['validratio']) except: logger.warning('[{}] : [WARN] Validation ratio is set to default'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) logger.info('[%s] : [INFO] Classification Validation ratio is %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['validratio'])) # try: # print("Classification comparison is set to %s" % readCnf['Detect']['compare']) # settings['compare'] = readCnf['Detect']['compare'] # except: # print("Classification comparison is default") # logger.info('[%s] : [INFO] Classification comparison is %s', # datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['compare']) try: # print("Classification data generation using only anomalies set to %s" % readCnf['Detect']['anomalyOnly']) settings['anomalyOnly'] = readCnf['Detect']['anomalyOnly'] except: # print("Classification data generation using only anomalies set to False") pass logger.info('[%s] : [INFO] Classification data generation using only anomalies set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['anomalyOnly'])) if settings["categorical"] is None: try: if not readCnf['Augmentation']['Categorical']: readCnf['Augmentation']['Categorical'] = None logger.info('[{}] : [INFO] Categorical columns defined as: {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Augmentation']['Categorical'])) if readCnf['Augmentation']['Categorical'] == '0': settings["categorical"] = None else: settings["categorical"] = readCnf['Augmentation']['Categorical'] logger.info('[%s] : [INFO] Categorical Features -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['categorical']) except: logger.warning('[%s] : [WARN] No Categorical Features selected from config file or comandline! Skipping encoding', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) settings["categorical"] = None else: logger.info('[%s] : [INFO] Categorical Features -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["categorical"]) if not settings["point"]: try: settings['point'] = readCnf['Misc']['point'] logger.info('[%s] : [INFO] Point set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['point']) except: settings['point'] = 'False' logger.info('[%s] : [INFO] Point detection set to default %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['point']) #print dmonC # sys.exit() # print("Conf file -> %s" %readCnf) # print("Settings -> %s" %settings) engine = aspideedengine.EDEngine(settings, dataDir=dataDir, modelsDir=modelsDir, queryDir=queryDir) #engine.printTest() engine.initConnector() if dask_backend: engine.runDask(engine) else: try: engine.runProcess(engine) except Exception as inst: logger.error('[{}] : [ERROR] Failed Process backend initialization with {} and {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning('[{}] : [WARN] Initializing default threaded engine, limited performance to be expected!'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) engine.run(engine) logger.info('[{}] : [INFO] Exiting EDE framework'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
logger.info('[{}] : [INFO] Exiting EDE framework'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) if __name__ == "__main__": def handler(singal_received, frame): logger.info('[{}] : [INFO] User break detected. Exiting EDE framework'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) sys.exit(0) signal(SIGINT, handler) SchedulerEndpoint, Scale, SchedulerPort, EnforceCheck = check_dask_settings() # Todo Better solution if SchedulerEndpoint: if SchedulerEndpoint == "local": cluster = LocalCluster(n_workers=int(Scale)) logger.info('[{}] : [INFO] Starting Dask local Cluster Backend with: {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), cluster)) client = Client(cluster) logger.info('[{}] : [INFO] Dask Client started with: {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), client)) else: scheduler_address = "{}:{}".format(SchedulerEndpoint, SchedulerPort) client = Client(address=scheduler_address) client.get_versions(check=EnforceCheck) else: cluster = 0 client = 0 main(sys.argv[1:], cluster, client)
def dict2csv(self, response, query, filename, df=False): ''' :param response: elasticsearch response :param query: elasticserch query :param filename: name of file :param df: if set to true method returns dataframe and doesn't save to file. :return: 0 if saved to file and dataframe if not ''' requiredMetrics = [] logger.info('[%s] : [INFO] Started response to csv conversion', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # print "This is the query _------------_-> %s" %query # print "This is the response _------------_-> %s" %response for key, value in response['aggregations'].items(): for k, v in value.items(): for r in v: dictMetrics = {} # print "This is the dictionary ---------> %s " % str(r) for rKey, rValue in r.items(): if rKey == 'doc_count' or rKey == 'key_as_string': pass elif rKey == 'key': logger.debug('[%s] : [DEBUG] Request has keys %s and values %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue) # print "%s -> %s"% (rKey, rValue) dictMetrics['key'] = rValue elif list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance.raw' \ or list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance': logger.debug('[%s] : [DEBUG] Detected Memory type aggregation', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # print "This is rValue ________________> %s" % str(rValue) # print "Keys of rValue ________________> %s" % str(rValue.keys()) try: for val in rValue['buckets']: dictMetrics[val['key']] = val['1']['value'] except Exception as inst: logger.error('[%s] : [ERROR] Failed to find key with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue['value']) sys.exit(1) else: # print "Values -> %s" % rValue # print "rKey -> %s" % rKey # print "This is the rValue ___________> %s " % str(rValue) logger.debug('[%s] : [DEBUG] Request has keys %s and flattened values %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), rKey, rValue['value']) dictMetrics[rKey] = rValue['value'] requiredMetrics.append(dictMetrics) # print "Required Metrics -> %s" % requiredMetrics csvOut = os.path.join(self.dataDir, filename) cheaders = [] if list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == "type_instance.raw" or \ list(query['aggs'].values())[0].values()[1].values()[0].values()[0].values()[0] == 'type_instance': logger.debug('[%s] : [DEBUG] Detected Memory type query', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) try: cheaders = list(requiredMetrics[0].keys()) except IndexError: logger.error('[%s] : [ERROR] Empty response detected from DMon, stoping detection, check DMon.', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("Empty response detected from DMon, stoping detection, check DMon") sys.exit(1) else: kvImp = {} for qKey, qValue in query['aggs'].items(): logger.info('[%s] : [INFO] Value aggs from query %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), qValue['aggs']) for v, t in qValue['aggs'].items(): kvImp[v] = t['avg']['field'] cheaders.append(v) cheaders.append('key') for key, value in kvImp.items(): cheaders[cheaders.index(key)] = value for e in requiredMetrics: for krep, vrep in kvImp.items(): e[vrep] = e.pop(krep) logger.info('[%s] : [INFO] Dict translator %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(kvImp)) logger.info('[%s] : [INFO] Headers detected %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(cheaders)) if not df: try: with open(csvOut, 'wb') as csvfile: w = csv.DictWriter(csvfile, cheaders) w.writeheader() for metrics in requiredMetrics: if set(cheaders) != set(metrics.keys()): logger.error('[%s] : [ERROR] Headers different from required metrics: headers -> %s, metrics ->%s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(cheaders), str(list(metrics.keys()))) diff = list(set(metrics.keys()) - set(cheaders)) print("Headers different from required metrics with %s " % diff) print("Check qInterval setting for all metrics. Try increasing it!") sys.exit(1) w.writerow(metrics) csvfile.close() except EnvironmentError: logger.error('[%s] : [ERROR] File %s could not be created', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), csvOut) sys.exit(1) logger.info('[%s] : [INFO] Finished csv %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), filename) return 0 else: df = pd.DataFrame(requiredMetrics) # df.set_index('key', inplace=True) logger.info('[%s] : [INFO] Created dataframe', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) return df
def computeOnColumns(self, df, operations, remove_filtered=True): if operations: if 'STD' in list(operations.keys()): std = operations['STD'] else: std = None if 'Mean' in list(operations.keys()): mean = operations['Mean'] else: mean = None if 'Median' in list(operations.keys()): median = operations['Median'] else: median = None all_processed_columns = [] if std or std is not None: for cl_std in std: for ncol_n, fcol_n in cl_std.items(): df_std = self.filterColumns(df, lColumns=fcol_n) logger.info('[{}] : [INFO] Computing standard deviation {} on columns {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n)) std_df = df_std.std(axis=1, skipna=True) df[ncol_n] = std_df for c in fcol_n: all_processed_columns.append(c) if mean or mean is not None: for cl_mean in mean: for ncol_n, fcol_n in cl_mean.items(): df_mean = self.filterColumns(df, lColumns=fcol_n) logger.info('[{}] : [INFO] Computing mean {} on columns {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n)) mean_df = df_mean.mean(axis=1, skipna=True) df[ncol_n] = mean_df for c in fcol_n: all_processed_columns.append(c) if median or median is not None: for cl_median in median: for ncol_n, fcol_n in cl_median.items(): df_median = self.filterColumns(df, lColumns=fcol_n) logger.info('[{}] : [INFO] Computing median {} on columns {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n)) median_df = df_median.median(axis=1, skipna=True) df[ncol_n] = median_df for c in fcol_n: all_processed_columns.append(c) if "Method" in list(operations.keys()): df = self.__operationMethod(operations['Method'], data=df) if remove_filtered: unique_all_processed_columns = list(set(all_processed_columns)) logger.warning('[{}] : [WARN] Droping columns used for computation ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), unique_all_processed_columns)) self.dropColumns(df, unique_all_processed_columns, cp=False) else: logger.info('[{}] : [INFO] No data operations/augmentations defined'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) logger.info('[{}] : [INFO] Augmented data shape {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), df.shape)) return df
def dropMissing(self, df): logger.info('[{}] : [WARN] Dropping columns with in missing values'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) df.dropna(axis=1, how='all', inplace=True)
def chainMergeSystem(self, linterface=None, lload=None, lmemory=None, lpack=None): logger.info('[%s] : [INFO] Startig system metrics merge .......', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) # Read files if linterface is None and lload is None and lmemory is None and lpack is None: allIterface = glob.glob(os.path.join(self.dataDir, "Interface_*.csv")) allLoad = glob.glob(os.path.join(self.dataDir, "Load_*.csv")) allMemory = glob.glob(os.path.join(self.dataDir, "Memory_*.csv")) allPackets = glob.glob(os.path.join(self.dataDir, "Packets_*.csv")) # Name of merged files mergedInterface = os.path.join(self.dataDir, "Interface.csv") mergedLoad = os.path.join(self.dataDir, "Load.csv") mergedMemory = os.path.join(self.dataDir, "Memory.csv") mergedPacket = os.path.join(self.dataDir, "Packets.csv") ftd = 1 else: allIterface = linterface allLoad = lload allMemory = lmemory allPackets = lpack ftd = 0 colNamesInterface = {'rx': 'rx_master', 'tx': 'tx_master'} df_interface = self.chainMerge(allIterface, colNamesInterface) logger.info('[%s] : [INFO] Interface metrics merge complete', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) colNamesPacket = {'rx': 'rx_master', 'tx': 'tx_master'} df_packet = self.chainMerge(allPackets, colNamesPacket) logger.info('[%s] : [INFO] Packet metrics merge complete', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) colNamesLoad = {'shortterm': 'shortterm_master', 'midterm': 'midterm_master', 'longterm': 'longterm_master'} df_load = self.chainMerge(allLoad, colNamesLoad) logger.info('[%s] : [INFO] Load metrics merge complete', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) colNamesMemory = {'cached': 'cached_master', 'buffered': 'buffered_master', 'used': 'used_master', 'free': 'free_master'} df_memory = self.chainMerge(allMemory, colNamesMemory) logger.info('[%s] : [INFO] Memory metrics merge complete', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) logger.info('[%s] : [INFO] Sistem metrics merge complete', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) if ftd: self.df2csv(df_interface, mergedInterface) self.df2csv(df_packet, mergedPacket) self.df2csv(df_load, mergedLoad) self.df2csv(df_memory, mergedMemory) return 0 else: return df_interface, df_load, df_memory, df_packet
def handler(singal_received, frame): logger.info('[{}] : [INFO] User break detected. Exiting EDE framework'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) sys.exit(0)
def detect(self, method, model, data): ''' :param method: -> method name :param model: -> trained clusterer :param data: -> dataframe with data :return: -> dictionary that contains the list of anomalous timestamps ''' smodel = self.__loadClusterModel(method, model) anomalieslist = [] if not smodel: dpredict = 0 else: if data.shape[0]: if isinstance(smodel, IsolationForest): logger.info('[{}] : [INFO] Loading predictive model IsolationForest ').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) for k, v in smodel.get_params().items(): logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) # print("Contamination -> %s" % smodel.contamination) # print("Max_Features -> %s" % smodel.max_features) # print("Max_Samples -> %s" % smodel.max_samples_) # print("Threashold -> %s " % smodel.threshold_) try: dpredict = smodel.predict(data) logger.debug('[{}] : [DEBUG] IsolationForest prediction array: {}').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(dpredict)) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 elif isinstance(smodel, DBSCAN): logger.info('[{}] : [INFO] Loading predictive model DBSCAN ').format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) for k, v in smodel.get_params().items(): logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) # print("Leaf_zise -> %s" % smodel.leaf_size) # print("Algorithm -> %s" % smodel.algorithm) # print("EPS -> %s" % smodel.eps) # print("Min_Samples -> %s" % smodel.min_samples) # print("N_jobs -> %s" % smodel.n_jobs) try: dpredict = smodel.fit_predict(data) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 else: dpredict = 0 logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]), str(data.shape[1])) print("Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]), str(data.shape[1]))) print("dpredict type is %s" % (type(dpredict))) if type(dpredict) is not int: anomalyarray = np.argwhere(dpredict == -1) for an in anomalyarray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]].name) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name)) anomalieslist.append(anomalies) anomaliesDict = {} anomaliesDict['anomalies'] = anomalieslist logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict)) return anomaliesDict
def run(self): logger.info('[{}] : [INFO] Starting engine Detect process {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.processID)) p = multiprocessing.Process(target=self.engine.detectAnomalies) return p
def __init__(self, prEndpoint=None, esEndpoint=None, dmonPort=5001, MInstancePort=9200, index="logstash-*", prKafkaEndpoint=None, prKafkaPort=9092, prKafkaTopic='edetopic'): self.dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') if esEndpoint is None: self.esInstance = None else: self.esInstance = Elasticsearch(esEndpoint) self.esEndpoint = esEndpoint self.dmonPort = dmonPort self.esInstanceEndpoint = MInstancePort self.myIndex = index logger.info( '[{}] : [INFO] EDE ES backend Defined at: {} with port {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), esEndpoint, MInstancePort)) if prEndpoint is None: pass else: self.prEndpoint = prEndpoint self.MInstancePort = MInstancePort logger.info( '[{}] : [INFO] EDE PR backend Defined at: {} with port {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), prEndpoint, MInstancePort)) self.dataDir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'data') if prKafkaEndpoint is None: self.producer = None logger.warning('[{}] : [WARN] EDE Kafka reporter not set'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: self.prKafkaTopic = prKafkaTopic try: self.producer = KafkaProducer( value_serializer=lambda v: json.dumps(v).encode('utf-8'), bootstrap_servers=[ "{}:{}".format(prKafkaEndpoint, prKafkaPort) ], retries=5) logger.info( '[{}] : [INFO] EDE Kafka reporter initialized to server {}:{}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), prKafkaEndpoint, prKafkaPort)) except Exception as inst: logger.error( '[{}] : [ERROR] EDE Kafka reporter failed with {} and {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) self.producer = None
def dask_detect( self, method, model, data, anomaly_label=-1 # Todo make anomaly label user definable ): smodel = self.__loadClusterModel(method, model) anomaliesList = [] anomaliesDict = {} shap_values_p = 0 if not smodel: dpredict = 0 else: if data.shape[0]: try: logger.info( '[{}] : [INFO] Loading predictive model {} '.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(smodel).split('(')[0])) for k, v in smodel.get_params().items(): logger.info( '[{}] : [INFO] Predict model parameter {} set to {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) dpredict = smodel.predict(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to load predictive model with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) dpredict = 0 else: dpredict = 0 logger.warning( '[{}] : [WARN] DataFrame is empty with shape {} '.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape))) if list(np.unique(dpredict)) == [0, 1] or isinstance( smodel, pyod.models.iforest.IForest): anomaly_label = 1 else: anomaly_label = -1 if type(dpredict) is not int: anomalyArray = np.argwhere(dpredict == anomaly_label) if self.pred_analysis and anomalyArray.shape[0]: try: plot = self.pred_analysis['Plot'] # print(self.pred_analysis['Plot']) except Exception: plot = False feature_importance, shap_values = self.__shap_analysis( model=smodel, data=data, plot=plot) anomaliesDict['complete_shap_analysis'] = feature_importance shap_values_p = 1 count = 0 for an in anomalyArray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]].name) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name)) if shap_values_p: anomalies['analysis'] = self.__shap_force_layout( shap_values=shap_values, instance=count) anomaliesList.append(anomalies) count += 1 anomaliesDict['anomalies'] = anomaliesList logger.info( '[{}] : [INFO] Detected {} anomalies with model {} using method {} ' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anomaliesList), model, str(smodel).split('(')[0])) return anomaliesDict
def dask_clusterMethod(self, cluster_method, mname, data): try: logger.info('[{}] : [INFO] Loading Clustering method {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method))) # delattr(cluster_method, 'behaviour') # del cluster_method.__dict__['behaviour'] for k, v in cluster_method.get_params().items(): logger.info( '[{}] : [INFO] Method parameter {} set to {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: with joblib.parallel_backend('dask'): logger.info( '[{}] : [INFO] Using Dask backend for user defined method' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit user defined method with dask backend with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning( '[{}] : [WARN] using default process based backend for user defined method' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit {} with {} and {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method), type(inst), inst.args)) sys.exit(1) predictions = clf.predict(data) if list(np.unique(predictions)) == [0, 1]: anomaly_marker = 1 normal_marker = 0 else: anomaly_marker = -1 normal_marker = 1 logger.info( '[{}] : [INFO] Number of Predicted Anomalies {} from a total of {} datapoints.' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), list(predictions).count(anomaly_marker), len(list(predictions)))) logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions)) fname = str(clf).split('(')[0] self.__serializemodel(clf, fname, mname) self.__plot_feature_sep(data, predictions, method=fname, mname=mname, anomaly_label=anomaly_marker, normal_label=normal_marker) self.__decision_boundary(clf, data, method=fname, mname=mname, anomaly_label=anomaly_marker) return clf
def test(times, processID): logger.info('[{}] : [INFO] Starting Engine Point process {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), processID)) time.sleep(times) logger.info('[{}] : [INFO] Exit Engine Point process {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), processID))
def run(self): logger.info('[{}] : [INFO] Starting Engine Train thread {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.threadID)) self.engine.trainMethod()
def fillMissing(self, df): logger.info('[{}] : [WARN] Filling in missing values with 0'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) df.fillna(0, inplace=True)
def run(self): logger.info('[{}] : [INFO] Starting Engine Detect thread {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.threadID)) self.engine.detectAnomalies()
def __decision_boundary( self, model, data, method, mname, anomaly_label=-1, ): """ :param model: model to be refitted with 2 features (PCA) :param data: dataset after PCA :param method: method used for plotting decision boundary :param mname: name of the model to be displayed :param anomaly_label: label for anomaly instances (differs from method to method) """ logger.info( '[{}] : [INFO] Computing PCA with 2 components for decision boundary ...' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) transformer = PCA(n_components=2) transformer.fit(data) data = transformer.transform(data) # print("PCA data shape: {}".format(data.shape)) # fit model try: model.set_params( max_features=data.shape[-1] ) # becouse we have only two features we must override previous setting except ValueError: logger.debug( '[{}] : [Debug] Model not effected by max feature parameter, setting encoding and decoding size' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) model.set_params(encoder_neurons=[2, 64, 32], decoder_neurons=[32, 64, 2]) model.fit(data) y_pred_outliers = model.predict(data) # get anomaly index anomaly_index_rf = np.where(y_pred_outliers == anomaly_label) # Get anomalies based on index ano_rf = data[anomaly_index_rf] # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-15, 25, 80), np.linspace(-5, 20, 80)) Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title(f"Decision Boundary for {method} with name {mname}") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black') b1 = plt.scatter(data[:, 0], data[:, 1], c='white', s=20, edgecolor='k') c = plt.scatter(ano_rf[:, 0], ano_rf[:, 1], c='red', s=20, edgecolor='k') plt.axis('tight') plt.xlim((-15, 25)) plt.ylim((-5, 20)) plt.legend([b1, c], [ "normal", "anomaly", ], loc="upper left") plot_name = f"Decision_Boundary_{method}_{mname}.png" plt.savefig(os.path.join(self.modelDir, plot_name)) plt.close()