예제 #1
0
def check_dask_settings(cnf=None):
    if cnf is None:
        cnf_file = 'ede_config.yaml'
    try:
        opts, args = getopt.getopt(
            cnf, "he:tf:m:vx:d:lq:",
            ["endpoint=", "file=", "method=", "export=", "detect=", "query="])
    except getopt.GetoptError:
        logger.warning(
            '[%s] : [WARN] Invalid argument received exiting',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        print("ede.py -f <filelocation>, -t -m <method> -v -x <modelname>")
        sys.exit(0)
    for opt, arg in opts:
        if opt in ("-f", "--file"):
            cnf_file = arg
    try:
        with open(cnf_file) as cf:
            readCnf = yaml.unsafe_load(cf)
        SchedulerEndpoint = readCnf['Connector']['Dask']['SchedulerEndpoint']
        Scale = readCnf['Connector']['Dask']['Scale']
        SchedulerPort = readCnf['Connector']['Dask']['SchedulerPort']
        EnforceCheck = readCnf['Connector']['Dask']['EnforceCheck']
    except Exception:
        SchedulerEndpoint = 0
        Scale = 0
        SchedulerPort = 0
        EnforceCheck = 0
    return SchedulerEndpoint, Scale, SchedulerPort, EnforceCheck
예제 #2
0
 def dask_clusterMethod(self, cluster_method,
                        mname,
                        data
                        ):
     try:
         logger.info('[{}] : [INFO] Loading Clustering method {}'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method)))
         # delattr(cluster_method, 'behaviour')
         # del cluster_method.__dict__['behaviour']
         for k, v in cluster_method.get_params().items():
             logger.info('[{}] : [INFO] Method parameter {} set to {}'.format(
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
         try:
             with joblib.parallel_backend('dask'):
                 logger.info('[{}] : [INFO] Using Dask backend for user defined method'.format(
                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                 clf = cluster_method.fit(data)
         except Exception as inst:
             logger.error('[{}] : [ERROR] Failed to fit user defined method with dask backedn with {} and {}'.format(
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args))
             logger.warning('[{}] : [WARN] using default process based backedn for user defined method'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
             clf = cluster_method.fit(data)
     except Exception as inst:
         logger.error('[{}] : [ERROR] Failed to fit {} with {} and {}'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method),
             type(inst), inst.args))
         sys.exit(1)
     predictions = clf.predict(data)
     logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions))
     fname = str(clf).split('(')[0]
     self.__serializemodel(clf, fname, mname)
     return clf
예제 #3
0
 def scale(self, data,
           scaler_type=None,
           rindex='time'):  # todo, integrate
     if not scaler_type:
         logger.warning('[{}] : [WARN] No data scaling used!'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
         return data
     if scaler_type is None:
         scaler_type = {"StandardScaler": {"copy": True, "with_mean": True, "with_std": True}}
         logger.warning('[{}] : [WARN] No user defined scaler using default'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_type))
     scaler_name = list(scaler_type.keys())[-1]
     scaler_attr = list(scaler_type.values())[-1]
     logger.info('[{}] : [INFO] Scaler set to {} with parameters {}.'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_name, scaler_attr))
     try:
         sc_mod = importlib.import_module(self.scaler_mod)
         scaler_instance = getattr(sc_mod, scaler_name)
         scaler = scaler_instance(**scaler_attr)
     except Exception as inst:
         logger.error('[{}] : [ERROR] Error while initializing scaler {}'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_name))
         sys.exit(2)
     # Fit and transform data
     logger.info('[{}] : [INFO] Scaling data ...'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
     scaled_data = scaler.fit_transform(data)
     # Transform numpy array into dataframe, re-add columns to scaled numpyarray
     df_scaled = pd.DataFrame(scaled_data, columns=data.columns)
     df_scaled[rindex] = list(data.index)
     df_scaled.set_index(rindex, inplace=True)
     scaler_file = '{}.scaler'.format(scaler_name)
     logger.info('[{}] : [INFO] Saving scaler instance {} ...'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), scaler_file))
     scale_file_location = os.path.join(self.dataDir, scaler_file)
     joblib.dump(scaler, filename=scale_file_location)
     return df_scaled
 def pushAnomalyKafka(self, body):
     if self.producer is None:
         logger.warning(
             '[{}] : [WARN] Kafka reporter not defined, skipping reporting'.
             format(
                 datetime.fromtimestamp(
                     time.time()).strftime('%Y-%m-%d %H:%M:%S')))
     else:
         try:
             self.producer.send(self.prKafkaTopic, body)
             # self.producer.flush()
             logger.info(
                 '[{}] : [INFO] Anomalies reported to kafka topic {}'.
                 format(
                     datetime.fromtimestamp(
                         time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                     self.prKafkaTopic))
         except Exception as inst:
             logger.error(
                 '[{}] : [ERROR] Failed to report anomalies to kafka topic {} with {} and {}'
                 .format(
                     datetime.fromtimestamp(
                         time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                     self.prKafkaTopic, type(inst), inst.args))
     return 0
예제 #5
0
    def dask_sdbscanTrain(self, settings, mname, data, scaler=None):
        '''
        :param data: -> dataframe with data
        :param settings: -> settings dictionary
        :param mname: -> name of serialized clusterer
        :param scaler: -> scaler to use on data
        :return: -> clusterer
        :example settings: -> {eps:0.9, min_samples:10, metric:'euclidean' ,
        algorithm:'auto, leaf_size:30, p:0.2, n_jobs:1}
        '''

        if scaler is None:
            logger.warning('[{}] : [WARN] Scaler not defined'.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        else:
            logger.info('[{}] : [INFO] Scaling data ...'.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            data = scaler.fit_transform(data)

        if not settings or settings is None:
            logger.warning(
                '[{}] : [WARN] No DBScan parameters defined using default'.
                format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            settings = {}
        else:
            for k, v in settings.items():
                logger.info(
                    '[{}] : [INFO] DBScan parameter {} set to {}'.format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))

        try:
            db = DBSCAN(**settings).fit(data)
        except Exception as inst:
            logger.error(
                '[{}] : [INFO] Failed to instanciate DBScan with {} and {}'.
                format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(1)
        labels = db.labels_
        logger.info('[{}] : [INFO] DBScan labels: {} '.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            labels))
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        logger.info(
            '[{}] : [INFO] DBScan estimated number of clusters {} '.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), n_clusters_))
        self.__serializemodel(db, 'sdbscan', mname)
        return db
예제 #6
0
 def prtoDF(self, data,
            checkpoint=False,
            verbose=False,
            index=None,
            detect=False):
     """
     From PR backend to dataframe
     :param data: PR response JSON
     :return: dataframe
     """
     if not data:
         logger.error('[{}] : [ERROR] PR query response is empty, exiting.'.format(
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
         sys.exit(2)
     df = pd.DataFrame()
     df_time = pd.DataFrame()
     if verbose:
         dr = tqdm.tqdm(data['data']['result'])
     else:
         dr = data['data']['result']
     for el in dr:
         metric_name = el['metric']['__name__']
         instance_name = el['metric']['instance']
         new_metric = "{}_{}".format(metric_name, instance_name)
         values = el['values']
         proc_val = []
         proc_time = []
         for val in values:
             proc_val.append(val[1])
             proc_time.append(val[0])
         df[new_metric] = proc_val
         time_new_metric = "time_{}".format(new_metric)
         df_time[time_new_metric] = proc_time
     # Calculate the meant time for all metrics
     df_time['mean'] = df_time.mean(axis=1)
     # Round to np.ceil all metrics
     df_time['mean'] = df_time['mean'].apply(np.ceil)
     # Add the meant time to rest of metrics
     df['time'] = df_time['mean']
     logger.info('[{}] : [INFO] PR query resulted in dataframe of size: {}'.format(
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), df.shape))
     if index is not None:
         df.set_index(index, inplace=True)
         logger.warning('[{}] : [WARN] PR query dataframe index set to  {}'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), index))
     if checkpoint:
         if detect:
             pr = "pr_data_detect.csv"
         else:
             pr = "pr_data.csv"
         pr_csv_loc = os.path.join(self.dataDir, pr)
         df.to_csv(pr_csv_loc, index=True)
         logger.info('[{}] : [INFO] PR query dataframe persisted to {}'.format(
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.dataDir))
     return df
예제 #7
0
 def getGT(self, data, gt='target'):
     if gt is None:
         logger.warning('[{}] : [WARN] Ground truth column not defined, fetching last column as target'.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
         features = data.columns[:-1]
         X = data[features]
         y = data.iloc[:, -1].values
     else:
         logger.info('[{}] : [INFO] Ground truth column set to {} '.format(
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), gt))
         y = data[gt].values
         X = data.drop([gt], axis=1)
     return X, y
예제 #8
0
    def dask_isolationForest(self, settings,
                             mname,
                             data
                             ):
        '''
        :param settings: -> settings dictionary
        :param mname: -> name of serialized clusterer
        :param scaler: -> scaler to use on data
        :return: -> isolation forest instance
        :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False,
                        max_features:1.0, n_jobs:1, random_state:None, verbose:0}
        '''
        if not settings or settings is None:
            logger.warning('[{}] : [WARN] No IsolationForest parameters defined using defaults'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            # print(settings)
            settings = {}
        else:
            for k, v in settings.items():
                logger.info('[{}] : [INFO] IsolationForest parameter {} set to {}'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
        try:

            clf = IsolationForest(**settings)
            # print(clf)
        except Exception as inst:
            logger.error('[{}] : [INFO] Failed to instanciate IsolationForest with {} and {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args))
            sys.exit(1)

        try:
            with joblib.parallel_backend('dask'):
                logger.info('[{}] : [INFO] Using Dask backend for IsolationForest'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                clf.fit(data)
        except Exception as inst:
            logger.error('[{}] : [ERROR] Failed to fit IsolationForest with {} and {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args))
            sys.exit(1)

        predict = clf.predict(data)
        anoOnly = np.argwhere(predict == -1)
        logger.info('[{}] : [INFO] Found {} anomalies in training dataset of shape {}.'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anoOnly), data.shape))
        logger.info('[{}] : [DEBUG] Predicted Anomaly Array {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predict))
        self.__serializemodel(clf, 'isoforest', mname)
        self.__appendPredictions(method='isoforest', mname=mname, data=data, pred=predict)
예제 #9
0
    def reinitialize(self):
        try:
            self.esInstance.indices.delete(
                index=['anomalies', '.watches', 'watch_history*'], ignore=404)
        except Exception as inst:
            logger.warning(
                '[%s] : [WARN] Watcher index reinitialization failed with %s and %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            return 1

        logger.info(
            '[%s] : [INFO] Watcher index reinitialization succesfull!',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        return 0
예제 #10
0
 def __loadClusterModel(self, method,
                        model):
     '''
     :param method: -> method name
     :param model: -> model name
     :return: -> instance of serialized object
     '''
     lmodel = glob.glob(os.path.join(self.modelDir, ("%s_%s.pkl" % (method, model))))
     if not lmodel:
         logger.warning('[%s] : [WARN] No %s model with the name %s found',
                      datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), method, model)
         return 0
     else:
         smodel = pickle.load(open(lmodel[0], "rb"))
         logger.info('[%s] : [INFO] Succesfully loaded %s model with the name %s',
                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), method, model)
         return smodel
예제 #11
0
    def listMerge(self, lFiles):
        '''
        :param lFiles: -> list of files
        :return: merged dataframe
        :note: Only use if dataframes have divergent headers
        '''
        dfList = []
        if all(isinstance(x, str) for x in lFiles):
            for f in lFiles:
                if not f:
                    logger.warning('[%s] : [WARN] Found empty string instead of abs path ...',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                try:
                    df = pd.read_csv(f)
                except Exception as inst:
                    logger.error('[%s] : [ERROR] Cannot load file at %s exiting',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), f)
                    sys.exit(1)
                dfList.append(df)
        elif all(isinstance(x, pd.DataFrame) for x in lFiles):
            dfList = lFiles
        else:
            incomp = []
            for el in lFiles:
                if not isinstance(el, pd.DataFrame):
                    incomp.append(type(el))
            logger.error('[%s] : [ERROR] Incompatible type detected for merging, cannot merge type %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(incomp))
        # for d in dfList:
        #     if d.empty:
        #         logger.warning('[%s] : [INFO] Detected empty dataframe in final merge, removing ...',
        #                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        #
        #         dfList.pop(dfList.index(d))
        try:
            current = reduce(lambda x, y: pd.merge(x, y, on='key'), dfList)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Merge dataframes exception %s with args %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            logger.error('[%s] : [ERROR] Merge dataframes exception df list %s',
                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), dfList)
            sys.exit(1)

        # current.set_index('key', inplace=True)
        return current
예제 #12
0
 def dask_detect(self,
                 method,
                 model,
                 data
                 ):
     smodel = self.__loadClusterModel(method, model)
     anomaliesList = []
     if not smodel:
         dpredict = 0
     else:
         if data.shape[0]:
             try:
                 logger.info('[{}] : [INFO] Loading predictive model {} '.format(
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(smodel).split('(')[0]))
                 for k, v in smodel.get_params().items():
                     logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format(
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
                     dpredict = smodel.predict(data)
             except Exception as inst:
                 logger.error('[{}] : [ERROR] Failed to load predictive model with {} and {}'.format(
                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args))
                 dpredict = 0
         else:
             dpredict = 0
             logger.warning('[{}] : [WARN] DataFrame is empty with shape {} '.format(
             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape)))
     if type(dpredict) is not int:
         anomalyArray = np.argwhere(dpredict == -1)
         for an in anomalyArray:
             anomalies = {}
             anomalies['utc'] = int(data.iloc[an[0]].name)
             anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name))
             anomaliesList.append(anomalies)
     anomaliesDict = {}
     anomaliesDict['anomalies'] = anomaliesList
     logger.info('[{}] : [INFO] Detected {} anomalies with model {} using method {} '.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anomaliesList), model,
         str(smodel).split('(')[0]))
     return anomaliesDict
예제 #13
0
    qlte = 1477914720000

    qsize = 0
    qinterval = "10s"
    dmonEndpoint = '85.120.206.27'

    dmonConnector = Connector(dmonEndpoint)
    qConstructor = QueryConstructor()
    dformat = DataFormatter(dataDir)

    nodeList = dmonConnector.getNodeList()
    interval = dmonConnector.getInterval()

    if int(qinterval[:-1]) < interval['System']:
        logger.warning(
            '[%s] : [WARN] System Interval smaller than set interval!',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    # per slave unique process name list
    nodeProcessReduce = {}
    nodeProcessMap = {}

    # Get host based metrics
    for node in nodeList:
        # Query and file string
        load, load_file = qConstructor.loadString(node)
        memory, memory_file = qConstructor.memoryString(node)
        interface, interface_file = qConstructor.interfaceString(node)
        packet, packet_file = qConstructor.packetString(node)
        nodeManager, nodeManager_file = qConstructor.nodeManagerString(node)
        jvmNodeManager, jvmNodeManager_file = qConstructor.jvmnodeManagerString(
예제 #14
0
    def computeOnColumns(self, df,
                         operations,
                         remove_filtered=True):
        if operations:
            if 'STD' in list(operations.keys()):
                std = operations['STD']
            else:
                std = None

            if 'Mean' in list(operations.keys()):
                mean = operations['Mean']
            else:
                mean = None

            if 'Median' in list(operations.keys()):
                median = operations['Median']
            else:
                median = None
            all_processed_columns = []
            if std or std is not None:
                for cl_std in std:
                    for ncol_n, fcol_n in cl_std.items():
                        df_std = self.filterColumns(df, lColumns=fcol_n)
                        logger.info('[{}] : [INFO] Computing standard deviation {} on columns {}'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n))
                        std_df = df_std.std(axis=1, skipna=True)
                        df[ncol_n] = std_df
                        for c in fcol_n:
                            all_processed_columns.append(c)
            if mean or mean is not None:
                for cl_mean in mean:
                    for ncol_n, fcol_n in cl_mean.items():
                        df_mean = self.filterColumns(df, lColumns=fcol_n)
                        logger.info('[{}] : [INFO] Computing mean {} on columns {}'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n))
                        mean_df = df_mean.mean(axis=1, skipna=True)
                        df[ncol_n] = mean_df
                        for c in fcol_n:
                            all_processed_columns.append(c)
            if median or median is not None:
                for cl_median in median:
                    for ncol_n, fcol_n in cl_median.items():
                        df_median = self.filterColumns(df, lColumns=fcol_n)
                        logger.info('[{}] : [INFO] Computing median {} on columns {}'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), ncol_n, fcol_n))
                        median_df = df_median.median(axis=1, skipna=True)
                        df[ncol_n] = median_df
                        for c in fcol_n:
                            all_processed_columns.append(c)
            if "Method" in list(operations.keys()):
                df = self.__operationMethod(operations['Method'], data=df)
            if remove_filtered:
                unique_all_processed_columns = list(set(all_processed_columns))
                logger.warning('[{}] : [WARN] Droping columns used for computation ...'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), unique_all_processed_columns))
                self.dropColumns(df, unique_all_processed_columns, cp=False)
        else:
            logger.info('[{}] : [INFO] No data operations/augmentations defined'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        logger.info('[{}] : [INFO] Augmented data shape {}'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), df.shape))
        return df
 def __init__(self,
              prEndpoint=None,
              esEndpoint=None,
              dmonPort=5001,
              MInstancePort=9200,
              index="logstash-*",
              prKafkaEndpoint=None,
              prKafkaPort=9092,
              prKafkaTopic='edetopic'):
     self.dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                 'data')
     if esEndpoint is None:
         self.esInstance = None
     else:
         self.esInstance = Elasticsearch(esEndpoint)
         self.esEndpoint = esEndpoint
         self.dmonPort = dmonPort
         self.esInstanceEndpoint = MInstancePort
         self.myIndex = index
         logger.info(
             '[{}] : [INFO] EDE ES backend Defined at: {} with port {}'.
             format(
                 datetime.fromtimestamp(
                     time.time()).strftime('%Y-%m-%d %H:%M:%S'), esEndpoint,
                 MInstancePort))
     if prEndpoint is None:
         pass
     else:
         self.prEndpoint = prEndpoint
         self.MInstancePort = MInstancePort
         logger.info(
             '[{}] : [INFO] EDE PR backend Defined at: {} with port {}'.
             format(
                 datetime.fromtimestamp(
                     time.time()).strftime('%Y-%m-%d %H:%M:%S'), prEndpoint,
                 MInstancePort))
         self.dataDir = os.path.join(
             os.path.dirname(os.path.abspath(__file__)), 'data')
     if prKafkaEndpoint is None:
         self.producer = None
         logger.warning('[{}] : [WARN] EDE Kafka reporter not set'.format(
             datetime.fromtimestamp(
                 time.time()).strftime('%Y-%m-%d %H:%M:%S')))
     else:
         self.prKafkaTopic = prKafkaTopic
         try:
             self.producer = KafkaProducer(
                 value_serializer=lambda v: json.dumps(v).encode('utf-8'),
                 bootstrap_servers=[
                     "{}:{}".format(prKafkaEndpoint, prKafkaPort)
                 ],
                 retries=5)
             logger.info(
                 '[{}] : [INFO] EDE Kafka reporter initialized to server {}:{}'
                 .format(
                     datetime.fromtimestamp(
                         time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                     prKafkaEndpoint, prKafkaPort))
         except Exception as inst:
             logger.error(
                 '[{}] : [ERROR] EDE Kafka reporter failed with {} and {}'.
                 format(
                     datetime.fromtimestamp(
                         time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                     type(inst), inst.args))
             self.producer = None
예제 #16
0
def main(argv,
         cluster,
         client):
    dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
    modelsDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models')
    queryDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'queries')

    settings = Dict()
    settings.esendpoint = None
    settings.prendpoint = None
    settings.Dask.SchedulerEndpoint = None  # "local"
    settings.Dask.SchedulerPort = 8787
    settings.Dask.EnforceCheck = False
    settings.prkafkaendpoint = None
    settings.prkafkaport = 9092
    settings.prkafkatopic = "edetopic"
    settings.augmentation = None  # augmentation including scaler and user defined methods
    settings.detectionscaler = None
    settings.MPort = 9090
    settings.dmonPort = 5001
    settings.index = "logstash-*"
    settings["from"] = None
    settings.to = None
    settings.query = None
    settings.nodes = None
    settings.qsize = None
    settings.qinterval = None
    settings.fillna = None
    settings.dropna = None
    settings.local = None
    settings.train = None
    settings.hpomethod = None
    settings.tpot = None
    settings.ParamDistribution = None
    settings.detecttype = None # TODO
    settings.traintype = None
    settings.validationtype = None # Todo
    settings.target = None
    settings.load = None
    settings.file = None
    settings.method = None
    settings.detectMethod = None
    settings.trainMethod = None
    settings.cv = None
    settings.trainscore = None
    settings.scorer = None
    settings.returnestimators = None
    settings.analysis = None
    settings.validate = None
    settings.export = None
    settings.trainexport = None
    settings.detect = None  # Bool default None
    settings.cfilter = None
    settings.rfilter = None
    settings.dfilter = None
    settings.sload = None
    settings.smemory = None
    settings.snetwork = None
    settings.heap = None
    settings.checkpoint = None
    settings.delay = None
    settings.interval = None
    settings.resetindex = None
    settings.training = None
    settings.validation = None
    settings.validratio = 0.2
    settings.compare = False
    settings.anomalyOnly = False
    settings.categorical = None
    settings.point = False

    # Only for testing
    settings['validate'] = False
    dask_backend = False

    try:
        opts, args = getopt.getopt(argv, "he:tf:m:vx:d:lq:", ["endpoint=", "file=", "method=", "export=", "detect=", "query="])  # todo:expand comand line options
    except getopt.GetoptError:
        logger.warning('[%s] : [WARN] Invalid argument received exiting', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        print("ede.py -f <filelocation>, -t -m <method> -v -x <modelname>")
        sys.exit(0)
    for opt, arg in opts:
        if opt == '-h':
            print("#" * 100)
            print("H2020 ASPIDE")
            print('Event Detection Engine')
            print("-" * 100)
            print('Utilisation:')
            print('-f -> configuration file location')
            print('-t -> activate training mode')
            print('-m -> methods')
            print('   -> allowed methods: skm, em, dbscan, sdbscan, isoforest')
            print('-x -> export model name')
            print('-v -> validation')
            print('-q -> query string for anomaly/event detection')
            print("#" * 100)
            sys.exit(0)
        elif opt in ("-e", "--endpoint"):
            settings['esendpoint'] = arg
        elif opt in ("-t"):
            settings["train"] = True
        elif opt in ("-f", "--file"):
            settings["file"] = arg
        elif opt in ("-m", "--method"):
            settings["method"] = arg
        elif opt in ("-v"):
            settings["validate"] = True
        elif opt in ("-x", "--export"):
            settings["export"] = arg
        elif opt in ("-d", "--detect"):
            settings["detect"] = arg
        elif opt in ("-l", "--list-models"):
            print ("Current saved models are:\n")
            print((getModelList()))
            sys.exit(0)
        elif opt in ("-q", "--query"):
            settings["query"] = arg

    # print("#" * 100)
    # print(queryDir)
    logger.info('[{}] : [INFO] Starting EDE framework ...'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
    logger.info('[{}] : [INFO] Trying to read configuration file ...'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))

    if settings["file"] is None:
        file_conf = 'ede_config.yaml'
        logger.info('[%s] : [INFO] Settings file set to %s',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), file_conf)
    else:
        if os.path.isfile(settings["file"]):
            file_conf = settings["file"]
            logger.info('[%s] : [INFO] Settings file set to %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), file_conf)
        else:
            logger.error('[%s] : [ERROR] Settings file not found at locations %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["file"])
            sys.exit(1)

    readCnf = readConf(file_conf)
    logger.info('[{}] : [INFO] Reading configuration file ...'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))

    # TODO: create def dls(file_conf)
    # Connector
    try:
        logger.info('[{}] : [INFO] Index Name set to : {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            readCnf['Connector']['indexname']))
    except:
        logger.warning('[%s] : [WARN] Index not set in conf setting to default value %s',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['index'])

    if settings['esendpoint'] is None:
        try:
            logger.info('[{}] : [INFO] Monitoring ES Backend endpoint in config {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                readCnf['Connector']['ESEndpoint']))
            settings['esendpoint'] = readCnf['Connector']['ESEndpoint']
        except:
            if readCnf['Connector']['PREndpoint'] is None:  # todo; now only available in config file not in commandline
                logger.error('[%s] : [ERROR] ES and PR backend Enpoints not set in conf or commandline!',
                                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                sys.exit(1)
            else:
                settings['prendpoint'] = readCnf['Connector']['PREndpoint']
                logger.info('[{}] : [INFO] Monitoring PR Endpoint set to {}'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                            settings["prendpoint"]))
    else:
        logger.info('[%s] : [INFO] ES Backend Enpoint set to %s',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['esendpoint'])
    if settings["from"] is None:
        try:
            settings["from"] = readCnf['Connector']['From']
            logger.info('[%s] : [INFO] From timestamp set to %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        settings["from"])
        except:
            logger.info('[{}] : [INFO] PR Backend endpoint set to {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['prendpoint']))
            if settings['prendpoint'] is not None:
                logger.info('[{}] : [INFO] PR Backedn endpoint set to {}'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['prendpoint']))
            else:
                logger.error('[%s] : [ERROR] From timestamp not set in conf or commandline!',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                sys.exit(1)
    else:
        logger.info('[%s] : [INFO] From timestamp set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['from'])

    if settings["to"] is None:
        try:
            settings["to"] = readCnf['Connector']['to']
            logger.info('[%s] : [INFO] To timestamp set to %s',
                                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                                settings["to"])
        except:
            if settings['prendpoint'] is not None:
                pass
            else:
                logger.error('[%s] : [ERROR] To timestamp not set in conf or commandline!',
                                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                sys.exit(1)
    else:
        logger.info('[%s] : [INFO] To timestamp set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['to'])

    if settings['query'] is None:
        try:
            settings['query'] = readCnf['Connector']['Query']
            logger.info('[%s] : [INFO] Query set to %s',
                                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                                settings['query'])
        except:
            if settings['prendpoint'] is not None:
                pass
            logger.error('[%s] : [ERROR] Query not set in conf or commandline!',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Query set to %s',
                           datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['query'])

    if settings.prkafkaendpoint is None:
        try:
            settings.prkafkaendpoint = readCnf['Connector']['KafkaEndpoint']
            if settings.prkafkaendpoint == 'None':
                settings.prkafkaendpoint = None
            else:
                settings.prkafkatopic = readCnf['Connector']['KafkaTopic']
                settings.prkafkaport = readCnf['Connector']['KafkaPort']
            logger.info('[{}] : [INFO] Kafka Endpoint set to  {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.prkafkaendpoint))
        except:
            logger.warning('[{}] : [WARN] Kafka Endpoint not set.'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.prkafkaendpoint))

    if settings["nodes"] is None:
        try:
            if not readCnf['Connector']['nodes']:
                readCnf['Connector']['nodes'] = 0
            settings["nodes"] = readCnf['Connector']['nodes']
            logger.info('[%s] : [INFO] Desired nodes set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                    settings['nodes'])
        except:
            logger.warning('[%s] : [WARN] No nodes selected from config file or comandline querying all',
                           datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            settings["nodes"] = 0
    else:
        logger.info('[%s] : [INFO] Desired nodes set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["nodes"])

    if settings["qsize"] is None:
        try:
            settings["qsize"] = readCnf['Connector']['QSize']
            logger.info('[%s] : [INFO] Query size set to %s',
                                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                                settings['qsize'])
        except:
            logger.warning('[%s] : [WARN] Query size not set in conf or commandline setting to default',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            settings["qsize"] = 'default'
    else:
        logger.info('[%s] : [INFO] Query size set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["qsize"])

    if settings["qinterval"] is None:
        try:
            settings["qinterval"] = readCnf['Connector']['MetricsInterval']
            logger.info('[%s] : [INFO] Metric Interval set to %s',
                                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                                settings['qinterval'])
        except:
            logger.warning('[%s] : [WARN] Metric Interval not set in conf or commandline setting to default',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            settings["qsize"] = "default"
    else:
        logger.info('[%s] : [INFO] Metric interval set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["qinterval"])
    if readCnf['Connector']['Dask']:
        try:
            settings['Dask']['SchedulerEndpoint'] = readCnf['Connector']['Dask']['SchedulerEndpoint']
            settings['Dask']['SchedulerPort'] = readCnf['Connector']['Dask']['SchedulerPort']
            settings['Dask']['EnforceCheck'] = readCnf['Connector']['Dask']['EnforceCheck']
            logger.info('[{}] : [INFO] Dask scheduler  set to: endpoint {}, port {}, check {}'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['Dask']['SchedulerEndpoint'],
                settings['Dask']['SchedulerPort'], settings['Dask']['EnforceCheck']))
            dask_backend = True
        except:
            logger.warning('[{}] : [WARN] Dask scheduler  set to default values'.format(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            dask_backend = False
    if settings['local'] is None:
        try:
            settings['local'] = readCnf['Connector']['Local']
            logger.info('[{}] : [INFO] Local datasource set to {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['local']))
        except:
            logger.info('[{}] : [INFO] Local datasource set to default'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            settings['local'] = None
    else:
        logger.info('[{}] : [INFO] Local datasource set to {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['local']))
    # Mode
    if settings["train"] is None:
        try:
            settings["train"] = readCnf['Mode']['Training']
            logger.info('[%s] : [INFO] Train is set to %s from conf',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['train'])
        except:
            logger.error('[%s] : [ERROR] Train is not set in conf or comandline!',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Train is set to %s from comandline',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['train'])

    # Analysis
    if settings.analysis is None:
        try:
            logger.info('[{}] : [INFO] Loading user defined analysis'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            settings.analysis = readCnf['Analysis']
        except:
            logger.info('[{}] : [INFO] No user defined analysis detected'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))

    # Validate
    if settings["validate"] is None:
        try:
            settings["validate"] = readCnf['Mode']['Validate']
            logger.info('[%s] : [INFO] Validate is set to %s from conf',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['validate'])
        except:
            logger.error('[%s] : [ERROR] Validate is not set in conf or comandline!',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Validate is set to %s from comandline',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['validate'])

    # Detect
    if settings["detect"] is None:
        try:
            settings["detect"] = readCnf['Mode']['Detect']
            logger.info('[%s] : [INFO] Detect is set to %s from conf',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['detect'])
        except:
            logger.error('[%s] : [ERROR] Detect is not set in conf or comandline!',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Detect is set to %s from comandline',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['detect'])

    if settings["detectMethod"] is None:
        try:
            settings["detectMethod"] = readCnf['Detect']['Method']
            logger.info('[%s] : [INFO] Detect Method is set to %s from conf',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detectMethod"])
        except:
            logger.error('[%s] : [ERROR] Detect Method is not set in conf or comandline!',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Detect Method is set to %s from comandline',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detectMethod"])

    if settings["detecttype"] is None:
        try:
            settings["detecttype"] = readCnf['Detect']['Type']
            logger.info('[{}] : [INFO] Detect Type is set to {} from conf'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detecttype"]))
        except:
            logger.error('[%s] : [ERROR] Detect Type is not set in conf or command line!',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Detect Type is set to %s from command line',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["detecttype"])

    if settings["trainMethod"] is None:
        try:
            settings["trainMethod"] = readCnf['Training']['Method']
            logger.info('[%s] : [INFO] Train Method is set to %s from conf',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["trainMethod"])
        except:
            try:
                settings['Training']['TPOTParam']
            except:
                logger.error('[%s] : [ERROR] Train Method is not set in conf or comandline!',
                                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Train Method is set to %s from comandline',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["trainMethod"])

    if settings["traintype"] is None:
        try:
            settings["traintype"] = readCnf['Training']['Type']
            logger.info('[%s] : [INFO] Train Type is set to %s from conf',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["traintype"])
        except:
            logger.error('[%s] : [ERROR] Train Type is not set in conf or command line!',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Train Type is set to %s from command line',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["traintype"])
    if settings.target is None:
        try:
            settings.target = readCnf['Training']['Target']
            logger.info('[{}] : [INFO] Classification Target set to {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.target))
        except:
            if settings['traintype'] == 'classification':
                logger.warning('[{}] : [WARN] Classification Target not set in config'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.target))
            else:
                pass

    if settings.hpomethod is None:
        try:
            settings.hpomethod = readCnf['Training']['HPOMethod']
            logger.info('[{}] : [INFO] HPO method set to {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod))
            try:
                settings.hpoparam = readCnf['Training']['HPOParam']
                for k, v in readCnf['Training']['HPOParam'].items():
                    logger.info('[{}] : [INFO] HPO Method {}  Param {} set to {}'.format(
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod, k, v))
            except:
                logger.warn('[{}] : [WARN] HPO Method Params set to default!'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                settings.hpoparam = {}
        except:
            if readCnf['Training']['Type'] == 'hpo':
                logger.error('[{}] : [ERROR] HPO invoked without method! Exiting'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod))
                sys.exit(1)
            else:
                pass

    if settings.ParamDistribution is None:
        try:
            settings.ParamDistribution = readCnf['Training']['ParamDistribution']
            logger.info('[{}] : [INFO] HPO Parameter Distribution found.'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        except:
            if readCnf['Training']['Type'] == 'hpo':
                logger.error('[{}] : [ERROR] HPO invoked without Parameter distribution! Exiting'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.hpomethod))
                sys.exit(1)
            else:
                pass
    if settings.tpot is None:
        try:
            settings.tpot = readCnf['Training']['TPOTParam']
            logger.info('[{}] : [INFO] TPO Parameters  found.'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        except:
            try:
                if readCnf['Training']['Type'] == 'tpot':
                    settings.tpot = {}
                    logger.warning('[{}] : [WARN] TPO Parameters not found. Using defaults'.format(
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                else:
                    pass
            except:
                pass

    if settings["export"] is None:
        try:
            settings["export"] = readCnf['Training']['Export']
            logger.info('[%s] : [INFO] Export is set to %s from conf',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["export"])
        except:
            logger.error('[%s] : [ERROR] Export is not set in conf or comandline!',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Model is set to %s from comandline',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["export"])

    if settings.cv is None:
        try:
            settings.cv = readCnf['Training']['CV']
            try:
                logger.info('[{}] : [INFO] Cross Validation set to {}'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cv']['Type']))
            except:
                logger.info('[{}] : [INFO] Cross Validation set to {}'.format(
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cv']))
                try:
                    settings['cv'] = int(settings['cv'])
                except:
                    logger.error('[{}] : [ERROR] Issues with CV definition in Training!'.format(
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                    sys.exit(1)
        except:
            logger.info('[{}] : [INFO] Cross Validation not defined'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))

    if settings.trainscore is None:
        try:
            settings.trainscore = readCnf['Training']['TrainScore']
            logger.info('[{}] : [INFO] Cross Validation set to include training scores'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        except:
            settings.trainscore = False

    if settings.scorer is None:
        try:
            settings.scorer = readCnf['Training']['Scorers']
            logger.info('[{}] : [INFO] Training scorers defined'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        except:
            logger.info('[{}] : [INFO] No Training scorers defined'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))

    if settings.returnestimators is None:
        try:
            settings.returnestimators = readCnf['Training']['ReturnEstimators']
            logger.info('[{}] : [INFO] CV Estimators will be saved'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        except:
            settings.returnestimators = False

    if settings["load"] is None:
        try:
            settings["load"] = readCnf['Detect']['Load']
            logger.info('[%s] : [INFO] Load is set to %s from conf',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["load"])
        except:
            logger.error('[%s] : [ERROR] Load is not set in conf or comandline!',
                                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            sys.exit(1)
    else:
        logger.info('[%s] : [INFO] Load is set to %s from comandline',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["load"])

    if settings.detectionscaler is None:
        try:
            settings.detectionscaler = readCnf['Detect']['Scaler']
            logger.info('[{}] : [INFO] Detection Scaler set to {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings.detectionscaler))
        except:
            settings.detectionscaler = None
            logger.warning('[{}] : [WARN] Detection scaler not specified'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))

    try:
        settings['MethodSettings'] = {}   #todo read settings from commandline ?
        for name, value in readCnf['Training']['MethodSettings'].items():
            # print("%s -> %s" % (name, value))
            settings['MethodSettings'][name] = value
    except:
        settings['MethodSettings'] = None
        logger.warning('[%s] : [WARN] No Method settings detected, using defaults for %s!',
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["method"])

    # Augmentation
    try:
        settings['augmentation'] = readCnf['Augmentation']
        logger.info('[%s] : [INFO] Augmentations loaded',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    except:
        settings['augmentation'] = None
        logger.info('[%s] : [INFO] Augmentations not defined',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    # Point anomaly settings
    try:
        settings["smemory"] = readCnf['Point']['memory']
        logger.info('[%s] : [INFO] System memory is set to %s',
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["smemory"])
    except:
        settings["smemory"] = "default"
        logger.warning('[%s] : [WARN] System memory is not set, using default!',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    try:
        settings["sload"] = readCnf['Point']['load']
        logger.info('[%s] : [INFO] System load is  set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["sload"])
    except:
        settings["sload"] = "default"
        logger.warning('[%s] : [WARN] System load is not set, using default!',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    try:
        settings["snetwork"] = readCnf['Point']['network']
        logger.info('[%s] : [INFO] System netowrk is  set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["snetwork"])
    except:
        settings["snetwork"] = "default"
        logger.warning('[%s] : [WARN] System network is not set, using default!',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    try:
        settings['heap'] = readCnf['Misc']['heap']
        logger.info('[%s] : [INFO] Heap size set to %s',
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['heap'])
    except:
        settings['heap'] = '512m'
        logger.info('[%s] : [INFO] Heap size set to default %s',
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['heap'])

    # Filter
    try:
        if readCnf['Filter']['Columns']:
            logger.info('[{}] : [INFO] Filter columns set in config as {}.'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Columns']))
            settings["cfilter"] = readCnf['Filter']['columns']
        else:
            logger.info('[{}] : [INFO] Filter columns set in config as {}.'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["cfilter"]))
    except:
        pass
    finally:
        logger.info('[%s] : [INFO] Filter column set to %s',
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['cfilter'])

    try:
        # logger.info('[%s] : [INFO] Filter rows set to %s',
        #             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Rows'])
        settings["rfilter"] = readCnf['Filter']['Rows']
    except:
        pass
        # logger.info('[%s] : [INFO] Filter rows  %s',
        #             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["rfilter"])
    finally:
        logger.info('[%s] : [INFO] Filter rows set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['rfilter'])

    try:
        if readCnf['Filter']['DColumns']:
            # print("Filter drop columns -> %s" % readCnf['Filter']['DColumns'])
            settings["dfilter"] = readCnf['Filter']['DColumns']
        else:
            # print("Filter drop columns -> %s" % settings["dfilter"])
            pass
    except:
        # print("Filter drop columns -> %s" % settings["dfilter"])
        pass
    finally:
        logger.info('[%s] : [INFO] Filter drop column set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['dfilter'])

    try:
        if readCnf['Filter']['Fillna']:
            settings['fillna'] = readCnf['Filter']['Fillna']
        else:
            settings['fillna'] = False
        logger.info('[{}] : [INFO] Fill None values set to {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Fillna']))
    except:
        logger.info('[{}] : [INFO] Fill None not set, skipping ...'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        settings['fillna'] = False

    try:
        if readCnf['Filter']['Dropna']:
            settings['dropna'] = readCnf['Filter']['Dropna']
        else:
            settings['dropna'] = False
        logger.info('[{}] : [INFO] Drop None values set to {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Filter']['Dropna']))
    except:
        logger.info('[{}] : [INFO] Drop None not set, skipping ...'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        settings['dropna'] = False

    if settings["checkpoint"] is None:
        try:

            settings["checkpoint"] = readCnf['Misc']['checkpoint']
            logger.info('[%s] : [INFO] Checkpointing is  set to %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['checkpoint'])
        except:
            settings["checkpoint"] = "True"
            logger.info('[%s] : [INFO] Checkpointing is  set to True',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    else:
        logger.info('[%s] : [INFO] Checkpointing is  set to %s',
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['checkpoint'])

    if settings["delay"] is None:
        try:

            settings["delay"] = readCnf['Misc']['delay']
            # logger.info('[%s] : [INFO] Delay is  set to %s',
            #         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay'])
        except:
            settings["delay"] = "2m"
        logger.info('[%s] : [INFO] Delay is  set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay'])
    else:
        logger.info('[%s] : [INFO] Delay is  set to %s',
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['delay'])

    if settings["interval"] is None:
        try:

            settings["interval"] = readCnf['Misc']['interval']
            logger.info('[%s] : [INFO] Interval is  set to %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval'])
        except:

            settings["interval"] = "15m"
            logger.info('[%s] : [INFO] Interval is  set to %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval'])
    else:
        logger.info('[%s] : [INFO] Interval is  set to %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['interval'])

    if settings["resetindex"] is None:
        try:

            settings["resetindex"] = readCnf['Misc']['resetindex']
        except:

            settings["resetindex"] = False
    else:
        logger.info('[%s] : [INFO] Reset index set to %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['resetindex'])

    try:
        settings['dmonPort'] = readCnf['Connector']['dmonport']
        logger.info('[{}] : [INFO] DMon Port is set to {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            settings['dmonPort']))
    except:
        logger.info('[%s] : [INFO] DMon Port is set to %s"',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['dmonPort']))

    try:
        settings['training'] = readCnf['Detect']['training']
        logger.info('[{}] : [INFO] Classification Training set is {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            readCnf['Detect']['training']))
    except:
        logger.info('[%s] : [INFO] Classification Training set is %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['training']))

    # try:
    #     print("Classification Validation set is %s" % readCnf['Detect']['validation'])
    #     settings['validation'] = readCnf['Detect']['validation']
    # except:
    #     print("Classification Validation set is default")
    # logger.info('[%s] : [INFO] Classification Validation set is %s',
    #             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['validation']))


    try:
        # print("Classification validation ratio is set to %d" % int(readCnf['Training']['ValidRatio']))
        logger.info('[{}] : [INFO] Classification validation ratio is set to {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Training']['ValidRatio']))
        if float(readCnf['Training']['ValidRatio']) > 1.0:
            # print("Validation ratio is out of range, must be between 1.0 and 0.1")
            settings['validratio'] = 0.0
            logger.warning('[{}] : [WARN] Validation ratio is out of range, must be between 1.0 and 0.1, overwritting'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), readCnf['Training']['ValidRatio']))
        settings['validratio'] = float(readCnf['Detect']['validratio'])
    except:
        logger.warning('[{}] : [WARN] Validation ratio is set to default'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
    logger.info('[%s] : [INFO] Classification Validation ratio is %s',
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['validratio']))

    # try:
    #     print("Classification comparison is set to %s" % readCnf['Detect']['compare'])
    #     settings['compare'] = readCnf['Detect']['compare']
    # except:
    #     print("Classification comparison is default")
    # logger.info('[%s] : [INFO] Classification comparison is %s',
    #             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['compare'])

    try:
        # print("Classification data generation using only anomalies set to %s" % readCnf['Detect']['anomalyOnly'])
        settings['anomalyOnly'] = readCnf['Detect']['anomalyOnly']
    except:
        # print("Classification data generation using only anomalies set to False")
        pass
    logger.info('[%s] : [INFO] Classification data generation using only anomalies set to %s',
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings['anomalyOnly']))

    if settings["categorical"] is None:
        try:
            if not readCnf['Augmentation']['Categorical']:
                readCnf['Augmentation']['Categorical'] = None
                logger.info('[{}] : [INFO] Categorical columns defined as: {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                    readCnf['Augmentation']['Categorical']))
            if readCnf['Augmentation']['Categorical'] == '0':
                settings["categorical"] = None
            else:
                settings["categorical"] = readCnf['Augmentation']['Categorical']
            logger.info('[%s] : [INFO] Categorical Features ->  %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                    settings['categorical'])
        except:
            logger.warning('[%s] : [WARN] No Categorical Features selected from config file or comandline! Skipping encoding',
                           datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
            settings["categorical"] = None
    else:
        logger.info('[%s] : [INFO] Categorical Features ->  %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings["categorical"])

    if not settings["point"]:
        try:
            settings['point'] = readCnf['Misc']['point']
            logger.info('[%s] : [INFO] Point  set to %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['point'])
        except:
            settings['point'] = 'False'
            logger.info('[%s] : [INFO] Point detection set to default %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), settings['point'])

    #print dmonC
    # sys.exit()
    # print("Conf file -> %s" %readCnf)
    # print("Settings  -> %s" %settings)

    engine = aspideedengine.EDEngine(settings,
                                     dataDir=dataDir,
                                     modelsDir=modelsDir,
                                     queryDir=queryDir)
    #engine.printTest()
    engine.initConnector()
    if dask_backend:
        engine.runDask(engine)
    else:
        try:
            engine.runProcess(engine)
        except Exception as inst:
            logger.error('[{}] : [ERROR] Failed Process backend initialization with {} and {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args))
            logger.warning('[{}] : [WARN] Initializing default threaded engine, limited performance to be expected!'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args))
            engine.run(engine)

    logger.info('[{}] : [INFO] Exiting EDE framework'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
예제 #17
0
    def detect(self, method,
               model,
               data):
        '''
        :param method: -> method name
        :param model: -> trained clusterer
        :param data: -> dataframe with data
        :return: -> dictionary that contains the list of anomalous timestamps
        '''
        smodel = self.__loadClusterModel(method, model)
        anomalieslist = []
        if not smodel:
            dpredict = 0
        else:
            if data.shape[0]:
                if isinstance(smodel, IsolationForest):
                    logger.info('[{}] : [INFO] Loading predictive model IsolationForest ').format(
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                    for k, v in smodel.get_params().items():
                        logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
                    # print("Contamination -> %s" % smodel.contamination)
                    # print("Max_Features -> %s" % smodel.max_features)
                    # print("Max_Samples -> %s" % smodel.max_samples_)
                    # print("Threashold -> %s " % smodel.threshold_)
                    try:
                        dpredict = smodel.predict(data)
                        logger.debug('[{}] : [DEBUG] IsolationForest prediction array: {}').format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(dpredict))
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
                        dpredict = 0

                elif isinstance(smodel, DBSCAN):
                    logger.info('[{}] : [INFO] Loading predictive model DBSCAN ').format(
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                    for k, v in smodel.get_params().items():
                        logger.info('[{}] : [INFO] Predict model parameter {} set to {}'.format(
                            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
                    # print("Leaf_zise -> %s" % smodel.leaf_size)
                    # print("Algorithm -> %s" % smodel.algorithm)
                    # print("EPS -> %s" % smodel.eps)
                    # print("Min_Samples -> %s" % smodel.min_samples)
                    # print("N_jobs -> %s" % smodel.n_jobs)
                    try:
                        dpredict = smodel.fit_predict(data)
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s',
                                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                                     inst.args)
                        dpredict = 0
            else:
                dpredict = 0
                logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]),
                             str(data.shape[1]))
                print("Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]),
                             str(data.shape[1])))
            print("dpredict type is %s" % (type(dpredict)))
        if type(dpredict) is not int:
            anomalyarray = np.argwhere(dpredict == -1)
            for an in anomalyarray:
                anomalies = {}
                anomalies['utc'] = int(data.iloc[an[0]].name)
                anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name))
                anomalieslist.append(anomalies)
        anomaliesDict = {}
        anomaliesDict['anomalies'] = anomalieslist
        logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict))
        return anomaliesDict
예제 #18
0
    def dask_clusterMethod(self, cluster_method, mname, data):
        try:
            logger.info('[{}] : [INFO] Loading Clustering method {}'.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                type(cluster_method)))
            # delattr(cluster_method, 'behaviour')
            # del cluster_method.__dict__['behaviour']
            for k, v in cluster_method.get_params().items():
                logger.info(
                    '[{}] : [INFO] Method parameter {} set to {}'.format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
            try:
                with joblib.parallel_backend('dask'):
                    logger.info(
                        '[{}] : [INFO] Using Dask backend for user defined method'
                        .format(
                            datetime.fromtimestamp(
                                time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                    clf = cluster_method.fit(data)
            except Exception as inst:
                logger.error(
                    '[{}] : [ERROR] Failed to fit user defined method with dask backend with {} and {}'
                    .format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        type(inst), inst.args))
                logger.warning(
                    '[{}] : [WARN] using default process based backend for user defined method'
                    .format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                clf = cluster_method.fit(data)
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Failed to fit {} with {} and {}'.format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                    type(cluster_method), type(inst), inst.args))
            sys.exit(1)
        predictions = clf.predict(data)
        if list(np.unique(predictions)) == [0, 1]:
            anomaly_marker = 1
            normal_marker = 0
        else:
            anomaly_marker = -1
            normal_marker = 1
        logger.info(
            '[{}] : [INFO] Number of Predicted Anomalies {} from a total of {} datapoints.'
            .format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                list(predictions).count(anomaly_marker),
                len(list(predictions))))
        logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            predictions))
        fname = str(clf).split('(')[0]
        self.__serializemodel(clf, fname, mname)
        self.__plot_feature_sep(data,
                                predictions,
                                method=fname,
                                mname=mname,
                                anomaly_label=anomaly_marker,
                                normal_label=normal_marker)
        self.__decision_boundary(clf,
                                 data,
                                 method=fname,
                                 mname=mname,
                                 anomaly_label=anomaly_marker)

        return clf
예제 #19
0
    def dask_detect(
            self,
            method,
            model,
            data,
            anomaly_label=-1  # Todo make anomaly label user definable
    ):
        smodel = self.__loadClusterModel(method, model)
        anomaliesList = []
        anomaliesDict = {}
        shap_values_p = 0
        if not smodel:
            dpredict = 0
        else:
            if data.shape[0]:
                try:
                    logger.info(
                        '[{}] : [INFO] Loading predictive model {} '.format(
                            datetime.fromtimestamp(
                                time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                            str(smodel).split('(')[0]))
                    for k, v in smodel.get_params().items():
                        logger.info(
                            '[{}] : [INFO] Predict model parameter {} set to {}'
                            .format(
                                datetime.fromtimestamp(
                                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                                k, v))
                        dpredict = smodel.predict(data)
                except Exception as inst:
                    logger.error(
                        '[{}] : [ERROR] Failed to load predictive model with {} and {}'
                        .format(
                            datetime.fromtimestamp(
                                time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                            type(inst), inst.args))
                    dpredict = 0
            else:
                dpredict = 0
                logger.warning(
                    '[{}] : [WARN] DataFrame is empty with shape {} '.format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        str(data.shape)))
        if list(np.unique(dpredict)) == [0, 1] or isinstance(
                smodel, pyod.models.iforest.IForest):
            anomaly_label = 1
        else:
            anomaly_label = -1

        if type(dpredict) is not int:
            anomalyArray = np.argwhere(dpredict == anomaly_label)
            if self.pred_analysis and anomalyArray.shape[0]:
                try:
                    plot = self.pred_analysis['Plot']
                    # print(self.pred_analysis['Plot'])
                except Exception:
                    plot = False
                feature_importance, shap_values = self.__shap_analysis(
                    model=smodel, data=data, plot=plot)
                anomaliesDict['complete_shap_analysis'] = feature_importance
                shap_values_p = 1
            count = 0
            for an in anomalyArray:
                anomalies = {}
                anomalies['utc'] = int(data.iloc[an[0]].name)
                anomalies['hutc'] = ut2hum(int(data.iloc[an[0]].name))
                if shap_values_p:
                    anomalies['analysis'] = self.__shap_force_layout(
                        shap_values=shap_values, instance=count)
                anomaliesList.append(anomalies)
                count += 1

        anomaliesDict['anomalies'] = anomaliesList
        logger.info(
            '[{}] : [INFO] Detected {} anomalies with model {} using method {} '
            .format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                len(anomaliesList), model,
                str(smodel).split('(')[0]))
        return anomaliesDict