示例#1
0
def sort_dictionary(dictionary,order_by='key',order='desc'):
    #TODO: check keywords parameters

    method_name = "sort_dictionary()"

    logging.info("Sorting dict ...")

    # Check the data type as dict
    if not isinstance(dictionary, dict):
        raise MSNMError(None,"Invalid dict as param", method_name)

    try:

        # Which order?
        if order == 'desc':
            reverse_order = True
        else:
            reverse_order = False

        if order_by == 'key':
            d = OrderedDict(sorted(dictionary.items(),key=lambda t: t[0],reverse=reverse_order))
        else:
            d = OrderedDict(sorted(dictionary.items(),key=lambda t: t[1],reverse=reverse_order))

    except Exception:
        raise MSNMError(None,sys.exc_info()[0],method_name)

    logging.info("Ending sorting dict ...")

    return d
示例#2
0
def sort_vector(vector, order, axis, abs_value):

    method_name = "sort_vector()"

    # Check the data type as ndarray
    if not isinstance(vector, np.ndarray):
        raise MSNMError(None,"Data is not an ndarray",method_name)

    try:

        # Absolute value?
        if abs_value:
            aux = np.abs(vector)
        else:
            aux = vector

        # Sorting asc
        aux = np.sort(aux,axis=axis)

        # Do sorting desc?
        if order == 'desc':
            aux = aux[::-1]

    except Exception:
        raise MSNMError(None,sys.exc_info()[0],method_name)

    return aux
示例#3
0
def averageDataImputation(**kwargs):
    """
    All missing data in X (NxM) will be replaced by their
    average value
    """

    method_name = "averageDataImputation()"

    # Check optional parameters
    # Check the observation for imputation
    if 'obs' in kwargs:
        obs = kwargs['obs']
    else:
        logging.error("There is no observation to recover")
        msnmerror = MSNMError(None,"There is no observation to recover",method_name)
        raise msnmerror
    # Check the calibration model
    if 'model' in kwargs:
        model = kwargs['model']
    else:
        logging.error("There is no calibration model")
        msnmerror = MSNMError(None,"There is no calibration model",method_name)
        raise msnmerror

    #Doing average imputation
    logging.debug("Doing average based data imputation ...")
    rec_obs = pd.DataFrame(obs).fillna(pd.DataFrame(model.get_av())).as_matrix()

    return rec_obs
示例#4
0
    def run(self):

        method_name = "run()"

        logging.info("Running client thread. Thread: %s",
                     threading.current_thread().getName())

        try:

            # Send pack to the server
            client_sock = self._client_instance.send_msg_to_server(
                self._client_instance._packet)

            logging.debug("Sending packet to %s",
                          self._client_instance._server_address)

            # Get response from server
            response = self._client_instance.recv_msg_from_server(client_sock)

            logging.debug("Server %s sent the response %s",
                          self._client_instance._server_address,
                          response._body['resp'])
        except CommError as ce:
            logging.error(ce.get_msg())
            #TODO: do some method to manage the raised exception in child threads on the main thread
            raise MSNMError(self, ce.get_msg(), method_name)
示例#5
0
def preprocess2Dapp(test, average, scale):
    """
    Apply autoscaled preprocessing to ``test`` data

    Parameters
    ----------
    test: numpy.ndarray
        [NxM] billinear data set
    average: numpy.ndarray
        [1xM] sample average to substract
    scale: numpy.ndarray
        [1xM] sample scale to divide the test

    Return
    ------
    testAutoScaled: numpy.ndarray
        [NxM] preprocessed data.

    Raises
    ------
    MSNMError
        General error is something is wrong

    Example
    -------
    >>> from msnm.utils import datautils as tools
    >>> import numpy as np
    >>> import scipy.io as sio

    >>> # Original data set X
    >>> originalData = './datatest/data_adicov_mspc.mat'

    >>> # Returns a dictonary like {'variable_name':'variable_data'}
    >>> x = sio.loadmat(originalData)
    >>> data = x['X']
    >>> weights = np.ones((data.shape[0],1))
    >>> xcs, average, scale = tools.preprocess2D(data,2,weights)
    >>> # anomalous data test
    >>> test = data['test']

    >>> # data test autoscaled
    >>> testcs = tools.preprocess2Dapp(test,average,scale)
    """

    method_name = "preprocess2Dapp()"

    try:

        # mean centering
        testMeanCenterting = test - np.dot(np.ones((test.shape[0],1)),average)

        # auto-scaled
        testAutoScaled = testMeanCenterting / (np.dot(np.ones((test.shape[0],1)),scale))

    except Exception:
        raise MSNMError(None,sys.exc_info()[0],method_name)

    return testAutoScaled
示例#6
0
def save2json(json_contents, path_to_save):

    logging.info("Saving json file in %s", path_to_save)

    try:
        with open(path_to_save,'w') as f:
            # Save raw data
            f.writelines(json_contents)
    except IOError as ioe:
        logging.error("Error saving json file: %s",sys.exc_info()[1])
        raise MSNMError(ioe, sys.exc_info()[1], 'save2json()')
示例#7
0
def sort_dict(keys,values,order_by,order,abs_value):
    #TODO: check keywords parameters

    method_name = "sort_dict()"

    # Check the data type as dict
    if not isinstance(keys, list):
        raise MSNMError(None,"Invalid list of keys", method_name)

    # Check the data type as ndarray
    if not isinstance(values, np.ndarray):
        raise MSNMError(None,"Invalid values array", method_name)

    try:
        # Absolute value?
        if abs_value:
            aux = np.abs(values)
        else:
            aux = values

        # Which order?
        if order == 'desc':
            reverse_order = True
        else:
            reverse_order = False

        # Make a dict from {keys:values}
        d = dict(zip(keys,aux))

        if order_by == 'key':
            d = OrderedDict(sorted(d.items(),key=lambda t: t[0],reverse=reverse_order))
        else:
            d = OrderedDict(sorted(d.items(),key=lambda t: t[1],reverse=reverse_order))

    except Exception:
        raise MSNMError(None,sys.exc_info()[0],method_name)

    return d
示例#8
0
def zeroDataImputation(**kwargs):

    method_name = "zeroDataImputation()"

    # Check optional parameters
    if 'obs' in kwargs:
        obs = kwargs['obs']
    else:
        logging.error("There is no observation to recover")
        msnmerror = MSNMError(None,"There is no observation to recover",method_name)
        raise msnmerror

    #Zero value imputation
    logging.debug("Doing zero based data imputation ...")
    rec_obs = pd.DataFrame(obs).fillna(0).as_matrix()

    return rec_obs
示例#9
0
def preprocess2D(x, prep, weights):
    """
    Data preprocessing depending on ``prep`` parameter

    Parameters
    ----------
    x: numpy.ndarray
        [NxM] billinear data set
    prep: int
        Choose the preprocessing method:
           0: no preprocessing
           1: mean-centering
           2: auto-scaling (default)
    weights: numpy.ndarray
        [1xM] weight applied after preprocessing. Set to a vector of 1s by defect.

    Return
    ------
    xcs: numpy.ndarray
        [NxM] preprocessed data.
    average: numpy.ndarray
        [1xM] sample average according to the preprocessing method.
    scale: numpy.ndarray
        [1xM] sample scale according to the preprocessing method.

    .. todo::
        weights vector is not implemented

    Raises
    ------
    MSNMError
        General error is when something was wrong

    Example
    -------
    >>> from msnm.utils import datautils as tools
    >>> import numpy as np
    >>> import scipy.io as sio

    >>> # Original data set X
    >>> originalData = './datatest/data_adicov_mspc.mat'

    >>> # Returns a dictonary like {'variable_name':'variable_data'}
    >>> x = sio.loadmat(originalData)
    >>> data = x['X']
    >>> weights = np.ones((data.shape[0],1))
    >>> xcs, average, scale = tools.preprocess2D(data,2,weights)
    """

    method_name = "preprocess_2D()"

    try:

        if prep == 1:
            # mean avoiding NaN for each variable
            average = np.nanmean(x,axis=0)# array of M elements
            average = average.reshape((1,average.shape[0]))# Matrix of 1xM elements
            # array 1xM, being M the number of variables
            scale = np.ones((1,x.shape[1]))
            # substract the average to the data set x
            xcs = x - np.dot(np.ones((x.shape[0],1)),average)
            # TODO: do test with NaN in the data set

        elif prep == 2:

            # not a numbers and a number in X
            nanM = np.isnan(x)
            anM = 1 - nanM

            average = np.nanmean(x,axis=0)# array of M elements
            average = average.reshape((1,average.shape[0]))# Matrix of 1xM elements
            scale = np.nanstd(x,axis=0,ddof=1)

            #TODO: to ask Pepe what is this :(
            ind = np.nonzero(scale == 0)# # of zeroes in scale
            dem = 2.0*np.sum(anM[:,ind],axis=0) - 1
            scale[ind] = np.sqrt(np.ones((1,np.array(ind).size)) / dem)

            scale = scale.reshape((1,scale.shape[0]))# Matrix of 1xM elements
            xcs = x - np.dot(np.ones((x.shape[0],1)),average)
            xcs = xcs / np.dot(np.ones((x.shape[0],1)),scale)
            # TODO: do test with NaN in the data set

        else:
            logging.warn("Preprocessing method %s is not available ...", prep)

    except Exception:
        raise MSNMError(None,sys.exc_info()[0],method_name)

    return xcs, average, scale
示例#10
0
def preprocess2Di(x, prep, lamda, average, scale, N, weights):
    """
    Data preprocessing applying EWMA methodology.

    J. Camacho, “Visualizing Big data with Compressed Score Plots: Approach and research challenges,”
    Chemometrics and Intelligent Laboratory Systems, vol. 135, pp. 110–125, Jul. 2014.

    References
    ----------
    Visualizing Big data with Compressed Score Plots: Approach and research challenges
    http://www.sciencedirect.com/science/article/pii/S016974391400080X

    Parameters
    ----------
    x: numpy.ndarray
        [NxM] billinear data set
    prep: int
        Choose the preprocessing method:
           0: no preprocessing
           1: mean-centering
           2: auto-scaling (default)
    lamda: float
        forgetting factor [0,1]
    average: numpy.ndarray or scalar
        [1xM] (t-1) previous computed average array
    scale: numpy.ndarray or scalar
        [1xM] (t-1) previous computed scale array
    N: int
        number of observations used to compute mean and scale vectors
    weights: numpy.ndarray
        [1xM] weight applied after preprocessing. Set to a vector of 1s by defect.

    Return
    ------
    xcs: numpy.ndarray
        [NxM] preprocessed data.
    average: numpy.ndarray
        [1xM] sample average according to the preprocessing method.
    scale: numpy.ndarray
        [1xM] sample scale according to the preprocessing method.
    N: int
        Current N after applying the forgetting factor

    .. todo::
        weights vector is not implemented yet

    Raises
    ------
    MSNMError
        General error is when something was wrong

    Example
    -------
    >>> from msnm.utils import datautils as tools
    >>> import numpy as np
    >>> import scipy.io as sio

    >>> # Original data set X
    >>> originalData = './datatest/data_adicov_mspc.mat'

    >>> # Returns a dictonary like {'variable_name':'variable_data'}
    >>> x = sio.loadmat(originalData)
    >>> data = x['X']
    >>> weights = np.ones((data.shape[0],1))
    >>> xcs, average, scale = tools.preprocess2D(data,2,weights)
    """

    method_name = "preprocess_2D()"

    logging.info("Preprocessing data dynamically for N=%s obs and lambda=%s",N,lamda)

    # EWMA mean update model
    # M_t^x = lambda * M_(t-1)^x + X_t
    # m_t^x = (1/N_t) * M_t^x
    # N_t = lambda * N_(t-1) + B_t

    # acc <=> M_t^x --> Current model accumulated
    # average <=> m_t^x --> Current model mean
    acc = average*N;

    # acc2 <=> (sigma_t^x)^2 --> Current model variability accumulated
    # scale <=> sigma_t^x --> Current model standard deviation
    acc2 = (scale**2)*np.max([N-1,0]);

    # Current number of real observations to compute the mean and standard
    # deviation
    N = lamda*N + x.shape[0];

    try:

        if prep == 1:# mean centering

            logging.debug("EWMA mean centering")

            # Computes the current model mean
            acc = lamda*acc + np.sum(x, axis=0)
            average = acc/N
            average = average.reshape(1,x.shape[1])
            # array 1xM, being M the number of variables.
            scale = np.ones((1,x.shape[1]))
            # subtract the average to the data set x
            xcs = x - np.dot(np.ones((x.shape[0],1)),average)

            # TODO: do test with NaN in the data set

        elif prep == 2: # auto-scaling
            logging.debug("EWMA auto-scaling")

            # Computes the current model mean
            acc = lamda*acc + np.sum(x, axis=0)
            average = acc/N;
            average = average.reshape(1,x.shape[1])

            # subtract the average to the data set x
            xc = x - np.dot(np.ones((x.shape[0],1)),average)
            # Computes the current model standard deviation
            acc2 = lamda*acc2 + np.sum(xc**2,axis=0)
            scale = np.sqrt(acc2/(N-1))

            # scale is all of zeros?
            if np.nonzero(scale)[0].shape[0] == 0:
                mS = 2
            else:
                mS = np.min(scale[np.nonzero(scale)])

            scale[np.nonzero(scale == 0)] = mS/2# use 1 by default may reduce detection of anomalous events
            # apply the scale
            scale = scale.reshape(1,x.shape[1])
            xcs = xc / np.dot(np.ones((x.shape[0],1)),scale)

            # TODO: do test with NaN in the data set

        elif prep == 3:
            logging.debug("EWMA scaling")

            # Computes the current model mean
            average = np.zeros((1,x.shape[1]))

            # Computes the current model standard deviation
            acc2 = lamda*acc2 + np.sum(x**2,axis=0)
            scale = np.sqrt(acc2/(N-1))

            # scale is all of zeros?
            if np.nonzero(scale)[0].shape[0] == 0:
                mS = 2
            else:
                mS = np.min(scale[np.nonzero(scale)])

            scale[np.nonzero(scale == 0)] = mS/2# use 1 by default may reduce detection of anomalous events
            # apply the scale
            scale = scale.reshape(1,x.shape[1])
            xcs = x / np.dot(np.ones((x.shape[0],1)),scale)

            # TODO: do test with NaN in the data set

        else:
            logging.warn("The selected preprocessing method is not valid")
            average = np.zeros((1,x.shape[1]))
            scale = np.ones((1,x.shape[1]))
            xcs = x

    except Exception:
        logging.error("Error preprocessing the data: %s",sys.exc_info()[1])
        raise MSNMError(None,sys.exc_info()[1],method_name)

    return xcs, average, scale, N
示例#11
0
    def launch_monitoring(self, ts):
        """
        Once the parsing (flow parser) procedure is done, this method is in charge of to start the monitoring
        process

        Raises
        ------
        MSNMError

        """

        method_name = "launch_monitoring()"

        # Configuration
        config = Configure()
        # Get root path for creating data files
        rootDataPath = config.get_config()['GeneralParams']['rootPath']

        obs_generated_path = rootDataPath + config.get_config()['Sensor'][
            'observation']  # path to save the complete observation joining all data sources
        batch_obs = config.get_config()['Sensor']['dynamiCalibration'][
            'B']  # number of observation in a batch for EWMA calibration
        lambda_param = config.get_config()['Sensor']['dynamiCalibration'][
            'lambda']  # fogetting parameter for EWMA calibration
        dyn_cal_enabled = config.get_config()['Sensor']['dynamiCalibration'][
            'enabled']  # is the dynamic calibration activated?
        output_generated_path = rootDataPath + config.get_config()['Sensor'][
            'output']  # path to save the Q and T statistics obtained from the previous observation
        missingDataMethods = config.get_config()['Sensor']['missingData'][
            'missingDataMethods']  # Missing data available methods
        missingDataSelectedMethod = config.get_config()['Sensor'][
            'missingData']['selected']  # Get the selected missing data method
        missingDataModule = config.get_config()['Sensor']['missingData'][
            'missingDataModule']  # Missing data available methods
        valuesFormat = config.get_config()['GeneralParams'][
            'valuesFormat']  # how the variables of the complete observation are saved

        logging.debug("Launch monitoring for %s ", ts)

        try:
            logging.debug("Building the observation at %s for %s sources.", ts,
                          self._sources.keys())
            # Build the observation for monitoring
            test = []
            for i in self._sources.keys():
                # Get the number of variables of source i
                i_variables = self.get_number_source_variables(
                    self._sources[i], i)
                logging.debug("Source %s has %s variables.", i, i_variables)
                # Get the source output parsed file for the current
                i_parsed_file = self._sources[i]._files_generated[ts]
                logging.debug("File generated of source %s at %s: %s", i, ts,
                              i_parsed_file)

                if i_parsed_file:
                    # Load the file
                    if self._sources[i]._type == Source.TYPE_L:

                        # static mode?
                        # TODO: next version
                        #staticMode = config.get_config()['DataSources'][self._sources[i]._type][i]['staticMode'];
                        staticMode = False

                        if not staticMode:  # online or dynamic mode
                            i_test = np.loadtxt(i_parsed_file,
                                                comments="#",
                                                delimiter=",")
                        else:  # offline or static mode
                            # TODO it is just a patch to remove in_npackets_verylow e in_nbytes_verylow like in matlab experiment and just for Netflow!!!
                            # look for a more smart way to do this e.g., by configuration params
                            i_test = np.loadtxt(i_parsed_file,
                                                comments="#",
                                                delimiter=",",
                                                usecols=range(
                                                    1, i_variables + 1 + 2))

                            logging.debug(
                                "Offline mode for source %s. Observation size of %s",
                                i, i_test.shape)

                            mask = np.ones(i_test.shape, dtype=bool)
                            # in_npackets_verylow index in matlab is 119 --> 119 in numpy
                            # in_nbytes_verylow index in matlab is 129 --> 129 in numpy
                            mask[118] = False
                            mask[128] = False
                            i_test = i_test[mask]

                            logging.debug(
                                "Offline mode for source %s. Observation size of %s after removing unuseless variables.",
                                i, i_test.shape)

                    elif self._sources[i]._type == Source.TYPE_R:
                        i_test = np.loadtxt(i_parsed_file,
                                            comments="#",
                                            delimiter=",")
                    else:
                        logging.warn(
                            "Source %s does not has a valid type. Type: %s", i,
                            self._sources[i]._type)
                else:
                    # Missing values are replaced with NaN values
                    i_test = np.empty(i_variables)
                    i_test[:] = np.nan

                # Test observation
                test = np.concatenate((test, i_test), axis=0)

            # 1xM array
            test = test.reshape((1, test.size))

            # Dynamic invocation of the selected data imputation method if needed
            if np.isnan(test).any():
                missingDataMethod = getattr(
                    importlib.import_module(missingDataModule),
                    missingDataMethods[missingDataSelectedMethod])
                logging.debug(
                    "Invoking %s method for data imputation for observation at %s",
                    missingDataMethod.func_name, ts)
                # Calling the corresponding method
                test = missingDataMethod(obs=test, model=self._sensor._model)

            obs_generate_file = obs_generated_path + "obs_" + ts + ".dat"
            np.savetxt(obs_generate_file,
                       test,
                       fmt=valuesFormat,
                       delimiter=",",
                       header=str(datautils.getAllVarNames()),
                       comments="#")

            logging.debug("Observation generated of %s variables at %s.",
                          test.size, ts)

            # if the dynamic calibration enabled?
            if dyn_cal_enabled:

                # Increments the number of observation
                self._current_batch_obs = self._current_batch_obs + 1

                logging.debug("obs %s added to the batch as number %s.", ts,
                              self._current_batch_obs)

                # Add the observation
                self._batch[ts] = {}
                self._batch[ts]['file'] = obs_generate_file
                self._batch[ts]['data'] = test

                # Once we reached the number of batch observations, we can do the dynamic calibration
                if self._current_batch_obs == batch_obs:
                    # data for calibration
                    x = np.array([])
                    x = x.reshape((0, test.size))

                    # Build the [NxM] data for the calibration
                    #print(self._batch.keys())
                    for i in self._batch.keys():
                        logging.debug("batch at %s -> %s", i,
                                      self._batch[i]['data'].shape)
                        x = np.vstack((x, self._batch[i]['data']))

                    #print(x)
                    #print(type(x))

                    # Build the model
                    self._sensor.set_data(x)
                    self._sensor.do_dynamic_calibration(phase=2,
                                                        lv=3,
                                                        lamda=lambda_param)

                    # Reset the counter
                    self._current_batch_obs = 0

                    # Removing all batch observations
                    self._batch.clear()

            # Do monitoring
            Qst, Dst = self._sensor.do_monitoring(test)

        except SensorError as ese:
            raise MSNMError(self, ese.get_msg(), method_name)
        except MSNMError as emsnme:
            raise emsnme

        logging.debug("MONITORING --> UCLd: %s | Dst: %s",
                      self._sensor.get_model().get_mspc().getUCLD(),
                      self._sensor.get_mspc().getDst())
        logging.debug("MONITORING --> UCLq: %s | Qst: %s",
                      self._sensor.get_model().get_mspc().getUCLQ(),
                      self._sensor.get_mspc().getQst())

        # Save the generated statistics
        output_generated_file = output_generated_path + "output_" + ts + ".dat"
        header = "UCLq:" + str(
            self._sensor.get_model().get_mspc().getUCLQ()) + ", UCLd:" + str(
                self._sensor.get_model().get_mspc().getUCLD())
        list_array = [
            self._sensor.get_mspc().getQst(),
            self._sensor.get_mspc().getDst()
        ]
        statistics = np.array(list_array)
        statistics = statistics.reshape((1, statistics.size))
        np.savetxt(output_generated_file,
                   statistics,
                   fmt=valuesFormat,
                   delimiter=",",
                   header=header,
                   comments="#")

        # Gets the remote sensor addressed to send the packet
        remote_addresses = config.get_config()['Sensor']['remote_addresses']

        # Send packets is there are someone for sending it!
        if remote_addresses:

            # Send the data packet to the corresponding sensor.
            dataPacket = DataPacket()
            # Packet sent counter increments
            self._packet_sent = self._packet_sent + 1
            dataPacket.fill_header({
                'id': self._packet_sent,
                'sid': config.get_config()['Sensor']['sid'],
                'ts': dateutils.get_timestamp(),
                'type': Packet.TYPE_D
            })
            dataPacket.fill_body({
                'Q': self._sensor.get_mspc().getQst(),
                'D': self._sensor.get_mspc().getDst()
            })

            logging.debug("Remote sources to send the packet #%s: %s",
                          self._packet_sent, remote_addresses)

            for i in remote_addresses.keys():
                ip = remote_addresses[i]['ip']
                port = remote_addresses[i]['port']
                tcpClient = TCPClient()
                tcpClient.set_server_address((ip, port))
                tcpClient.set_packet_to_send(dataPacket)
                TCPClientThread(tcpClient).start()

        return test, Qst, Dst