Пример #1
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        logger = get_logger('IsolationForest Logger')
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )

        # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1.
        y_hat = self.estimator.predict(X.values)*-1
        # Printing the accuracy for prediction of outliers
        accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2)))
        logger.debug(accuracy)
        
        y_hat = y_hat.astype('str')

        #Assign output_name
        default_name = 'isOutlier'
        new_name = options.get('output_name', None)
        output_name = self.rename_output(default_names=default_name, new_names=new_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat, nans=nans, output_names=output_name
        )
        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
Пример #2
0
def make_rest_call(session_key, method, url, postargs):
    import os
    import subprocess
    import cexc

    logger = cexc.get_logger(__name__)

    payload = {
        'session_key': session_key,
        'url': url,
        'method': method,
        'postargs': postargs
    }

    try:
        python_path = os.path.join(os.environ['SPLUNK_HOME'], 'bin', 'python')
        p = subprocess.Popen(
            [python_path, os.path.abspath(__file__)],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        (stdoutdata, stderrdata) = p.communicate(json.dumps(payload))
        p.wait()

        for errline in stderrdata.splitlines():
            logger.debug('> %s', errline)

        if p.returncode != 0:
            raise RuntimeError(
                "rest_bouncer subprocess exited with non-zero error code '%d'"
                % p.returncode)

        reply = json.loads(stdoutdata)
    except Exception as e:
        logger.warn('rest_bouncer failure: %s: %s', type(e).__name__, str(e))
        return False

    return reply
Пример #3
0
#!/usr/bin/env python
# Copyright (C) 2015-2017 Splunk Inc. All Rights Reserved.
import errno
import gc
import os

import pandas as pd

import cexc
import conf
import models.base

from BaseProcessor import BaseProcessor
from util import search_util

logger = cexc.get_logger(__name__)
messages = cexc.get_messages_logger()


class ApplyProcessor(BaseProcessor):
    """The apply processor receives and returns pandas DataFrames."""

    def __init__(self, process_options, searchinfo):
        """Initialize options for the processor.

        Args:
            process_options (dict): process options
            searchinfo (dict): information required for search
        """
        self.searchinfo = searchinfo
        self.algo_name, self.algo, self.process_options, self.namespace = self.setup_model(process_options, self.searchinfo)
Пример #4
0
#!/usr/bin/env python
# Copyright (C) 2015-2018 Splunk Inc. All Rights Reserved.
from exec_anaconda import exec_anaconda_or_die
exec_anaconda_or_die()

import cexc
from cexc import BaseChunkHandler

from util.param_util import parse_args
from util.command_util import GeneratingCommand, is_getinfo_chunk

logger = cexc.get_logger('kvstorelookup')
messages = cexc.get_messages_logger()


class KVStoreLookupCommand(GeneratingCommand):
    """KVStoreLookupCommand uses the ChunkedController & KVStoreLookupProcessor to read a KVStore collection"""
    @staticmethod
    def handle_arguments(getinfo):
        """Check for invalid arguments and get controller_options.

        Args:
            getinfo (dict): getinfo metadata

        Returns:
            controller_options (dict): controller options
        """

        options = parse_args(getinfo['searchinfo']['args'])
        params = options.get('params', {})
Пример #5
0
#!/usr/bin/env python
# Copyright (C) 2015-2018 Splunk Inc. All Rights Reserved.
from exec_anaconda import exec_anaconda_or_die
exec_anaconda_or_die()

import os

import cexc
from cexc import BaseChunkHandler
from util import command_util
from util.param_util import parse_args
from chunked_controller import ChunkedController

logger = cexc.get_logger('score')
messages = cexc.get_messages_logger()


class ScoreCommand(cexc.BaseChunkHandler):
    """ScoreCommand uses ChunkedController & processor(s) to score field(s). """
    @staticmethod
    def handle_arguments(getinfo):
        """Take the getinfo metadata and return controller_options.

        Args:
            getinfo (dict): getinfo metadata from first chunk

        Returns:
            controller_options (dict): options to be passed to controller
        """
        if len(getinfo['searchinfo']['raw_args']) == 0:
            raise RuntimeError('First argument must be a scoring method')
Пример #6
0
#!/usr/bin/env python
# Copyright (C) 2015-2018 Splunk Inc. All Rights Reserved.
from exec_anaconda import exec_anaconda_or_die
exec_anaconda_or_die()

import os
import conf
from cStringIO import StringIO
from util.param_util import is_truthy, parse_args, convert_params
from util import command_util

import cexc
from chunked_controller import ChunkedController
from cexc import BaseChunkHandler

logger = cexc.get_logger('fit')
messages = cexc.get_messages_logger()


class FitCommand(cexc.BaseChunkHandler):
    """FitCommand uses ChunkedController & one of two processors to fit models.

    The FitCommand can use either the FitBatchProcessor or the FitPartialProcessor,
    which is chosen based on the presence of the partial_fit parameter.
    """
    @staticmethod
    def handle_arguments(getinfo):
        """Take the getinfo metadata and return controller_options.

        Args:
            getinfo (dict): getinfo metadata from first chunk
Пример #7
0
#!/usr/bin/env python
# Copyright (C) 2015-2017 Splunk Inc. All Rights Reserved.
import cexc
from cexc.cexc_anaconda import exec_anaconda_or_die
exec_anaconda_or_die()

from util import command_util, param_util
from util.command_util import GeneratingCommand

logger = cexc.get_logger('delete')
messages = cexc.get_messages_logger()


class DeleteModelCommand(GeneratingCommand):
    """DeleteModelCommand uses the ChunkedController & DeleteModelProcessor to
    delete models."""
    @staticmethod
    def handle_arguments(getinfo):
        """Check for invalid argument usage and return controller options.

        Args:
            getinfo(dict): getinfo metadata

        Returns:
            controller_options (dict): controller options
        """
        if len(getinfo['searchinfo']['args']) != 1:
            raise RuntimeError('Usage: deletemodel <modelname>')

        controller_options = {}
        controller_options['namespace'], controller_options[
Пример #8
0
#!/usr/bin/env python
# Copyright (C) 2015-2017 Splunk Inc. All Rights Reserved.
import cexc
from cexc import BaseChunkHandler
from cexc.cexc_anaconda import exec_anaconda_or_die
exec_anaconda_or_die()


from util import param_util, command_util
from util.command_util import GeneratingCommand

logger = cexc.get_logger('summary')
messages = cexc.get_messages_logger()


class SummaryCommand(GeneratingCommand):
    """Summary command gets model summaries from ML-SPL models."""

    @staticmethod
    def handle_arguments(getinfo):
        """Catch invalid argument and return controller options.

        Args:
            getinfo (dict): getinfo metadata

        Return:
            controller_options (dict): controller options
        """
        if len(getinfo['searchinfo']['args']) == 0:
            raise RuntimeError('First argument must be a saved model')
Пример #9
0
#!/usr/bin/env python
# Copyright (C) 2015-2018 Splunk Inc. All Rights Reserved.
from exec_anaconda import exec_anaconda_or_die

exec_anaconda_or_die()

import cexc
from cexc import BaseChunkHandler

from util import command_util
from util.command_util import GeneratingCommand

logger = cexc.get_logger('list')
messages = cexc.get_messages_logger()


class ListModelsCommand(GeneratingCommand):
    """ListModelsCommand uses the ChunkedController & ListModelsProcessor to
    list saved models."""
    @staticmethod
    def handle_arguments(getinfo):
        """Check for invalid arguments and get controller_options.

        Args:
            getinfo (dict): getinfo metadata

        Returns:
            controller_options (dict): controller options
        """
        if len(getinfo['searchinfo']['args']) > 0:
            raise RuntimeError(
Пример #10
0
#!/usr/bin/env python
# Copyright (C) 2015-2017 Splunk Inc. All Rights Reserved.
import cexc
from cexc import BaseChunkHandler
from cexc.cexc_anaconda import exec_anaconda_or_die
exec_anaconda_or_die()

import conf
from util.param_util import parse_args, is_truthy, parse_namespace_model_name
from util import command_util

from chunked_controller import ChunkedController

logger = cexc.get_logger('apply')
messages = cexc.get_messages_logger()


class ApplyCommand(BaseChunkHandler):
    """ApplyCommand uses the ChunkedController & ApplyProcessor to make
    predictions."""
    @staticmethod
    def handle_arguments(getinfo):
        """Take the getinfo metadata and return controller_options.

        Args:
            getinfo (dict): getinfo metadata

        Returns:
            controller_options (dict): options to be sent to controller
        """
        if len(getinfo['searchinfo']['args']) == 0:
    def fit(self, df, options):
        # df contains all the search results, including hidden fields
        # but the requested requested are saved as self.feature_variables
        logger = get_logger('MyCustomLogging')

        X = df.copy()

        # it is always best practice to prepare your data.
        # splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only
        # want the features that are valid field names.

        #Make sure to turn off get_dummies
        X, _, self.columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            get_dummies=False,
            mlspl_limits=options.get('mlspl_limits'),
        )

        # test if user field is in the list
        logger.debug("The user field is %s", self.user_field)
        try:
            my_list_index = (X[self.user_field].values)
        except:
            raise RuntimeError(
                'You must specify user field that exists. You sent %s',
                self.user_field)

        X = X.drop([self.user_field], axis=1)
        my_list_header = (X.columns.values)

        #ratings as a matrix , clean that data up!
        X = X.replace([np.inf, -np.inf], "nan").replace("nan", "0")
        matrix = X.values
        # force type for Numpy Math
        matrix = matrix.astype(np.float64)

        # should consider erroring out when you have super sparse user data
        # TODO add other methods via parameter
        user_sim = pairwise_distances(matrix, metric='cosine')
        item_sim = pairwise_distances(matrix.T, metric='cosine')

        #item prediction
        item_sim = matrix.dot(item_sim) / np.array(
            [np.abs(item_sim).sum(axis=1)])

        #user sim
        mean_user_rating = matrix.mean(axis=1)
        matrix_diff = (matrix - mean_user_rating[:, np.newaxis])
        user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot(
            matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T

        # add back into the matrix the header row
        if self.rating_type == "item":
            output_df = pd.DataFrame(item_sim,
                                     columns=my_list_header,
                                     index=my_list_index)
        if self.rating_type == "user":
            output_df = pd.DataFrame(user_sim,
                                     columns=my_list_header,
                                     index=my_list_index)
        output_df[self.user_field] = pd.Series(my_list_index).values

        return output_df