예제 #1
0
    def fit(self, df, options):
        """Do the clustering & merge labels with original data."""
        # Make a copy of the input data
        X = df.copy()

        # Use the df_util prepare_features method to
        # - drop null columns & rows
        # - convert categorical columns into dummy indicator columns
        # X is our cleaned data, nans is a mask of the null value locations
        X, nans, columns = df_util.prepare_features(X, self.feature_variables)

        # Do the actual clustering
        y_hat = self.estimator.fit_predict(X.values)

        # attach silhouette coefficient score for each row
        silhouettes = silhouette_samples(X, y_hat)

        # Combine the two arrays, and transpose them.
        y_hat = np.vstack([y_hat, silhouettes]).T

        # Assign default output names
        default_name = 'cluster'

        # Get the value from the as-clause if present
        output_name = options.get('output_name', default_name)

        # There are two columns - one for the labels, for the silhouette scores
        output_names = [output_name, 'silhouette_score']

        # Use the predictions & nans-mask to create a new dataframe
        output_df = df_util.create_output_dataframe(y_hat, nans, output_names)

        # Merge the dataframe with the original input data
        df = df_util.merge_predictions(df, output_df)
        return df
예제 #2
0
    def apply(self, df, options=None):
        # Handle backwards compatibility.
        add_missing_attr(self.estimator,
                         attr='max_iter',
                         value=5,
                         param_key='n_iter')
        add_missing_attr(self.estimator, attr='tol', value=None)

        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )

        scaled_X = self.scaler.transform(X.values)
        y_hat = self.estimator.predict(scaled_X)

        default_name = 'predicted({})'.format(self.target_variable)
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )

        output = df_util.merge_predictions(df, output)
        return output
예제 #3
0
 def fit(self, df, options):
     """Compute the polynomial features and return a DataFrame"""
     (X, nans, columns) = prepare_features(df.copy(),
                                           self.feature_variables)
     X_hat = DataFrame(self.preprocessor.fit_transform(X),
                       columns=self.get_feature_names(columns))
     return merge_predictions(df, X_hat)
예제 #4
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()
        X, nans, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
        )
        y_hat = self.estimator.transform(X.values)
        mask = self.estimator.get_support()
        columns_select = np.array(self.columns)[mask]
        width = len(columns_select)

        if width == 0:
            cexc.messages.warn(
                'No fields pass the current configuration. Consider changing your parameters.'
            )

        default_name = 'fs'
        output_name = options.get('output_name', default_name)
        output_names = [output_name + '_%s' % x for x in columns_select]

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_names,
        )

        df = df_util.merge_predictions(df, output)
        return df
예제 #5
0
    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
        )

        if len(X) > 0 and len(X) <= self.estimator.n_clusters:
            raise RuntimeError(
                "k must be smaller than the number of events used as input")

        scaled_X = self.scaler.fit_transform(X.values)
        y_hat = self.estimator.fit_predict(scaled_X)
        y_hat = ['' if np.isnan(v) else str('%.0f' % v) for v in y_hat]

        default_name = 'cluster'
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )
        df = df_util.merge_predictions(df, output)
        return df
예제 #6
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Make sure to turn off get_dummies
        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            get_dummies=False,
            mlspl_limits=options.get('mlspl_limits'),
        )

        X = X.values.ravel().astype('str')
        y_hat = self.estimator.transform(X)

        # Convert the returned sparse matrix into array
        y_hat = y_hat.toarray()

        output_names = self.make_output_names(options)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            output_names=output_names,
            nans=nans,
        )

        df = df_util.merge_predictions(df, output)
        return df
예제 #7
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Prepare the dataset
        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )
        # Make predictions
        y_hat = self.estimator.predict(X.values)

        # Assign output_name
        default_name = 'predicted({})'.format(self.target_variable)
        new_name = options.get('output_name', None)
        output_name = self.rename_output(default_names=default_name,
                                         new_names=new_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )

        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
예제 #8
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        logger = get_logger('IsolationForest Logger')
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )

        # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1.
        y_hat = self.estimator.predict(X.values)*-1
        # Printing the accuracy for prediction of outliers
        accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2)))
        logger.debug(accuracy)
        
        y_hat = y_hat.astype('str')

        #Assign output_name
        default_name = 'isOutlier'
        new_name = options.get('output_name', None)
        output_name = self.rename_output(default_names=default_name, new_names=new_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat, nans=nans, output_names=output_name
        )
        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
예제 #9
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Prepare the features
        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
        )

        # Call the transform method
        y_hat = self.estimator.fit_transform(X.values)

        # Assign output_name
        output_name = options.get('output_name', None)
        default_names = self.make_output_names(
            output_name=output_name,
            n_names=y_hat.shape[1],
        )
        output_names = self.rename_output(default_names, output_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_names,
        )

        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
예제 #10
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )
        y_hat = self.estimator.predict(X.values)

        # Ensure the output has no floating points
        y_hat = y_hat.astype('str')

        # Assign output_name
        default_name = 'cluster'
        new_name = options.get('output_name', None)
        output_name = self.rename_output(default_names=default_name,
                                         new_names=new_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )

        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
예제 #11
0
파일: TFIDF.py 프로젝트: TPLink32/spnk1
    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Make sure to turn off get_dummies
        X, _, self.columns = df_util.prepare_features(
            X=X, variables=self.feature_variables, get_dummies=False)

        self.estimator.fit(X.values.ravel())
예제 #12
0
    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, _, self.columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            mlspl_limits=options.get('mlspl_limits'),
        )
        self.estimator.fit(X.values)
예제 #13
0
    def fit(self, df, options):
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            mlspl_limits=options.get('mlspl_limits'),
            get_dummies=False)

        number_of_nulls = nans.sum()
        if number_of_nulls > 0:
            messages.warn('{} events with nulls were dropped.'.format(number_of_nulls))

        if self.nlags >= len(X):
            raise RuntimeError('k must be less than number of events.')

        # Only fields allowed (in case fields expanded through glob matching).
        if len(self.feature_variables) > 1:
            temp = 'You must specify only one field. Multiple fields found: {}'
            err = temp.format(', '.join(self.feature_variables))
            raise RuntimeError(err)

        # Only numeric inputs allowed.
        if X[self.feature_variables].dtypes.tolist()[0] == object:
            temp = '{} contains non-numeric data. {} only accepts numeric data.'
            err = temp.format(self.feature_variables[0], self.__class__.__name__)
            raise RuntimeError(err)

        # Get calculation
        autocors, conf_int = self._calculate(X)
        conf_int = conf_int - conf_int.mean(1)[:, None]

        # autocors[:, None] converts 1D-array to 2D for concatenation match
        autocors_2d = autocors[:, None]
        stacked = np.concatenate([autocors_2d, conf_int], axis=1)

        # Get the default name
        output_name = options.get('output_name', self.feature_variables[0])
        name = self.default_name.format(output_name)

        # Lower and upper names
        confidence_interval = alpha_to_confidence_interval(self.alpha)
        lower_name = 'lower{}({})'.format(confidence_interval, name)
        upper_name = 'upper{}({})'.format(confidence_interval, name)

        # Splunk arranges columns via ascii ordering
        # So the capital L on Lag ensures it will be in the leftmost column
        output_names = ['Lag', name, lower_name, upper_name]

        output = pd.DataFrame(stacked)
        output = output.reset_index()
        output.columns = output_names

        return output
예제 #14
0
    def partial_fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, _, columns = df_util.prepare_features(X, self.feature_variables)
        if self.columns is not None:
            df_util.handle_new_categorical_values(X, None, options,
                                                  self.columns)
            if X.empty:
                return
        else:
            self.columns = columns
        self.estimator.partial_fit(X)
예제 #15
0
    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Make sure to turn off get_dummies
        X, _, self.columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            get_dummies=False,
            mlspl_limits=options.get('mlspl_limits'),
        )

        X = X.values.ravel().astype('str')
        self.estimator.fit(X)
예제 #16
0
    def fit(self, df, options):
        X = df.copy()
        X, nans, columns = df_util.prepare_features(X, self.feature_variables)

        def f(x):
            return savgol_filter(x, self.window_length, self.polyorder,
                                 self.deriv)

        y_hat = np.apply_along_axis(f, 0, X)

        names = ['SG_%s' % col for col in columns]
        output_df = df_util.create_output_dataframe(y_hat, nans, names)
        df = df_util.merge_predictions(df, output_df)

        return df
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Prepare the dataset
        X, nans, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )
        # Make predictions
        y_hat = self.estimator.predict(X.values)

        # Assign output_name
        default_name = 'predicted({})'.format(self.target_variable)
        output_name = options.get('output_name', default_name)

        # Create output
        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )
        if self.check_probabilities(options):
            # predict probs
            y_hat_proba = self.estimator.predict_proba(X.values)

            # get names
            class_names = [
                'probability({}={})'.format(self.target_variable, cls_name)
                for cls_name in self.estimator.classes_
            ]

            # create output data frame
            output_proba = df_util.create_output_dataframe(
                y_hat=y_hat_proba,
                nans=nans,
                output_names=class_names,
            )
            # combine
            output = pd.concat([output, output_proba], axis=1)

        df = df_util.merge_predictions(df, output)
        return df
예제 #18
0
    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits')
        )

        y_hat = self.estimator.fit_predict(X.values)

        default_name = 'cluster'
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat, nans=nans, output_names=output_name
        )
        df = df_util.merge_predictions(df, output)
        return df
예제 #19
0
    def partial_fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        algo_util.assert_estimator_supports_partial_fit(self.estimator)
        X, _, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            mlspl_limits=options.get('mlspl_limits'),
        )

        if getattr(self, 'columns', None):
            df_util.handle_new_categorical_values(X, None, options,
                                                  self.columns)
            if X.empty:
                return
        else:
            self.columns = columns

        self.estimator.partial_fit(X)
예제 #20
0
    def apply(self, df, options):
        """Apply is overridden to add additional 'cluster_distance' column."""
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )
        y_hat = self.estimator.predict(X.values)

        default_name = 'cluster'
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )
        df_values = X[self.columns].values
        cluster_ctrs = self.estimator.cluster_centers_

        dist = [
            np.nan if np.isnan(cluster) else np.sum(
                np.square(cluster_ctrs[cluster] - row))
            for (cluster, row) in izip(y_hat, df_values)
        ]

        dist_df = df_util.create_output_dataframe(
            y_hat=dist,
            nans=nans,
            output_names='cluster_distance',
        )

        output = df_util.merge_predictions(output, dist_df)
        df = df_util.merge_predictions(df, output)
        df[output_name] = df[output_name].apply(lambda c: ''
                                                if np.isnan(c) else int(c))
        return df
예제 #21
0
    def apply(self, df, options=None):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
        )

        scaled_X = self.scaler.transform(X.values)
        y_hat = self.estimator.predict(scaled_X)

        default_name = 'predicted({})'.format(self.target_variable)
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )

        df = df_util.merge_predictions(df, output)
        return df
    def fit(self, df, options):
        # df contains all the search results, including hidden fields
        # but the requested requested are saved as self.feature_variables
        logger = get_logger('MyCustomLogging')

        X = df.copy()

        # it is always best practice to prepare your data.
        # splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only
        # want the features that are valid field names.

        #Make sure to turn off get_dummies
        X, _, self.columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            get_dummies=False,
            mlspl_limits=options.get('mlspl_limits'),
        )

        # test if user field is in the list
        logger.debug("The user field is %s", self.user_field)
        try:
            my_list_index = (X[self.user_field].values)
        except:
            raise RuntimeError(
                'You must specify user field that exists. You sent %s',
                self.user_field)

        X = X.drop([self.user_field], axis=1)
        my_list_header = (X.columns.values)

        #ratings as a matrix , clean that data up!
        X = X.replace([np.inf, -np.inf], "nan").replace("nan", "0")
        matrix = X.values
        # force type for Numpy Math
        matrix = matrix.astype(np.float64)

        # should consider erroring out when you have super sparse user data
        # TODO add other methods via parameter
        user_sim = pairwise_distances(matrix, metric='cosine')
        item_sim = pairwise_distances(matrix.T, metric='cosine')

        #item prediction
        item_sim = matrix.dot(item_sim) / np.array(
            [np.abs(item_sim).sum(axis=1)])

        #user sim
        mean_user_rating = matrix.mean(axis=1)
        matrix_diff = (matrix - mean_user_rating[:, np.newaxis])
        user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot(
            matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T

        # add back into the matrix the header row
        if self.rating_type == "item":
            output_df = pd.DataFrame(item_sim,
                                     columns=my_list_header,
                                     index=my_list_index)
        if self.rating_type == "user":
            output_df = pd.DataFrame(user_sim,
                                     columns=my_list_header,
                                     index=my_list_index)
        output_df[self.user_field] = pd.Series(my_list_index).values

        return output_df