def fit(self, df, options): """Do the clustering & merge labels with original data.""" # Make a copy of the input data X = df.copy() # Use the df_util prepare_features method to # - drop null columns & rows # - convert categorical columns into dummy indicator columns # X is our cleaned data, nans is a mask of the null value locations X, nans, columns = df_util.prepare_features(X, self.feature_variables) # Do the actual clustering y_hat = self.estimator.fit_predict(X.values) # attach silhouette coefficient score for each row silhouettes = silhouette_samples(X, y_hat) # Combine the two arrays, and transpose them. y_hat = np.vstack([y_hat, silhouettes]).T # Assign default output names default_name = 'cluster' # Get the value from the as-clause if present output_name = options.get('output_name', default_name) # There are two columns - one for the labels, for the silhouette scores output_names = [output_name, 'silhouette_score'] # Use the predictions & nans-mask to create a new dataframe output_df = df_util.create_output_dataframe(y_hat, nans, output_names) # Merge the dataframe with the original input data df = df_util.merge_predictions(df, output_df) return df
def apply(self, df, options=None): # Handle backwards compatibility. add_missing_attr(self.estimator, attr='max_iter', value=5, param_key='n_iter') add_missing_attr(self.estimator, attr='tol', value=None) # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, columns = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) scaled_X = self.scaler.transform(X.values) y_hat = self.estimator.predict(scaled_X) default_name = 'predicted({})'.format(self.target_variable) output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) output = df_util.merge_predictions(df, output) return output
def fit(self, df, options): """Compute the polynomial features and return a DataFrame""" (X, nans, columns) = prepare_features(df.copy(), self.feature_variables) X_hat = DataFrame(self.preprocessor.fit_transform(X), columns=self.get_feature_names(columns)) return merge_predictions(df, X_hat)
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, columns = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, ) y_hat = self.estimator.transform(X.values) mask = self.estimator.get_support() columns_select = np.array(self.columns)[mask] width = len(columns_select) if width == 0: cexc.messages.warn( 'No fields pass the current configuration. Consider changing your parameters.' ) default_name = 'fs' output_name = options.get('output_name', default_name) output_names = [output_name + '_%s' % x for x in columns_select] output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_names, ) df = df_util.merge_predictions(df, output) return df
def fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, ) if len(X) > 0 and len(X) <= self.estimator.n_clusters: raise RuntimeError( "k must be smaller than the number of events used as input") scaled_X = self.scaler.fit_transform(X.values) y_hat = self.estimator.fit_predict(scaled_X) y_hat = ['' if np.isnan(v) else str('%.0f' % v) for v in y_hat] default_name = 'cluster' output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) df = df_util.merge_predictions(df, output) return df
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Make sure to turn off get_dummies X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, get_dummies=False, mlspl_limits=options.get('mlspl_limits'), ) X = X.values.ravel().astype('str') y_hat = self.estimator.transform(X) # Convert the returned sparse matrix into array y_hat = y_hat.toarray() output_names = self.make_output_names(options) output = df_util.create_output_dataframe( y_hat=y_hat, output_names=output_names, nans=nans, ) df = df_util.merge_predictions(df, output) return df
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Prepare the dataset X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) # Make predictions y_hat = self.estimator.predict(X.values) # Assign output_name default_name = 'predicted({})'.format(self.target_variable) new_name = options.get('output_name', None) output_name = self.rename_output(default_names=default_name, new_names=new_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def apply(self, df, options): # Make a copy of data, to not alter original dataframe logger = get_logger('IsolationForest Logger') X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1. y_hat = self.estimator.predict(X.values)*-1 # Printing the accuracy for prediction of outliers accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2))) logger.debug(accuracy) y_hat = y_hat.astype('str') #Assign output_name default_name = 'isOutlier' new_name = options.get('output_name', None) output_name = self.rename_output(default_names=default_name, new_names=new_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Prepare the features X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, ) # Call the transform method y_hat = self.estimator.fit_transform(X.values) # Assign output_name output_name = options.get('output_name', None) default_names = self.make_output_names( output_name=output_name, n_names=y_hat.shape[1], ) output_names = self.rename_output(default_names, output_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_names, ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) y_hat = self.estimator.predict(X.values) # Ensure the output has no floating points y_hat = y_hat.astype('str') # Assign output_name default_name = 'cluster' new_name = options.get('output_name', None) output_name = self.rename_output(default_names=default_name, new_names=new_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Make sure to turn off get_dummies X, _, self.columns = df_util.prepare_features( X=X, variables=self.feature_variables, get_dummies=False) self.estimator.fit(X.values.ravel())
def fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, _, self.columns = df_util.prepare_features( X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits'), ) self.estimator.fit(X.values)
def fit(self, df, options): X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits'), get_dummies=False) number_of_nulls = nans.sum() if number_of_nulls > 0: messages.warn('{} events with nulls were dropped.'.format(number_of_nulls)) if self.nlags >= len(X): raise RuntimeError('k must be less than number of events.') # Only fields allowed (in case fields expanded through glob matching). if len(self.feature_variables) > 1: temp = 'You must specify only one field. Multiple fields found: {}' err = temp.format(', '.join(self.feature_variables)) raise RuntimeError(err) # Only numeric inputs allowed. if X[self.feature_variables].dtypes.tolist()[0] == object: temp = '{} contains non-numeric data. {} only accepts numeric data.' err = temp.format(self.feature_variables[0], self.__class__.__name__) raise RuntimeError(err) # Get calculation autocors, conf_int = self._calculate(X) conf_int = conf_int - conf_int.mean(1)[:, None] # autocors[:, None] converts 1D-array to 2D for concatenation match autocors_2d = autocors[:, None] stacked = np.concatenate([autocors_2d, conf_int], axis=1) # Get the default name output_name = options.get('output_name', self.feature_variables[0]) name = self.default_name.format(output_name) # Lower and upper names confidence_interval = alpha_to_confidence_interval(self.alpha) lower_name = 'lower{}({})'.format(confidence_interval, name) upper_name = 'upper{}({})'.format(confidence_interval, name) # Splunk arranges columns via ascii ordering # So the capital L on Lag ensures it will be in the leftmost column output_names = ['Lag', name, lower_name, upper_name] output = pd.DataFrame(stacked) output = output.reset_index() output.columns = output_names return output
def partial_fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, _, columns = df_util.prepare_features(X, self.feature_variables) if self.columns is not None: df_util.handle_new_categorical_values(X, None, options, self.columns) if X.empty: return else: self.columns = columns self.estimator.partial_fit(X)
def fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Make sure to turn off get_dummies X, _, self.columns = df_util.prepare_features( X=X, variables=self.feature_variables, get_dummies=False, mlspl_limits=options.get('mlspl_limits'), ) X = X.values.ravel().astype('str') self.estimator.fit(X)
def fit(self, df, options): X = df.copy() X, nans, columns = df_util.prepare_features(X, self.feature_variables) def f(x): return savgol_filter(x, self.window_length, self.polyorder, self.deriv) y_hat = np.apply_along_axis(f, 0, X) names = ['SG_%s' % col for col in columns] output_df = df_util.create_output_dataframe(y_hat, nans, names) df = df_util.merge_predictions(df, output_df) return df
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Prepare the dataset X, nans, columns = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) # Make predictions y_hat = self.estimator.predict(X.values) # Assign output_name default_name = 'predicted({})'.format(self.target_variable) output_name = options.get('output_name', default_name) # Create output output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) if self.check_probabilities(options): # predict probs y_hat_proba = self.estimator.predict_proba(X.values) # get names class_names = [ 'probability({}={})'.format(self.target_variable, cls_name) for cls_name in self.estimator.classes_ ] # create output data frame output_proba = df_util.create_output_dataframe( y_hat=y_hat_proba, nans=nans, output_names=class_names, ) # combine output = pd.concat([output, output_proba], axis=1) df = df_util.merge_predictions(df, output) return df
def fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits') ) y_hat = self.estimator.fit_predict(X.values) default_name = 'cluster' output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name ) df = df_util.merge_predictions(df, output) return df
def partial_fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() algo_util.assert_estimator_supports_partial_fit(self.estimator) X, _, columns = df_util.prepare_features( X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits'), ) if getattr(self, 'columns', None): df_util.handle_new_categorical_values(X, None, options, self.columns) if X.empty: return else: self.columns = columns self.estimator.partial_fit(X)
def apply(self, df, options): """Apply is overridden to add additional 'cluster_distance' column.""" # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) y_hat = self.estimator.predict(X.values) default_name = 'cluster' output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) df_values = X[self.columns].values cluster_ctrs = self.estimator.cluster_centers_ dist = [ np.nan if np.isnan(cluster) else np.sum( np.square(cluster_ctrs[cluster] - row)) for (cluster, row) in izip(y_hat, df_values) ] dist_df = df_util.create_output_dataframe( y_hat=dist, nans=nans, output_names='cluster_distance', ) output = df_util.merge_predictions(output, dist_df) df = df_util.merge_predictions(df, output) df[output_name] = df[output_name].apply(lambda c: '' if np.isnan(c) else int(c)) return df
def apply(self, df, options=None): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, columns = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, ) scaled_X = self.scaler.transform(X.values) y_hat = self.estimator.predict(scaled_X) default_name = 'predicted({})'.format(self.target_variable) output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) df = df_util.merge_predictions(df, output) return df
def fit(self, df, options): # df contains all the search results, including hidden fields # but the requested requested are saved as self.feature_variables logger = get_logger('MyCustomLogging') X = df.copy() # it is always best practice to prepare your data. # splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only # want the features that are valid field names. #Make sure to turn off get_dummies X, _, self.columns = df_util.prepare_features( X=X, variables=self.feature_variables, get_dummies=False, mlspl_limits=options.get('mlspl_limits'), ) # test if user field is in the list logger.debug("The user field is %s", self.user_field) try: my_list_index = (X[self.user_field].values) except: raise RuntimeError( 'You must specify user field that exists. You sent %s', self.user_field) X = X.drop([self.user_field], axis=1) my_list_header = (X.columns.values) #ratings as a matrix , clean that data up! X = X.replace([np.inf, -np.inf], "nan").replace("nan", "0") matrix = X.values # force type for Numpy Math matrix = matrix.astype(np.float64) # should consider erroring out when you have super sparse user data # TODO add other methods via parameter user_sim = pairwise_distances(matrix, metric='cosine') item_sim = pairwise_distances(matrix.T, metric='cosine') #item prediction item_sim = matrix.dot(item_sim) / np.array( [np.abs(item_sim).sum(axis=1)]) #user sim mean_user_rating = matrix.mean(axis=1) matrix_diff = (matrix - mean_user_rating[:, np.newaxis]) user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot( matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T # add back into the matrix the header row if self.rating_type == "item": output_df = pd.DataFrame(item_sim, columns=my_list_header, index=my_list_index) if self.rating_type == "user": output_df = pd.DataFrame(user_sim, columns=my_list_header, index=my_list_index) output_df[self.user_field] = pd.Series(my_list_index).values return output_df