def train(self, search_string, X_fields, Y_field): ''' trains the model using normal equations theta = (X^T X)^-1 X^T y ''' # doesn't seem like finding these values in splunk itself is the most efficient; if we just pull out X and y, # we can find everything using numpy. So let's do that # make a feature mapping feature_mapping = {X_fields[i]:i for i in range(len(X_fields))} self.feature_mapping =feature_mapping # make the search job search_kwargs = {'timeout':1000, 'exec_mode':'blocking'} job = self.jobs.create('search ' +search_string + '| table %s' % ' '.join(X_fields) + ' ' + Y_field, **search_kwargs) X, y = sm.job_to_numpy_reps(job, feature_mapping, Y_field, ('continuous','continuous'), bias=True) # now we solve the normal equations to find theta self.theta = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y) print self.theta.shape
def train(self, search_string, X_fields, n): ''' trains the model using PCA definition: Y = XV where the columns are V are the first n eigenvectors of the covariance matrix 1/m (X^T X) X is mxd, V is dxn -> Y is mxn, each row an example now with dimension n ''' # doesn't seem like finding these values in splunk itself is the most efficient (it's done in Splunk for gda) if we just pull out X # we can find everything using numpy. So let's do that # make a feature mapping feature_mapping = {X_fields[i]:i for i in range(len(X_fields))} self.feature_mapping =feature_mapping # make the search job search_kwargs = {'timeout':1000, 'exec_mode':'blocking'} job = self.jobs.create('search ' +search_string + '| table %s' % ' '.join(X_fields), **search_kwargs) # note that we are just disregarding the y here because of how job_to_numpy_reps was coded. will change later. X, y = sm.job_to_numpy_reps(job, feature_mapping, X_fields[0], ('continuous','continuous')) # now we find the principal n eigenvectors # first subtract the means; save it to user later self.means = np.average(X, axis=0) X = X - self.means # now compute covariance matrix cov = np.dot(X.T, X) # now compute eigenvectors eigval, eigvec = np.linalg.eig(cov) # sort the eigvals sorted_indices = np.argsort(eigval)[::-1][:n] # find the right eigvecs; now the principal n eigvecs are the columns of principal_n_eigvecs principal_n_eigvecs = eigvec[:,sorted_indices] # store it in self.components self.components = principal_n_eigvecs print self.components.shape
def train(self, search_string, X_fields, Y_field): ''' trains the model using normal equations theta = (X^T X)^-1 X^T y ''' # doesn't seem like finding these values in splunk itself is the most efficient; if we just pull out X and y, # we can find everything using numpy. So let's do that # make a feature mapping feature_mapping = {X_fields[i]: i for i in range(len(X_fields))} self.feature_mapping = feature_mapping # make the search job search_kwargs = {'timeout': 1000, 'exec_mode': 'blocking'} job = self.jobs.create( 'search ' + search_string + '| table %s' % ' '.join(X_fields) + ' ' + Y_field, **search_kwargs) X, y = sm.job_to_numpy_reps(job, feature_mapping, Y_field, ('continuous', 'continuous'), bias=True) # now we solve the normal equations to find theta self.theta = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y) print self.theta.shape