Пример #1
0
	def train(self, search_string, X_fields, Y_field):
		'''
		trains the model using normal equations

		theta = (X^T X)^-1 X^T y
		'''
		# doesn't seem like finding these values in splunk itself is the most efficient; if we just pull out X and y, 
		# we can find everything using numpy. So let's do that
		# make a feature mapping
		feature_mapping = {X_fields[i]:i for i in range(len(X_fields))}
		self.feature_mapping =feature_mapping
		# make the search job
		search_kwargs = {'timeout':1000, 'exec_mode':'blocking'}
		job = self.jobs.create('search ' +search_string + '| table %s' % ' '.join(X_fields) + ' ' + Y_field, **search_kwargs)
		X, y = sm.job_to_numpy_reps(job, feature_mapping, Y_field, ('continuous','continuous'), bias=True)

		# now we solve the normal equations to find theta
		self.theta = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)
		print self.theta.shape
Пример #2
0
	def train(self, search_string, X_fields, n):
		'''
		trains the model using PCA definition:

		Y = XV where the columns are V are the first n eigenvectors of the covariance matrix 1/m (X^T X)

		X is mxd, V is dxn -> Y is mxn, each row an example now with dimension n
		'''
		# doesn't seem like finding these values in splunk itself is the most efficient (it's done in Splunk for gda) if we just pull out X
		# we can find everything using numpy. So let's do that
		# make a feature mapping
		feature_mapping = {X_fields[i]:i for i in range(len(X_fields))}
		self.feature_mapping =feature_mapping
		# make the search job
		search_kwargs = {'timeout':1000, 'exec_mode':'blocking'}
		job = self.jobs.create('search ' +search_string + '| table %s' % ' '.join(X_fields),  **search_kwargs)
		# note that we are just disregarding the y here because of how job_to_numpy_reps was coded. will change later.
		X, y = sm.job_to_numpy_reps(job, feature_mapping, X_fields[0], ('continuous','continuous')) 

		# now we find the principal n eigenvectors
		# first subtract the means; save it to user later
		self.means = np.average(X, axis=0)
		X = X - self.means

		# now compute covariance matrix
		cov = np.dot(X.T, X)

		# now compute eigenvectors
		eigval, eigvec = np.linalg.eig(cov)

		# sort the eigvals
		sorted_indices = np.argsort(eigval)[::-1][:n]

		# find the right eigvecs; now the principal n eigvecs are the columns of principal_n_eigvecs
		principal_n_eigvecs = eigvec[:,sorted_indices]

		# store it in self.components
		self.components = principal_n_eigvecs
		print self.components.shape
Пример #3
0
    def train(self, search_string, X_fields, Y_field):
        '''
		trains the model using normal equations

		theta = (X^T X)^-1 X^T y
		'''
        # doesn't seem like finding these values in splunk itself is the most efficient; if we just pull out X and y,
        # we can find everything using numpy. So let's do that
        # make a feature mapping
        feature_mapping = {X_fields[i]: i for i in range(len(X_fields))}
        self.feature_mapping = feature_mapping
        # make the search job
        search_kwargs = {'timeout': 1000, 'exec_mode': 'blocking'}
        job = self.jobs.create(
            'search ' + search_string + '| table %s' % ' '.join(X_fields) +
            ' ' + Y_field, **search_kwargs)
        X, y = sm.job_to_numpy_reps(job,
                                    feature_mapping,
                                    Y_field, ('continuous', 'continuous'),
                                    bias=True)

        # now we solve the normal equations to find theta
        self.theta = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)
        print self.theta.shape