예제 #1
0
    def fit(self,
            chips,
            specs=None,
            n_samples=0,
            counts=None,
            a=0,
            bounds=None):
        """Primary execution point. Run either standard KDE or class-membership
        based KDE. If any of the class-membership based KDE arguments are set,
        it will be run instead of standard KDE.
                
        Parameters
        ----------
        chips : list
            A list of chip model objects.
        
        n_samples : int
            The number of samples to generate.
        
        specs  : qikify.models.Specs, optional
            If using partitioned sampling, boundaries defining pass / critical /
            fail subspaces must be provided.
            
        counts : dict, optional
            If using partitioned sampling, counts dictionary must be provided,
            with three keys: nGood, nCritical, nFail.
                 
        """
        X = pandas.DataFrame([chip.LCT for chip in chips])

        self.n, self.d = X.shape
        self.specs = specs
        self.columns = getattr(X, 'columns', None)

        # Normalize data/bounds
        self.scale_factors, self.Xn = standardize(X)
        self.bounds = standardize(np.array([X.min(0), X.max(0)]), \
                                  self.scale_factors)
        if bounds is not None:
            self.bounds = standardize(bounds, self.scale_factors)

        # Select bandwidth for Epanechnikov kernel (Rule of Thumb, see
        # Silverman, p.86)
        self.bandwidth = 0.8  # Magic number, default bandwidth scaling factor
        self.c_d     = 2.0 * pow( np.pi, (self.d/2.0) ) / \
                       ( self.d * gamma(self.d/2) )
        self.h = self._compute_h(self.n, self.d, self.c_d, self.bandwidth)
        self._set_bandwith_factors(a)

        # Generate samples
        if counts is None:
            return self._gen_samples(n_samples)
        else:
            print 'KDE: Running on dataset of size n: %d d: %d and \
                generating %d samples.' % (self.n, self.d, sum(
                counts.values()))
            self._gen_spec_limits(X)
            return self._gen_partitioned_samples(counts)
예제 #2
0
파일: KDE.py 프로젝트: strategist922/qikify
 def fit(self, 
         chips, 
         specs     = None,
         n_samples = 0, 
         counts    = None, 
         a         = 0, 
         bounds    = None):
     """Primary execution point. Run either standard KDE or class-membership
     based KDE. If any of the class-membership based KDE arguments are set,
     it will be run instead of standard KDE.
             
     Parameters
     ----------
     chips : list
         A list of chip model objects.
     
     n_samples : int
         The number of samples to generate.
     
     specs  : qikify.models.Specs, optional
         If using partitioned sampling, boundaries defining pass / critical /
         fail subspaces must be provided.
         
     counts : dict, optional
         If using partitioned sampling, counts dictionary must be provided,
         with three keys: nGood, nCritical, nFail.
              
     """
     X = pandas.DataFrame([chip.LCT for chip in chips])
     
     self.n, self.d = X.shape
     self.specs     = specs
     self.columns   = getattr(X, 'columns', None)
     
     # Normalize data/bounds        
     self.scale_factors, self.Xn = standardize(X)
     self.bounds = standardize(np.array([X.min(0), X.max(0)]), \
                               self.scale_factors)
     if bounds is not None:
         self.bounds = standardize(bounds, self.scale_factors)
         
     # Select bandwidth for Epanechnikov kernel (Rule of Thumb, see 
     # Silverman, p.86)
     self.bandwidth = 0.8  # Magic number, default bandwidth scaling factor
     self.c_d     = 2.0 * pow( np.pi, (self.d/2.0) ) / \
                    ( self.d * gamma(self.d/2) )
     self.h       = self._compute_h(self.n, self.d, self.c_d, self.bandwidth)
     self._set_bandwith_factors(a)
     
     # Generate samples
     if counts is None:
         return self._gen_samples(n_samples)
     else:
         print 'KDE: Running on dataset of size n: %d d: %d and \
             generating %d samples.' % (self.n, self.d, sum(counts.values()))
         self._gen_spec_limits(X)
         return self._gen_partitioned_samples(counts)
예제 #3
0
    def fit(self, chips):
        """Run Laplacian Score Feature Selection. 
         
        .. note:: Eventually, it'd be nice to maintain col names with Xin so
        that we can add a plot method to plot scores vs. column names.
        
        Notes
        -----
        This code is based on the definition from the paper [1]_:

        .. \\frac{\sum_{ij} (f_r^i - f_r^j) * S_{ij}}{sigma_2}

        .. [1] He, X. and Cai, D. and Niyogi, P., "Laplacian Score for Feature
        Selection", NIPS 2005.

        Parameters
        ----------
        chips : list
            A list of chip objects            
        """
        
        X   = np.array([chip.LCT.values() for chip in chips])
        gnd = np.array([chip.gnd for chip in chips])
        
        assert X.shape[0] == len(gnd), \
            "Data and gnd do not have matching sizes"
        
        _, X = standardize(X)
        
        # Per LSFS paper, S_ij = exp(-||x_i - x_j||^2 / t). I've found that
        # t = ncol(X) to be a suitable choice; anything on that order should 
        # work just fine.
        S          = self._construct_w(X, gnd, t=X.shape[1]) 
        D          = sum(S, 1)
        dot_d_x    = np.dot(D, X)
        z          = (dot_d_x * dot_d_x) / sum(D)  
        
        dprime = sum(np.dot(X.T, np.diag(D)).T * X, 0) - z      
        lprime = sum(np.dot(X.T, S).T * X, 1) - z
        
        
        # Remove trivial solutions
        dprime[dprime < 1e-12] = np.inf
        
        # Compute and retain Laplacian scores and rankings
        self.scores  = (lprime/dprime).T
        self.ranking = np.argsort(-self.scores)
        
        del S  # Clean up to save memory
        return self
예제 #4
0
파일: LSFS.py 프로젝트: trela/qikify
    def fit(self, chips):
        """Run Laplacian Score Feature Selection. 
         
        .. note:: Eventually, it'd be nice to maintain col names with Xin so
        that we can add a plot method to plot scores vs. column names.
        
        Notes
        -----
        This code is based on the definition from the paper [1]_:

        .. \\frac{\sum_{ij} (f_r^i - f_r^j) * S_{ij}}{sigma_2}

        .. [1] He, X. and Cai, D. and Niyogi, P., "Laplacian Score for Feature
        Selection", NIPS 2005.

        Parameters
        ----------
        chips : list
            A list of chip objects            
        """

        X = np.array([chip.LCT.values() for chip in chips])
        gnd = np.array([chip.gnd for chip in chips])

        assert X.shape[0] == len(gnd), \
            "Data and gnd do not have matching sizes"

        _, X = standardize(X)

        # Per LSFS paper, S_ij = exp(-||x_i - x_j||^2 / t). I've found that
        # t = ncol(X) to be a suitable choice; anything on that order should
        # work just fine.
        S = self._construct_w(X, gnd, t=X.shape[1])
        D = sum(S, 1)
        dot_d_x = np.dot(D, X)
        z = (dot_d_x * dot_d_x) / sum(D)

        dprime = sum(np.dot(X.T, np.diag(D)).T * X, 0) - z
        lprime = sum(np.dot(X.T, S).T * X, 1) - z

        # Remove trivial solutions
        dprime[dprime < 1e-12] = np.inf

        # Compute and retain Laplacian scores and rankings
        self.scores = (lprime / dprime).T
        self.ranking = np.argsort(-self.scores)

        del S  # Clean up to save memory
        return self
예제 #5
0
    def _gen_partitioned_samples(self, counts):
        """Generates nCritical critical devices, nGood good devices, nFail
        failing devices, with each region defined by specs.inner /
        specs.outer.
        """

        # Initialize arrays for speed
        Sg, Sc, Sf = np.zeros((counts['nGood'], self.d)), \
                     np.zeros((counts['nCritical'], self.d)), \
                     np.zeros((counts['nFail'], self.d))
        ng, nc, nf = 0, 0, 0

        thresh = 0.02
        while (ng + nc + nf < sum(counts.values())):
            sample = standardize(self._gen_sample(), \
                                 self.scale_factors, reverse = True)
            if self._is_good(sample) and ng < counts['nGood']:
                Sg[ng, :] = sample
                ng += 1
            if self._is_failing(sample) and nf < counts['nFail']:
                Sf[nf, :] = sample
                nf += 1
            if self._is_critical(sample) and nc < counts['nCritical']:
                Sc[nc, :] = sample
                nc += 1

            # Prints # generated in each category so we can monitor progress,
            # since this can take a while :)
            if float(ng + nc + nf) / sum(counts.values()) > thresh:
                print 'Ng:%i/%i Nc:%i/%i Nf:%i/%i' % \
                      (ng, counts['nGood'], \
                       nc, counts['nCritical'], \
                       nf, counts['nFail'])
                thresh += 0.02
        print 'Non-parametric density estimation sampling complete.'
        return pandas.DataFrame(np.vstack((Sc, Sg, Sf)), columns=self.columns)
예제 #6
0
파일: KDE.py 프로젝트: strategist922/qikify
    def _gen_partitioned_samples(self, counts):
        """Generates nCritical critical devices, nGood good devices, nFail
        failing devices, with each region defined by specs.inner /
        specs.outer.
        """

        # Initialize arrays for speed
        Sg, Sc, Sf = np.zeros((counts['nGood'], self.d)), \
                     np.zeros((counts['nCritical'], self.d)), \
                     np.zeros((counts['nFail'], self.d))
        ng, nc, nf = 0, 0, 0
        
        thresh = 0.02
        while ( ng+nc+nf < sum(counts.values()) ):
            sample = standardize(self._gen_sample(), \
                                 self.scale_factors, reverse = True)
            if self._is_good(sample) and ng < counts['nGood']:
                Sg[ng, :] = sample
                ng += 1
            if self._is_failing(sample) and nf < counts['nFail']:
                Sf[nf, :] = sample
                nf += 1
            if self._is_critical(sample) and nc < counts['nCritical']:
                Sc[nc, :] = sample
                nc += 1      
            
            # Prints # generated in each category so we can monitor progress,
            # since this can take a while :)
            if float(ng+nc+nf) / sum(counts.values()) > thresh:
                print 'Ng:%i/%i Nc:%i/%i Nf:%i/%i' % \
                      (ng, counts['nGood'], \
                       nc, counts['nCritical'], \
                       nf, counts['nFail'])
                thresh += 0.02
        print 'Non-parametric density estimation sampling complete.'
        return pandas.DataFrame(np.vstack((Sc, Sg, Sf)), columns=self.columns)
예제 #7
0
 def _gen_samples(self, n_samples):
     """Generate KDE samples.
     """
     Sn = np.vstack([self._gen_sample() for _ in xrange(n_samples)])
     sample = standardize(Sn, self.scale_factors, reverse=True)
     return pandas.DataFrame(sample, columns=self.columns)
예제 #8
0
파일: KDE.py 프로젝트: strategist922/qikify
 def _gen_samples(self, n_samples):
     """Generate KDE samples.
     """
     Sn = np.vstack([ self._gen_sample() for _ in xrange(n_samples) ])
     sample = standardize(Sn, self.scale_factors, reverse = True)
     return pandas.DataFrame(sample, columns=self.columns)