예제 #1
0
def kde_labeler(picks):
    if isinstance(picks, torch.Tensor):
        picks = picks.clone().cpu().data.numpy().astype(int)
    nums = np.array([x for x in range(0, 101)]).reshape(-1, 1)
    picks = picks.reshape(-1, 1)
    lower = np.percentile(picks, 25)
    upper = np.percentile(picks, 75)
    IQR = upper - lower
    std = picks.std()
    if std < 0.5:
        std = 1.0
        IQR = 1.0

    if IQR < 0.1:
        IQR = 0.1
    m = min(np.sqrt(std * std), IQR / 1.349)
    bandwidth = (0.9 * float(m)) / (float(pow(float(len(picks)), 0.2)))

    if bandwidth > 5:
        # TODO: Handle this in a manner not using print statements. Maybe set a warning flag
        print(
            f"Bandwidth too high! m: {m} std: {std} IQR: {IQR} bandwidth: {bandwidth}"
        )

    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
    kde.fit(picks)

    log_dens = kde.score_samples(nums)
    label = np.exp(log_dens)
    label = label / label.sum()
    return label
예제 #2
0
def kernel_fit_single(data, bw=None, min_size=20, kern='gaussian'):
    """ guassian fit to 1D data
    """
    res = np.histogram(data.ravel(), bins='sqrt', density=True)
    std_data = data.std()
    if (bw == None):
        bw = (data.ravel().shape[0] * (std_data + 2) / 4.)**(-1. /
                                                             (std_data + 4))

    N_bins = res[1].shape[0]
    if (N_bins < min_size):
        extra = 0.2
        #N_bins *=2
    else:
        extra = 0.0
    # get plus or minus 20%

    x_grid = np.linspace(res[1][0] - extra * abs(res[1][0]),
                         res[1][-1] + extra * abs(res[1][0]), N_bins)

    kde = KernelDensity(bandwidth=bw, kernel=kern)
    kde.fit(data.ravel()[:, None])

    pdf = np.exp(kde.score_samples(x_grid[:, None]))

    return pdf, x_grid
예제 #3
0
def calculateKernelDensity(args):
    try:
        frame, XY, kernel, bandwidth, positions, Xgrid, Ygrid, extend = args  # the input parameters

        # Compute the kernel density
        kdf = KernelDensity(kernel=kernel,
                            bandwidth=float(bandwidth),
                            algorithm='kd_tree')
        kdf.fit(XY)

        # Evaluate the kernel on the grid
        Z = kdf.score_samples(positions)
        Z = Z.reshape(Xgrid.shape)  # put the result back into the grid shape


#        Z = remap0to1(Z) # map array to [0,1]
    except:
        # For debugging puropses it helps to first create a NoneType error outside
        # the multiprocessing part. If an error occurs in the multiprocessing
        # the thread is not finishing and no traceback is printed (it appears
        # as if the process is still running).
        #raise
        frame, kdf, Z, Xgrid, Ygrid, extend = None, None, None, None, None, None

    return [frame, (kdf, Z, Xgrid, Ygrid, extend)]
예제 #4
0
def kde_sklearn(x, x_grid, bandwidth):
    ## Kernel Density Estimation with Scikit-learn
    kde_skl = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
    kde_skl.fit(x)
    ##score_samples''() returns the log-likelihood of the samples
    log_pdf = np.exp(kde_skl.score_samples(x_grid))
    return log_pdf
def plot_kde(obj, lo, hi, true, test):
    obj_plot = np.linspace(lo, hi, 10000)[:, np.newaxis]
    avg_std = np.mean(std(obj))
    bandwidth = 1.06 * avg_std * len(obj)**-0.2
    plt.figure()
    #    ax = plt.gca()
    for i in range(obj.shape[1]):
        a = obj[:, i][:, np.newaxis]
        #1.06*np.std(a)*len(a)**-0.2 # Bandwidth estimated by Silverman's Rule of Thumb
        kde = KernelDensity(bandwidth=bandwidth,
                            kernel='gaussian',
                            algorithm='ball_tree')
        kde.fit(a)
        log_dens = kde.score_samples(obj_plot)
        plt.plot(obj_plot, np.exp(log_dens))
#        vline_color = next(ax._get_lines.prop_cycler)['color']
#        plt.axvline(np.mean(a), linestyle=':', color = vline_color, label='Update %i' %(i+1))
    plt.axvline(np.mean(average(obj)),
                color='red',
                label='Mean of all predictions')
    plt.axvline(true,
                label='True value',
                linestyle='dashdot',
                color='black',
                linewidth=2)
    plt.ylabel('PDF')
    plt.xlabel('Cycle')
    plt.tight_layout()
    plt.legend()
예제 #6
0
class LeveOneOutEntropyEstimator(ItEstimator):
    """
    Leave One Out cross-validation entropy estimation from datapoints by
    using kernel estimation of the probability density
    See More:
    Ivanov A. V. and Rozhkova . Properties of the statistical estimate of the
    entropy of a random vector with a probability density
    """

    def __init__(self, kernel,  min_log_proba, bandwith=1.0):
        self.kde = KernelDensity(kernel=kernel, bandwidth=bandwith)
        self.min_log_proba = min_log_proba

    def estimateFromData(self, datapoints):
        entropy = 0.0
        if len(datapoints.shape) == 1:
            datapoints = np.expand_dims(datapoints, 1)
        for i in range(datapoints.shape[0]):
            curr = np.delete(datapoints, i, axis=0)
            self.kde.fit(curr)
            score = self.kde.score(datapoints[None, i, :])
            if score < self.min_log_proba:
                print(score)
                continue

            entropy -= score

        return entropy / datapoints.shape[0]

    def entropy(self, X):
        return self.estimateFromData(X)

    def flags(self):
        return False, False, False
예제 #7
0
class OneClassKDE(BaseClassifier):
    _fit_params = ["bandwidth"]
    _predict_params = []
    def __init__(self, *args, **kwargs):
        self.bandwidth = kwargs["bandwidth"]
        self.perc_keep = kwargs["perc_keep"]
    
    def fit(self, data, **kwargs):
        #self.train_data = data
        self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth)
        
        idx = numpy.random.randint(2, size=len(data)).astype(numpy.bool)
        print idx
        
        
        self.kde.fit(data[idx, :])
        self.training_score = self.kde.score_samples(data[~idx, :])
        self.direct_thresh = numpy.percentile(self.training_score, 100-self.perc_keep)
        
        print 'training', self.training_score.min(), self.training_score.mean(), self.training_score.max(), self.direct_thresh
        
        print self.direct_thresh
    
    def predict(self, data):
        score = self.kde.score_samples(data)
        self.score = score
        res = (score < self.direct_thresh)
        print 'test', self.score.min(), self.score.mean(), self.score.max()
        print res.sum(), "of", len(self.score), 'outliers'
        
        return res.astype(numpy.uint8)*-2+1
    
    def decision_function(self, data=None):
        return self.score
예제 #8
0
def projected_density_gauss(pos, fov, ncells):
    """
    Input:
        pos: particle positions
        mass: particle masses
        centre: centre of sub-&halo
        fov: field-of-view
        ncells: number of grid cells
    """
    pos = pos - centre
    
    _indx = np.logical_and(np.abs(pos[:, 0]) < 0.5*fov,
                           np.abs(pos[:, 1]) < 0.5*fov)
    pos = pos[_indx, :]
    n = 1024*1024
    h = (4*np.std(pos[:, :2])**5/(3*n))**(1/5)
    #TODO: plot this falty situation
    kde_skl = KernelDensity(bandwidth=h,
                            kernel='gaussian',
                            algorithm='ball_tree')
    
    xx, yy = np.mgrid[min(pos[:, 0]):max(pos[:, 0]):complex(ncells), 
                      min(pos[:, 1]):max(pos[:, 1]):complex(ncells)]

    xy_sample = np.vstack([xx.ravel(), yy.ravel()]).T

    kde_skl.fit(pos[:, :2])
    sigma = np.exp(kde_skl.score_samples(xy_sample))
    sigma = sigma.reshape(xx.shape)
    return sigma, h
예제 #9
0
파일: TPnote.py 프로젝트: camachoxx/TP1_AA
def Kde_model(bw, data):
    #Returns the classifier list for the given bandwidth and data
    #The data must be [[feats],y]
    kde_list = [[], []]

    data_0 = data[data[:, -1] == 0]

    data_1 = data[data[:, -1] == 1]

    #Class 0
    X_feats = data_0[:, :-1]

    Y = data_0[:, -1]
    for feat in range(X_feats.shape[1]):
        X_y = np.column_stack((X_feats[:, feat], Y))
        kde = KernelDensity(kernel='gaussian', bandwidth=bw)
        kde.fit(X_y)
        kde_list[0].append(kde)

    #Class 1
    X_feats = data_1[:, :-1]
    Y = data_1[:, -1]
    for feat in range(X_feats.shape[1]):
        X_y = np.column_stack((X_feats[:, feat], Y))
        kde = KernelDensity(kernel='gaussian', bandwidth=bw)
        kde.fit(X_y)
        kde_list[1].append(kde)

    return kde_list
예제 #10
0
class KDECluster:
    '''
    points is a vector of vectors [[],[]]
    '''
    def __init__(self, points, bw):
        if len(points) < 5:
            self.kde_ = KernelDensity(kernel='gaussian', bandwidth=bw)
        else:
            self.kde_ = KernelDensity(kernel='epanechnikov',
                                      algorithm='ball_tree',
                                      bandwidth=bw,
                                      leaf_size=50)

        self.points_ = points

        self.kde_.fit(points)

    #..........................................................................
    def compare(self, cluster):
        scores_self = np.exp(self.kde_.score_samples(cluster.points_))
        scores_clus = np.exp(cluster.kde_.score_samples(self.points_))

        m_self = max(scores_self)
        m_clus = max(scores_clus)

        return max(m_clus, m_self)
예제 #11
0
def kde_naive_bayes(X_train, Y_train, bw):
    #divide the training set into two matrixes, one for each class
    matrix_0 = []
    matrix_1 = []
    for i in range(len(Y_train)):
        if Y_train[i] == 0:
            matrix_0.append(X_train[i])
        else:
            matrix_1.append(X_train[i])
    #convert the matrixes into numpy arrays
    matrix_0 = np.array(matrix_0)
    matrix_1 = np.array(matrix_1)
    #prior probabilities for each class
    prior_prob_0 = len(matrix_0) / len(X_train)
    prior_prob_1 = len(matrix_1) / len(X_train)
    #vectors to store the conditional distributions on each class
    kde_0 = []
    kde_1 = []
    #kernel estimator distribution for each feature-class combination
    for i in range(0, 4):  #last is not feature
        #create KernelDensity object, fit with training data and store the distributions
        kde_0_k = KernelDensity(kernel='gaussian', bandwidth=bw)
        kde_0_k.fit(matrix_0[:, i].reshape(-1, 1))
        kde_0.append(kde_0_k)
        #create KernelDensity object, fit with training data and store the distributions
        kde_1_k = KernelDensity(kernel='gaussian', bandwidth=bw)
        kde_1_k.fit(matrix_1[:, i].reshape(-1, 1))
        kde_1.append(kde_1_k)
    #convert into numpy arrays
    kde_0 = np.array(kde_0)
    kde_1 = np.array(kde_1)

    return (prior_prob_0, prior_prob_1, kde_0, kde_1)
예제 #12
0
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn. Fit KDE"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return kde_skl, np.exp(log_pdf)
예제 #13
0
def _importance_preprocess_uni(states, rewards, gradients, p_tar, p_gen):
    res = _create_episode_info()

    flat_states = [s for traj in states for s in traj]
    # TODO Pass in as args?
    kde = KernelDensity(kernel='gaussian', bandwidth=0.25)
    kde.fit(flat_states)

    for ss, rs, gs, ps, qs in izip(states, rewards, gradients, p_tar, p_gen):

        state_probs = kde.score_samples(ss)
        traj_p = np.cumsum(ps)  # + np.mean(state_probs)
        traj_q = np.cumsum(qs) + state_probs
        traj_grads = np.cumsum(gs, axis=0)
        r_acc = np.cumsum(rs[::-1])[::-1]
        r_grad = (r_acc * traj_grads.T).T

        res.r_grads.extend(r_grad)
        res.traj_p_tar.extend(traj_p)
        res.traj_p_gen.extend(traj_q)
        res.traj_grads.extend(traj_grads)
        res.traj_r.extend(r_acc)

        # Used for estimating fisher
        res.act_grads.extend(gs)
        res.state_act_p_tar.extend(traj_p)
        res.state_act_p_gen.extend(traj_q)

    return res
예제 #14
0
def estimate_distribution(samples, h=0.1, n_points=100):
	kde = KernelDensity(bandwidth=h)
	samples = samples[:, np.newaxis]
	kde.fit(samples)
	xs = np.linspace(-1.0, 1.0, n_points)
	ys = [np.exp(kde.score([x])) for x in xs]
	return xs, ys
예제 #15
0
class RegularizedKernelDensityEstimator(BaseEstimator):
    def __init__(self, bandwidth=1.0, regularization=1.0e-5):
        self.bandwidth = bandwidth
        self.regularization = regularization

    def setup(self):
        self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth)

        height, width = self.shape
        self.uniform_density = -np.log(width * height)

        self.kde_constant = np.log(1 - self.regularization)
        self.uniform_constant = np.log(self.regularization)

    def fit(self, X):
        self.shape = X[0, 2:4]
        self.setup()
        self.kde.fit(X[:, 0:2])
        return self

    def score_samples(self, X):
        kde_logliks = self.kde.score_samples(X[:, :2])

        logliks = np.logaddexp(self.kde_constant + kde_logliks,
                               self.uniform_constant + self.uniform_density)
        return logliks

    def score(self, X):
        return np.sum(self.score_samples(X))
예제 #16
0
class AUCKernelDensityEstimator(BaseEstimator):
    def __init__(self, nonfixations, bandwidth=1.0):
        self.bandwidth = bandwidth
        self.nonfixations = nonfixations

    def setup(self):
        self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth)

    def fit(self, X):
        self.setup()
        self.kde.fit(X)
        self.nonfixation_values = self.kde.score_samples(self.nonfixations)
        return self

    def score_samples(self, X):
        pos_logliks = self.kde.score_samples(X)
        neg_logliks = self.nonfixation_values

        aucs = [
            general_roc(np.array([p]), neg_logliks)[0] for p in pos_logliks
        ]

        return aucs

    def score(self, X):
        return np.sum(self.score_samples(X))
예제 #17
0
def kde_sklearn(x, bandwidth=0.2, **kwargs):
    x_grid = np.linspace(x.min() - 1, x.max() + 1, 500)
    """Kernel Density Estimation with Scikit-learn"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return np.exp(log_pdf), x_grid
예제 #18
0
def kde_single_arr(x_series, bandwidth=1.0):
    """
        x_series: 一个pd.Series,即一列数据
        对x_series里面的数据进行核密度估计
    """
    kde = KernelDensity(bandwidth, kernel='gaussian')
    kde.fit(x_series.values.reshape(-1, 1))
    return kde
예제 #19
0
 def _fit(self, df, kernel='gaussian'):
     """ Estimate density for errors as a function of perceptual distance """
     df = df.copy()
     errors = df[~df['correct']]
     kde = KernelDensity(kernel=kernel)
     # this may take a bit if it is a large sample
     kde.fit(errors['distance'].values.reshape(-1, 1))
     self.kde = kde
예제 #20
0
def kde_sklearn(ndim,
                kernel,
                bd,
                Nt,
                No,
                coordo,
                coordt,
                dtype='float64',
                rtime=False):
    '''
    Calculating PDF by using KDE (Epanechnikov) based on scikit-learn KernelDensity method.
    Inputs:
        ndim   -- the number of dimensions/variables [int]
        kernel -- the type of the kernel [str]
        bd     -- a list of bandwidths for each dimension/variable [list]
        Nt     -- the number of locations whose PDF will be estimated [int]
        No     -- the number of sampled locations [int]
        coordo -- the sampled locations [ndarray with shape(No, ndim)]
        coordt -- the locations to be estimated [ndarray with shape(Nt, ndim)]
    Outputs:
        pdf    -- the estimated pdf [ndarray with shape(Nt,)]
    '''
    # Check kernel types
    if kernel.lower() not in allowed_kernels:
        raise Exception('Unknown kernel type %s' % kernel)

    # Convert the float value of the bd into a list in a numpy array
    if ndim == 1 and isinstance(bd, float):
        bd = np.array([bd], dtype='float64')
    # Check dimensions
    if (No, ndim) != coordo.shape and (No, ) != coordo.shape:
        raise Exception('Wrong dimension and size of coordo!')
    if (Nt, ndim) != coordt.shape and (Nt, ) != coordt.shape:
        print Nt, ndim, coordt.shape
        raise Exception('Wrong dimension and size of coordt!')
    if len(bd) != ndim:
        raise Exception(
            'The length of the bandwidht does not equal to the number of the dimensions!'
        )

    # Reshape coordt when ndim is 1 and the shape is (Nt,) to the shape (Nt, 1)
    if ndim == 1 and coordt.shape == (Nt, ):
        coordt = coordt[:, np.newaxis]

    # Calculate the pdf and compute the time
    start = time()
    kde_skl = KernelDensity(bandwidth=bd[0], kernel=kernel.lower())
    kde_skl.fit(coordo)
    log_pdf = kde_skl.score_samples(coordt)
    pdf = np.exp(log_pdf, dtype=dtype)

    end = time()

    # Return results
    if rtime:
        return pdf, end - start
    else:
        return pdf
def createfeatmat(N):
    grid = getgridcoords(N).T
    featmat = np.zeros((len(vals), N ** 2))
    for i in range(len(vals)):
        m = np.array([vals[i][0], vals[i][1]]).T
        k = KernelDensity(bandwidth=0.5 / (N - 1), kernel="gaussian")
        k.fit(m)
        featmat[i, :] = k.score_samples(grid)
    return featmat
예제 #22
0
def kernel_fit_hist(data, hist, bw=None, min_size=20, kern='gaussian'):
    """ guassian fit to 1D data
    """
    x_grid = 0.5*(hist[1][1:]+hist[1][:-1]) # sample one less than histogram
    kde = KernelDensity(bandwidth=bw, kernel=kern)
    kde.fit(data.ravel()[:, None])

    pdf = np.exp(kde.score_samples(x_grid[:, None]))
    
    return pdf, hist[1]
def estimate_distribution(samples, h=0.1, n_points=100):
	kde = KernelDensity(bandwidth=h)
	min_xs = min(samples)
	max_xs = max(samples)
	samples = samples[:, np.newaxis]
	kde.fit(samples)
	xs = np.linspace(min_xs, max_xs, n_points)
	ys = np.exp(kde.score_samples(xs[:, np.newaxis]))
	print xs.shape, ys.shape, sum(ys)
	return xs, ys
예제 #24
0
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    
    N = np.trapz(np.exp(log_pdf), x_grid)

    return np.exp(log_pdf)/N
예제 #25
0
def plot_scatter(X, scale, out_prefix, title, kde=True):
    """Draws a 2D scatter plot (png) of the core and accessory distances

    Also draws contours of kernel density estimare

    Args:
        X (numpy.array)
            n x 2 array of core and accessory distances for n samples.
        scale (numpy.array)
            Scaling factor from :class:`~PopPUNK.models.BGMMFit`
        out_prefix (str)
            Prefix for output plot file (.png will be appended)
        title (str)
            The title to display above the plot
        kde (bool)
            Whether to draw kernel density estimate contours

            (default = True)
    """
    plt.figure(figsize=(11, 8), dpi=160, facecolor='w', edgecolor='k')
    if kde:
        xx, yy, xy = get_grid(0, 1, 100)

        # KDE estimate
        kde = KernelDensity(bandwidth=0.03,
                            metric='euclidean',
                            kernel='epanechnikov',
                            algorithm='ball_tree')
        kde.fit(X)
        z = np.exp(kde.score_samples(xy))
        z = z.reshape(xx.shape).T

        levels = np.linspace(z.min(), z.max(), 10)
        plt.contour(xx * scale[0],
                    yy * scale[1],
                    z,
                    levels=levels[1:],
                    cmap='plasma')
        scatter_alpha = 1
    else:
        scatter_alpha = 0.1

    plt.scatter(X[:, 0] * scale[0].flat,
                X[:, 1] * scale[1].flat,
                s=1,
                alpha=scatter_alpha)

    plt.title(title)
    plt.xlabel('Core distance (' + r'$\pi$' + ')')
    plt.ylabel('Accessory distance (' + r'$a$' + ')')
    plt.savefig(out_prefix + ".png")
    plt.close()
예제 #26
0
    def _evaluate_vec(self,
                      opts,
                      step,
                      real_points,
                      fake_points,
                      validation_fake_points,
                      prefix=''):
        """Compute the average log-likelihood and the Coverage metric.

        Coverage metric is defined in arXiv paper. It counts a mass of true
        data covered by the 95% quantile of the model density.
        """

        # Estimating density with KDE
        dist = fake_points[:-1] - fake_points[1:]
        dist = dist * dist
        dist = np.sqrt(np.sum(dist, axis=(1, 2, 3)))
        bandwidth = np.median(dist)
        num_real = len(real_points)
        num_fake = len(fake_points)
        if validation_fake_points is not None:
            max_score = -1000000.
            num_val = len(validation_fake_points)
            b_grid = bandwidth * (2.**(np.arange(14) - 7.))
            for _bandwidth in b_grid:
                kde = KernelDensity(kernel='gaussian', bandwidth=_bandwidth)
                kde.fit(np.reshape(fake_points, [num_fake, -1]))
                score = np.mean(
                    kde.score_samples(
                        np.reshape(validation_fake_points, [num_val, -1])))
                if score > max_score:
                    # logging.debug("Updating bandwidth to %.4f"
                    #             " with likelyhood %.2f" % (_bandwidth, score))
                    bandwidth = _bandwidth
                    max_score = score
        kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
        kde.fit(np.reshape(fake_points, [num_fake, -1]))

        # Computing Coverage, refer to Section 4.3 of arxiv paper
        model_log_density = kde.score_samples(
            np.reshape(fake_points, [num_fake, -1]))
        # np.percentaile(a, 10) returns t s.t. np.mean( a <= t ) = 0.1
        threshold = np.percentile(model_log_density, 5)
        real_points_log_density = kde.score_samples(
            np.reshape(real_points, [num_real, -1]))
        ratio_not_covered = np.mean(real_points_log_density <= threshold)

        log_p = np.mean(real_points_log_density)
        C = 1. - ratio_not_covered

        logging.info('Evaluating: log_p=%.3f, C=%.3f' % (log_p, C))
        return log_p, C
예제 #27
0
파일: tp1.py 프로젝트: Dooping/Naive-Bayes
    def fit(self, X, Y):
        x0 = X[Y==0,:]
        x1 = X[Y==1,:]
        self.pc0 = np.log(np.float(x0.shape[0])/np.float(X.shape[0]))
        self.pc1 = np.log(np.float(x1.shape[0])/np.float(X.shape[0]))

        self.kdes = []
        for ix in range(X.shape[1]):
            kde0 = KernelDensity(kernel = 'gaussian', bandwidth = self.bw)
            kde0.fit(x0[:,[ix]])
            kde1 = KernelDensity(kernel = 'gaussian', bandwidth = self.bw)
            kde1.fit(x1[:,[ix]])
            self.kdes.append((kde0,kde1))
예제 #28
0
def train_KDE_model(train_df, bandwith=KDE_BANDWITH):
    """
    Train KDE model based on coordinates of incidents.
    """

    kde = KernelDensity(bandwidth=bandwith,
                        metric='haversine',
                        kernel='gaussian',
                        algorithm='ball_tree')

    kde.fit(train_df[['latitude', 'longitude']] * np.pi / 180)

    return kde
예제 #29
0
def calc_kdes(X_train, Y_train, X_valid, bw):

    prob_matrix = np.zeros((2, X_valid.shape[0]))
    
    for i in range(0, 2):
        X_train_class_i = X_train[Y_train == i, :]
        for j in range(0, FEATS):
            kde = KernelDensity(kernel = 'gaussian', bandwidth = bw)
            kde.fit(X_train_class_i[:,[j]])
            log_prob = kde.score_samples(X_valid[:,[j]])
            prob_matrix[i] = np.add(prob_matrix[i], log_prob)

    return prob_matrix
예제 #30
0
class KDEntropyEstimator(ItEstimator):
    discrete = False

    def __init__(self,
                 kernel="gaussian",
                 min_log_proba=-500,
                 bandwith=1.0,
                 kfold=10):
        self.kde = KernelDensity(kernel=kernel, bandwidth=bandwith)
        self.min_log_proba = min_log_proba
        self.kfold = kfold

    def estimateFromData(self, datapoints):
        if len(datapoints.shape) == 1:
            datapoints = np.expand_dims(datapoints, 1)

        entropy = 0.0

        n, d = datapoints.shape
        ma = np.ones(n, dtype=np.bool)
        unit = n // self.kfold
        rem = n % self.kfold

        start = 0
        end = unit + rem
        for i in range(self.kfold):
            sel = np.arange(start, end)
            ma[start:end] = False
            curr = datapoints[ma, :]

            self.kde.fit(curr)
            score = self.kde.score(datapoints[sel, :])

            ma[:] = True
            start = end
            end = min(unit + end, n)

            if score < self.min_log_proba:
                continue

            entropy -= score

        return entropy / n

    def entropy(self, X):
        np.random.seed(0)
        return self.estimateFromData(X)

    def flags(self):
        return False, False, False
예제 #31
0
def five_lambdas(lambdas):
    misclassifications = []
    for each_lambda in lambdas:
        ##kde = KernelDensity(kernel='epanechnikov', bandwidth=each_lambda)
        kdg = KernelDensity(kernel='gaussian', bandwidth=each_lambda)
        kdg.fit(combined.loc[:, combined.columns != 'y'])
        smooth = kdg.score_samples(combined.loc[:, combined.columns != 'y'])
        data = pd.DataFrame({'Dat': smooth, 'y': pd.concat([d0['y'],d1['y'],d2['y']])})
        data_x_train, data_x_test, data_y_train, data_y_test = train_test_split(data, pd.DataFrame(data['y']), test_size=0.3, stratify=data['y'])
        model = LDA()
        model.fit(data_x_train.loc[:, data_x_train.columns != 'y'], data_x_train['y'])
        misclassification = model.score(data_x_test.loc[:, data_x_test.columns != 'y'], data_y_test)
        misclassifications.append(misclassification)
    return misclassifications
예제 #32
0
def kde2D(x, y, bandwidth, xbins=100j, ybins=100j, **kwargs): 
    """Build 2D kernel density estimate (KDE)."""

    # create grid of sample locations (default: 100x100)
    xx, yy = np.mgrid[x.min():x.max():xbins, 
                      y.min():y.max():ybins]

    xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T
    xy_train  = np.vstack([y, x]).T

    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(xy_train)

    # score_samples() returns the log-likelihood of the samples
    z = np.exp(kde_skl.score_samples(xy_sample))
    return xx, yy, np.reshape(z, xx.shape)
예제 #33
0
파일: stats.py 프로젝트: prashjet/dynclust
def get_kde_jsd(x, y, kw_kde={}):

    kde = KernelDensity(**kw_kde)
    kde.fit(x)
    log_p_x = kde.score_samples(x)
    log_p_y = kde.score_samples(y)
    kde.fit(y)
    log_q_x = kde.score_samples(x)
    log_q_y = kde.score_samples(y)
    log_mix_x = np.logaddexp(log_p_x, log_q_x)
    log_mix_y = np.logaddexp(log_p_y, log_q_y)
    kl_p_m = log_p_x.mean() - (log_mix_x.mean() - np.log(2))
    kl_q_m = log_q_y.mean() - (log_mix_y.mean() - np.log(2))
    js_divergence = (kl_p_m + kl_q_m) / 2.
    js_distance = np.sqrt(js_divergence)

    return js_distance
예제 #34
0
def construct_kde(array, bandwidth=None):
    if bandwidth == None:
        bw = 1.2*array.std()*np.power(array.size,-1/5)
    else:
        bw = bandwidth
    kde = KernelDensity(kernel='gaussian', bandwidth=bw)
    kde.fit(array.reshape(-1,1))
    x = np.linspace(array.min(),array.max(),200)
    log_dens=kde.score_samples(x.reshape(-1,1))
    kdens=np.exp(log_dens)

    total_dens=np.sum(kdens)
    cdf_array=np.zeros(shape=len(x))
    delta=x[1]-x[0]
    for i in range(len(x)):
        cdf_array[i] = np.sum(kdens[:i])*delta

    return x,kdens, cdf_array
예제 #35
0
class KDEModel(object):
    """
    Wrapper class for Scikit Learn's Kernel Density Estimation model.

    Attributes
    ----------
    model : KernelDensity
        Wrapped class model.
    """
    def __init__(self, kernel='gaussian', bandwidth=.001):
        self.model = KernelDensity(kernel='gaussian', bandwidth=bandwidth)

    def fit(self, train_X):
        """
        Wrapper method for fit() method of Kernel Density model.

        Parameters
        ----------
        train_X : {array-like, sparse matrix}, shape = [n_samples, n_features]
        """
        self.model.fit(train_X)

    def generate_samples(self, n_samples):
        """
        Generates the random samples according to the fitted distribution.

        Returns
        -------
        list
            List of numpy arrays of randomly generated observations.

        """
        points = self.model.sample(n_samples)
        return points

    def score_samples(self, X):
        """
        Predicts the log likelihood score of the samples in X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
        """
        return self.model.score_samples(X)
예제 #36
0
class OneClassKDE(BaseClassifier):
    _fit_params = ["bandwidth"]
    def __init__(self, *args, **kwargs):
        self.bandwidth = kwargs["bandwidth"]
    
    def fit(self, data, **kwargs):
        #self.train_data = data
        self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth)
        self.kde.fit(data)
        self.training_score = self.kde.score_samples(data)
        self.direct_thresh = numpy.percentile(self.training_score, 10)
    
    def predict(self, data):
        score = self.kde.score_samples(data)
        self.score = score
        return (score < self.direct_thresh).astype(numpy.int32)*-2+1
    
    def decision_function(self, data):
        return self.score
def resample_state(D, w):
    w_norm = np.sum(w)  # Normalization factor for weights
    w_ecdf = np.cumsum(w) / w_norm  # New weight given the new measurement
    # Resample the points
    D_new, ind = np.empty_like(D), np.empty_like(D)
    for i, q in enumerate(D):
        ind[i] = bisect.bisect_left(w_ecdf, np.random.uniform(
            0, 1))  # Indexes for new samples
        D_new[i] = D[int(
            ind[i]
        )]  # New weighted particles (samples) from previous step given new measuremnt
    # Regularize it!


#    std = np.std(D_new)
    bandwidth = 0.05  #1.06*std*len(D_new)**-0.2  ## used to be 0.08
    kde = KernelDensity(
        bandwidth=bandwidth, kernel='gaussian', algorithm='ball_tree'
    )  # Bandwidth = 0.006 is calculated based on Silverman's Rule of Thumb
    kde.fit(D_new[:, np.newaxis])
    return kde.sample(num_particles).flatten(), ind
예제 #38
0
 def nmultitype_conf_matrix(self,tipos,nfolds):
     cadena = ""
     for t in tipos:
         cadena += t
     if not os.path.exists("models/nmultitype_conf_matrix" + self.bd +"ts"+cadena+"Promedio"+str(nfolds)+".p") or True:
         #Creamos la matriz de matrices donde guardaremos los resultados parciales
         matrices = [None] * nfolds * nfolds
         #Creamos/Recuperamos el modelo Node2Vec
         n2v = node2vec(self.bd,self.port,self.user,self.pss,self.label,1000,20,6,self.mode,[],1)
         n2v.learn("normal",0,False,0)
         #Creamos los arrays X e Y, anadiendo
         X = []
         Y = []
         #Creamos un array de comunes que son los nodos que son a la vez de ambos tipos
         comunes = list()
         for tipo in tipos:
             for n in n2v.n_types[tipo]:
                 if n in n2v.w2v:
                     X.append(n2v.w2v[n])
                     if n in n2v.n_types[tipos[0]] and  n in n2v.n_types[tipos[1]]:
                         comunes.append(n2v.w2v[n])
                     Y.append(tipo)
         #Creamos los k folds estratificados    
         X = np.array(X)
         Y = np.array(Y)
         skf = StratifiedKFold(Y, n_folds=nfolds)
         it = 0
         kdes = []
         for train_index, test_index in skf:
             print "k-fold para kde"
             X_train, X_test = X[train_index], X[test_index]
             Y_train, Y_test = Y[train_index], Y[test_index]
             Y_test = Y_test.astype('|S64')
             #Creamos la funcion de densidad de probabilidad de cada tipo
             for t in tipos:
                 print "Creando KDE para el tipo "+t
                 tempX = []
                 for idx,n in enumerate(Y_train):
                     if n == t:
                         tempX.append(X_train[idx])
                 #Calculating KDE with the train set
                 #use grid search cross-validation to optimize the bandwidth
                 #params = {'bandwidth': np.logspace(-1, 1, 10)}
                 #grid = GridSearchCV(neighbors.KernelDensity(), params)
                 #grid.fit(tempX)
                 #print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
                 # use the best estimator to compute the kernel density estimate
                 #kde = grid.best_estimator_
                 kde = KernelDensity(kernel='gaussian', bandwidth=0.1)
                 kde.fit(tempX)
                 kdes.append(kde)
                 print "Terminado KDE para el tipo "+t
             #Dividimos el conjunto de test en tipo1, tipo2 y tipo1+2
             cont = 0
             for idx,x in enumerate(X_test):
                 total = 0
                 x = np.array(x)
                 if any((x == a).all() for a in comunes):
                     Y_test[idx] = str(tipos[0]+"+"+tipos[1])
                     cont += 1
             print "Numero de elementos con doble tipo:"+str(cont)
             #Creamos k-folds estratificados para el arbol de decision
             skf = StratifiedKFold(Y_test, n_folds=nfolds)
             for train_index, test_index in skf:
                 print "k-fold para decission tree"
                 X_train1, X_test1 = X_test[train_index], X_test[test_index]
                 Y_train1, Y_test1 = Y_test[train_index], Y_test[test_index]
                 clf = DecisionTreeClassifier(random_state=0)
                 print X_train1[0]
                 clf.fit(X_train1,Y_train1)
                 export_graphviz(clf);
                 Y_pred1 = clf.predict(X_test1)
                 matriz = metrics.confusion_matrix(Y_test1, Y_pred1,[tipos[0],tipos[1],tipos[0]+"+"+tipos[1]])
                 matrices[it] = np.array(matriz)
                 print matrices[it]
                 it += 1
         f = open( "models/nmultitype_conf_matrix" + self.bd +"ts"+cadena+"Promedio"+str(nfolds)+".p", "w" )
         pickle.dump(matrices,f)
     else:
         f = open( "models/nmultitype_conf_matrix" + self.bd +"ts"+cadena+"Promedio"+str(nfolds)+".p", "r" )
         matrices = pickle.load(f)
     total = matrices[0]
     for m in matrices[1:]:
         total += m
     print total
     matriz_promedio = total 
     matriz_promedio = matriz_promedio.astype('float')
     #print matrices
     #print matriz_promedio
     matriz_promedio = matriz_promedio / len(matrices)
     #print matriz_promedio
     #calculando porcentajes a partir del promedio de frecuencias
     for i in range(0,len(matriz_promedio)):
         suma = 0
         for j in range(0,len(matriz_promedio)): 
             suma += matriz_promedio[i][j]
             matriz_promedio[i][j] = float(matriz_promedio[i][j])
         for j in range(0,len(matriz_promedio)):                
             if suma > 0:
                 matriz_promedio[i][j] = round(float(matriz_promedio[i][j] * 100) / float(suma),2)
             else:
                 matriz_promedio[i][j] = 0
     matriz_promedio = matriz_promedio.astype('string')
     for i in range(0,len(matriz_promedio)):
         for j in range(0,len(matriz_promedio)):
             matriz_promedio[i][j] = str(matriz_promedio[i][j])+"%"
     return matriz_promedio