Пример #1
0
def cv1(x, bws, model='gaussian', plot=False, n_folds=10):
    """
    This calculates the leave-one-out cross validation. If you set 
    plot to True, then it will show a big grid of the test and training
    samples with the KDE chosen at each step. You might need to modify the 
    code if you want a nicer layout :)
    """

    # Get the number of bandwidths to check and the number of objects
    N_bw = len(bws)
    N = len(x)
    cv_1 = np.zeros(N_bw)
    
    # If plotting is requested, set up the plot region
    if plot:
        fig, axes = plt.subplots(N_bw, int(np.ceil(N/n_folds)), figsize=(15, 8))
        xplot = np.linspace(-3, 8, 1000)

    # Loop over each band-width and calculate the probability of the 
    # test set for this band-width
    for i, bw in enumerate(bws):
    
        # I will do N-fold CV here. This divides X into N_folds
        kf = KFold(N)

        # Initiate - lnP will contain the log likelihood of the test sets
        # and i_k is a counter for the folds that is used for plotting and
        # nothing else..
        lnP = 0.0
        i_k = 0
                                 
        # Loop over each fold
        for train, test in kf.split(x):
            x_train = x[train, :]
            x_test = x[test, :]
            
            # Create the kernel density model for this bandwidth and fit
            # to the training set.
            kde = KD(kernel=model, bandwidth=bw).fit(x_train)
                                 
            # score evaluates the log likelihood of a dataset given the fitted KDE.
            log_prob = kde.score(x_test)
            
            if plot:
                # Show the tries
                ax = axes[i][i_k]

                # Note that the test sample is hard to see here.
                hist(x_train, bins=10, ax=ax, color='red')
                hist(x_test, bins=10, ax=ax, color='blue')
                ax.plot(xplot, np.exp(kde.score_samples(xplot[:, np.newaxis])))
                i_k += 1
            

            lnP += log_prob
            
        # Calculate the average likelihood          
        cv_1[i] = lnP/N
        
    return cv_1
Пример #2
0
def question1b(t, key, bwrange):
    import matplotlib.colors as colors
    import matplotlib.cm as cmx
    import seaborn as sns
    sns.set()

    t = Table().read('joint-bh-mass-table.csv')

    X_plot = np.linspace(np.min(t['MBH']) - 4, np.max(t['MBH']) + 4,
                         num=1000)[:, np.newaxis]
    X = t['MBH'][:, np.newaxis]

    #plt.scatter(X[:,0], np.zeros(len(X[:,0])), marker = 'x', color = 'black')

    #different values for the bandwidth
    bwrange = np.arange(1, 10, 0.1)

    #plot many lines using colors from a color map, with MAX the amount of lines
    #jet = plt.get_cmap('jet')
    #cNorm  = colors.Normalize(vmin=0, vmax=len(bwrange))
    #scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)

    #set the number of folds
    kf = KFold(n_splits=5)

    likelyhood = np.zeros(len(bwrange))

    for bw, i in zip(bwrange, np.arange(len(bwrange))):
        lh = []
        for train_i, test_i in kf.split(X):
            Xtrain, Xtest = X[train_i], X[test_i]
            kde = KernelDensity(bandwidth=bw, kernel='gaussian').fit(Xtrain)

            log_dens = kde.score_samples(Xtrain)
            lhscore = kde.score(Xtest)

            #print('Bandwidth: {0}, Likelyhood: {1}'.format(bw, lhscore))

            lh = np.append(lh, lhscore)

        likelyhood[i] = np.mean(lh)

    print('Highest likelyhood ({0}) at bandwidth = {1}'.format(
        round(np.max(likelyhood), 2), bwrange[np.argmax(likelyhood)]))

    plt.plot(bwrange, likelyhood, color='black', alpha=0.8, label='Likelyhood')
    plt.scatter(bwrange[np.argmax(likelyhood)],
                np.max(likelyhood),
                marker='x',
                s=100,
                color='orange',
                label='Maximum likelyhood')

    plt.xlabel('Bandwidth [$10^6$ M$_{\odot}$]')
    plt.ylabel('Likelyhood')
    plt.legend(loc='best')

    plt.title('Black hole mass density distribution')
    #plt.savefig('Blackhole-kde-bandwidth-likelyhood.svg')
    plt.show()
Пример #3
0
def plot_density(model, path):
    N_samples = 10000

    samples = model.predict_y_samples(Xs, N_samples, session=sess)[:, :, 0]
    # objective = np.average([model.compute_log_likelihood() for _ in range(1000)])

    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    ax.scatter(X, Y, marker='.', color='C1')
    levels = np.linspace(-1, 2, 200)
    ax.set_ylim(min(levels), max(levels))
    ax.set_xlim(min(Xs), max(Xs))


    cs = np.zeros((len(Xs), len(levels)))
    for i, Ss in enumerate(samples.T):
        bandwidth = 1.06 * np.std(Ss) * len(Ss) ** (-1. / 5)  # Silverman's (1986) rule of thumb.
        kde = KernelDensity(bandwidth=float(bandwidth))

        kde.fit(Ss.reshape(-1, 1))
        for j, level in enumerate(levels):
            cs[i, j] = kde.score(np.array(level).reshape(1, 1))
    ax.pcolormesh(Xs.flatten(), levels, np.exp(cs.T), cmap='Blues_r')  # , alpha=0.1)
    ax.scatter(X, Y, marker='.', color='C1')

    plt.savefig(os.path.join(path, 'density_{:03d}.png'.format(k)))
    plt.close()
Пример #4
0
def plot_posterior(gp, **options):
    bounds = gp.bounds
    posterior = gp.get_posterior
    S = gp.num_posterior_samples

    ax = get_axes(**options)
    Xs = np.linspace(*bounds[0], num=1000)
    samples = posterior(Xs, S)[:, :, 0]
    # print(samples)
    ydif = (max(gp.Y) - min(gp.Y)) * 0.15
    levels = np.linspace(min(gp.Y) - ydif, max(gp.Y) + ydif, 1000)

    ax.set_ylim(min(levels), max(levels))
    ax.set_xlim(min(Xs), max(Xs))

    cs = np.zeros((len(Xs), len(levels)))
    for i, Ss in enumerate(samples.T):
        bandwidth = 1.06 * np.std(Ss) * len(Ss)**(
            -1. / 5)  # Silverman's (1986) rule of thumb.
        kde = KernelDensity(bandwidth=float(bandwidth))

        kde.fit(Ss.reshape(-1, 1))
        for j, level in enumerate(levels):
            cs[i, j] = kde.score(np.array(level).reshape(1, 1))
    ax.pcolormesh(Xs.flatten(), levels, np.exp(cs.T),
                  cmap='Blues_r')  # , alpha=0.1)
    ax.scatter(gp.X, gp.Y, s=15, color="red", zorder=10)
    '''for j in range(0, 5):
Пример #5
0
    def pdf(self, token, years, bandwidth=5):

        """
        Estimate a density function from a token's rank series.

        Args:
            token (str)
            years (range)

        Returns: OrderedDict {year: density}
        """

        series = self.series(token)

        data = []
        for year, wpm in series.items():
            data += [year] * round(wpm)

        data = np.array(data)[:, np.newaxis]

        pdf = KernelDensity(bandwidth=bandwidth).fit(data)

        samples = OrderedDict()

        for year in years:
            samples[year] = np.exp(pdf.score(year))

        return samples
Пример #6
0
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
                        atol=atol, rtol=rtol)
    log_dens = kde.fit(X).score_samples(Y)
    assert_allclose(np.exp(log_dens), dens_true,
                    atol=atol, rtol=max(1E-7, rtol))
    assert_allclose(np.exp(kde.score(Y)),
                    np.prod(dens_true),
                    atol=atol, rtol=max(1E-7, rtol))
Пример #7
0
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
                        atol=atol, rtol=rtol)
    log_dens = kde.fit(X).score_samples(Y)
    assert_allclose(np.exp(log_dens), dens_true,
                    atol=atol, rtol=max(1E-7, rtol))
    assert_allclose(np.exp(kde.score(Y)),
                    np.prod(dens_true),
                    atol=atol, rtol=max(1E-7, rtol))
Пример #8
0
def part_d_test(x, p1, p2):
    kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(x)
    sc1 = kde.score(p1)
    sc2 = kde.score(p2)
    sc_b = cross_val_score(kde, x)
    sc1_b = sc1 / (sum(sc_b) / len(sc_b))
    sc2_b = sc2 / (sum(sc_b) / len(sc_b))
    print(sc1_b)
    print(sc2_b)
    print('c')
Пример #9
0
def kde3d(x, y, z, data_point):
    values = np.vstack([x, y, z]).T
    # Use grid search cross-validation to optimize the bandwidth
    # params = {'bandwidth': np.logspace(-1, 1, 20)}
    kde = KernelDensity(bandwidth=0.3)
    kde.fit(values)
    kde_coords = kde.sample(10000)
    log_pdf = kde.score_samples(kde_coords)
    percentile = np.sum(log_pdf < kde.score(data_point))/10000.
    return (percentile)
Пример #10
0
def test():
    size=100
    a=0.5
    p=2
    rand=0

    # generate sequence
    states, observations, filtered_state_estimates = KalmanSequence(size, a, rand)

    # # plot sequence
    # plt.plot(states, marker='.', label="true")
    # # plt.plot(observations, label="obs")
    # plt.plot(filtered_state_estimates, marker='.', label="est")
    # plt.legend()
    # plt.show()
    # plt.clf()

    # produce blocks (X:label, Y:features)
    X,Y = produce_blocks(states, p)
    XY = np.concatenate((X,Y), axis=1)
    # print("data")
    # print(states)
    # print(X)
    # print(Y)

    # estimate pdf
    # Compute the total log probability density under the model.
    # aka score = log-likelihood
    kde_x = KernelDensity(kernel='gaussian', bandwidth=2).fit(X)
    kde_y = KernelDensity(kernel='gaussian', bandwidth=2).fit(Y)
    kde_xy = KernelDensity(kernel='gaussian', bandwidth=2).fit(XY)


    print("estimation")
    print(e ** kde_x.score(X))
    print(e ** kde_y.score(Y))
    print(e ** kde_xy.score(XY))


    entropy_est = -np.mean(kde_xy.score(XY) - kde_y.score(Y))
    print("Estimated Lower Bound: ", func(entropy_est))
    print("Kalman Filter MSE    : ", mse(states, filtered_state_estimates))
Пример #11
0
 def calculateDensityKernel(self, pt_3d, num_neigh=10, noprogress=True):
     dists, nn_idxs = self.kdt.query(pt_3d, num_neigh)
     densities = []
     for i in tqdm(range(0, pt_3d.shape[0]), disable=noprogress):
         nn_coords = self.all_coords3d[nn_idxs[i], :3]
         density = KernelDensity().fit(nn_coords)
         log_density = density.score(pt_3d[i].reshape(1, -1))
         density = np.exp(log_density)
         densities.append(density)
     densities = np.array(densities).reshape(-1)
     return densities
Пример #12
0
def calculate_KL_KDE((posterior_distances, prior_distances)):

    from sklearn.neighbors import KernelDensity
    from scipy.integrate import quad

    h_silverman = lambda d: d.std() * (4. / 3 / len(d))**(1. / 5)
    h = h_silverman

    prior = KernelDensity(kernel='gaussian', bandwidth=h(prior_distances)).fit(
        prior_distances.reshape(-1, 1))
    posterior = KernelDensity(kernel='gaussian',
                              bandwidth=h(posterior_distances)).fit(
                                  posterior_distances.reshape(-1, 1))

    ce = lambda x: -prior.score(x) * np.exp(posterior.score(x))
    hh = lambda x: -posterior.score(x) * np.exp(posterior.score(x))

    x_max = np.max((posterior_distances.max(), prior_distances.max()))
    vals = (quad(ce, 0., x_max)[0], quad(hh, 0., x_max)[0])

    return vals[0] - vals[1]
Пример #13
0
class KernelDensityLmC(ContinuousLmC):
    def Init(self):
        ContinuousLmC.Init(self)
        self.kde = KernelDensity()
        self.lBandWidth = np.logspace(-2, 0, 10)
        self.BandWidth = 0.001
        self.KernelType = 'additivekde'

    def SetPara(self, conf):
        ContinuousLmC.SetPara(self, conf)
        self.BandWidth = conf.GetConf('bandwidth', self.BandWidth)
        self.KernelType = conf.GetConf('kernel', self.KernelType)
        return True

    def Construct(self, lTerm, Word2VecModel):
        if [] == lTerm:
            return
        lX = np.array(
            [Word2VecModel[term] for term in lTerm if term in Word2VecModel])
        #         self.kde = self.CVForBestKde()
        self.FitKernel(lX)

        logging.debug('doc kde lm estimated')

    def FitKernel(self, lX):
        if self.KernelType == 'additivekde':
            self.kde = AdditiveKdeC()
            self.kde.Bandwidth = self.BandWidth
            self.kde.fit(lX)
            return
        if self.KernelType == 'kde':
            self.kde = KernelDensity(kernel='gaussian',
                                     bandwidth=self.BandWidth).fit(lX)
            return

    def CVForBestKde(self):
        '''
        this is CV for each doc's best bandwidth
        It is better/more intuitive to CV for training query's ranking performance
        '''
        params = {'bandwidth': self.lBandWidth}
        #         logging.debug('cv bandwidth from [%s]',json.dumps(self.lBandWidth))
        grid = GridSearchCV(KernelDensity(), params)
        logging.debug('fitting on [%d] vector', len(self.lX))
        grid.fit(self.lX)
        logging.info('best bandwidth = [%f]', grid.best_estimator_.bandwidth)
        return grid.best_estimator_

    def pdf(self, x):
        return np.exp(self.LogPdf(x))

    def LogPdf(self, x):
        return self.kde.score(x)
def get_params_ll(X_train, X_validate, bandwidth_kernel):
    """
    Fit data using this bandwidth and kernel and report back log-likelihood fit on validation data (30% of training). This works better than 3-fold cross-validation, which was found to overfit data.
    :param X: data for only positive class
    :param bandwidth_kernel: list of bandwidth and kernel to evaluate
    """
    bandwidth = bandwidth_kernel[0]
    kernel = bandwidth_kernel[1]
    kde = KernelDensity(bandwidth=bandwidth,
                        metric='euclidean',
                        kernel=kernel
                        )
    kde.fit(X_train)
    ll = kde.score(X_validate)
    return ll
Пример #15
0
def bestbandwidth(a):
    kf = KFold(n_splits=10)
    kf.get_n_splits(a)
    Max = -1e99
    for train_index, test_index in kf.split(a):
        a_train, a_test = a[train_index], a[test_index]
        kde = KernelDensity(kernel='gaussian',
                            bandwidth=0.001 + i / 1000.).fit(a_train)
        log_dens = kde.score_samples(a_train)
        loglikelihood = kde.score(a_test)
        array = np.append(array, loglikelihood)
    Loglikelihood = np.nanmean(array)
    if Loglikelihood > Max:
        Max = Loglikelihood
        Bandwidth = 0.001 + i / 1000.
        print 'new best value for the bandwidth: ', Bandwidth
Пример #16
0
def plot_posterior_samples(target_model,
                           x_counts=1000,
                           samples=100,
                           points=True,
                           kde=True):
    m = target_model
    bounds = m.bounds
    S = samples

    if type(m).__name__ == 'DGPRegression':
        posterior = m.get_posterior
    elif type(m).__name__ == 'GPyRegression':
        posterior = m.get_posterior
    else:
        raise ValueError("The target_model should be either 'DGPRegression'"
                         "or 'GpyRegression'")

    Xs = np.linspace(*bounds[0], x_counts)
    samples = posterior(Xs, size=S)
    samples = samples[:, :, 0]
    ydif = (max(m.Y) - min(m.Y)) * 0.15
    levels = np.linspace(min(m.Y) - ydif, max(m.Y) + ydif, 1000)

    ax = plt.gca()
    # ax.set_ylim(min(levels), max(levels))
    # ax.set_ylim(min(levels), 1.0)
    # ax.set_xlim(min(Xs), max(Xs))
    plt.xticks(np.arange(0, 100, step=10))
    plt.xlabel(r"$\theta$")
    plt.ylabel(r"$d(x_\theta, x_{obs})$")

    if kde == True:
        cs = np.zeros((len(Xs), len(levels)))
        for i, Ss in enumerate(samples.T):
            bandwidth = 1.06 * np.std(Ss) * len(Ss)**(
                -1. / 5)  # Silverman's (1986) rule of thumb.
            kde = KernelDensity(bandwidth=float(bandwidth))

            kde.fit(Ss.reshape(-1, 1))
            for j, level in enumerate(levels):
                cs[i, j] = kde.score(np.array(level).reshape(1, 1))
        ax.pcolormesh(Xs.flatten(), levels, np.exp(cs.T),
                      cmap='Blues_r')  # , alpha=0.1)

    if points == True:
        ax.scatter(m.X, m.Y, s=15, color="red", zorder=10)
    return
Пример #17
0
    def calculateFeatures(self, distancesArray, nearestNeighborsArray, iterVector) -> dict:
        
        resultsDict = {}


        if "avgDistance" in self.features:
            # Calculate Average Distance for k-Nearest Neighbos
            resultsDict["avgDistance"] = np.mean(distancesArray)

        if "maxDistance" in self.features:
            # Calculate Max Distance for k-th Neighbor
            resultsDict["maxDistance"] = np.max(distancesArray)

        if "localDensity" in self.features:
            kde = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(nearestNeighborsArray)
            
            resultsDict["localDensity"] = -1 * kde.score(iterVector)

        
        return resultsDict
Пример #18
0
def jackknife_bandwidths(data, bandwidths, kernel="gaussian"):
    """Perform jack-knife sampling over different bandwidths for KDEs for each
    time-series in the dataset.

    Parameters
    ----------
    data: list of arrays
        A list of (variable length) arrays of values. The values should represent
        "times" of "events".

    bandwidths: array
        The possible bandwidths to try

    kernel: string (optional, default="gaussian")
        The kernel to use for the KDE. Should be accepted by sklearn's KernelDensity
        class.

    Returns
    -------
    result: array of shape (n_bandwidths,)
        The total likelihood of unobserved data over all jackknife samplings and all
        time series in the dataset for each bandwidth.
    """
    result = np.zeros(bandwidths.shape[0])
    for j in range(bandwidths.shape[0]):
        kde = KernelDensity(bandwidth=bandwidths[j], kernel=kernel)
        for i in range(len(data)):
            likelihood = 0.0
            for k in range(len(data[i])):
                if k < len(data[i]) - 1:
                    jackknife_sample = np.hstack([data[i][:k], data[i][k + 1 :]])
                else:
                    jackknife_sample = data[i][:k]
                kde.fit(jackknife_sample[:, None])
                likelihood += np.exp(kde.score(np.array([[data[i][k]]])))

            result[j] += likelihood

    return result
class EmpiricalDistribution1DKDE(object):
    def __init__(self,
                 param_name,
                 samples,
                 minval=None,
                 maxval=None,
                 bandwidth=0.1,
                 nbins=40):
        """
        Minvals and maxvals should specify priors for these. Should make these required.
        """
        self.ndim = 1
        self.param_name = param_name
        self.bandwidth = bandwidth
        # code below  relies on samples axes being swapped. but we
        # want to keep inputs the same
        # create a 2D KDE from which to evaluate
        self.kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(
            samples.reshape((samples.size, 1)))
        if minval is None:
            # msg = "minvals for KDE empirical distribution were not supplied. Resulting distribution may not have support over full prior"
            # logger.warning(msg)
            # widen these to add support
            minval = min(samples)
            maxval = max(samples)
        # significantly faster probability estimation using interpolation
        # instead of evaluating KDE every time
        self.minval = minval
        self.maxval = maxval
        xvals = np.linspace(minval, maxval, num=nbins)
        self._Nbins = nbins
        scores = np.array(
            [self.kde.score(np.atleast_2d(xval)) for xval in xvals])
        # interpolate within prior
        self._logpdf = interp1d(xvals, scores, kind='linear', fill_value=-1000)

    def draw(self):
        params = self.kde.sample(1).T
        return params.squeeze()
Пример #20
0
from data import importdata
import numpy as np
from sklearn.neighbors import NearestNeighbors, KernelDensity

dataset = ['abalone16_29', 'balance_scale', 'breast_cancer', 'car', 'cmc',
           'ecoli', 'glass', 'haberman', 'heart_cleveland', 'hepatitis',
           'new_thyroid', 'postoperative', 'solar_flare', 'transfusion', 'vehicle',
           'yeastME3', 'bupa', 'german', 'horse_colic', 'ionosphere', 'seeds', 'vertebal']

for data in dataset:
    db = getattr(importdata, 'load_' + data)()
    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
    print('Zbior danych: %s' % data)

    metrics = ['minkowski']
    for metric in metrics:
        nearestN = KernelDensity(kernel='epanechnikov')

        nearestN.fit(db.data, db.target)

        miniority_ind = np.where(db.target == 1)
        miniority_data = db.data[miniority_ind]
        miniority_target = db.target[miniority_ind]
        for d in miniority_data:
            print(nearestN.score(d))
Пример #21
0
print(result)

# Non parametric regression
#Nearest Neighbors with 5 neighbor
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(df3, y)
#K nearest neighbors with 5 neighbor
kneigh = KNeighborsRegressor(n_neighbors=5)
kneigh.fit(df3, y)
distances, indices = kneigh.kneighbors(df3)
print("Estimated value for selected features:",
      kneigh.predict(input_validation[Columnlist]))

#Kernel destiny
kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(df3)
kdescore = kde.score(df3, yv)
print("Kernel score:", kdescore)

##Write prediction to file
test = test.fillna(0)
test = pd.DataFrame(test)
test.columns = [
    'Tweet Id', 'User Name', 'Favs', 'RTs', 'Followers', 'Following', 'Listed',
    'likes', 'tweets', 'reply', 'URLs', 'Tweet content'
]
#add wordcounts of tweet
test['Tweet content'] = test['Tweet content'].astype(str)
test['Word Count'] = test['Tweet content'].str.split().str.len()
test = test[Columnlist]

#dar inja sotun result ra ezafe mikonim
        n_jobs         : int regarding the number of jobs to run in parallel. Default = maximum number of jobs.
        
        Returns
        ----------
        F1_CAPe        : dict containing for each key multiple of k (k, 2*k, 3*k,...,ntimes*k) the array [tn,fp,fn,tp] obtained
                         with the estimate of the prior in such case.
        class_priors   : array of shape (ntimes,) containing every k new labels the estimate of the class prior.
        
    """

    n = np.shape(X_train)[0]
    tmp_cont = 0.1
    query_list = []
    labeled_ex = np.zeros(n, dtype=np.int)
    ker = KernelDensity().fit(X_train)
    dmu = [np.exp(ker.score(X_train[i:i + 1])) for i in range(n)]
    mean_prob_term = math.log(np.mean(dmu), 10)  #Take the log density
    F1_CAPe = {}
    class_priors = np.zeros(ntimes, dtype=float)

    for j in range(ntimes):

        prior, labeled_ex, query_list = CAPe(X_train, labeled_ex, query_list,
                                             k, real_anomalies, tmp_cont,
                                             mean_prob_term, case, n_jobs)

        class_priors[j] = prior

        tmp_cont = 1 - min(prior, 0.9999)  #update the contamination factor

        F1_CAPe[int(k * (j + 1))] = get_tnfpfntp(
Пример #23
0
driverID=2
tripInd=2
driverDir = '/home/user1/Desktop/SharedFolder/Kaggle/DriversCleaned/'+str(driverID)
df = pd.read_csv(driverDir+'_' + str(tripInd)+'.csv')
trip = Trip(driverID,tripInd,df)
trip.getSpeed()
trip.getAcc()
    #trip.getRadius()
    #trip.getCacc()
trip.getFeatures()
X=trip.features[['v','acc']]
    
probas = np.zeros(X.shape[0])
    
for i in range(X.shape[0]):
    probas[i]=clf.score(X.loc[i])

# <codecell>

probas.mean()

# <codecell>

sns.jointplot(X.v,X.acc,kind = "scatter",size=6,ratio=5,marginal_kws={'bins':30})
#sns.kdeplot(X[['cacc','acc']])

# <codecell>

xN = np.asanyarray(X[['cacc','acc']])

# <codecell>
Пример #24
0
    def evaluate_generator(self, test_data, iteration=None):
        is_load_weights = iteration is not None
        if is_load_weights:
            self.load_weights(iteration)

        test_data = test_data[:]
        if type(test_data['poses']) == torch.Tensor:
            test_data['poses'] = test_data['poses'].numpy()
            test_data['konf_obsts'] = test_data['konf_obsts'].numpy()
            test_data['actions'] = test_data['actions'].numpy()

        poses = torch.from_numpy(test_data['poses']).float().to(self.device)
        konf_obsts = torch.from_numpy(test_data['konf_obsts']).float().to(
            self.device)

        n_data = len(poses)
        n_smpls_per_state = 100
        smpls = []
        print "Making samples..."
        stime = time.time()
        for i in range(n_smpls_per_state):
            if self.architecture == 'gnn':
                noise = torch.randn(n_data, self.n_dim_actions).to(self.device)
                new_smpls1 = self.generator(konf_obsts[:500], poses[:500],
                                            noise[:500])
                new_smpls2 = self.generator(konf_obsts[500:], poses[500:],
                                            noise[500:])
                new_smpls = torch.cat([new_smpls1, new_smpls2], dim=0)
            else:
                noise = torch.randn(n_data, self.n_dim_actions).to(self.device)
                new_smpls = self.generator(konf_obsts, poses, noise)
            smpls.append(new_smpls.cpu().detach().numpy())
        print "Sample making time", time.time() - stime
        smpls = np.stack(smpls)

        real_actions = test_data['actions']
        real_actions, real_mean, real_std = self.normalize_data(real_actions)

        real_data_scores = []
        entropies = []
        min_mses = []
        for idx in range(n_data):
            smpls_from_state = smpls[:, idx, :]
            smpls_from_state, _, _ = self.normalize_data(
                smpls_from_state, real_mean, real_std)
            real_action = real_actions[idx].reshape(-1, self.n_dim_actions)

            unnormalized_real_action = real_action * real_std + real_mean
            unnormalized_smpls_from_state = smpls_from_state * real_std + real_mean
            min_mse = self.measure_min_mse_between_samples_and_point(
                unnormalized_real_action, unnormalized_smpls_from_state)
            min_mses.append(min_mse)

            # fit the KDE - how likely is the real action come from the learend distribution of smpls_from_state
            generated_model = KernelDensity(
                kernel='gaussian', bandwidth=0.1).fit(smpls_from_state)
            real_data_scores.append(generated_model.score(real_action))

            # measure the entropy
            if 'pick' in self.action_type:
                base_angles = unnormalized_smpls_from_state[:, 4:6]
                H, _, _ = np.histogram2d(base_angles[:, 0],
                                         base_angles[:, 1],
                                         bins=10,
                                         range=self.domain[:, 4:6].transpose())
            else:
                place_x, place_y = unnormalized_smpls_from_state[:,
                                                                 0], unnormalized_smpls_from_state[:,
                                                                                                   1]
                encoded_theta = unnormalized_smpls_from_state[:, 1:]
                # H_theta, _, _ = np.histogram2d(encoded_theta[:, 0], encoded_theta[:, 1], bins=10, range=self.domain[:, 2:].transpose())
                H, _, _ = np.histogram2d(place_x,
                                         place_y,
                                         bins=10,
                                         range=self.domain[:, 0:2].transpose())

                # I think the angle entropy is more important
                # For a given x,y, what is the entropy on the angles? I think entropy of angles
                # has more to say, because this is what we should get accurately.

            all_smpls_out_of_range = np.sum(H) == 0
            if all_smpls_out_of_range:
                entropy = np.inf
            else:
                prob = H / np.sum(H)
                entropy = sp.stats.entropy(prob.flatten())
            entropies.append(entropy)

        return np.mean(min_mses), np.mean(real_data_scores), np.mean(entropies)
Пример #25
0
def make2D_KDE(X, n_samp = 1e5, bandwidth = None, n_folds = 3, bw_train_size = 1000, bw_range_size = 20, doplot = True):
	"""
	Make a 2D Kernel Density Estimation and draw a n_samp number of samples from it
	best bandwidth obtained from previous runs
	bandwidth = 0.0546938775510204
	bandwidth = 0.05894736842105264

	Input:
		X (2D numpy array): the training data, consisting of the Y - J and J - H 
		colours.\n
		n_samp (int): the number of samples to draw from the KDE. Default = 100000.\n
		bandwidth (float): the bandwidth to use for the KDE from which the samples
		will be drawn. Set to None to let the script find the best bandwidth. 
		Default = None.\n
		n_folds (int): the number of folds to use when determining the bandwidth.\n
		bw_train_size (int): size of the training set that will be used to 
		determine the best bandwidth. Default = 1000.\n
		bw_range_size (int); the amount of bandwidths to try out in the interval
		0.04 to 0.1. Default = 20.\n
		doplot (boolean): whether to make a hex-bin plot of the drawn samples or
		not. Default = True.

	Output:
		samples (2D numpy array): the samples drawn from the KDE.
	"""
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.neighbors import KernelDensity
	from sklearn.model_selection import KFold
	from matplotlib import rcParams
	rcParams['font.family'] = 'Latin Modern Roman'
	from matplotlib.colors import LogNorm

	#shuffle the data
	np.random.shuffle(X)

	#determine the best bandwidth if it is not provided
	if bandwidth == None:
		#first we find the optimum bandwidth
		kf = KFold(n_splits = n_folds)

		#range of bandwidths to try
		bwrange = np.linspace(0.02, 0.08, bw_range_size)
		#the array which will store the likelyhood
		likelyhood = np.zeros(len(bwrange))
		
		print('Finding the best bandwidth...')
		for bw, i in zip(bwrange, np.arange(len(bwrange))):
			print('Iteration {0}, bandwidth {1}'.format(i, bw))
			lh = []
			#split the data into a train and test set using only the first 1000 samples
			for train_i, test_i in kf.split(X[:,:bw_train_size]):
				Xtrain, Xtest = X[train_i], X[test_i]
				kde = KernelDensity(bandwidth = bw, kernel = 'gaussian').fit(Xtrain)

				lhscore = kde.score(Xtest)
				
				lh = np.append(lh, lhscore)

				print('Bandwidth: {0}, score: {1}'.format(bw, lhscore))
				
			likelyhood[i] = np.mean(lh)

		plt.plot(bwrange, likelyhood)
		plt.xlabel('Bandwidth')
		plt.ylabel('Likelyhood')
		plt.title('KDE likelyhood for different bandwidths')
		plt.savefig('2D_KDE_likelyhood_run4.png', dpi = 300)
		plt.close()


		#find the bandwidth which gave the highest likelyhood
		bandwidth = bwrange[np.argmax(likelyhood)]

		print('Best bandwidth: {0}'.format(bandwidth))

	kde = KernelDensity(bandwidth = bandwidth, kernel = 'gaussian').fit(X)

	#pull samples from the kde
	samples = kde.sample(int(n_samp))
	
	#plot the samples in a hexbin plot
	if doplot:
		plt.hexbin(samples[:, 0], samples[:, 1], bins = 'log', cmap = 'Reds')
		plt.colorbar(label = 'Density of samples [logarithmic]')

		plt.xlabel('Y - J')
		plt.ylabel('J - H')
		plt.title('Distribution of samples in (Y-J, J-H) colour space')
		plt.savefig('Samples_distribution_hex.pdf', dpi = 300)
		plt.show()

	return samples
Пример #26
0
class KDE():
    """Kernel density estimation (KDE) for accurate local density estimation.
    This is achieved by using maximum-likelihood estimation of the generative kernel density model
    which is regularized using cross-validation.


    Parameters
    ----------
    bandwidth: float, optional
        bandwidth for the kernel density estimation. If not specified, will be determined automatically using
        maximum likelihood on a test-set.

    nh_size: int, optional
        number of points in a typical neighborhood... only relevant for evaluating
        a crude estimate of the bandwidth. If run in combination with t-SNE, should be on
        the order of the perplexity.

    xtol,atol,rtol: float, optional
        precision parameters for kernel density estimates and bandwidth optimization determination.

    test_ratio_size: float, optional
        ratio of the test size for determining the bandwidth.
    """
    def __init__(self,
                 bandwidth=None,
                 test_ratio_size=0.1,
                 xtol=0.01,
                 atol=0.000005,
                 rtol=0.00005,
                 extreme_dist=False,
                 nn_dist=None):

        self.bandwidth = bandwidth
        self.test_ratio_size = test_ratio_size
        self.xtol = xtol
        self.atol = atol
        self.rtol = rtol
        self.extreme_dist = extreme_dist
        self.nn_dist = nn_dist

    def fit(self, X):
        """Fit kernel model to X"""
        if self.bandwidth is None:
            self.bandwidth = self.find_optimal_bandwidth(X)
        else:
            self.kde = KernelDensity(bandwidth=self.bandwidth,
                                     algorithm='kd_tree',
                                     kernel='gaussian',
                                     metric='euclidean',
                                     atol=self.atol,
                                     rtol=self.rtol,
                                     breadth_first=True,
                                     leaf_size=40)

        self.kde.fit(X)
        return self

    def evaluate_density(self, X):
        """Given an array of data, computes the local density of every point using kernel density estimation

        Input
        ------
        Data X : array, shape(n_sample,n_feature)

        Return
        ------
        Log of densities for every point: array, shape(n_sample)
        Return:
            kde.score_samples(X)
        """
        return self.kde.score_samples(X)

    def bandwidth_estimate(self, X):
        """Gives a rough estimate of the optimal bandwidth (based on the notion of some effective neigborhood)
        
        Return
        ---------
        bandwidth estimate, minimum possible value : tuple, shape(2)
        """
        if self.nn_dist is None:
            nn = NearestNeighbors(n_neighbors=2, algorithm='kd_tree')
            nn.fit(X)
            nn_dist, _ = nn.kneighbors(X, n_neighbors=2, return_distance=True)
        else:
            nn_dist = self.nn_dist

        h_min = np.mean(nn_dist[:, 1])
        h_max = 5 * h_min  # heuristic bound !! careful !!

        return h_max, h_min

    def find_optimal_bandwidth(self, X):
        """Performs maximum likelihood estimation on a test set of the density model fitted on a training set
        """
        from scipy.optimize import fminbound

        hest, hmin = self.bandwidth_estimate(X)
        print("[kde] Minimum bound = %.4f \t Rough estimate of h = %.4f" %
              (hmin, hest))

        X_train, X_test = train_test_split(X, test_size=self.test_ratio_size)
        args = (X_train, X_test)

        # We are trying to find reasonable tight bounds (hmin,1.5*hest) to bracket the error function minima
        if self.xtol > hmin:
            tmp = round_float(hmin)
            print(
                '[kde] Bandwidth tolerance (xtol) greater than minimum bound, adjusting xtol: %.5f -> %.5f'
                % (self.xtol, tmp))
            self.xtol = tmp

        h_optimal, score_opt, _, niter = fminbound(
            self.log_likelihood_test_set,
            hmin,
            1.5 * hest,
            args,
            maxfun=100,
            xtol=self.xtol,
            full_output=True)

        print("[kde] Found log-likelihood minima in %i evaluations, h = %.5f" %
              (niter, h_optimal))

        if self.extreme_dist is False:  # in the case of distribution with extreme variances in density, these bounds will fail ...
            assert abs(h_optimal - 1.5 *
                       hest) > 1e-4, "Upper boundary reached for bandwidth"
            assert abs(h_optimal -
                       hmin) > 1e-4, "Lower boundary reached for bandwidth"

        return h_optimal

    '''  def find_nh_size(self, X, h_optimal = None, n_estimate = 100):
        """ Given the optimal bandwidth from the CV score, finds the nh_size (using a binary search) which yield h_opt according 
        to the formula np.median(dist_to_nth_neighor) = h_opt
        """
        if h_optimal is None:
            h_optimal = self.bandwidth # should trigger a bug if this is not defined !

        nn = NearestNeighbors(n_neighbors = n_estimate, algorithm='kd_tree').fit(X)
        nn_dist, _ = self.nbrs.kneighbors(X, n_neighbors = 3*n_estimate)
        max_n = 3*n_estimate
        min_n = 0

        n_var = n_estimate
        while True: # performs binary search until convergence !
            h_est = np.median(nn_dist[:,n_var])
            print(n_var,'\t', h_est)
            if h_est > h_optimal:
                max_n = n_var
                change = round(0.5*(max_n - min_n))+min_n
                if change != n_var:
                    n_var = change
                else:
                    break 
            else:
                min_n = n_var
                change = round(0.5*(max_n - min_n))+min_n
                if change != n_var:
                    n_var = change
                else:
                    break
        return n_var 
    '''

    def log_likelihood_test_set(self, bandwidth, X_train, X_test):
        """Fit the kde model on the training set given some bandwidth and evaluates the log-likelihood of the test set
        """
        self.kde = KernelDensity(bandwidth=bandwidth,
                                 algorithm='kd_tree',
                                 atol=self.atol,
                                 rtol=self.rtol,
                                 leaf_size=40)
        self.kde.fit(X_train)
        return -self.kde.score(X_test)
Пример #27
0
    def apply(self):
        # [WUMBO] - Weighted UID-filtered Multi-Metric Based Outlier detection 
        ##############################################################################
        # Wumbo is an anomaly detector designed for large datasets with pockets of
        # common groups.
        #
        # Part of the output of Wumbo is the Outlier Score [0,1], the other part
        # is the weight, or Risk Score, a number that represents "Outlierness"
        # of the node.
        #
        # The algorithm requires no hyperparameters to be chosen except
        # alpha. 
        #
        # There are some optional arguments to select starting with filtering out similar
        # identities/uid's so that one identity/uid can't cluster with itself to poison 
        # the density values. 
        #
        # Additionally, the default measurements are kernel density, average distance, 
        # and max distance from some k-Nearest Neighbors where k is already
        # calculated by 5 <= sqrt(N) <= 50. This helps scale the dataset to a large
        # number of data points with common behavioral characteristics relative to the
        # size of the total, larger dataset.
        # 
        # These metrics are then combined and evaluated against the entire dataset
        # to find outliers and Score the weighted "Risk."
        ##############################################################################


        
        # Initialize a temporary and return dataframe 
        temp_df = self.dataframe.copy(deep=True)
        results_df = pd.DataFrame()
        
        # Calculate number of k-Neighbors 
        ##############################################################################
        # This is a number that is equal to the square root of distinct count of UID's
        # with a minimum of 5 and a maximum of 50. (Concept comes from t-SNE paper)
        # This way the number of k-Neighbors scales with the size of the data.
        ##############################################################################


        kNeighbors = min(max(int(len(self.dataframe[self.uidColumnName].unique()) ** 0.5),5),50)
        numberOfRows = len(self.dataframe.index)
    
    
        # Initialize feature columns
        if "avgDistance" in self.features:
            results_df["avgDistance"] = 0
        if "maxDistance" in self.features:
            results_df["maxDistance"] = 0
        if "localDensity" in self.features:
            results_df["localDensity"] = 0

        
        # Iterate through dataframe
        for x in range(numberOfRows):
            
            # Identify current UID Value
            iterVector: np.array
            
            # Filter (or not)
            if self.filter==True: 
                temp_df = self.dataframe.copy(deep=True)
                currentUID = temp_df.loc[x,self.uidColumnName]

                # Filter out UID's and convert to numpy
                iterVector = temp_df.loc[x].drop(self.uidColumnName).reset_index(drop=True).to_numpy().reshape(1,-1)
                neighbors = NearestNeighbors(n_neighbors=kNeighbors)
                neighbors.fit(temp_df.loc[x | temp_df[self.uidColumnName]!=currentUID].drop([self.uidColumnName], axis=1).to_numpy())
                distancesArray, ind = neighbors.kneighbors(iterVector, return_distance=True)
                
                nearestNeighborsArray = temp_df[temp_df.index.isin(ind[0])].drop(labels=self.uidColumnName,axis=1).to_numpy()

            else:
                currentUID = temp_df.loc[x,self.uidColumnName]
                #print(len(temp_df))
                # Find nearest neighbors
                iterVector = temp_df.loc[x].drop(self.uidColumnName).reset_index(drop=True).to_numpy().reshape(1,-1)
                neighbors = NearestNeighbors(n_neighbors=kNeighbors)
                neighbors.fit(temp_df.drop([self.uidColumnName], axis=1).to_numpy())
                distancesArray, ind = neighbors.kneighbors(iterVector, return_distance=True)
                
                nearestNeighborsArray = temp_df[temp_df.index.isin(ind[0])].drop(labels=self.uidColumnName,axis=1).to_numpy()
            
            
            
            # Calculate Features (for layer 1) based on Feature Values for Model
            resultsDict = self.calculateFeatures(distancesArray=distancesArray, nearestNeighborsArray=nearestNeighborsArray, iterVector=iterVector)

            resultsDict[self.uidColumnName] = currentUID
            results_df = results_df.append(resultsDict, ignore_index=True)
            


        output_df = self.dataframe.copy(deep=True)
        kde = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(results_df[self.features].to_numpy())

        for x in range(numberOfRows):

            iterRow = results_df.loc[x].drop("uid").reset_index(drop=True).to_numpy().reshape(1,-1)    
                
            output_df.loc[x, "Risk Score"] = 1/(numberOfRows * 10 ** (kde.score(iterRow)))
            if 1/((numberOfRows * 10 ** (kde.score(iterRow)))) > (1/self.alpha):
                output_df.loc[x, "Outlier"] = 1
            else:
                output_df.loc[x, "Outlier"] = 0
        
        return output_df
Пример #28
0
zero_test = test_imgs[test_labels == 1, :]
one_test = test_imgs[test_labels == 0, :]

#counting white pixels
train_count_zero = np.sum(zero_train > 25, axis=1)
train_count_one = np.sum(one_train > 25, axis=1)
test_count_zero = np.sum(zero_test > 25, axis=1)
test_count_one = np.sum(one_test > 25, axis=1)

kde1 = KernelDensity(kernel='gaussian',
                     bandwidth=0.2).fit(train_count_one.reshape(-1, 1))
kde0 = KernelDensity(kernel='gaussian',
                     bandwidth=0.2).fit(train_count_zero.reshape(-1, 1))

onescores_one = np.array(
    [kde1.score(i.reshape(-1, 1)) for i in test_count_one])
onescores_zero = np.array(
    [kde0.score(i.reshape(-1, 1)) for i in test_count_one])
zeroscores_one = np.array(
    [kde1.score(i.reshape(-1, 1)) for i in test_count_zero])
zeroscores_zero = np.array(
    [kde0.score(i.reshape(-1, 1)) for i in test_count_zero])

plt.subplot(3, 1, 1)
plt.hist(train_count_zero, 100)
plt.title('Histogram of Pixel Count for Digit 0')
plt.subplot(3, 1, 2)
plt.hist(train_count_one, 100)
plt.title('Histogram of Pixel Count for Digit 1')
plt.subplot(3, 1, 3)
sns.distplot(train_count_zero)
Пример #29
0
def optimize_bd(dfGenome, dfPos, dfGene, outpath):

    "Bandwidth optimization by fitting the density to positive set"

    dfPos['mid'] = ((dfPos['end'] - dfPos['start']) / 2) + dfPos['start']

    chrs = list(dfGenome.chrom.unique())

    bdlist = list(np.linspace(1000, 1000000, 1000))

    sc = np.array([0.0] * (len(bdlist) + 1))

    for chrname in chrs:

        chrlen = int(dfGenome[dfGenome.chrom == chrname].length)

        N = dfPos[dfPos.chrom == chrname].shape[0]

        dfchr = dfGene[dfGene.chrom == chrname]

        dfPosChr = dfPos[dfPos.chrom == chrname]

        Xp = np.array(list(dfPosChr['mid']))[:, np.newaxis]

        X = np.array(list(dfchr['mid']))[:, np.newaxis]

        ## estimate the density at each 1000 bp

        X_plot = np.linspace(0, chrlen, int(chrlen / 1000))[:, np.newaxis]

        b = np.array([[0, 0]])

        print("optimization for", chrname)

        for bd in bdlist:

            kde = KernelDensity(kernel='gaussian', bandwidth=bd).fit(X)

            a = np.c_[bd, kde.score(Xp)]

            b = np.r_[b, a]

        sc[:] = sc[:] + b[:, 1]

    end = np.c_[bdlist, list(sc[1:, ])]

    idxrow = np.argwhere(end == max(end[:, 1]))[0, 0]
    newbd = int(end[idxrow, 0])

    print("the bandwith is", newbd)

    #plt.plot(bdlist, list(sc[1:,]))
    #plt.title("genome")
    #plt.xlabel("bandwidth (bp)")
    #plt.ylabel("log score of positive set")
    #plt.savefig(path + 'gene_density_optimization.png')
    #plt.close()

    dfout = pd.DataFrame({'A': bdlist, 'B': sc[1:, ]})

    dfout.to_csv(path_or_buf=outpath + "bandwidth_trials.txt",
                 sep='\t',
                 header=False,
                 index=False)

    return newbd
Пример #30
0
class KDE:
    """Kernel density estimation (KDE) for accurate local density estimation.
    This is achieved by using maximum-likelihood estimation of the generative kernel density model
    which is regularized using cross-validation.


    Parameters
    ----------
    bandwidth: float, optional
        bandwidth for the kernel density estimation. If not specified, will be determined automatically using
        maximum likelihood on a test-set.

    nh_size : int, optional (default = 'auto')
        number of points in a typical neighborhood... only relevant for evaluating
        a crude estimate of the bandwidth.'auto' means that the nh_size is scaled with number of samples. We 
        use nh_size = 100 for 10000 samples. The minimum neighborhood size is set to 4.

    test_ratio_size: float, optional (default = 0.8)
        Ratio size of the test set used when performing maximum likehood estimation.
        In order to have smooth density estimations (prevent overfitting), it is recommended to
        use a large test_ratio_size (closer to 1.0) rather than a small one.

    atol: float, optional (default = 0.000005)
        kernel density estimate precision parameter. determines the precision used for kde.
        smaller values leads to slower execution but better precision
    
    rtol: float, optional (default = 0.00005)
        kernel density estimate precision parameter. determines the precision used for kde.
        smaller values leads to slower execution but better precision
    
    xtol: float, optional (default = 0.01)
        precision parameter for optimizing the bandwidth using maximum likelihood on a test set

    test_ratio_size: float, optional
        ratio of the test size for determining the bandwidth.

    kernel: str, optional (default='gaussian')
        Type of Kernel to use for density estimates. Other options are {'epanechnikov'|'linear','tophat'}.
    """
    def __init__(self,
                 nh_size='auto',
                 bandwidth=None,
                 test_ratio_size=0.1,
                 xtol=0.01,
                 atol=0.000005,
                 rtol=0.00005,
                 extreme_dist=False,
                 nn_dist=None,
                 kernel='gaussian'):

        self.bandwidth = bandwidth
        self.nh_size = nh_size
        self.test_ratio_size = test_ratio_size
        self.xtol = xtol
        self.atol = atol
        self.rtol = rtol
        self.extreme_dist = extreme_dist
        self.nn_dist = nn_dist
        self.kernel = kernel  # epanechnikov other option

    def fit(self, X):
        """Fit kernel model to X"""

        if self.nh_size is 'auto':
            self.nh_size = max([int(25 * np.log10(X.shape[0])), 4])

        if X.shape[1] > 8:
            print(
                'Careful, you are trying to do density estimation for data in a D > 8 dimensional space\n ... you are warned !'
            )

        if self.bandwidth is None:
            self.bandwidth = self.find_optimal_bandwidth(X)
        else:
            self.kde = KernelDensity(bandwidth=self.bandwidth,
                                     algorithm='kd_tree',
                                     kernel=self.kernel,
                                     metric='euclidean',
                                     atol=self.atol,
                                     rtol=self.rtol,
                                     breadth_first=True,
                                     leaf_size=40)
        self.kde.fit(X)
        return self

    def evaluate_density(self, X):
        """Given an array of data, computes the local density of every point using kernel density estimation

        Input
        ------
        Data X : array, shape(n_sample,n_feature)

        Return
        ------
        Log of densities for every point: array, shape(n_sample)
        Return:
            kde.score_samples(X)
        """
        return self.kde.score_samples(X)

    def bandwidth_estimate(self, X_train, X_test):
        """Gives a rough estimate of the optimal bandwidth (based on the notion of some effective neigborhood)
        
        Return
        ---------
        bandwidth estimate, minimum possible value : tuple, shape(2)
        """

        if self.nn_dist is None:
            nn = NearestNeighbors(n_neighbors=self.nh_size,
                                  algorithm='kd_tree')
            nn.fit(X_train)
            nn_dist, _ = nn.kneighbors(X_test,
                                       n_neighbors=self.nh_size,
                                       return_distance=True)
        else:
            nn_dist = self.nn_dist

        dim = X_train.shape[1]

        # Computation of minimum bound
        # This can be computed by taking the limit h -> 0 and making a saddle-point approx.
        mean_nn2_dist = np.mean(nn_dist[:, 1] * nn_dist[:, 1])
        h_min = np.sqrt(mean_nn2_dist / dim)

        idx_1 = np.random.choice(np.arange(len(X_train)),
                                 size=min([1000, len(X_train)]),
                                 replace=False)
        idx_2 = np.random.choice(np.arange(len(X_test)),
                                 size=min([1000, len(X_test)]),
                                 replace=False)

        max_size = min([len(idx_1), len(idx_2)])

        tmp = np.linalg.norm(X_train[idx_1[:max_size]] -
                             X_test[idx_2[:max_size]],
                             axis=1)

        h_max = np.sqrt(np.mean(tmp * tmp) / dim)
        h_est = 10 * h_min
        return h_est, h_min, h_max

    def find_optimal_bandwidth(self, X):
        """Performs maximum likelihood estimation on a test set of the density model fitted on a training set
        """
        from scipy.optimize import fminbound
        X_train, X_test = train_test_split(X, test_size=self.test_ratio_size)
        args = (X_test, )

        hest, hmin, hmax = self.bandwidth_estimate(X_train, X_test)

        print(
            "[kde] Minimum bound = %.4f \t Rough estimate of h = %.4f \t Maximum bound = %.4f"
            % (hmin, hest, hmax))

        # We are trying to find reasonable tight bounds (hmin, 4.0*hest) to bracket the error function minima
        # Would be nice to have some hard accurate bounds
        self.xtol = round_float(hmin)

        print(
            '[kde] Bandwidth tolerance (xtol) set to precision of minimum bound : %.5f '
            % self.xtol)

        self.kde = KernelDensity(algorithm='kd_tree',
                                 atol=self.atol,
                                 rtol=self.rtol,
                                 leaf_size=40,
                                 kernel=self.kernel)

        self.kde.fit(X_train)

        # hmax is the upper bound, however, heuristically it appears to always be way above the actual bandwidth. hmax*0.2 seems much better but still conservative
        h_optimal, score_opt, _, niter = fminbound(
            self.log_likelihood_test_set,
            hmin,
            hmax * 0.2,
            args,
            maxfun=100,
            xtol=self.xtol,
            full_output=True)

        print(
            "[kde] Found log-likelihood maximum in %i evaluations, h = %.5f" %
            (niter, h_optimal))

        if self.extreme_dist is False:  # These bounds should always be satisfied ...
            assert abs(h_optimal -
                       hmax) > 1e-4, "Upper boundary reached for bandwidth"
            assert abs(h_optimal -
                       hmin) > 1e-4, "Lower boundary reached for bandwidth"

        return h_optimal

    # @profile
    def log_likelihood_test_set(self, bandwidth, X_test):
        """Fit the kde model on the training set given some bandwidth and evaluates the negative log-likelihood of the test set
        """
        self.kde.bandwidth = bandwidth
        # l_test = len(X_test)
        return -self.kde.score(
            X_test[:2000]
        )  # X_test[np.random.choice(np.arange(0, l_test), size=min([int(0.5*l_test), 1000]), replace=False)]) # this should be accurate enough !
Пример #31
0
dist = np.sqrt(
    np.sum(np.square(y_reconstructed - test_latents).reshape(
        len(test_latents), -1),
           axis=1))
sns.distplot(dist)
pred_save(dist, PRED_FOLDER + 'prediction_unet_vae_pca_reconstruced.csv')

# %%
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
print('TSNE fitting...')
tsne = TSNE(n_components=2, random_state=SEED, verbose=True)
y_TSNE = tsne.fit_transform(test_latents)
plt.scatter(y_TSNE[:, 0], y_TSNE[:, 1], s=1)

rmse_tsne_test = np.sqrt(
    np.square(y_TSNE[:, 0] - np.mean(y_TSNE[:, 0])) +
    np.square(y_TSNE[:, 1] - np.mean(y_TSNE[:, 1])))
sns.distplot(rmse_tsne_test)
pred_save(rmse_tsne_test, PRED_FOLDER + 'prediction_unet_vae_tsne_rmse.csv')
# %%
from sklearn.neighbors import KernelDensity
kd = KernelDensity()
kd.fit(test_latents)
score = [kd.score(i.reshape(1, -1)) for i in test_latents]
score = score - np.min(score)
sns.distplot(score)
pred_save(score, PRED_FOLDER + 'prediction_unet_vae_latentkd.csv')

# %%
Пример #32
0
def getKDE(userJson):
    # arrayFilePath = "../data/test2.txt"
    # vector = np.loadtxt(arrayFilePath, dtype=np.float32)
    #
    # X_row = np.size(vector, 0)  # 计算 X 一行元素的个数
    # X_col = np.size(vector, 1)  # 计算 X 一列元素的个数
    #
    # # 计算出每两个高维向量之间的距离
    # dis = []
    # for i in range(X_row):
    #     vec1 = vector[i]
    #     for j in range(i + 1, X_row):
    #         vec2 = vector[j]
    #         dis_c = np.sqrt(np.sum(np.square(vec1 - vec2)))
    #         dis.append([dis_c])
    words = userJson["words"][0:100]#只选取用户100个查询
    similarValueList = []
    for index in range(len(words)):
        pair = words[index]
        for key, value in pair.items():
            sentence =  key  # 整句
            wordList1 = value # 每个词一个格子的list

        # 两两比较所有words
        for index in range(index+1, len(words)):
            pair2 = words[index]

            for key, value in pair2.items():

                sentence2 = key
                wordList2 = value

            # 比较两个短句的相似度,分拆后两两比较,
            # similarValue = model.similarity(sentence, sentence2)

            # 两两比较两个句子的每个词语之间的相似度,选出 最相似的k个值加权平均,

            similar_K = []
            minlen = min(len(wordList1),len(wordList2)) # 选择小的长度

            # 复杂度可能太高
            for word1 in wordList1:
                for word2 in wordList2:
                    curSimilar = model.wv.similarity(word1,word2)
                    similar_K.append(curSimilar)

            # 选取相似最大的minlen个,求和平均
            similar_K.sort()
            similar_K.reverse()
            similarValue = sum(similar_K[0:minlen])/minlen #

            similarValueList.append(similarValue)


    global maxValue
    global minValue
    maxValue = np.max(similarValueList)
    minValue = np.min(similarValueList)

    # 归一化到 0-1
    # dis3 = MaxMinNormalization(similarValueList, maxValue, minValue)
    dis3 = similarValueList
    # print(dis)
    print(dis3)
    print(len(similarValueList))
    # 标准差
    stdValue = np.std(dis3)

   # -----------------------------------------------------------

    X = [] # 1维变2维
    for item in dis3:
        X.append([item])
    N = len(dis3)
    maxValue3 = np.max(dis3)
    minValue3 = np.min(dis3)

    # 创建等差数列 -5 到 10, N个数 ,作为x坐标轴
    X_plot = np.linspace(minValue3 - 1, maxValue3 + 1, N)[:, np.newaxis]

    # 真实密度

    fig, ax = plt.subplots()

    # 这里需要计算出一个合理的bandwidth
    # bandwidth约等于 1/N^(0.2) * stdValue
    bandwidth = 1 / pow(N, 0.2) * stdValue

    print("bandwidth,N",bandwidth, N)
    # for kernel in ['gaussian', 'tophat', 'epanechnikov']:
    for kernel in ['gaussian']:
        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(X)  # bandwidth=0.008
        log_dens = kde.score_samples(X_plot)
        exp_dens = np.exp(log_dens)
        ax.plot(X_plot[:, 0], np.exp(log_dens), '-',
                label="kernel = '{0}'".format(kernel))

    ax.text(6, 0.38, "N={0} points".format(N))

    ax.legend(loc='upper left')

    # ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')

    ax.set_xlim(minValue3, maxValue3)
    ax.set_ylim(-0.02, 10)

    plt.show()

    density = np.exp(kde.score([[0.5]]))
    # 个性化访问概率
    # p = 1/N  * 求和(kde.score)
    # 密度*bandwidth 计算出概率,这里有一定问题  应该是积分
    probability = density * bandwidth
    print(probability)

    return kde
Пример #33
0
    print("Kernel bandwidth:")
    bw = np.random.uniform(1, 5)
    print(bw)

    print("Our KDE:")
    my_kde = TruncatedNormalKernelDensity(bandwidth=bw)
    my_kde.fit(x)
    print(my_kde.score_samples(y))
    print(my_kde.score(y))

    print("SciKitLearn KDE:")
    skl_kde = KernelDensity(kernel='gaussian', bandwidth=bw)
    skl_kde.fit(x)
    print(skl_kde.score_samples(y))
    print(skl_kde.score(y))

    print("Test that truncation works:")
    y_vals = sorted(y)
    up = y_vals[5]
    low = y_vals[2]

    print(f"With upperbound {up}:")
    up_kde = TruncatedNormalKernelDensity(bandwidth=bw, upperbound=up)
    up_kde.fit(x)
    print(up_kde.score_samples(y))

    print(f"With lowerbound {low}:")
    low_kde = TruncatedNormalKernelDensity(bandwidth=bw, lowerbound=low)
    low_kde.fit(x)
    print(low_kde.score_samples(y))