#y = 20 + x ** 2 * np.cos(0.5 * x) * int(x < 0) + 10 * np.sin(x) * np.cos(x) * x * int(0 <= x) y = -x return y y = [datagenproc(i) for i in x] ynoise = [datagenproc(i) + 2 * np.random.randn(1)[0] for i in x] #ynoise = np.random.randn(len(x)) # correlation coeff print('r_xy=\n', np.corrcoef(np.array([x, ynoise]).T, rowvar=0)) # get Silvermans opt bandwidth h_x = kde.kde_pdf(x=None, sampledata=np.array(x), kerneltype='epanechnikov', biascorrected=False, getsilverman=True) h_y = kde.kde_pdf(x=None, sampledata=np.array(y), kerneltype='epanechnikov', biascorrected=False, getsilverman=True) # check independence tstat = nptests.nptests_ahmadli(xdata=x, ydata=y, bandwidthx=h_x, bandwidthy=h_y, kerneltype='epanechnikov') print('tstat=', tstat)
sampleMulti = np.random.multivariate_normal(mu, sigma, size=50) ** 2 # Use kde_pdf to predict the density based on these sample data in a range. Also predict confidence intervals for the # univariate and compute the true density in the range. support = np.arange(0, 40, 0.8) supportx1 = np.arange(-4, 4, 0.2) supportx2 = np.arange(-3, 3, 0.2) # Predefine the lists fhatUni = list() ci_low = list() ci_high = list() fUni = list() # Univariate for x in support: # Call the function with CI mode hat, low, high = kde.kde_pdf(x, sampleUni, 'epanechnikov', kernelorder=2, biascorrected=True, correctboundarybias=False, flowbound=0, bandwidth=None, confidint=True) fhatUni.append(hat) ci_low.append(low) ci_high.append(high) # Compute the true density fUni.append(stats.chi2.pdf(x=x, df=20)) print('\nUnivariate estimation is done.\n') # Multivariate fhatMulti = [[kde.kde_pdf(np.array([x1, x2]), sampleMulti, 'epanechnikov', correctboundarybias=False, flowbound=np.array([0, 0]), bandwidth=np.array([0.5, 0.5]), confidint=True) for x1 in supportx1] for x2 in supportx2] fMulti = [[stats.multivariate_normal.pdf([x1, x2], mean=mu, cov=sigma) for x1 in supportx1] for x2 in supportx2] print('Multivariate estimation is done.\n')
with open(r'C:\Users\Máté\Dropbox\CEU\2017 Spring\Nonparametric\Nonparametric_ps2\bweight_age_white_20K.txt', 'r')\ as myfile: filein = myfile.readlines() # break the lines and convert into np array rawdata = np.array([float(line.split()[0]) for line in filein]) # weight weight_w = rawdata[0:int(len(rawdata) / 2)] # age age_w = rawdata[int(len(rawdata) / 2):] # Size n = len(age_b) # Get Silverman's bandwidth for Gaussian and Epa kernels # gaussian h_gau_weight_b = kde.kde_pdf(x=None, sampledata=weight_b, kerneltype='gaussian', kernelorder=2, getsilverman=True) h_gau_weight_w = kde.kde_pdf(x=None, sampledata=weight_w, kerneltype='gaussian', kernelorder=2, getsilverman=True) h_gau_age_b = kde.kde_pdf(x=None, sampledata=age_b, kerneltype='gaussian', kernelorder=2, getsilverman=True) h_gau_age_w = kde.kde_pdf(x=None, sampledata=age_w, kerneltype='gaussian', kernelorder=2, getsilverman=True) # epa h_epa_weight_b = kde.kde_pdf(x=None, sampledata=weight_b, kerneltype='epanechnikov', kernelorder=2, getsilverman=True) h_epa_weight_w = kde.kde_pdf(x=None, sampledata=weight_w, kerneltype='epanechnikov', kernelorder=2, getsilverman=True) h_epa_age_b = kde.kde_pdf(x=None, sampledata=age_b, kerneltype='epanechnikov', kernelorder=2, getsilverman=True) h_epa_age_w = kde.kde_pdf(x=None, sampledata=age_w, kerneltype='epanechnikov', kernelorder=2, getsilverman=True) # Compute the test statistics for different kernels and scaled bandwidths # summary to print as table at the end matrixtoprint = np.empty((3, 4)) matrixindex = 0 subsamplesize = 100 for scale in [1 / 3, 1, 3]:
weight_w_train.mean()) / weight_w_train.std() # test age_b_test_st = (age_b_test - age_b_test.mean()) / age_b_test.std() age_w_test_st = (age_w_test - age_w_test.mean()) / age_w_test.std() weight_b_test_st = (weight_b_test - weight_b_test.mean()) / weight_b_test.std() weight_w_test_st = (weight_w_test - weight_w_test.mean()) / weight_w_test.std() # Time of preparing data start_time_lscv_b = time.time() print('Preparing data: %s seconds\n' % (start_time_lscv_b - start_time)) # ###################### TRAINING ############################## # Bandwidth kerneltype = 'epanechnikov' h_age_b = kde.kde_pdf(x=None, sampledata=age_b_train_st, kerneltype=kerneltype, getsilverman=True) h_age_w = kde.kde_pdf(x=None, sampledata=age_w_train_st, kerneltype=kerneltype, getsilverman=True) h_weight_b = kde.kde_pdf(x=None, sampledata=weight_b_train_st, kerneltype=kerneltype, getsilverman=True) h_weight_w = kde.kde_pdf(x=None, sampledata=weight_w_train_st, kerneltype=kerneltype, getsilverman=True) print('Silvermans bandwidht, blacks, age and weight:', [h_age_b, h_weight_b]) print('Silvermans bandwidht, whites, age and weight:', [h_age_w, h_weight_w])
# For each x # index i_x = 0 # loop for x in [1, 1.5, 2]: fx = stats.chi2.pdf(x=x, df=1) # for each sample size # index i_n = 0 # loop for n in [50, 100, 250, 500]: print('Currently working on x=', x, ' and n=', n, '\n') # get Silverman's bandwidth, using the average of 40 samples of size n h_s = kde.kde_pdf( x=None, sampledata=np.mean( [np.random.chisquare(df=1, size=n) for i in range(40)], 0), kerneltype='epanechnikov', getsilverman=True) # iteration for m in range(M): # draw sample sample = np.random.chisquare(df=1, size=n) # estimate fhat and ci for Silverman and rescaled Silverman bandwidth # (for the coverage probability fhat is not need only the CI; fhat is needed only for # the rescaled bias computation) fhatt, ci_low, ci_high = kde.kde_pdf(x=x, sampledata=sample, kerneltype='epanechnikov', bandwidth=None, correctboundarybias=False, biascorrected=False,
# True f(x) for f~uniform[0,1] f = 1 # Seed random np.random.seed([0]) # For each x # index i = 0 # loop for x in [0, 0.05, 0.1, 0.5, 0.6]: # iteration for m in range(M): # draw sample of size n sample = np.random.rand(n) # compute the kernel density estimation, given the sample, for (bandwidth, kerneltype) tuples fhat_m[m, 0:4] = [kde.kde_pdf(x=x, sampledata=sample, bandwidth=0.1, kerneltype='epanechnikov', biascorrected=False), kde.kde_pdf(x=x, sampledata=sample, bandwidth=0.25, kerneltype='epanechnikov', biascorrected=False), kde.kde_pdf(x=x, sampledata=sample, bandwidth=0.1, kerneltype='gaussian', biascorrected=False), kde.kde_pdf(x=x, sampledata=sample, bandwidth=0.25, kerneltype='epanechnikov', biascorrected=False) ] # mean over (bandwidth,kerneltype) tuples fhat_m[m, 4] = np.mean(fhat_m[m, 0:4]) # average over iterations Efhat[i, :] = np.mean(fhat_m, 0) # Compute bias bias[i, :] = Efhat[i,:] - f # increase row index for the next x i = i + 1
filein = myfile.readlines() # break lines and convert into numpy array bw_white = np.array([float(line.split()[0]) for line in filein]) # black with open(r'C:\Users\Máté\Dropbox\CEU\2017 Spring\Nonparametric\Nonparametric_ps1\bweight_black_20K.txt', 'r') \ as myfile: filein = myfile.readlines() # break lines and convert into numpy array bw_black = np.array([float(line.split()[0]) for line in filein]) # Part (a) # # kernel density estimation for whites, Epa kernel, Silverman's bandwidth, plot # domain support = np.arange(0, bw_white.max() + 0.3 * bw_white.std(), 100) # estimation fhat_white_s = [kde.kde_pdf(x=x, sampledata=bw_white, kerneltype='epanechnikov', bandwidth=None, bandwidthscale=None, biascorrected=False, correctboundarybias=False) for x in support] # plot kde.kde_plot(fhat=fhat_white_s, ismultiple=False, fsupport=support, plottitle='KDE of white birthweight, Epa. with ' 'Silverman\'s bandwidth', xlabel='gram', ylabel='$\hat{f}(x)$', savemode=True, filepath=r'C:\Users\Máté\Dropbox\CEU\2017 Spring\Nonparametric\Nonparametric_ps1\Problem1_a', viewmode=True) # Part (b) # # kernel density estimation for whites, Epa kernel, Silverman's bandwidth times 1/5 and 5, plot # estimation fhat_white_s_1over5 = [kde.kde_pdf(x=x, sampledata=bw_white, kerneltype='epanechnikov', bandwidth=None, bandwidthscale=1 / 5, biascorrected=False, correctboundarybias=False) for x in support] fhat_white_s_5 = [kde.kde_pdf(x=x, sampledata=bw_white, kerneltype='epanechnikov', bandwidth=None, bandwidthscale=5, biascorrected=False, correctboundarybias=False) for x in support]
def nptests_ahmadli(xdata, ydata, bandwidthx, bandwidthy, kerneltype, kernelorder=2, subsamplesize=None, getpvalue=False): """ Ahmad & Li nonparametrics test of statistical indenpendence of two random variables, which can be multivariate :param xdata: sample data on x, n*d_x sized, n: number of observations, d_x number of variables in x :param ydata: sample data on y, n*d_y sized, n: number of observations, d_y number of variables in y :param bandwidthx: scalar or list of length d_x, bandwidths to use in the kernel. If scalar, the same bandwidth is used for all variables in x :param bandwidthy: scalar or list of length d_y, bandwidths to use in the kernel. If scalar, the same bandwidth is used for all variables in y :param kerneltype: string, a name of kernel from kernels.py :param kernelorder: order of the kernel :param subsamplesize: if given the test uses only subsamplesize randomly chosen elements of the original sample :param getpvalue: if True, p-value is returned as well :return: """ # Import dependencies import numpy as np from nonparaecon.kde import kde_pdf from nonparaecon import kernels # Subsample to use if not (subsamplesize is None): # seed random np.random.seed([0]) # permute observations and keep required size permindex = np.random.permutation(len(ydata)) xdata = xdata[permindex[0:subsamplesize]] ydata = ydata[permindex[0:subsamplesize]] # Get sizes n = len(xdata) try: d_x = np.size(xdata, 1) except: d_x = 1 try: d_y = np.size(ydata, 1) except: d_y = 1 #print('Perceived number of variables in x and y respectively: ', d_x, d_y) # Joint array try: xyarray = np.array(np.concatenate([xdata, ydata], 1)) except: xyarray = np.array([xdata, ydata]).T # Expand bandwidths to list if scalar if np.isscalar(bandwidthx): bandwidthx_exp = [bandwidthx] * d_x else: bandwidthx_exp = bandwidthx if np.isscalar(bandwidthy): bandwidthy_exp = [bandwidthy] * d_y else: bandwidthy_exp = bandwidthy # join bandwidthxy = bandwidthx_exp + bandwidthy_exp # Components of Itilde itilde1 = np.array([(n - 1) / n * kde_pdf(x=xyarray[i], sampledata=xyarray, kerneltype=kerneltype, bandwidth=bandwidthxy, kernelorder=kernelorder, biascorrected=False, leaveoneout=True, leftoutindex=i) for i in range(n)]).mean() itilde2 = np.array([(n - 1) / n * kde_pdf(x=xdata[i], sampledata=xdata, kerneltype=kerneltype, bandwidth=bandwidthx, kernelorder=kernelorder, biascorrected=False, leaveoneout=True, leftoutindex=i) * \ (n - 1) / n * kde_pdf(x=ydata[j], sampledata=ydata, kerneltype=kerneltype, bandwidth=bandwidthy, kernelorder=kernelorder, biascorrected=False, leaveoneout=True, leftoutindex=j) for i in range(n) for j in range(n)]).mean() itilde3 = -2 * np.array([(n - 1) / n * kde_pdf(x=xdata[i], sampledata=xdata, kerneltype=kerneltype, bandwidth=bandwidthx, kernelorder=kernelorder, biascorrected=False, leaveoneout=True, leftoutindex=i) * \ (n - 1) / n * kde_pdf(x=ydata[i], sampledata=ydata, kerneltype=kerneltype, bandwidth=bandwidthy, kernelorder=kernelorder, biascorrected=False, leaveoneout=True, leftoutindex=i) for i in range(n)]).mean() # Numerator itilde = itilde1 + itilde2 + itilde3 bandwidthproduct = np.product(np.array(bandwidthxy)) numerator = bandwidthproduct * itilde # Denominator # k # get kernel kernel = getattr(kernels, kerneltype) # univariate x, univariate y if d_x == 1 and d_y == 1: k = np.array([kernel((xdata[i] - xdata[j]) / bandwidthx, kernelorder) ** 2 * \ kernel((ydata[i] - ydata[j]) / bandwidthy, kernelorder) ** 2 for j in range(n) for i in range(n) if j != i]).sum() # multivariate x, univariate y elif d_x > 1 and d_y == 1: k = np.array([np.product([kernel((xdata[i, d] - xdata[j, d]) / bandwidthx_exp[d], kernelorder) for d in range(d_x)]) ** 2 *\ kernel((ydata[i] - ydata[j]) / bandwidthy, kernelorder) ** 2 for j in range(n) for i in range(n) if j != i]).sum() # univariate x, multivariate y elif d_x == 1 and d_y > 1: k = np.array([kernel((xdata[i] - xdata[j]) / bandwidthx, kernelorder) ** 2 *\ np.product([kernel((ydata[i, d] - ydata[j, d]) / bandwidthy_exp[d], kernelorder)] for d in range(d_y)) ** 2 for j in range(n) for i in range(n) if j != i]).sum() # multivariate x, multivariate y else: k = np.array([np.product([kernel((xdata[i, d] - xdata[j, d]) / bandwidthx_exp[d], kernelorder) for d in range(d_x)]) ** 2 *\ np.product([kernel((ydata[i, d] - ydata[j, d]) / bandwidthy_exp[d], kernelorder)] for d in range(d_y)) ** 2 for j in range(n) for i in range(n) if j != i]).sum() denominator = np.sqrt(2 * k) # Test statistics teststat = n**2 * numerator / denominator if not getpvalue: return teststat