예제 #1
0
def simple_CV_test(bins,N,i):
    random.seed(i)
    test_num=int(N/2)
    sampled_data = get_data(N)
    sampled_data_exchange_1=sampled_data[0:int(N/2)]
    sampled_data_exchange_1.sort()
    sampled_data_exchange_2=sampled_data[int(N/2):N]
    sampled_data_exchange_2.sort()
    size=20/float(bins)
    pred_distribution_1=Hist_new(int(N/2),sampled_data_exchange_1,bins)
    pred_distribution_2=Hist_new(int(N/2),sampled_data_exchange_2,bins)
    return sum_of_squire(pred_distribution_1,pred_distribution_2,len(pred_distribution_1))
예제 #2
0
def requirement1() :

    global min_range
    global max_range

    ds = [100, 500, 1000, 10000]
    b = 100
    h = 0.1
    k = 10

    xs = np.linspace(min_range, max_range, 200)

    # Histogram as example
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    for d in ds :
        data = get_data(d)
        plt.hist(data, density=True, bins=b, alpha=0.4)
        legends.append('#bin = ' + str(b) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.title('Requirement 1-1')
    plt.savefig('req1-1', dpi=300)
    plt.show()

    # KDE as example
    plt.figure()
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    density = kde(data)
    for d in ds :
        data = get_data(d)
        density = kde(data)
        density.set_bandwidth(h)
        plt.plot(xs, density(xs))
        legends.append('h = ' + str(h) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.title('Requirement 1-2')
    plt.savefig('req1-2', dpi=300)
    plt.show()

    # KNN as example
    plt.figure()
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    for d in ds :
        data = get_data(d)
        density = knn(data, k)
        plt.plot(xs, density(xs))
        legends.append('k = ' + str(k) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.ylim([0, 0.4])
    plt.title('Requirement 1-3')
    plt.savefig('req1-3', dpi=300)
    plt.show()
예제 #3
0
def KDE(NUM, h, c, l):
    sampled_data = get_data(NUM)
    minvalue = min(sampled_data) - 3
    maxvalue = max(sampled_data) + 3
    bins = 500  #(int)((maxvalue-minvalue)/h)
    x = np.linspace(minvalue, maxvalue, bins)
    y = np.zeros(x.shape, dtype=np.float)
    for i in sampled_data:
        y = y + (1 /
                 NUM) * (1 /
                         (2 * math.pi * h * h)**0.5) * math.e**(-(x - i)**2 /
                                                                (2 * h**2))
    plt.plot(x, y, color=c, label=l)
예제 #4
0
def show_k_influence():
  N = 200
  sampled_data = get_data(N)
  # Ks = [2, 5, 10, 20, 30, 40, 50, 60, 80, 100]
  Ks = [2, 5, 20, 30]
  fig = plt.figure(figsize=(12, 6))
  for i, K in enumerate(Ks):
    plt.subplot(4, 1, i+1)
    plt.ylim(0, 0.35)
    plt.ylabel('K = {}'.format(K))
    draw_nearest(N, K)
    gm_plot(gm1d, N)
  plt.show()
예제 #5
0
def KDE(num_sample,h):
    sample_data = get_data(num_sample)
    xlist = np.linspace(min(sample_data),max(sample_data),2*num_sample)
    ylist = np.zeros_like(xlist)
    i = 0
    for x in xlist:
        sum = 0
        for x_compare in sample_data:
            sum += Gaussian(x,x_compare,h)
        ylist[i] = sum/num_sample
        i += 1
    plt.plot(xlist,ylist)
    plt.xlabel("x")
    plt.ylabel("y")
예제 #6
0
def kde(num_data, h):
    sampled_data = get_data(num_data)
    xs = np.linspace(
        min(sampled_data) - 3 * np.std(sampled_data),
        max(sampled_data) + 3 * np.std(sampled_data), 2000)
    ys = np.zeros_like(xs)
    for i, x in enumerate(xs):
        for xi in sampled_data:
            ys[i] += exp(-pow(x - xi, 2) /
                         (2 * h * h)) / (sqrt(2 * pi * h * h) * num_data)
    plt.plot(xs, ys)
    plt.xlabel("x")
    plt.ylabel("p(x)")
    plt.show()
예제 #7
0
def kNN_matrix(N=200, K=10):
    assert K > 0
    assert K <= N

    data = get_data(N)

    x = np.linspace(min(data), max(data), 100)
    distance = np.abs(x - np.reshape(data, (N, 1)))
    px = K / N * 0.5 / np.sort(distance, axis=0)[K - 1, :]

    plt.plot(x, px, label="kNN_matrix")
    plt.legend()
    plt.title("N = %d, K = %d" % (N, K))
    gm1d.plot()
예제 #8
0
def KFold_cross_validation_KDE(num_sample):
    sample_data = get_data(num_sample)
    print(sample_data)
    kf = KFold(n_splits = 3)
    h_test = 0
    minCV = 10000000
    h_ideal = 0
    for i in range(0,100):
        h_test += 0.01
        CV = 0
        print(i)
        for train_index,test_index in kf.split(sample_data):
            print("_______________")
            train = []
            test = []
            for idx in train_index:
                train.append(sample_data[idx])
            for idx in test_index:
                test.append(sample_data[idx])
#             print(train)
#             print(test)
            xlist = np.linspace(min(sample_data),max(sample_data),200)
            y_train = np.zeros_like(xlist)
            y_test = np.zeros_like(xlist)
            j = 0
            for x in xlist:
                sum = 0
                for x_compare in train:
                    sum += Gaussian(x,x_compare,h_test)
                y_train[j] = sum/len(train)
                j += 1
            j = 0
            for x in xlist:
                sum = 0
                for x_compare in test:
                    sum += Gaussian(x,x_compare,h_test)
                y_test[j] = sum/len(test)
                j += 1
            MSE = 0
            for j in range(0,len(xlist)):
                MSE += math.pow(y_train[j] - y_test[j],2)
            CV += MSE/len(xlist)
            print(CV)
        
        if(CV < minCV):
            minCV = CV
            h_ideal = h_test
    print(h_ideal)
    KDE(num_sample,h_ideal)
예제 #9
0
def kernel_density_estimate(N=100, h=0.35, show=True):
    assert h > 0

    data = get_data(N)

    x = np.linspace(min(data), max(data), 1000)
    px = np.sum(np.exp(np.square(x - np.reshape(data, (N, 1))) / (-2 * h**2)),
                axis=0) / (np.sqrt(2 * np.pi) * h) / N

    plt.plot(x, px, label="kernel density estimate")
    plt.legend()
    plt.title("N = %d, h = %f" % (N, h))
    plot_gm1d()
    if show:
        plt.show()
예제 #10
0
def task1(bins: int = 50, para_h: int = 0.2, k: int = 20):
    sample_data1 = get_data(100)
    sample_data2 = get_data(500)
    sample_data3 = get_data(1000)
    sample_data4 = get_data(10000)

    plt.subplot(3, 2, 1)
    plt.title("num_data=100")
    show_all(sample_data1, bins, para_h, k)

    plt.subplot(3, 2, 2)
    plt.title("num_data=500")
    show_all(sample_data2, bins, para_h, k)

    plt.subplot(3, 2, 5)
    plt.title("num_data=1000")
    show_all(sample_data3, bins, para_h, k)

    plt.subplot(3, 2, 6)
    plt.title("num_data=10000")
    show_all(sample_data4, bins, para_h, k)

    plt.show()
    return
예제 #11
0
def nnde(num_data, k):
    sampled_data = get_data(num_data)
    xs = np.linspace(
        min(sampled_data) - 3 * np.std(sampled_data),
        max(sampled_data) + 3 * np.std(sampled_data), 2000)
    ys = np.zeros_like(xs)
    for i, x in enumerate(xs):
        dist = []
        for xi in sampled_data:
            dist.append(abs(x - xi))
        dist.sort()
        ys[i] = k / (num_data * 2 * (dist[k] + 1e-9))
    plt.plot(xs, ys)
    plt.xlabel("x")
    plt.ylabel("p(x)")
    plt.show()
예제 #12
0
def Kernel(num,N,h):
    np.random.seed(0)
    output_data=[]
    h_2=h**2
    para=1/(float(N)*mt.sqrt(2*mt.pi*h_2))
    sampled_data = get_data(N)
    # for x in np.linspace(0,1-h,h):
    #     print(1)
    #     output_data.append(KernelGaussian(x,sampled_data))
    for x in np.linspace(0,50,num):
        output_data.append(KernelGaussian(x,sampled_data,h_2,para))
    plt.plot(np.linspace(0,50,num),output_data)
    gm1d = GaussianMixture1D(mode_range=(0, 50))
    gm1d.plot(200)
    plt.show()
    return output_data
예제 #13
0
def kNN_kdtree(N=200, K=10, show=True):
    assert K > 0
    assert K <= N

    data = get_data(N)
    tree = KDTree(np.reshape(data, (N, 1)))

    x = np.linspace(min(data), max(data), 100).reshape((100, 1))
    matrix = tree.query(x, k=K, p=1)
    px = K / N * 0.5 / np.abs(tree.data[matrix[1][:, K - 1]] - x)

    plt.plot(x, px, label="kNN_kdtree")
    plt.legend()
    plt.title("N = %d, K = %d" % (N, K))
    plot_gm1d()
    if show:
        plt.show()
예제 #14
0
def kNN(num_data=200, K=10):
    data = sorted(get_data(num_data))

    x = np.linspace(min(data), max(data), 100)
    px = []
    left = 0
    right = K - 1
    for xi in x:
        while right <num_data- 1 and data[right + 1] + data[left] < 2 * xi:
            right = right + 1
            left = left + 1
        px.append(0.5 / max(data[right] - xi, xi - data[left]))
    px = np.array(px) * K / num_data

    plt.plot(x, px, label="K = %d" % (K))
    plt.legend()
    plt.title("N = %d, K = %d" % (num_data, K))
    plt.savefig("img/knn_sample_"+str(num_data)+"_k_"+str(K)+".png") 
예제 #15
0
def show_bin_method():
  N = 200
  sampled_data = get_data(N)
  stdev = np.std(sampled_data)
  # Sturge’s Rule   k = 1+log2(N)
  # Scott’s Rule    h = 3.49σN^(−1/3)
  # Rice’s Rule     k = pow(N, 1/3)*2
  names = ['Sturge’s Rule', 'Scott’s Rule', 'Rice’s Rule', '', '', '', '']
  bin_num = [int( 1 + np.ceil(np.log2(N)) ),
            int(np.ceil( (max(sampled_data) - min(sampled_data)) / (3.49*stdev/np.power(N, 1.0/3.0)) )),
            int(np.ceil( np.power(N, 1.0/3.0)*2 )), 20, 25, 30, 50]
  print(bin_num)
  fig = plt.figure(figsize=(6, 10))
  for i, bins in enumerate(bin_num):
    plt.subplot(3, 3, i+1)
    plt.title(names[i])
    plt.ylabel(bins)
    draw_hist(N, bins)
  plt.show()
예제 #16
0
def show_h_influence():
  N = 100
  sampled_data = get_data(N)
  sampled_data = sorted(sampled_data)
  distance = 0
  for i, sample in enumerate(sampled_data[1:]):
    distance += sample - sampled_data[i-1]
  distance /= (N-1)
  print(distance)
  # choose sqrt(avg(distance)) * 2
  hs = [0.05, 0.1, 0.2, 0.3, 0.5, 1.0, 1.5, np.power(distance, 0.5)*2]
  fig = plt.figure(figsize=(12, 6))
  for i, h in enumerate(hs):
    plt.subplot(4, 2, i+1)
    plt.ylabel(h)
    if i == 7:
      plt.xlabel('sqrt(average_interval) * 2')
    draw_kernel(N, h)
  plt.show()
예제 #17
0
def gauss_kernel(num_data=100, h=None, ptype="varh", num_inter=2000):
    sampled_data = get_data(num_data)
    mini, maxi = min(sampled_data), max(sampled_data)
    interval = maxi - mini
    x_list = np.linspace(mini - interval * 0.05, maxi + interval * 0.05,
                         num_inter)
    if h is None:
        h = find_maxli(sampled_data)

    p_list = []
    for x in x_list:
        p = calc_density(x, sampled_data, h)
        p_list.append(p)

    if ptype == "varh":
        plt.title("gauss h={:.2f}".format(h))
    else:
        plt.title("gauss n={}".format(num_data))
    plt.plot(x_list, p_list)
예제 #18
0
def compare(min_range=10000, max_range=50001):
    xT = range(min_range, max_range, 2000)
    yT_kernel = []
    yT_IFGT = []
    xs = np.linspace(min_range, max_range, 10000)
    for x in xT:
        sampled_data = get_data(x)
        T = time.time()
        kernel(sampled_data, xs, h=0.1389)
        yT_kernel.append(time.time() - T)
        T = time.time()
        IFGT(sampled_data, xs, h=0.1389, K=100)
        yT_IFGT.append(time.time() - T)

    plt.title('Time Comparision')
    plt.plot(xT, yT_kernel, color='blue')
    plt.plot(xT, yT_IFGT, color='red')
    plt.xlabel("x")
    plt.ylabel("Time")
    plt.show()
예제 #19
0
def Kernel_Density_Estimation(num_data, h):
    sampled_data = get_data(num_data)
    min_range = min(sampled_data) - 3
    max_range = max(sampled_data) + 3
    xs = np.linspace(min_range, max_range, 2000)
    ys = np.zeros_like(xs)

    index = 0
    for x in xs:
        tmp = 0
        for xn in sampled_data:
            tmp += m.exp(-(m.pow(x-xn, 2))/(2*m.pow(h, 2)))/(m.sqrt(2*m.pi*m.pow(h, 2)))
        ys[index] = tmp / num_data
        index += 1

    plt.title("num_data = %d & h = %f" % (num_data, h))
    plt.plot(xs, ys)
    plt.xlabel("x")
    plt.ylabel("p(x)")
    plt.show()
예제 #20
0
def task4():
    sample_data = get_data(200)
    #    kf = KFold(n_splits = 2)
    #    mincv = 1000000.0
    #    mink = 1
    #    for k in range(1,31):
    #        #plt.title("K={}".format(k))
    #        #knn_method(sample_data, k)
    #        cv = 0.0
    #        for train,test in kf.split(sample_data):
    #            cv += task4_corss_validation(train,test,k,min(sample_data),max(sample_data))
    #        if cv<mincv:
    #            mincv = cv
    #            # print(mincv)
    #            mink = k
    #    k = mink
    k = int(math.sqrt(len(sample_data)))
    # k = 15
    plt.title("k={}".format(k))
    knn_method(sample_data, k)
예제 #21
0
def histogram_bins_selection():
    num_data = 200
    sample_data = get_data(num_data)

    num_bins = int(square_root_choice(sample_data))
    title = "square_root_choice: " + "bins=" + str(num_bins) + ", num_sd=200"
    histogram_estimation(num_bins=num_bins,
                         sample_data=sample_data,
                         status=True,
                         title=title)

    num_bins = int(sturges_formula(sample_data))
    title = "sturges_formula: " + "bins=" + str(num_bins) + ", num_sd=200"
    histogram_estimation(num_bins=num_bins,
                         sample_data=sample_data,
                         status=True,
                         title=title)

    num_bins = int(rice_rule(sample_data))
    title = "rice_rule: " + "bins=" + str(num_bins) + ", num_sd=200"
    histogram_estimation(num_bins=num_bins,
                         sample_data=sample_data,
                         status=True,
                         title=title)

    num_bins = int(scotts_normal_reference_rule(sample_data))
    title = "scotts_normal_reference_rule: " + "bins=" + str(
        num_bins) + ", num_sd=200"
    histogram_estimation(num_bins=num_bins,
                         sample_data=sample_data,
                         status=True,
                         title=title)

    num_bins = int(shimazaki_and_shinomoto(sample_data))
    title = "shimazaki_and_shinomoto: " + "bins=" + str(
        num_bins) + ", num_sd=200"
    plt.cla()
    histogram_estimation(num_bins=num_bins,
                         sample_data=sample_data,
                         status=True,
                         title=title)
예제 #22
0
def requirement2() :
    global min_range
    global max_range

    data = get_data(200)
    bs = [2, 10, 30]
    xs = np.linspace(min_range, max_range, 200)
    
    legends = []

    plot_true_distribution()
    legends.append('True Distribution')

    # Plotting histogram with different bins
    for b in bs :
        plt.hist(data, density=True, bins=b, alpha=0.4)
        legends.append('bins = ' + str(b))

    plt.title('Requirement 2')
    plt.legend(legends)
    plt.show()
예제 #23
0
def knn(num_sample,K):
    sample_data = get_data(num_sample)
    xlist = np.linspace(min(sample_data),max(sample_data),2*num_sample)
    ylist = np.zeros_like(xlist)
    i = 0
    integration = 0
    for x in xlist:
        j = 0
        dis_list = np.zeros_like(sample_data)
        for x_compare in sample_data:
            dis_list[j] = abs(x_compare - x)
            j += 1
        dis_list.sort()
        ylist[i] = K/(num_sample*2*max(dis_list[K-1],0.001))
        integration += (max(sample_data) - min(sample_data))*ylist[i]/(2*num_sample)
        i += 1
    print(integration)
    plt.plot(xlist,ylist)
    
    plt.xlabel("x")
    plt.ylabel("y")
예제 #24
0
def simple_CV(bins,N): #return the result of CV test
    test_num=int(N/2)
    sampled_data = get_data(N)
    sampled_data_exchange=sampled_data[test_num:N]
    sampled_data_exchange.sort()
    size=20/float(bins)
    
    pred_distribution=Hist_new(N-test_num,sampled_data_exchange,bins)
    # print(pred_distribution)
    # plt.plot(range(0,len(pred_distribution)),pred_distribution)
    # plt.show()
    pred=[]
    for i in range(0,test_num):
        s=random.random()
        j=0
        while s>0 and j<len(pred_distribution):
            s=s-pred_distribution[j]
            j=j+1
        pred.append(j*size+20)
        # print(pred)
    return sum_of_squire(pred,sampled_data[0:len(pred)],test_num)
예제 #25
0
def KDE_CV(NUM, h):
    sampled_data = get_data(NUM)
    train_data = sampled_data[0:NUM * 8 // 10]
    valid_data = sampled_data[NUM * 8 // 10:NUM]
    train_data_size = NUM * 8 // 10
    valid_data_size = NUM // 5
    minvalue = min(sampled_data) - 3
    maxvalue = max(sampled_data) + 3
    bins = 500  #(int)((maxvalue-minvalue)/h)
    deltax = (maxvalue - minvalue) / 500
    x = np.linspace(minvalue, maxvalue, bins)
    y = np.zeros(valid_data_size, dtype=np.float)

    for i in train_data:
        y = y + (1 / train_data_size) * (
            1 / (2 * math.pi * h * h)**0.5) * math.e**(-(valid_data - i)**2 /
                                                       (2 * h**2))
    loss = 0
    for i in range(0, valid_data_size):
        loss = loss - math.log(y[i])
    return (loss)
예제 #26
0
def histogram_exploration():
    num_data = 200
    sample_data = get_data(num_data)

    title = "bins=10, num_sd=200"
    histogram_estimation(num_bins=10,
                         sample_data=sample_data,
                         status=False,
                         title=title)

    title = "bins=25, num_sd=200"
    histogram_estimation(num_bins=25,
                         sample_data=sample_data,
                         status=False,
                         title=title)

    title = "bins=100, num_sd=200"
    histogram_estimation(num_bins=100,
                         sample_data=sample_data,
                         status=False,
                         title=title)
예제 #27
0
def Nearest_Neighbor_Estimation(num_data, k):
    sampled_data = get_data(num_data)
    min_range = min(sampled_data) - 3
    max_range = max(sampled_data) + 3
    xs = np.linspace(min_range, max_range, 2000)
    ys = np.zeros_like(xs)

    index = 0
    for x in xs:
        data_list = []
        for xn in sampled_data:
            data_list.append(abs(x-xn))
        data_list.sort()
        ys[index] = k / (num_data*2*(data_list[k]+1e-10))
        index += 1

    plt.title("num_data = %d & k = %d" % (num_data, k))
    plt.plot(xs, ys)
    plt.xlabel("x")
    plt.ylabel("p(x)")
    plt.show()
예제 #28
0
def KernelDensity(num_data, h):
    sampled_data = get_data(num_data)
    min_range = min(sampled_data) - 3
    max_range = max(sampled_data) + 3
    xs = np.linspace(min_range, max_range, 2000)
    ys = np.zeros_like(xs)
    print(h)
    index = 0
    for x in xs:
        tmp = 0
        for xn in sampled_data:
            tmp += np.exp(-(np.power(x-xn, 2))/(2*np.power(h, 2)))/(np.sqrt(2*np.pi*np.power(h, 2)))
        ys[index] = tmp / num_data
        index += 1

    plt.title("num_data = %d & h = %f" % (num_data, h))
    plt.plot(xs, ys)
    plt.xlabel("x")
    plt.ylabel("p(x)")
    plt.savefig("img/kernel_sample_"+str(num_data)+"_h_"+str(h)+".png") 
    plt.close()
예제 #29
0
def M_KDE(h):
    sample_data=get_data()
    N=100
    h_2=h**2
    para=1/(float(N)*mt.sqrt(2*mt.pi*h_2))
    para_i=1/(float(N-1)*mt.sqrt(2*mt.pi*h_2))
    def KDE_new(x):
        return (KernelGaussian(x,sample_data,h_2,para))**2
    the_first=scipy.integrate.quad(KDE_new,20,40)
    the_second=0
    flag=0

    for x in sample_data:
        tp_sample_data=sample_data[:]
        del tp_sample_data[flag]
        flag=flag+1
        the_second=the_second+KernelGaussian(x,tp_sample_data,h_2,para_i)
    the_second=the_second*2/N

    M0=the_first[0]-the_second
    return M0
예제 #30
0
def M_k_NN(K):
    N=500
    sample_data=get_data(N)
    sample_data.sort()
    def NNM_new(x):
        return (KNN_Pro(x,sample_data,N-1,K))**2
    
    the_first=scipy.integrate.quad(NNM_new,20,40)
    the_second=0
    flag=0

    for x in sample_data:
        tp_sample_data=sample_data[:]
        del tp_sample_data[flag]
        flag=flag+1
        the_second=the_second+KNN_Pro(x,tp_sample_data,N-1,K)
        # print(x)
    the_second=the_second*2/N

    M0=the_first[0]-the_second
    return M0