Пример #1
0
def Manhattan_classify(train, test2):
    '''用Manhattan对测试集分类'''
    total = data_process.count_total(test2)
    K = 1
    a = np.array(test2)
    t_data = a[:, 0:4]
    predict = []  # 对测试集各样本进行分类
    for sample in t_data:
        sort = Manhattan_distance(train, sample)
        lab_0 = 0
        lab_1 = 0
        for i in range(K):
            if sort[i][1] == 0:
                lab_0 += 1
            else:
                lab_1 += 1
        if lab_0 > lab_1:
            predict.append(0)
        else:
            predict.append(1)
    Manhattan_result = list(test2)
    for i in range(total):
        Manhattan_result[i][4] = predict[i]
    title = [
        'Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
        'Time (months)', 'My prediction'
    ]
    Manhattan_result.insert(0, title)
    submit2 = pd.DataFrame(data=Manhattan_result)
    #print(submit)
    submit2.to_csv('./task1_test_Manhattan.csv',
                   encoding='gbk',
                   header=None,
                   index=None)
Пример #2
0
def Chebyshev_distance(train_data, data):
    '''计算单个样本与训练集各个样本间的Chebyshev_distance,并按升序排列'''
    a1 = np.array(train_data)
    a2 = np.array(data)
    td = a1[:, 0:4]
    d = a2[0:4]
    label = a1[:, 4]
    C_distance = []
    dis_lab = []  #存储该样本与训练样本的Chebyshev_distance和训练样本对应的类别
    for sample in td:
        norm = max(abs(sample - d))
        C_distance.append(norm)
    for i in range(data_process.count_total(train_data)):
        dis_lab.append([C_distance[i], label[i]])
    sort = sorted(dis_lab, key=(lambda x: x[0]))  #将Chebyshev_distance按升序排列
    return sort
Пример #3
0
def Euclidean_distance(train_data, data):
    '''计算单个样本与训练集各个样本间的Euclidean_distance,并按升序排列'''
    a1 = np.array(train_data)
    a2 = np.array(data)
    td = a1[:, 0:4]
    d = a2[0:4]
    label = a1[:, 4]
    E_distance = []
    dis_lab = []  # 存储该样本与训练样本的欧式距离和训练样本对应的类别
    for sample in td:
        norm = np.linalg.norm(sample - d)
        E_distance.append(norm)
    for i in range(data_process.count_total(train_data)):
        dis_lab.append([E_distance[i], label[i]])
    sort = sorted(dis_lab, key=(lambda x: x[0]))  # 将欧氏距离按升序排列
    return sort
Пример #4
0
def Manhattan_distance(train_data, data):
    '''计算单个样本与训练集各个样本间的Manhattan_distance,并按升序排列'''
    a1 = np.array(train_data)
    a2 = np.array(data)
    td = a1[:, 0:4]
    d = a2[0:4]
    label = a1[:, 4]
    C_distance = []
    dis_lab = []  #存储该样本与训练样本的Manhattan_distance和训练样本对应的类别
    for sample in td:
        temp = np.array([feature for feature in sample - d])
        norm = sum(abs(temp))
        C_distance.append(norm)
    for i in range(data_process.count_total(train_data)):
        dis_lab.append([C_distance[i], label[i]])
    sort = sorted(dis_lab, key=(lambda x: x[0]))  #将Manhattan_distance按升序排列
    return sort
Пример #5
0
def Mahalanobis_distance(train_data, data):
    '''计算单个样本与训练集各个样本间的马氏距离,并按升序排列'''
    A1 = np.array(A)
    a1 = np.array(train_data)
    a2 = np.array(data)
    td = a1[:, 0:4]
    d = a2[0:4]
    label = a1[:, 4]
    C_distance = []
    dis_lab = []  # 存储该样本与训练样本的马氏距离和训练样本对应的类别
    for sample in td:
        temp = np.dot(d - sample, A1)
        temp_tr = np.transpose(temp)
        mul = np.dot(temp, temp_tr)
        norm = math.sqrt(mul)
        C_distance.append(norm)
    for i in range(data_process.count_total(train_data)):
        dis_lab.append([C_distance[i], label[i]])
    sort = sorted(dis_lab, key=(lambda x: x[0]))  # 将马氏距离按升序排列
    return sort
Пример #6
0
def decide_label(train_data, val_data):
    '''用验证集计算各个K下的正确率'''
    total = data_process.count_total(val_data)
    a = np.array(val_data)
    data = a[:, 0:4]
    label = a[:, 4]
    corr = []  #各个K下的正确率
    for K in range(1, 30, 2):
        predict = []  # 对验证集各样本进行分类
        for sample in data:
            sort = Mahalanobis_distance(train_data, sample)
            lab_0 = 0
            lab_1 = 0
            for i in range(K):
                if sort[i][1] == 0:
                    lab_0 += 1
                else:
                    lab_1 += 1
            if lab_0 > lab_1:
                predict.append(0)
            else:
                predict.append(1)
        correct = 0
        for i, lab in enumerate(label):
            if predict[i] == lab:
                correct += 1
        corr.append(correct / total)
    #绘制K对精度影响的曲线图
    K = list(range(1, 30, 2))
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    fig.suptitle('K of Mahalanobis', fontsize=14, fontweight='bold')
    ax.set_xlabel("K")
    ax.set_ylabel("correction rate")
    plt.plot(K, corr)
    plt.show()
    K2 = corr.index(max(corr)) * 2 + 1  #记录准确率最高的K
    return K2