Python CSV示例，Conversion.CSV Python示例

示例#1

0

显示文件

文件： main_linear_kernel2.py 项目： bjutliulei/Positive-and-Unlabeled-Learning

def experiment():
    percent_p = 3
    path = r'C:\Users\yyveggie\Desktop\UCI\Conversion\mushroom.csv'
    seed = 2019

    est_error_upu = []
    est_binary_upu = []
    est_binary_pusb = []
    est_error_pusb = []
    est_error_drsb = []
    est_binary_drsb = []

    for k in range(10):
        np.random.seed(seed)
        pi = 0.6        # 类先验，表示U中P所占比例，现在的问题是，是否需要根据下文中比例进行计算，而不是提前随机指定
        classifier = LogisticRegression(C=0.01, penalty='l2', solver='liblinear')
        texts_1, texts_0 = CSV(path)
        texts_1 = np.array_split(texts_1, 10)  # 将类别为1的样本集分成十份
        texts_0 = np.array_split(texts_0, 10)  # 将类别为0的样本集分成十份
        x_test = np.array(list(texts_1[k]) + list(texts_0[k]))  # 测试集x，每一份每轮选一次
        t_test = np.array(list(len(texts_1[k]) * [1]) + list(len(texts_0[k]) * [0]))  # 测试集y，正例为1，负例为0

        index_rest = sorted(set(range(10)) - set([k]))  # 除了测试集剩下的索引
        texts_1 = np.array(texts_1)
        texts_0 = np.array(texts_0)
        texts_1 = np.array([j for i in texts_1[index_rest] for j in i])     # p
        texts_0 = np.array([j for i in texts_0[index_rest] for j in i])     # n
        x = np.vstack((texts_1, texts_0))   # p和n组成训练集
        one = np.ones((len(x), 1))  #
        x_pn = np.concatenate([x, one], axis=1)
        t = pd.Series([1] * len(texts_1) + [0] * len(texts_0))
        classifier.fit(x_pn, t)

        x_train = x
        t_train = t

        xp = x_train[t_train == 1]
        one = np.ones((len(xp), 1))
        xp_temp = np.concatenate([xp, one], axis=1)
        xp_prob = classifier.predict_proba(xp_temp)[:, 1]
        # xp_prob /= np.mean(xp_prob)
        xp_prob = xp_prob ** 20
        xp_prob /= np.max(xp_prob)
        rand = np.random.uniform(size=len(xp))
        temp = xp[xp_prob > rand]
        pdata = int(percent_p / 10 * len(x))  # p样本数量，占了总数的3/10
        while (len(temp) < pdata):
            rand = np.random.uniform(size=len(xp))
            temp = np.concatenate([temp, xp[xp_prob > rand]], axis=0)
        xp = temp
        perm = np.random.permutation(len(xp))
        xp = xp[perm[:pdata]]
        u = int(6 / 10 * len(x))  # u样本数量，占了总数的6/10
        updata = np.int(u * pi)   # U中P的数量 = U的数量 * 类先验
        undata = u - updata       # U中N的数量 = U的数量 - U中P的数量

        xp_temp = x_train[t_train == 1]
        xn_temp = x_train[t_train == 0]
        perm = np.random.permutation(len(xp_temp))
        xp_temp = xp_temp[perm[:updata]]

        perm = np.random.permutation(len(xn_temp))
        xn_temp = xn_temp[perm[:undata]]
        xu = np.concatenate([xp_temp, xn_temp], axis=0)

        x = np.concatenate([xp, xu], axis=0)

        tp = np.ones(len(xp))
        tu = np.zeros(len(xu))
        t = np.concatenate([tp, tu], axis=0)

        updata = np.int(1000 * pi)
        undata = 1000 - updata

        xp_test = x_test[t_test == 1]
        perm = np.random.permutation(len(xp_test))
        xp_test = xp_test[perm[:updata]]
        xn_test = x_test[t_test == 0]
        perm = np.random.permutation(len(xn_test))
        xn_test = xn_test[perm[:undata]]

        x_test = np.concatenate([xp_test, xn_test], axis=0)
        tp = np.ones(len(xp_test))
        tu = np.zeros(len(xn_test))
        t_test = np.concatenate([tp, tu], axis=0)

        pu = PU(pi=pi)
        x_train = x
        res, x_test_kernel = pu.optimize(x, t, x_test)
        acc1, f1_binary1 = pu.test(x_test_kernel, res, t_test, quant=False)
        acc2, f1_binary2 = pu.test(x_test_kernel, res, t_test, quant=True, pi=pi)

        result = densratio(x_train[t == 1], x_train[t == 0])
        r = result.compute_density_ratio(x_test)
        temp = np.copy(r)
        temp = np.sort(temp)
        theta = temp[np.int(np.floor(len(x_test) * (1 - pi)))]
        pred = np.zeros(len(x_test))
        pred[r > theta] = 1
        acc3 = np.mean(pred == t_test)
        f1_binary3 = f1_score(t_test, pred, average='binary')

        est_error_upu.append(acc1)
        est_binary_upu.append(f1_binary1)
        est_error_pusb.append(acc2)
        est_binary_pusb.append(f1_binary2)
        est_error_drsb.append(acc3)
        est_binary_drsb.append(f1_binary3)

        seed += 1

        print("Iter：", k)
        print("upu_accuracy ", acc1)
        print("upu_f1_binary ", f1_binary1)
        print("pusb_accuracy ", acc2)
        print("pusb_f1_binary ", f1_binary2)
        print("drsb_accuracy ", acc3)
        print("drsb_f1_binary ", f1_binary3)

    print("Accuracy for uPU：", np.mean(est_error_upu))
    print("F1-Score for uPU：", np.mean(est_binary_upu))
    print("Accuracy for PUSB：", np.mean(est_error_pusb))
    print("F1-Score for PUSB：", np.mean(est_binary_pusb))
    print("Accuracy for DRSB：", np.mean(est_error_drsb))
    print("F1-Score for DRSB：", np.mean(est_binary_drsb))

示例#2

0

显示文件

文件： DDI-PULearn.py 项目： bjutliulei/Positive-and-Unlabeled-Learning

'''CSV文件格式'''

from Conversion import CSV
from sklearn.svm import SVC
import numpy as np
import random

f1 = []
accuracy = []
path = r'C:\Users\yyveggie\Desktop\UCI\Conversion\mushroom.csv'

k = 5   # k近邻
T = 4.8  # 阈值

texts_1, texts_0 = CSV(path)

def SplitData(k, texts_1, texts_0):                                            # 10折交叉
    percent_p = 3                                                              # p所占份数
    texts_1 = np.array_split(texts_1, 10)                                      # 将类别为1的样本集分成十份
    texts_0 = np.array_split(texts_0, 10)                                      # 将类别为0的样本集分成十份
    test_x = list(texts_1[k]) + list(texts_0[k])                               # 测试集x，每一份每轮选一次
    test_y = list(len(texts_1[k]) * [1]) + list(len(texts_0[k]) * [0])         # 测试集y，正例为1，负例为0

    index_rest = sorted(set(range(10)) - set([k]))                             # 除去测试集剩余索引
    index_p = random.sample(index_rest, percent_p)                             # 随机选择percent_p个p索引
    texts_1 = np.array(texts_1)                                                # 转换为array格式可进行多维索引
    p = texts_1[index_p]                                                       # p集合

    index_except_p = sorted(set(range(10)) - set([k]) - set(index_p))          # 除去测试集和p集合剩余索引
    texts_0 = np.array(texts_0)                                                # 转换为array格式可进行多维索引
    u = list(texts_1[index_except_p]) + list(texts_0[index_rest])              # u集合

示例#3

0

显示文件

文件：测试.py 项目： bjutliulei/Positive-and-Unlabeled-Learning

def experiment():
    percent_p = 3
    path = r'C:\Users\yyveggie\Desktop\UCI\Conversion\mushroom.csv'
    ite = 10
    epoch = 100
    batchsize = 1000

    seed = 2020

    gpu = True

    acc_nnpu = []
    f1_nnpu = []
    acc_nnpusb = []
    f1_nnpusb = []

    for k in range(ite):
        np.random.seed(seed)
        #PN classification
        texts_1, texts_0 = CSV(path)
        texts_1 = np.array_split(texts_1, 10)  # 将类别为1的样本集分成十份
        texts_0 = np.array_split(texts_0, 10)  # 将类别为0的样本集分成十份
        x_test = np.array(list(texts_1[k]) + list(texts_0[k]))  # 测试集x，每一份每轮选一次
        t_test = np.array(list(len(texts_1[k]) * [1]) + list(len(texts_0[k]) * [0]))  # 测试集y，正例为1，负例为0

        index_rest = sorted(set(range(10)) - set([k]))
        texts_1 = np.array(texts_1)
        texts_0 = np.array(texts_0)
        texts_1 = np.array([j for i in texts_1[index_rest] for j in i])
        texts_0 = np.array([j for i in texts_0[index_rest] for j in i])
        x = np.vstack((texts_1, texts_0))
        t = pd.Series([1] * len(texts_1) + [0] * len(texts_0))
        dim = x.shape[1]
        print(x.shape)

        x_train = x
        t_train = pd.Series([1] * len(texts_1) + [0] * len(texts_0))

        pi = np.mean(t_train)

        model = MultiLayerPerceptron(dim)
        optimizer = optimizers.Adam(1e-5)
        optimizer.setup(model)

        if gpu:
            gpu_device = 0
            cuda.get_device(gpu_device).use()
            model.to_gpu(gpu_device)
            xp = cuda.cupy
        else:
            xp = np

        model, optimizer = train(x, t, epoch, model, optimizer, batchsize, xp)

        x_p = x_train[t_train==1]

        xp_prob = np.array([])
        for j in six.moves.range(0, len(x_p), batchsize):
            X = Variable(xp.array(x_p[j:j + batchsize], xp.float32))
            g = chainer.cuda.to_cpu(model(X).data).T[0]
            xp_prob = np.append(xp_prob, 1/(1+np.exp(-g)), axis=0)
        xp_prob /= np.mean(xp_prob)
        xp_prob = xp_prob
        xp_prob /= np.max(xp_prob)
        print(xp_prob)
        rand = np.random.uniform(size=len(x_p))
        x_p = x_p[xp_prob > rand]
        perm = np.random.permutation(len(x_p))
        pdata = int(percent_p / 10 * len(x))  # p样本数量，占了总数的3/10
        x_p = x_p[perm[:pdata]]

        tp = np.ones(len(x_p))
        tu = np.zeros(len(x_train))
        t_train = np.concatenate([tp, tu], axis=0)

        x_train = np.concatenate([x_p, x_train], axis=0)

        print(x_train.shape)
        print(t_train.shape)
        print(x_test.shape)
        print(t_test.shape)

        model = MultiLayerPerceptron(dim)
        optimizer = optimizers.Adam(alpha=1e-5)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(0.005))

        if gpu:
            gpu_device = 0
            cuda.get_device(gpu_device).use()
            model.to_gpu(gpu_device)
            xp = cuda.cupy
        else:
            xp = np

        model, optimizer, acc1, acc2, f1_binary1, f1_binary2 = train_pu(x_train, t_train, x_test, t_test, pi, epoch, model, optimizer, batchsize, xp)

        acc_nnpu.append(acc1)
        f1_nnpu.append(f1_binary1)
        acc_nnpusb.append(acc2)
        f1_nnpusb.append(f1_binary2)

        seed += 1

        print("Iter：", k)
        print("acc_nnpu：", acc1)
        print("acc_nnpusb：", acc2)
        print("f1_nnpu：", f1_binary1)
        print("f1_nnpusb：", f1_binary2)

    print("acc_nnpu_mean：", np.mean(acc_nnpu))
    print("f1_nnpu_mean：", np.mean(f1_nnpu))
    print("acc_nnpusb_mean：", np.mean(acc_nnpusb))
    print("f1_nnpusb_mean：", np.mean(f1_nnpusb))