示例#1
0
        distanceArray[i][j] = np.sqrt(
            np.sum(np.square(originDataX[:, i] - originDataX[:, j])))
k = 10
minSize = 0
for i in range(size):
    if originDataY[i] == 1:
        minSize = minSize + 1
kLink = [0 for i in range(size)]
minknear = []  # 存储着所有有缺陷的模块到其他异类点之间的距离从小到大进行排序
count = 0  # 少数类个数
for i in range(size):
    if originDataY[i] == 1:  # 找出少数类
        minknear.append([])
        for j in range(size):
            if originDataY[j] == 0:
                minknear[count].append(Relation(i, j, distanceArray[i][j]))
        minknear[count] = sorted(minknear[count],
                                 key=lambda distance: distance.get_distance())
        count = count + 1
for knear in minknear:
    # 找出其最接近的k个异类元素
    for index in range(k):
        maxknear = []
        # 计算异类元素相邻的少数类元素
        for i in range(size):
            if originDataY[i] == 1:
                maxknear.append(
                    Relation(i, knear[index].get_majorNumber(),
                             distanceArray[i][knear[index].get_majorNumber()]))
        maxknear = sorted(maxknear,
                          key=lambda distance: distance.get_distance())
示例#2
0
def main():

    # load data
    dataset = np.loadtxt('I:\\tools\\SoftwarePrediction\\dataset\\cm1.txt', delimiter=",")
    length = len(dataset[0])
    x = dataset[:, 0:length - 1]
    y = dataset[:, length - 1]

    # prepare train data
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)
    #x_train=x;y_train=y

    # prepare test and train data
    x_train=x_train.transpose()
    y_train[y_train==1] = 1
    y_train[y_train==0] = -1

    x_test=x_test.transpose()  
    y_test[y_test == 1] = 1
    y_test[y_test == 0] = -1

    originDataX = x_train;
    originDataY = y_train;
    size = originDataX.shape[1];
    tt = originDataX[:, 0]
    distanceArray = np.zeros((size, size), dtype=float)
    for i in range(size):
        for j in range(size):
            distanceArray[i][j] = np.sqrt(np.sum(np.square(originDataX[:, i] - originDataX[:, j])))
    k = 10
    minSize = 0
    for i in range(size):
        if originDataY[i] == 1:
            minSize = minSize + 1
    kLink = [0 for i in range(size)]
    minknear = []  # 存储着所有有缺陷的模块到其他异类点之间的距离从小到大进行排序
    count = 0  # 少数类个数
    for i in range(size):
        if originDataY[i] == 1:  # 找出少数类
            minknear.append([])
            for j in range(size):
                if originDataY[j] == -1:
                    minknear[count].append(Relation(i, j, distanceArray[i][j]))
            minknear[count] = sorted(minknear[count], key=lambda distance: distance.get_distance())
            count = count + 1
    for knear in minknear:
        # 找出其最接近的k个异类元素
        for index in range(k):
            maxknear = []
            # 计算异类元素相邻的少数类元素
            for i in range(size):
                if originDataY[i] == 1:
                    maxknear.append(Relation(i, knear[index].get_majorNumber(),
                                             distanceArray[i][knear[index].get_majorNumber()]))
            maxknear = sorted(maxknear, key=lambda distance: distance.get_distance())
            # 设置边界节点的权重
            for i in range(k):
                if maxknear[i].get_minNumber() == knear[index].get_minNumber():
                    kLink[knear[index].get_minNumber()] = kLink[knear[index].get_minNumber()] + 1

    # 调整边界节点的权重
    k2 = 10
    for knear in minknear:
        minNumber = knear[0].get_minNumber()
        minAllNear = []
        for i in range(size):
            minAllNear.append(Relation(minNumber, i, distanceArray[minNumber][i]))
        minAllNear = sorted(minAllNear, key=lambda distance: distance.get_distance())
        minCount = 0
        for i in range(k2):
            if originDataY[minAllNear[i].get_majorNumber()] == 1:
                minCount = minCount + 1
        kLink[minNumber] = kLink[minNumber] * (minCount / k2)

    # for i in range(size):
    #     print(kLink[i])

    # 标准化
    total = 0
    for i in range(size):
        total += kLink[i]
    for i in range(size):
        kLink[i] = kLink[i] / total

    # for i in range(size):
    #     print(kLink[i])

    # 要生成的个数N
    N = 2*count
    # N=0
    T = []
    for i in range(size):
        T.append(math.ceil(N * kLink[i]))

    # for i in range(size):
    #     print(T[i])

    sampleDataCount = 0
    # 使用SMOT人工合成过采样技术生成少数类
    for i in range(size):
        if T[i] != 0:
            orinMin = originDataX[:, i]
            minNearArray = []
            for j in range(size):
                if j != i and originDataY[j] == 1:
                    minNearArray.append(Relation(i, j, distanceArray[i][j]))
            minNearArray = sorted(minNearArray, key=lambda distance: distance.get_distance())
            for j in range(T[i]):
                nearData = originDataX[:, minNearArray[j].get_majorNumber()]
                sampleData = (orinMin - nearData) * random.random() + orinMin
                sampleData = sampleData.reshape(1, originDataX.shape[0])
                originDataX = np.concatenate((originDataX, sampleData.T), axis=1)
                originDataY = np.concatenate((originDataY, np.array([1])), axis=0)

                sampleDataCount = sampleDataCount + 1
                # print(sampleData)

    # train
    ada=AdaBoost(originDataX, originDataY)
    ada.train(50)

    # predict
    y_pred = ada.pred(x_test)
    print("total test", len(y_pred))
    print("true pred",  len(y_pred[y_pred == y_test]))
    print("acc", accuracy_score(y_test, y_pred))
    print("precision",precision_score(y_test,y_pred))
    print("recall",recall_score(y_test,y_pred))
    print("f1",f1_score(y_test,y_pred))
    for i in range(len(y_pred)):
        if y_test[i]==1 and y_pred[i]==1:
            print(i)
示例#3
0
def generateSMOTE():
    # load data
    dataset = np.loadtxt(
        'I:\\tools\\SoftwarePrediction\\dataset\\oringData.txt', delimiter=",")
    length = len(dataset[0])
    x = dataset[:, 0:length - 1]
    y = dataset[:, length - 1]
    x_train = x
    y_train = y
    # prepare test and train data
    x_train = x_train.transpose()
    originDataX = x_train
    originDataY = y_train
    size = originDataX.shape[1]
    distanceArray = np.zeros((size, size), dtype=float)
    for i in range(size):
        for j in range(size):
            distanceArray[i][j] = np.sqrt(
                np.sum(np.square(originDataX[:, i] - originDataX[:, j])))

    count = 0  # 少数类个数
    kLink = [0 for i in range(size)]
    for i in range(size):
        if originDataY[i] == 1:  # 找出少数类
            count = count + 1
            kLink[i] = random.random()

    # 标准化
    total = 0

    for i in range(size):
        total += kLink[i]
    for i in range(size):
        kLink[i] = kLink[i] / total

    # 要生成的个数N
    N = 2 * count
    # N=0
    T = []
    for i in range(size):
        T.append(math.ceil(N * kLink[i]))

    sampleDataCount = 0
    # 使用SMOT人工合成过采样技术生成少数类
    for i in range(size):
        if T[i] != 0:
            orinMin = originDataX[:, i]
            minNearArray = []
            for j in range(size):
                if j != i and originDataY[j] == 1:
                    minNearArray.append(Relation(i, j, distanceArray[i][j]))
            minNearArray = sorted(minNearArray,
                                  key=lambda distance: distance.get_distance())
            for j in range(T[i]):
                nearData = originDataX[:, minNearArray[j].get_majorNumber()]
                sampleData = (orinMin - nearData) * random.random() + orinMin
                sampleData = sampleData.reshape(1, originDataX.shape[0])
                originDataX = np.concatenate((originDataX, sampleData.T),
                                             axis=1)
                originDataY = np.concatenate((originDataY, np.array([1])),
                                             axis=0)
                sampleDataCount = sampleDataCount + 1
                # print(sampleData)
    originDataX = np.concatenate(
        (originDataX, originDataY.reshape(1, originDataY.shape[0])), axis=0)
    originDataX = originDataX.T

    doc = open("test.txt", 'w')
    for row in originDataX:
        doc.write(
            str(int(row[0])) + "," + str(int(row[1])) + "," +
            str(int(row[2])) + "\n")
示例#4
0
    def train(self, M=5):
        self.G = {}  # 表示弱分类器的字典
        self.alpha = {}  # 每个弱分类器的参数
        for i in range(M):
            self.G.setdefault(i)
            self.alpha.setdefault(i)

        originDataX = self.X
        originDataY = self.y
        size = originDataX.shape[1]
        if self.distanceArray is None:
            self.distanceArray = np.zeros((size, size), dtype=float)
            for i in range(size):
                for j in range(size):
                    if j > i:
                        self.distanceArray[i][j] = np.sqrt(
                            np.sum(
                                np.square(originDataX[:, i] -
                                          originDataX[:, j])))
                        self.distanceArray[j][i] = self.distanceArray[i][j]
        k = 10
        minSize = 0
        for i in range(size):
            if originDataY[i] == 1:
                minSize = minSize + 1
        kLink = [0 for i in range(size)]
        minknear = []  # 存储着所有有缺陷的模块到其他异类点之间的距离从小到大进行排序
        count = 0  # 少数类个数
        for i in range(size):
            if originDataY[i] == 1:  # 找出少数类
                minknear.append([])
                for j in range(size):
                    if originDataY[j] == -1:
                        minknear[count].append(
                            Relation(i, j, self.distanceArray[i][j]))
                minknear[count] = sorted(
                    minknear[count],
                    key=lambda distance: distance.get_distance())
                count = count + 1
        for knear in minknear:
            # 找出其最接近的k个异类元素
            for index in range(k):
                maxknear = []
                # 计算异类元素相邻的少数类元素
                for i in range(size):
                    if originDataY[i] == 1:
                        maxknear.append(
                            Relation(
                                i, knear[index].get_majorNumber(),
                                self.distanceArray[i][
                                    knear[index].get_majorNumber()]))
                maxknear = sorted(maxknear,
                                  key=lambda distance: distance.get_distance())
                # 设置边界节点的权重
                for i in range(k):
                    if maxknear[i].get_minNumber(
                    ) == knear[index].get_minNumber():
                        kLink[knear[index].get_minNumber(
                        )] = kLink[knear[index].get_minNumber()] + 1

        # 标准化
        total = 0
        for i in range(size):
            total += kLink[i]
        for i in range(size):
            kLink[i] = kLink[i] / total

        # for i in range(size):
        #     print(kLink[i])

        # 要生成的个数N
        N = count
        # N=0
        T = []
        for i in range(size):
            T.append(math.ceil(N * kLink[i]))

        for iClassfiler in range(M):  # self.G[i]为第i个弱分类器
            # for i in range(size):
            #     print(T[i])
            originDataX = self.X
            originDataY = self.y
            sampleDataCount = 0
            # 使用SMOT人工合成过采样技术生成少数类
            for i in range(size):
                if T[i] != 0:
                    orinMin = originDataX[:, i]
                    minNearArray = []
                    for j in range(size):
                        if j != i and originDataY[j] == 1:
                            minNearArray.append(
                                Relation(i, j, self.distanceArray[i][j]))
                    minNearArray = sorted(
                        minNearArray,
                        key=lambda distance: distance.get_distance())
                    for j in range(T[i]):
                        nearData = originDataX[:,
                                               minNearArray[j].get_majorNumber(
                                               )]
                        sampleData = (orinMin -
                                      nearData) * random.random() + orinMin
                        sampleData = sampleData.reshape(
                            1, originDataX.shape[0])
                        originDataX = np.concatenate(
                            (originDataX, sampleData.T), axis=1)
                        originDataY = np.concatenate(
                            (originDataY, np.array([1])), axis=0)
                        sampleDataCount = sampleDataCount + 1
                        # print(sampleData)

            originTotalWeight = 0
            if self.W is None:
                self.W = np.ones((originDataX.shape[1],
                                  1)).flatten(1) / originDataX.shape[1]
                self.sums = np.zeros(originDataY.shape)
            else:
                for i in range(size):
                    originTotalWeight = originTotalWeight + self.W[i]
                # self.W=np.delete(self.W,[size:],axis=0)
                self.W = self.W[:size]
                for i in range(sampleDataCount):
                    self.W = np.concatenate(
                        (self.W,
                         np.array([(1 - originTotalWeight) /
                                   (0.94 * sampleDataCount)])),
                        axis=0)
                # print("")
                # for i in range(sampleDataCount):
                #     self.W[i+size]=sampleDataCount/sampleDataCount

            self.G[iClassfiler] = self.Weaker(originDataX, originDataY)
            e = self.G[iClassfiler].train(self.W)  # 根据当前权值进行该个弱分类器训练

            self.alpha[iClassfiler] = 1.0 / 2 * np.log(
                (1 - e) / e)  # 计算该分类器的系数
            res = self.G[iClassfiler].pred(originDataX)  # res表示该分类器得出的输出

            # 计算当前次数训练精确度
            print("weak classfier acc", accuracy_score(originDataY, res),
                  "\n======================================================")

            # Z表示规范化因子
            Z = self.W * np.exp(
                -self.alpha[iClassfiler] * originDataY * res.transpose())
            self.W = (Z / Z.sum()).flatten(1)  # 更新权值
            self.Q = iClassfiler
            # errorcnt返回分错的点的数量,为0则表示perfect
            if (self.errorcnt(iClassfiler, originDataX, originDataY) == 0):
                print("%d个弱分类器可以将错误率降到0" % (iClassfiler + 1))
                break
示例#5
0
def generateBorder():

    # load data
    dataset = np.loadtxt(
        'I:\\tools\\SoftwarePrediction\\dataset\\oringData.txt', delimiter=",")
    length = len(dataset[0])
    x = dataset[:, 0:length - 1]
    y = dataset[:, length - 1]

    # prepare train data
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)
    x_train = x
    y_train = y

    # prepare test and train data
    x_train = x_train.transpose()
    # y_train[y_train==1] = 1
    # y_train[y_train==0] = -1

    x_test = x_test.transpose()
    # y_test[y_test == 1] = 1
    # y_test[y_test == 0] = -1

    originDataX = x_train
    originDataY = y_train
    size = originDataX.shape[1]
    tt = originDataX[:, 0]
    distanceArray = np.zeros((size, size), dtype=float)
    for i in range(size):
        for j in range(size):
            distanceArray[i][j] = np.sqrt(
                np.sum(np.square(originDataX[:, i] - originDataX[:, j])))
    k = 20
    minSize = 0
    for i in range(size):
        if originDataY[i] == 1:
            minSize = minSize + 1
    kLink = [0 for i in range(size)]
    minknear = []  # 存储着所有有缺陷的模块到其他异类点之间的距离从小到大进行排序
    count = 0  # 少数类个数
    for i in range(size):
        if originDataY[i] == 1:  # 找出少数类
            minknear.append([])
            for j in range(size):
                if originDataY[j] == 0:
                    minknear[count].append(Relation(i, j, distanceArray[i][j]))
            minknear[count] = sorted(
                minknear[count], key=lambda distance: distance.get_distance())
            count = count + 1
    for knear in minknear:
        # 找出其最接近的k个异类元素
        for index in range(k):
            maxknear = []
            # 计算异类元素相邻的少数类元素
            for i in range(size):
                if originDataY[i] == 1:
                    maxknear.append(
                        Relation(
                            i, knear[index].get_majorNumber(),
                            distanceArray[i][knear[index].get_majorNumber()]))
            maxknear = sorted(maxknear,
                              key=lambda distance: distance.get_distance())
            # 设置边界节点的权重
            for i in range(k):
                if maxknear[i].get_minNumber() == knear[index].get_minNumber():
                    kLink[knear[index].get_minNumber(
                    )] = kLink[knear[index].get_minNumber()] + 1

    # 标准化
    total = 0
    for i in range(size):
        total += kLink[i]
    for i in range(size):
        kLink[i] = kLink[i] / total

    # for i in range(size):
    #     print(kLink[i])

    # 要生成的个数N
    N = 2 * count
    # N=0
    T = []
    for i in range(size):
        T.append(math.ceil(N * kLink[i]))

    # for i in range(size):
    #     print(T[i])

    sampleDataCount = 0
    # 使用SMOT人工合成过采样技术生成少数类
    for i in range(size):
        if T[i] != 0:
            orinMin = originDataX[:, i]
            minNearArray = []
            for j in range(size):
                if j != i and originDataY[j] == 1:
                    minNearArray.append(Relation(i, j, distanceArray[i][j]))
            minNearArray = sorted(minNearArray,
                                  key=lambda distance: distance.get_distance())
            for j in range(T[i]):
                nearData = originDataX[:, minNearArray[j].get_majorNumber()]
                sampleData = (orinMin - nearData) * random.random() + orinMin
                sampleData = sampleData.reshape(1, originDataX.shape[0])
                originDataX = np.concatenate((originDataX, sampleData.T),
                                             axis=1)
                originDataY = np.concatenate((originDataY, np.array([1])),
                                             axis=0)
                sampleDataCount = sampleDataCount + 1
                # print(sampleData)
    originDataX = np.concatenate(
        (originDataX, originDataY.reshape(1, originDataY.shape[0])), axis=0)
    originDataX = originDataX.T

    doc = open("test.txt", 'w')
    for row in originDataX:
        doc.write(
            str(int(row[0])) + "," + str(int(row[1])) + "," +
            str(int(row[2])) + "\n")