distanceArray[i][j] = np.sqrt( np.sum(np.square(originDataX[:, i] - originDataX[:, j]))) k = 10 minSize = 0 for i in range(size): if originDataY[i] == 1: minSize = minSize + 1 kLink = [0 for i in range(size)] minknear = [] # 存储着所有有缺陷的模块到其他异类点之间的距离从小到大进行排序 count = 0 # 少数类个数 for i in range(size): if originDataY[i] == 1: # 找出少数类 minknear.append([]) for j in range(size): if originDataY[j] == 0: minknear[count].append(Relation(i, j, distanceArray[i][j])) minknear[count] = sorted(minknear[count], key=lambda distance: distance.get_distance()) count = count + 1 for knear in minknear: # 找出其最接近的k个异类元素 for index in range(k): maxknear = [] # 计算异类元素相邻的少数类元素 for i in range(size): if originDataY[i] == 1: maxknear.append( Relation(i, knear[index].get_majorNumber(), distanceArray[i][knear[index].get_majorNumber()])) maxknear = sorted(maxknear, key=lambda distance: distance.get_distance())
def main(): # load data dataset = np.loadtxt('I:\\tools\\SoftwarePrediction\\dataset\\cm1.txt', delimiter=",") length = len(dataset[0]) x = dataset[:, 0:length - 1] y = dataset[:, length - 1] # prepare train data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4) #x_train=x;y_train=y # prepare test and train data x_train=x_train.transpose() y_train[y_train==1] = 1 y_train[y_train==0] = -1 x_test=x_test.transpose() y_test[y_test == 1] = 1 y_test[y_test == 0] = -1 originDataX = x_train; originDataY = y_train; size = originDataX.shape[1]; tt = originDataX[:, 0] distanceArray = np.zeros((size, size), dtype=float) for i in range(size): for j in range(size): distanceArray[i][j] = np.sqrt(np.sum(np.square(originDataX[:, i] - originDataX[:, j]))) k = 10 minSize = 0 for i in range(size): if originDataY[i] == 1: minSize = minSize + 1 kLink = [0 for i in range(size)] minknear = [] # 存储着所有有缺陷的模块到其他异类点之间的距离从小到大进行排序 count = 0 # 少数类个数 for i in range(size): if originDataY[i] == 1: # 找出少数类 minknear.append([]) for j in range(size): if originDataY[j] == -1: minknear[count].append(Relation(i, j, distanceArray[i][j])) minknear[count] = sorted(minknear[count], key=lambda distance: distance.get_distance()) count = count + 1 for knear in minknear: # 找出其最接近的k个异类元素 for index in range(k): maxknear = [] # 计算异类元素相邻的少数类元素 for i in range(size): if originDataY[i] == 1: maxknear.append(Relation(i, knear[index].get_majorNumber(), distanceArray[i][knear[index].get_majorNumber()])) maxknear = sorted(maxknear, key=lambda distance: distance.get_distance()) # 设置边界节点的权重 for i in range(k): if maxknear[i].get_minNumber() == knear[index].get_minNumber(): kLink[knear[index].get_minNumber()] = kLink[knear[index].get_minNumber()] + 1 # 调整边界节点的权重 k2 = 10 for knear in minknear: minNumber = knear[0].get_minNumber() minAllNear = [] for i in range(size): minAllNear.append(Relation(minNumber, i, distanceArray[minNumber][i])) minAllNear = sorted(minAllNear, key=lambda distance: distance.get_distance()) minCount = 0 for i in range(k2): if originDataY[minAllNear[i].get_majorNumber()] == 1: minCount = minCount + 1 kLink[minNumber] = kLink[minNumber] * (minCount / k2) # for i in range(size): # print(kLink[i]) # 标准化 total = 0 for i in range(size): total += kLink[i] for i in range(size): kLink[i] = kLink[i] / total # for i in range(size): # print(kLink[i]) # 要生成的个数N N = 2*count # N=0 T = [] for i in range(size): T.append(math.ceil(N * kLink[i])) # for i in range(size): # print(T[i]) sampleDataCount = 0 # 使用SMOT人工合成过采样技术生成少数类 for i in range(size): if T[i] != 0: orinMin = originDataX[:, i] minNearArray = [] for j in range(size): if j != i and originDataY[j] == 1: minNearArray.append(Relation(i, j, distanceArray[i][j])) minNearArray = sorted(minNearArray, key=lambda distance: distance.get_distance()) for j in range(T[i]): nearData = originDataX[:, minNearArray[j].get_majorNumber()] sampleData = (orinMin - nearData) * random.random() + orinMin sampleData = sampleData.reshape(1, originDataX.shape[0]) originDataX = np.concatenate((originDataX, sampleData.T), axis=1) originDataY = np.concatenate((originDataY, np.array([1])), axis=0) sampleDataCount = sampleDataCount + 1 # print(sampleData) # train ada=AdaBoost(originDataX, originDataY) ada.train(50) # predict y_pred = ada.pred(x_test) print("total test", len(y_pred)) print("true pred", len(y_pred[y_pred == y_test])) print("acc", accuracy_score(y_test, y_pred)) print("precision",precision_score(y_test,y_pred)) print("recall",recall_score(y_test,y_pred)) print("f1",f1_score(y_test,y_pred)) for i in range(len(y_pred)): if y_test[i]==1 and y_pred[i]==1: print(i)
def generateSMOTE(): # load data dataset = np.loadtxt( 'I:\\tools\\SoftwarePrediction\\dataset\\oringData.txt', delimiter=",") length = len(dataset[0]) x = dataset[:, 0:length - 1] y = dataset[:, length - 1] x_train = x y_train = y # prepare test and train data x_train = x_train.transpose() originDataX = x_train originDataY = y_train size = originDataX.shape[1] distanceArray = np.zeros((size, size), dtype=float) for i in range(size): for j in range(size): distanceArray[i][j] = np.sqrt( np.sum(np.square(originDataX[:, i] - originDataX[:, j]))) count = 0 # 少数类个数 kLink = [0 for i in range(size)] for i in range(size): if originDataY[i] == 1: # 找出少数类 count = count + 1 kLink[i] = random.random() # 标准化 total = 0 for i in range(size): total += kLink[i] for i in range(size): kLink[i] = kLink[i] / total # 要生成的个数N N = 2 * count # N=0 T = [] for i in range(size): T.append(math.ceil(N * kLink[i])) sampleDataCount = 0 # 使用SMOT人工合成过采样技术生成少数类 for i in range(size): if T[i] != 0: orinMin = originDataX[:, i] minNearArray = [] for j in range(size): if j != i and originDataY[j] == 1: minNearArray.append(Relation(i, j, distanceArray[i][j])) minNearArray = sorted(minNearArray, key=lambda distance: distance.get_distance()) for j in range(T[i]): nearData = originDataX[:, minNearArray[j].get_majorNumber()] sampleData = (orinMin - nearData) * random.random() + orinMin sampleData = sampleData.reshape(1, originDataX.shape[0]) originDataX = np.concatenate((originDataX, sampleData.T), axis=1) originDataY = np.concatenate((originDataY, np.array([1])), axis=0) sampleDataCount = sampleDataCount + 1 # print(sampleData) originDataX = np.concatenate( (originDataX, originDataY.reshape(1, originDataY.shape[0])), axis=0) originDataX = originDataX.T doc = open("test.txt", 'w') for row in originDataX: doc.write( str(int(row[0])) + "," + str(int(row[1])) + "," + str(int(row[2])) + "\n")
def train(self, M=5): self.G = {} # 表示弱分类器的字典 self.alpha = {} # 每个弱分类器的参数 for i in range(M): self.G.setdefault(i) self.alpha.setdefault(i) originDataX = self.X originDataY = self.y size = originDataX.shape[1] if self.distanceArray is None: self.distanceArray = np.zeros((size, size), dtype=float) for i in range(size): for j in range(size): if j > i: self.distanceArray[i][j] = np.sqrt( np.sum( np.square(originDataX[:, i] - originDataX[:, j]))) self.distanceArray[j][i] = self.distanceArray[i][j] k = 10 minSize = 0 for i in range(size): if originDataY[i] == 1: minSize = minSize + 1 kLink = [0 for i in range(size)] minknear = [] # 存储着所有有缺陷的模块到其他异类点之间的距离从小到大进行排序 count = 0 # 少数类个数 for i in range(size): if originDataY[i] == 1: # 找出少数类 minknear.append([]) for j in range(size): if originDataY[j] == -1: minknear[count].append( Relation(i, j, self.distanceArray[i][j])) minknear[count] = sorted( minknear[count], key=lambda distance: distance.get_distance()) count = count + 1 for knear in minknear: # 找出其最接近的k个异类元素 for index in range(k): maxknear = [] # 计算异类元素相邻的少数类元素 for i in range(size): if originDataY[i] == 1: maxknear.append( Relation( i, knear[index].get_majorNumber(), self.distanceArray[i][ knear[index].get_majorNumber()])) maxknear = sorted(maxknear, key=lambda distance: distance.get_distance()) # 设置边界节点的权重 for i in range(k): if maxknear[i].get_minNumber( ) == knear[index].get_minNumber(): kLink[knear[index].get_minNumber( )] = kLink[knear[index].get_minNumber()] + 1 # 标准化 total = 0 for i in range(size): total += kLink[i] for i in range(size): kLink[i] = kLink[i] / total # for i in range(size): # print(kLink[i]) # 要生成的个数N N = count # N=0 T = [] for i in range(size): T.append(math.ceil(N * kLink[i])) for iClassfiler in range(M): # self.G[i]为第i个弱分类器 # for i in range(size): # print(T[i]) originDataX = self.X originDataY = self.y sampleDataCount = 0 # 使用SMOT人工合成过采样技术生成少数类 for i in range(size): if T[i] != 0: orinMin = originDataX[:, i] minNearArray = [] for j in range(size): if j != i and originDataY[j] == 1: minNearArray.append( Relation(i, j, self.distanceArray[i][j])) minNearArray = sorted( minNearArray, key=lambda distance: distance.get_distance()) for j in range(T[i]): nearData = originDataX[:, minNearArray[j].get_majorNumber( )] sampleData = (orinMin - nearData) * random.random() + orinMin sampleData = sampleData.reshape( 1, originDataX.shape[0]) originDataX = np.concatenate( (originDataX, sampleData.T), axis=1) originDataY = np.concatenate( (originDataY, np.array([1])), axis=0) sampleDataCount = sampleDataCount + 1 # print(sampleData) originTotalWeight = 0 if self.W is None: self.W = np.ones((originDataX.shape[1], 1)).flatten(1) / originDataX.shape[1] self.sums = np.zeros(originDataY.shape) else: for i in range(size): originTotalWeight = originTotalWeight + self.W[i] # self.W=np.delete(self.W,[size:],axis=0) self.W = self.W[:size] for i in range(sampleDataCount): self.W = np.concatenate( (self.W, np.array([(1 - originTotalWeight) / (0.94 * sampleDataCount)])), axis=0) # print("") # for i in range(sampleDataCount): # self.W[i+size]=sampleDataCount/sampleDataCount self.G[iClassfiler] = self.Weaker(originDataX, originDataY) e = self.G[iClassfiler].train(self.W) # 根据当前权值进行该个弱分类器训练 self.alpha[iClassfiler] = 1.0 / 2 * np.log( (1 - e) / e) # 计算该分类器的系数 res = self.G[iClassfiler].pred(originDataX) # res表示该分类器得出的输出 # 计算当前次数训练精确度 print("weak classfier acc", accuracy_score(originDataY, res), "\n======================================================") # Z表示规范化因子 Z = self.W * np.exp( -self.alpha[iClassfiler] * originDataY * res.transpose()) self.W = (Z / Z.sum()).flatten(1) # 更新权值 self.Q = iClassfiler # errorcnt返回分错的点的数量,为0则表示perfect if (self.errorcnt(iClassfiler, originDataX, originDataY) == 0): print("%d个弱分类器可以将错误率降到0" % (iClassfiler + 1)) break
def generateBorder(): # load data dataset = np.loadtxt( 'I:\\tools\\SoftwarePrediction\\dataset\\oringData.txt', delimiter=",") length = len(dataset[0]) x = dataset[:, 0:length - 1] y = dataset[:, length - 1] # prepare train data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4) x_train = x y_train = y # prepare test and train data x_train = x_train.transpose() # y_train[y_train==1] = 1 # y_train[y_train==0] = -1 x_test = x_test.transpose() # y_test[y_test == 1] = 1 # y_test[y_test == 0] = -1 originDataX = x_train originDataY = y_train size = originDataX.shape[1] tt = originDataX[:, 0] distanceArray = np.zeros((size, size), dtype=float) for i in range(size): for j in range(size): distanceArray[i][j] = np.sqrt( np.sum(np.square(originDataX[:, i] - originDataX[:, j]))) k = 20 minSize = 0 for i in range(size): if originDataY[i] == 1: minSize = minSize + 1 kLink = [0 for i in range(size)] minknear = [] # 存储着所有有缺陷的模块到其他异类点之间的距离从小到大进行排序 count = 0 # 少数类个数 for i in range(size): if originDataY[i] == 1: # 找出少数类 minknear.append([]) for j in range(size): if originDataY[j] == 0: minknear[count].append(Relation(i, j, distanceArray[i][j])) minknear[count] = sorted( minknear[count], key=lambda distance: distance.get_distance()) count = count + 1 for knear in minknear: # 找出其最接近的k个异类元素 for index in range(k): maxknear = [] # 计算异类元素相邻的少数类元素 for i in range(size): if originDataY[i] == 1: maxknear.append( Relation( i, knear[index].get_majorNumber(), distanceArray[i][knear[index].get_majorNumber()])) maxknear = sorted(maxknear, key=lambda distance: distance.get_distance()) # 设置边界节点的权重 for i in range(k): if maxknear[i].get_minNumber() == knear[index].get_minNumber(): kLink[knear[index].get_minNumber( )] = kLink[knear[index].get_minNumber()] + 1 # 标准化 total = 0 for i in range(size): total += kLink[i] for i in range(size): kLink[i] = kLink[i] / total # for i in range(size): # print(kLink[i]) # 要生成的个数N N = 2 * count # N=0 T = [] for i in range(size): T.append(math.ceil(N * kLink[i])) # for i in range(size): # print(T[i]) sampleDataCount = 0 # 使用SMOT人工合成过采样技术生成少数类 for i in range(size): if T[i] != 0: orinMin = originDataX[:, i] minNearArray = [] for j in range(size): if j != i and originDataY[j] == 1: minNearArray.append(Relation(i, j, distanceArray[i][j])) minNearArray = sorted(minNearArray, key=lambda distance: distance.get_distance()) for j in range(T[i]): nearData = originDataX[:, minNearArray[j].get_majorNumber()] sampleData = (orinMin - nearData) * random.random() + orinMin sampleData = sampleData.reshape(1, originDataX.shape[0]) originDataX = np.concatenate((originDataX, sampleData.T), axis=1) originDataY = np.concatenate((originDataY, np.array([1])), axis=0) sampleDataCount = sampleDataCount + 1 # print(sampleData) originDataX = np.concatenate( (originDataX, originDataY.reshape(1, originDataY.shape[0])), axis=0) originDataX = originDataX.T doc = open("test.txt", 'w') for row in originDataX: doc.write( str(int(row[0])) + "," + str(int(row[1])) + "," + str(int(row[2])) + "\n")