def _test_rnn_rand_vec(): # 这里随机生成一个 Tensor,维度是 1000 x 10 x 200;其实就是1000个句子,每个句子里面有10个词向量,每个词向量 200 维度,其中的值符合 NORMAL 分布。 _xs = torch.randn(1000, 10, 200) _ys = [] # 标签值 0 - 5 闭区间 for i in range(1000): _ys.append(1) # 隐层 200,输出 6,隐层用词向量的宽度,输出用标签的值得个数 (one-hot) encoder_test = EncoderRNNWithVector(200, 6) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(encoder_test.parameters(), lr=0.001, momentum=0.9) num_data = len(_xs) #1000 batchsize = 20 num_epoches = 10 for epoch in range(num_epoches): for start, end in zip(range(0, num_data, batchsize), range(batchsize, num_data, batchsize)): encoder_hidden = encoder_test.init_hidden() input_data = torch.autograd.Variable(_xs[start:end]) output_labels = torch.autograd.Variable( torch.LongTensor(np.array([ _ys[start:end] ])).reshape(batchsize)) #output_labels需要为LongTensor encoder_outputs, encoder_hidden = encoder_test( input_data, encoder_hidden) #此处调用前向传播 optimizer.zero_grad() predict = encoder_outputs.view(batchsize, -1) # print("predict_shape",predict.size()) #predict_shape torch.Size([20, 6]) # print("output_labels", output_labels.size()) #output_labels torch.Size([20]) loss = criterion(predict, output_labels) loss.backward() optimizer.step() eva(predict, output_labels, batchsize) return
def SpectralClustering(data, class_num, data_nm, label): X = data af = ['rbf', 'laplacian', 'distance'] Compare = [] for a in af: la = spectral_clustering(X, class_num, affinity=a) Compare.append(la) A = [] for com in Compare: NMI, ARI = evaluate.eva_com(com, label) A.append(ARI) k = Fmax(A) labels = Compare[k] print("标准化互信息 精度 纯度 轮廓系数 兰德系数") nmi, acc, purity, Sc, ARI = evaluate.eva(labels, label, X) print(nmi, acc, purity, Sc, ARI) # 画图 plt.style.use('ggplot') # 原数据 # 谱聚类结果 plt.scatter(X[:, 0], X[:, 1], c=labels, edgecolors='k') plt.title("SC2+" + data_nm) plt.savefig( '.\picture\improved_spectral_clustering\sc1_{0}.png'.format(data_nm)) plt.close()
def HC(data, class_num, data_nm, label): k = class_num # Z = sch.linkage(data, method='average',metric='euclidean') # sch.dendrogram(Z) # plt.savefig('.\picture\hierarchical_clustering\c_{0}.png'.format(data_nm)) # plt.close() # plt.figure() # sns.clustermap(data,method='average',metric='euclidean',cmap='RdYlBu_r') # plt.savefig('.\picture\hierarchical_clustering\c1_{0}.png'.format(data_nm)) # plt.close() hc = AgglomerativeClustering(k, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(data) # print(len(y_hc)) # print(len(label.reshape(-1))) print("标准化互信息 精度 纯度 轮廓系数 兰德系数") nmi, acc, purity, Sc, ARI = evaluate.eva(y_hc, label, data) print(nmi, acc, purity, Sc, ARI) colors = [ '#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B', '#2E8B57', '#8B0000', '#8B5A00', '#EEEE00', '#CDCDB4', '#ABABAB', '#8B8B00' ] plt.figure() for i in range(k): for j in range(0, 6): color = colors[i % len(colors)] plt.scatter(data[y_hc == i, 0], data[y_hc == i, 1], s=6, c=color) plt.title("AgglomerativeClustering+" + data_nm) # plt.legend(loc='best') plt.savefig('.\picture\hierarchical_clustering\hc_{0}.png'.format(data_nm)) plt.close()
def FCM1(data, class_num, data_nm, label): # Hyper Parameters # C = int(class_num) C = class_num m = 1.1 iteration = 10 X = data n, dimension = X.shape # print(n) # print(dimension) U = np.array(np.random.rand(n, C), dtype='double') # print(U) U_crisp = np.zeros((n, 1)) mu = np.zeros((C, dimension)) # print(mu) X = np.array(X) fig, ax = plt.subplots() for k in range(iteration): for i in range(n): U[i, :] = U[i, :] / sum(U[i, :]) for j in range(C): temp = (U[:, j]**m) mu[j, :] = sum(np.multiply(temp, X.transpose()).transpose()) / sum(temp) for i in range(n): for j in range(C): U[i, j] = 1 / sum((d(X[i, :], mu[j, :], 'vector')) / d(X[i, :], mu[:, :], 'matrix'))**(1 / (m - 1)) UV = [] for i in range(n): U_crisp[i] = np.argmax(U[i, :]) UV.extend(U_crisp[i]) # print(UV) print("标准化互信息 精度 纯度 轮廓系数 兰德系数") nmi, acc, purity, Sc, ARI = evaluate.eva(UV, label, data) print(nmi, acc, purity, Sc, ARI) colors = [ '#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B', '#2E8B57', '#8B0000', '#8B5A00', '#EEEE00', '#CDCDB4', '#ABABAB', '#8B8B00' ] for i in range(C): points = np.array([X[j, :] for j in range(n) if U_crisp[j] == i]) # print(points) ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i]) plt.title("FCM+" + data_nm) # plt.legend(loc='best') plt.savefig('.\picture\FCM\F_{0}.png'.format(data_nm))
def dbscan(data, data_nm, class_num, label): # 设置半径为eps,最小样本量为min_samples,建模 db = DBSCAN(eps=0.82, min_samples=2).fit(data) labels = db.labels_ # print(len(labels)) # print(len(label.reshape(-1))) print("标准化互信息 精度 纯度 轮廓系数 兰德系数") nmi, acc, purity, SC, ARI = evaluate.eva(labels, label, data) print(nmi, acc, purity, SC, ARI) # 计算噪声点个数占总数的比例 # raito = len(labels[labels[:] == -1]) / len(labels) # print('噪声比:', format(raito, '.2%')) plotRes(data, labels, class_num, data_nm)
def sp(data, class_num, data_nm, label): n_clusters = class_num matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] matplotlib.rcParams['axes.unicode_minus'] = False m = euclidean_distances(data, squared=True) # print(m) sigma = np.median(m) plt.figure(figsize=(12, 8), facecolor='w') plt.suptitle(u'谱聚类', fontsize=20) clrs = [ '#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B', '#2E8B57', '#FAEBD7', '#8B5A00', '#EEEE00', '#0000FF', '#ABABAB', '#8B8B00' ] # print(len(clrs)) assess = [] for i, s in enumerate(np.logspace(-2, 0, 6)): af = np.exp(-m**2 / (s**2)) + 1e-6 y_hat = spectral_clustering(af, n_clusters=n_clusters, assign_labels='kmeans', random_state=1) # assess.append(y_hat) plt.subplot(2, 3, i + 1) for k, clr in enumerate(clrs): cur = (y_hat == k) plt.scatter(data[cur, 0], data[cur, 1], s=40, color=clr, edgecolors='k') x1_min, x2_min = np.min(data, axis=0) x1_max, x2_max = np.max(data, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.title(u'sigma = %.2f' % s, fontsize=16) # print(y_hat) print("标准化互信息 精度 纯度 轮廓系数 兰德系数") nmi, acc, purity, Sc, ARI = evaluate.eva(y_hat, label, data) print(nmi, acc, purity, Sc, ARI) plt.tight_layout() plt.title("SC1+" + data_nm) plt.subplots_adjust(top=0.9) plt.savefig( '.\picture\improved_spectral_clustering\sc1_{0}.png'.format(data_nm)) plt.close()
def main(): word_id=create_vocab(args.training_data_path,args.vocab_path,True) #label_id=create_vocab(args.training_data_path,args.vocab_tag_path) args.class_num=3 #train,test=load_data(args.training_data_path,word_id,label_id) train1,test1=load_data1(args.training_data_path,word_id) #train1,test1=load_data_bert(args.training_data_path,word_id) TrainX,TrainY=zip(*train1) testX,testY=zip(*test1) cnn=model.CNN_Text(args).cuda() criterion = torch.nn.CrossEntropyLoss() #optimizer = torch.optim.SGD(cnn.parameters(), lr=0.001, momentum=0.9) opt_Adam = torch.optim.Adam(cnn.parameters(), lr=args.lr, betas=(0.9, 0.99)) for epoch in range(1,args.epoches): print("epoch",epoch) batch_iter=batch_helper(TrainX,TrainY,args.batch_size) for trainx,trainy in batch_iter: #print("trainy length",len(trainy)) #batchsize input_data = torch.autograd.Variable(torch.LongTensor(trainx)).cuda() output_labels=torch.autograd.Variable(torch.LongTensor(trainy)).cuda() output_labels=output_labels. squeeze() #print("vocab_size",args.vocab_size) cnn_outputs=cnn(input_data) torch.save(cnn.state_dict(),args.parameters_path) opt_Adam.zero_grad() loss = criterion(cnn_outputs, output_labels) loss.backward() opt_Adam.step() # for param_tensor in cnn.state_dict(): # print(param_tensor, "\t", cnn.state_dict()[param_tensor].size()) # for var_name in opt_Adam.state_dict(): # print(var_name, "\t", opt_Adam.state_dict()[var_name]) eva(cnn_outputs,output_labels,args.batch_size) torch.save(cnn.state_dict(), args.parameters_path) run_val(testX,testY,cnn)
def kmeans(data, class_num, data_nm, label): k = class_num clu = random.sample(data.tolist(), k) # 随机取质心 clu = np.asarray(clu) err, clunew, k, clusterRes = Kmeans.classfy(data, clu, k) while np.any(abs(err) > 0): # print(clunew) err, clunew, k, clusterRes = Kmeans.classfy(data, clunew, k) clulist = Kmeans.cal_dis(data, clunew, k) clusterResult = Kmeans.divide(data, clulist) # print(clusterResult) # print(label.reshape(-1)) print("标准化互信息 精度 纯度 轮廓系数 兰德系数") nmi, acc, purity, Sc, ARI = evaluate.eva(clusterResult, label, data) print(nmi, acc, purity, Sc, ARI) Kmeans.plotRes(data, clusterResult, k, data_nm)
def sp(data,class_num,data_nm,label): n_clusters=class_num matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] matplotlib.rcParams['axes.unicode_minus'] = False plt.figure(facecolor='w') clrs =['#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B', '#2E8B57', '#FAEBD7', '#8B5A00', '#EEEE00', '#0000FF', '#ABABAB', '#8B8B00'] gamma_list = [0.1,0.2,0.4,0.6,0.8,1] af=['laplacian','nearest_neighbors'] Compare=[] for gamma_value in gamma_list: # for a in af: spectral = SpectralClustering(n_clusters,gamma=gamma_value, affinity='nearest_neighbors',random_state=1) y_hat = spectral.fit_predict(data) Compare.append(y_hat) N=[] A=[] for com in Compare: NMI,ARI=evaluate.eva_com(com,label) N.append(NMI) A.append(ARI) k=Fmax(N) y_hat=Compare[k] print("标准化互信息 精度 纯度 轮廓系数 兰德系数") nmi, acc, purity,Sc,ARI= evaluate.eva(y_hat, label,data) print(nmi, acc, purity,Sc,ARI) for k, clr in enumerate(clrs): cur = (y_hat == k) plt.scatter(data[cur, 0], data[cur, 1], s=40, color=clr, edgecolors='k') x1_min, x2_min = np.min(data, axis=0) x1_max, x2_max = np.max(data, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) # plt.grid(True) # plt.legend(loc='best') plt.title("SC2+" + data_nm) plt.savefig('.\picture\improved_spectral_clustering\sc1_{0}.png'.format(data_nm)) plt.close()
end = time.time() print("Dev: " + str(numOfSamples) + ' / ' + str(len(dev)) + " , Current loss : " + str( loss / numOfSamples) + ", run time = " + str(end - start)) start = time.time() # print('%s (%d %d%%) %.4f' % (timeSince(start, numOfSamples / (len(train) * 1.0)), # numOfSamples, numOfSamples / len(train) * 100, loss / numOfSamples)) loss /= numOfSamples writeResult.close() with codecs.open('result.txt', 'w', encoding='utf-8') as outfile: json.dump(dict, outfile) print('Dev Loss: ' + str(loss)) evaluate.eva('result.txt', '../data/dev-v1.1.json') # loss = 0 # numOfSamples = 0 # numOfBatch = 0 # start = time.time() # print("Start Dev2:") # dict = {} # s = "" # writeResult = codecs.open('prediction2.txt','w',encoding='utf-8') # for sid in range(0, len(dev2), config.DevBatchSize): # # instances = dev2[sid:sid + config.DevBatchSize] # # print(instances[0][10])
] for i in range(clusterNum): color = scatterColors[i % len(scatterColors)] x1 = [] y1 = [] for j in range(nPoints): if clusterRes[j] == i: x1.append(data[j, 0]) y1.append(data[j, 1]) plt.scatter(x1, y1, c=color, alpha=1, marker='+') plt.show() if __name__ == '__main__': k = 7 # 类别个数 data = load_data() print(data) clu = random.sample(data[:, 0:2].tolist(), k) # 随机取质心 print(clu) clu = np.asarray(clu) err, clunew, k, clusterRes = classfy(data, clu, k) while np.any(abs(err) > 0): print(clunew) err, clunew, k, clusterRes = classfy(data, clunew, k) clulist = cal_dis(data, clunew, k) clusterResult = divide(data, clulist) nmi, acc, purity = eva.eva(clusterResult, np.asarray(data[:, 2])) print(nmi, acc, purity) plotRes(data, clusterResult, k)
if clusterRes[pointId] == UNASSIGNED: if to_cluster(data, clusterRes, pointId, clusterId, radius, minPts): clusterId = clusterId + 1 return np.asarray(clusterRes), clusterId def plotRes(data, clusterRes, clusterNum): nPoints = len(data) scatterColors = [ 'black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown' ] for i in range(clusterNum): color = scatterColors[i % len(scatterColors)] x1 = [] y1 = [] for j in range(nPoints): if clusterRes[j] == i: x1.append(data[j, 0]) y1.append(data[j, 1]) plt.scatter(x1, y1, c=color, alpha=1, marker='+') if __name__ == '__main__': data = load_data() cluster = np.asarray(data[:, 2]) clusterRes, clusterNum = dbscan(data, 0.8, 3) plotRes(data, clusterRes, clusterNum) nmi, acc, purity = eva.eva(clusterRes, cluster) print(nmi, acc, purity) plt.show()
nPoints = len(data) scatterColors = [ 'black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange' ] for i in range(clusterNum): color = scatterColors[i % len(scatterColors)] x1 = [] y1 = [] for j in range(nPoints): if clusterResult[j] == i: x1.append(data[j, 0]) y1.append(data[j, 1]) plt.scatter(x1, y1, c=color, alpha=1, marker='+') plt.show() if __name__ == '__main__': cluster_num = 2 KNN_k = 5 data = load_data() data = np.asarray(data) W = getW(data, KNN_k) D = getD(W) L = getL(D, W) eigvec = getEigen(L) clf = KMeans(n_clusters=cluster_num) s = clf.fit(eigvec) C = s.labels_ nmi, acc, purity = eval.eva(C + 1, data[:, 2]) print(nmi, acc, purity) plotRes(data, np.asarray(C), 7)