Пример #1
0
def _test_rnn_rand_vec():

    # 这里随机生成一个 Tensor,维度是 1000 x 10 x 200;其实就是1000个句子,每个句子里面有10个词向量,每个词向量 200 维度,其中的值符合 NORMAL 分布。

    _xs = torch.randn(1000, 10, 200)
    _ys = []

    # 标签值 0 - 5 闭区间
    for i in range(1000):
        _ys.append(1)

    # 隐层 200,输出 6,隐层用词向量的宽度,输出用标签的值得个数 (one-hot)
    encoder_test = EncoderRNNWithVector(200, 6)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(encoder_test.parameters(),
                                lr=0.001,
                                momentum=0.9)

    num_data = len(_xs)  #1000
    batchsize = 20
    num_epoches = 10
    for epoch in range(num_epoches):
        for start, end in zip(range(0, num_data, batchsize),
                              range(batchsize, num_data, batchsize)):
            encoder_hidden = encoder_test.init_hidden()
            input_data = torch.autograd.Variable(_xs[start:end])
            output_labels = torch.autograd.Variable(
                torch.LongTensor(np.array([
                    _ys[start:end]
                ])).reshape(batchsize))  #output_labels需要为LongTensor
            encoder_outputs, encoder_hidden = encoder_test(
                input_data, encoder_hidden)  #此处调用前向传播

            optimizer.zero_grad()
            predict = encoder_outputs.view(batchsize, -1)

            # print("predict_shape",predict.size()) #predict_shape torch.Size([20, 6])
            # print("output_labels", output_labels.size()) #output_labels torch.Size([20])

            loss = criterion(predict, output_labels)

            loss.backward()
            optimizer.step()

            eva(predict, output_labels, batchsize)

    return
Пример #2
0
def SpectralClustering(data, class_num, data_nm, label):
    X = data
    af = ['rbf', 'laplacian', 'distance']
    Compare = []
    for a in af:
        la = spectral_clustering(X, class_num, affinity=a)

        Compare.append(la)
    A = []
    for com in Compare:
        NMI, ARI = evaluate.eva_com(com, label)
        A.append(ARI)
    k = Fmax(A)
    labels = Compare[k]
    print("标准化互信息      精度      纯度     轮廓系数    兰德系数")
    nmi, acc, purity, Sc, ARI = evaluate.eva(labels, label, X)
    print(nmi, acc, purity, Sc, ARI)
    # 画图
    plt.style.use('ggplot')
    # 原数据
    # 谱聚类结果
    plt.scatter(X[:, 0], X[:, 1], c=labels, edgecolors='k')
    plt.title("SC2+" + data_nm)
    plt.savefig(
        '.\picture\improved_spectral_clustering\sc1_{0}.png'.format(data_nm))
    plt.close()
Пример #3
0
Файл: hc.py Проект: bang77/SCRNA
def HC(data, class_num, data_nm, label):
    k = class_num
    # Z = sch.linkage(data, method='average',metric='euclidean')
    # sch.dendrogram(Z)
    # plt.savefig('.\picture\hierarchical_clustering\c_{0}.png'.format(data_nm))
    # plt.close()
    # plt.figure()
    # sns.clustermap(data,method='average',metric='euclidean',cmap='RdYlBu_r')
    # plt.savefig('.\picture\hierarchical_clustering\c1_{0}.png'.format(data_nm))
    # plt.close()

    hc = AgglomerativeClustering(k, affinity='euclidean', linkage='ward')
    y_hc = hc.fit_predict(data)
    # print(len(y_hc))
    # print(len(label.reshape(-1)))
    print("标准化互信息      精度      纯度     轮廓系数    兰德系数")
    nmi, acc, purity, Sc, ARI = evaluate.eva(y_hc, label, data)
    print(nmi, acc, purity, Sc, ARI)
    colors = [
        '#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B',
        '#2E8B57', '#8B0000', '#8B5A00', '#EEEE00', '#CDCDB4', '#ABABAB',
        '#8B8B00'
    ]
    plt.figure()
    for i in range(k):
        for j in range(0, 6):
            color = colors[i % len(colors)]
            plt.scatter(data[y_hc == i, 0], data[y_hc == i, 1], s=6, c=color)
    plt.title("AgglomerativeClustering+" + data_nm)
    # plt.legend(loc='best')
    plt.savefig('.\picture\hierarchical_clustering\hc_{0}.png'.format(data_nm))
    plt.close()
Пример #4
0
def FCM1(data, class_num, data_nm, label):
    # Hyper Parameters
    # C = int(class_num)
    C = class_num
    m = 1.1
    iteration = 10
    X = data
    n, dimension = X.shape
    # print(n)
    # print(dimension)
    U = np.array(np.random.rand(n, C), dtype='double')
    # print(U)
    U_crisp = np.zeros((n, 1))
    mu = np.zeros((C, dimension))
    # print(mu)
    X = np.array(X)
    fig, ax = plt.subplots()

    for k in range(iteration):

        for i in range(n):
            U[i, :] = U[i, :] / sum(U[i, :])

        for j in range(C):
            temp = (U[:, j]**m)
            mu[j, :] = sum(np.multiply(temp,
                                       X.transpose()).transpose()) / sum(temp)

        for i in range(n):
            for j in range(C):
                U[i,
                  j] = 1 / sum((d(X[i, :], mu[j, :], 'vector')) /
                               d(X[i, :], mu[:, :], 'matrix'))**(1 / (m - 1))
    UV = []
    for i in range(n):
        U_crisp[i] = np.argmax(U[i, :])
        UV.extend(U_crisp[i])
    # print(UV)
    print("标准化互信息      精度      纯度     轮廓系数    兰德系数")
    nmi, acc, purity, Sc, ARI = evaluate.eva(UV, label, data)
    print(nmi, acc, purity, Sc, ARI)
    colors = [
        '#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B',
        '#2E8B57', '#8B0000', '#8B5A00', '#EEEE00', '#CDCDB4', '#ABABAB',
        '#8B8B00'
    ]

    for i in range(C):
        points = np.array([X[j, :] for j in range(n) if U_crisp[j] == i])
        # print(points)
        ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
    plt.title("FCM+" + data_nm)
    # plt.legend(loc='best')
    plt.savefig('.\picture\FCM\F_{0}.png'.format(data_nm))
Пример #5
0
def dbscan(data, data_nm, class_num, label):
    # 设置半径为eps,最小样本量为min_samples,建模
    db = DBSCAN(eps=0.82, min_samples=2).fit(data)
    labels = db.labels_
    # print(len(labels))
    # print(len(label.reshape(-1)))
    print("标准化互信息      精度      纯度     轮廓系数    兰德系数")
    nmi, acc, purity, SC, ARI = evaluate.eva(labels, label, data)
    print(nmi, acc, purity, SC, ARI)
    # 计算噪声点个数占总数的比例
    # raito = len(labels[labels[:] == -1]) / len(labels)
    # print('噪声比:', format(raito, '.2%'))
    plotRes(data, labels, class_num, data_nm)
Пример #6
0
def sp(data, class_num, data_nm, label):
    n_clusters = class_num
    matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    m = euclidean_distances(data, squared=True)
    # print(m)
    sigma = np.median(m)
    plt.figure(figsize=(12, 8), facecolor='w')
    plt.suptitle(u'谱聚类', fontsize=20)
    clrs = [
        '#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B',
        '#2E8B57', '#FAEBD7', '#8B5A00', '#EEEE00', '#0000FF', '#ABABAB',
        '#8B8B00'
    ]
    # print(len(clrs))

    assess = []
    for i, s in enumerate(np.logspace(-2, 0, 6)):

        af = np.exp(-m**2 / (s**2)) + 1e-6
        y_hat = spectral_clustering(af,
                                    n_clusters=n_clusters,
                                    assign_labels='kmeans',
                                    random_state=1)
        # assess.append(y_hat)
        plt.subplot(2, 3, i + 1)
        for k, clr in enumerate(clrs):
            cur = (y_hat == k)
            plt.scatter(data[cur, 0],
                        data[cur, 1],
                        s=40,
                        color=clr,
                        edgecolors='k')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
        plt.title(u'sigma = %.2f' % s, fontsize=16)
    # print(y_hat)
    print("标准化互信息      精度      纯度     轮廓系数    兰德系数")
    nmi, acc, purity, Sc, ARI = evaluate.eva(y_hat, label, data)
    print(nmi, acc, purity, Sc, ARI)
    plt.tight_layout()
    plt.title("SC1+" + data_nm)
    plt.subplots_adjust(top=0.9)
    plt.savefig(
        '.\picture\improved_spectral_clustering\sc1_{0}.png'.format(data_nm))
    plt.close()
Пример #7
0
def main():
    word_id=create_vocab(args.training_data_path,args.vocab_path,True)
    #label_id=create_vocab(args.training_data_path,args.vocab_tag_path)
    args.class_num=3
    #train,test=load_data(args.training_data_path,word_id,label_id)
    train1,test1=load_data1(args.training_data_path,word_id)
    #train1,test1=load_data_bert(args.training_data_path,word_id)
    TrainX,TrainY=zip(*train1)
    testX,testY=zip(*test1)
    cnn=model.CNN_Text(args).cuda()
    criterion = torch.nn.CrossEntropyLoss()
    #optimizer = torch.optim.SGD(cnn.parameters(), lr=0.001, momentum=0.9)
    opt_Adam = torch.optim.Adam(cnn.parameters(), lr=args.lr, betas=(0.9, 0.99))

    for epoch in range(1,args.epoches):
        print("epoch",epoch)
        batch_iter=batch_helper(TrainX,TrainY,args.batch_size)
        for trainx,trainy in batch_iter:
            #print("trainy length",len(trainy)) #batchsize
            input_data = torch.autograd.Variable(torch.LongTensor(trainx)).cuda()

            output_labels=torch.autograd.Variable(torch.LongTensor(trainy)).cuda()
            output_labels=output_labels. squeeze()
            #print("vocab_size",args.vocab_size)
            cnn_outputs=cnn(input_data)
            torch.save(cnn.state_dict(),args.parameters_path)
            opt_Adam.zero_grad()
            loss = criterion(cnn_outputs, output_labels)
            loss.backward()
            opt_Adam.step()
            # for param_tensor in cnn.state_dict():
            #     print(param_tensor, "\t", cnn.state_dict()[param_tensor].size())
            # for var_name in opt_Adam.state_dict():
            #     print(var_name, "\t", opt_Adam.state_dict()[var_name])
            eva(cnn_outputs,output_labels,args.batch_size)
        torch.save(cnn.state_dict(), args.parameters_path)
        run_val(testX,testY,cnn)
Пример #8
0
def kmeans(data, class_num, data_nm, label):
    k = class_num
    clu = random.sample(data.tolist(), k)  # 随机取质心
    clu = np.asarray(clu)
    err, clunew, k, clusterRes = Kmeans.classfy(data, clu, k)
    while np.any(abs(err) > 0):
        # print(clunew)
        err, clunew, k, clusterRes = Kmeans.classfy(data, clunew, k)

    clulist = Kmeans.cal_dis(data, clunew, k)
    clusterResult = Kmeans.divide(data, clulist)
    # print(clusterResult)
    # print(label.reshape(-1))
    print("标准化互信息      精度      纯度     轮廓系数    兰德系数")
    nmi, acc, purity, Sc, ARI = evaluate.eva(clusterResult, label, data)
    print(nmi, acc, purity, Sc, ARI)
    Kmeans.plotRes(data, clusterResult, k, data_nm)
Пример #9
0
def sp(data,class_num,data_nm,label):

    n_clusters=class_num
    matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.figure(facecolor='w')
    clrs =['#B03060', '#AEEEEE', '#68228B', 'y', 'c', 'm', '#2E2E2E', '#00008B', '#2E8B57', '#FAEBD7',
                     '#8B5A00', '#EEEE00', '#0000FF', '#ABABAB', '#8B8B00']
    gamma_list = [0.1,0.2,0.4,0.6,0.8,1]
    af=['laplacian','nearest_neighbors']
    Compare=[]
    for gamma_value in gamma_list:
        # for a in af:
        spectral = SpectralClustering(n_clusters,gamma=gamma_value, affinity='nearest_neighbors',random_state=1)
        y_hat = spectral.fit_predict(data)
        Compare.append(y_hat)
    N=[]
    A=[]
    for  com in Compare:
        NMI,ARI=evaluate.eva_com(com,label)
        N.append(NMI)
        A.append(ARI)
    k=Fmax(N)
    y_hat=Compare[k]
    print("标准化互信息      精度      纯度     轮廓系数    兰德系数")
    nmi, acc, purity,Sc,ARI= evaluate.eva(y_hat, label,data)
    print(nmi, acc, purity,Sc,ARI)
    for k, clr in enumerate(clrs):
        cur = (y_hat == k)
        plt.scatter(data[cur, 0], data[cur, 1], s=40, color=clr, edgecolors='k')
    x1_min, x2_min = np.min(data, axis=0)
    x1_max, x2_max = np.max(data, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    # plt.grid(True)
    # plt.legend(loc='best')
    plt.title("SC2+" + data_nm)
    plt.savefig('.\picture\improved_spectral_clustering\sc1_{0}.png'.format(data_nm))
    plt.close()
Пример #10
0
        end = time.time()
        print("Dev: " + str(numOfSamples) + ' / ' + str(len(dev)) + " , Current loss : " + str(
            loss / numOfSamples) + ", run time = " + str(end - start))
        start = time.time()
        # print('%s (%d %d%%) %.4f' % (timeSince(start, numOfSamples / (len(train) * 1.0)),
        #                              numOfSamples, numOfSamples / len(train) * 100, loss / numOfSamples))

loss /= numOfSamples
writeResult.close()
with codecs.open('result.txt', 'w', encoding='utf-8') as outfile:
    json.dump(dict, outfile)



print('Dev Loss: ' + str(loss))
evaluate.eva('result.txt', '../data/dev-v1.1.json')



# loss = 0
# numOfSamples = 0
# numOfBatch = 0
# start = time.time()
# print("Start Dev2:")
# dict = {}
# s = ""
# writeResult = codecs.open('prediction2.txt','w',encoding='utf-8')
# for sid in range(0, len(dev2), config.DevBatchSize):
#
#     instances = dev2[sid:sid + config.DevBatchSize]
#     # print(instances[0][10])
Пример #11
0
    ]
    for i in range(clusterNum):
        color = scatterColors[i % len(scatterColors)]
        x1 = []
        y1 = []
        for j in range(nPoints):
            if clusterRes[j] == i:
                x1.append(data[j, 0])
                y1.append(data[j, 1])
        plt.scatter(x1, y1, c=color, alpha=1, marker='+')
    plt.show()


if __name__ == '__main__':
    k = 7  # 类别个数
    data = load_data()
    print(data)
    clu = random.sample(data[:, 0:2].tolist(), k)  # 随机取质心
    print(clu)
    clu = np.asarray(clu)
    err, clunew, k, clusterRes = classfy(data, clu, k)
    while np.any(abs(err) > 0):
        print(clunew)
        err, clunew, k, clusterRes = classfy(data, clunew, k)

    clulist = cal_dis(data, clunew, k)
    clusterResult = divide(data, clulist)

    nmi, acc, purity = eva.eva(clusterResult, np.asarray(data[:, 2]))
    print(nmi, acc, purity)
    plotRes(data, clusterResult, k)
Пример #12
0
        if clusterRes[pointId] == UNASSIGNED:
            if to_cluster(data, clusterRes, pointId, clusterId, radius,
                          minPts):
                clusterId = clusterId + 1
    return np.asarray(clusterRes), clusterId


def plotRes(data, clusterRes, clusterNum):
    nPoints = len(data)
    scatterColors = [
        'black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown'
    ]
    for i in range(clusterNum):
        color = scatterColors[i % len(scatterColors)]
        x1 = []
        y1 = []
        for j in range(nPoints):
            if clusterRes[j] == i:
                x1.append(data[j, 0])
                y1.append(data[j, 1])
        plt.scatter(x1, y1, c=color, alpha=1, marker='+')


if __name__ == '__main__':
    data = load_data()
    cluster = np.asarray(data[:, 2])
    clusterRes, clusterNum = dbscan(data, 0.8, 3)
    plotRes(data, clusterRes, clusterNum)
    nmi, acc, purity = eva.eva(clusterRes, cluster)
    print(nmi, acc, purity)
    plt.show()
Пример #13
0
    nPoints = len(data)
    scatterColors = [
        'black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange'
    ]
    for i in range(clusterNum):
        color = scatterColors[i % len(scatterColors)]
        x1 = []
        y1 = []
        for j in range(nPoints):
            if clusterResult[j] == i:
                x1.append(data[j, 0])
                y1.append(data[j, 1])
        plt.scatter(x1, y1, c=color, alpha=1, marker='+')
    plt.show()


if __name__ == '__main__':
    cluster_num = 2
    KNN_k = 5
    data = load_data()
    data = np.asarray(data)
    W = getW(data, KNN_k)
    D = getD(W)
    L = getL(D, W)
    eigvec = getEigen(L)
    clf = KMeans(n_clusters=cluster_num)
    s = clf.fit(eigvec)
    C = s.labels_
    nmi, acc, purity = eval.eva(C + 1, data[:, 2])
    print(nmi, acc, purity)
    plotRes(data, np.asarray(C), 7)