def jacards_Spectral_Clustering(jid, confDict, arff):
    data = la(arff)
    # pos_seq = confDict['input']['pos_seq']
    # neg_seq = confDict['input']['neg_seq']
    # pos_name = findName(pos_seq) # fasta file
    # neg_name = findName(neg_seq)
    ### get all features
    features = list(data[1])[:-1]
    # top = int(confDict['RF_gini_filter']['top'])
    # total_ranking_file = jid + "_gini_total_ranking.tsv"
    # total_ranking = open(total_ranking_file,"wb")
    # Y=np.array(data[0]["Class"])
    # print Y
    # sum_pos = np.sum(Y)
    sum_pos = 334
    X = np.array(map(lambda x: list(x), data[0][features].tolist()))
    X = X[1:sum_pos, :]
    n, m = X.shape
    print n, m
    Sim = np.ones((n, n))
    print Sim
    for i in range(n):
        # print i
        for j in range(i + 1, n):
            Sim[i, j] = jacards_score(X[i], X[j], m)
            Sim[j, i] = Sim[i, j]
    print Sim
    Cluster = []
    a = SC(n_clusters=2, affinity="precomputed", assign_labels="discretize")
    Clusters2 = a.fit_predict(Sim)
    a = SC(n_clusters=3, affinity="precomputed", assign_labels="discretize")
    Clusters3 = a.fit_predict(Sim)
    a = SC(n_clusters=4, affinity="precomputed", assign_labels="discretize")
    Clusters4 = a.fit_predict(Sim)
    a = SC(n_clusters=5, affinity="precomputed", assign_labels="discretize")
    Clusters5 = a.fit_predict(Sim)
    Cluster.append(Clusters2)
    Cluster.append(Clusters3)
    Cluster.append(Clusters4)
    Cluster.append(Clusters5)
    sil_score = []
    silhouette_avg2 = silhouette_score(X, Clusters2)
    silhouette_avg3 = silhouette_score(X, Clusters3)
    silhouette_avg4 = silhouette_score(X, Clusters4)
    silhouette_avg5 = silhouette_score(X, Clusters5)
    sil_score.append(silhouette_avg2)
    sil_score.append(silhouette_avg3)
    sil_score.append(silhouette_avg4)
    sil_score.append(silhouette_avg5)
    ind = np.argmax(sil_score)
    print sil_score
    return Cluster[ind]
示例#2
0
def Run(data, iter_count=1, limit=3):
    Run.counter += 1
    graph = defaultdict(list)
    build_graph(data, graph, iter_count)
    data_matrix = build_data_matrix(graph,
                                    number_of_walks_per_node,
                                    rand=rand,
                                    restart=0)
    model = build_word2vec_model(data_matrix, embedding_size=embedding_size)
    train = model.syn0.astype(np.float)
    index = np.array(model.index2word)
    clf = SC(n_clusters=2)
    output = clf.fit_predict(train)
    index_of_data_0 = index[output == 0]
    index_of_data_1 = index[output == 1]
    data_0 = dict((key, data[key]) for key in index_of_data_0)
    data_1 = dict((key, data[key]) for key in index_of_data_1)
    print len(data_0), len(data_1), iter_count, limit
    if iter_count == limit:
        f = open("file_" + str(Run.counter) + '_0', "w")
        pickle.dump(data_0, f)
        f = open("file_" + str(Run.counter) + '_1', "w")
        pickle.dump(data_1, f)
        return
    elif iter_count > limit:
        return
    else:
        Run(data_0, iter_count + 1, limit)
        Run(data_1, iter_count + 1, limit)
示例#3
0
    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self.sc = SC(
            n_clusters=self.hyperparams["n_clusters"],
            n_init=self.hyperparams["n_init"],
            n_neighbors=self.hyperparams["n_neighbors"],
            affinity=self.hyperparams["affinity"],
            random_state=self.random_seed,
        )
示例#4
0
def build_model(file_name):
	label,data=get_embeddings(file_name)
	clf=SC(n_clusters=2)
	#temp=np.transpose(data)
	output=clf.fit_predict(data)
#	s=open("color",'w')
#	pickle.dump(output,s)
	plt.scatter(data[:,0],data[:,1],c=output)
	plt.show()
#	print "here"
	return output,np.array(label),data
示例#5
0
def build_model(file_name):
	label,data=get_embeddings(file_name)
	print data.shape
	clf=SC(n_clusters=2)
	#temp=np.transpose(data)
	output=clf.fit_predict(data)
	s=open("color",'w')
	pickle.dump(output,s)
	s=output
	plt.scatter(data[:,0][s==0],data[:,1][s==0],marker='+',s=45,label='Class 1',c='r')
	plt.scatter(data[:,0][s==1],data[:,1][s==1],marker='o',s=45,label='Class 2',c='b')
	plt.legend(loc='upper left')
	plt.show()
	return output,np.array(label),data
示例#6
0
def clusterResult(k, file_name,former):

    # 获取已经创建好的模型信息
    print("加载文档-主题矩阵")
    result = btm.loadModel(file_name)
    data = np.array(result)

    print("开始kMeans聚类")
    result = np.zeros(6);
    for i in range(30):
        #estimator = kmn.kMeansByFeature(k, data)
        #labels = estimator.labels_
        labels = SC(assign_labels="discretize", gamma=1e-7, n_clusters=k).fit_predict(data)
        result += cr.printResult(k, labels, former)
    return result / 30;

    print("聚类完成")
def cluster(X, k):
    #结果取30词平均

    reslut = np.zeros(6)
    for i in range(30):
        # kmeans 算法
        # res = km.kMeansByFeature(k,X)
        # labels = res.labels_
        # 谱聚类
        # assign_labels = "discretize" 离散化 取优值
        # labels = SC(gamma=1e-7, n_clusters=k).fit_predict(X)
        labels = SC(assign_labels="discretize", gamma=1e-7,
                    n_clusters=k).fit_predict(X)
        #labels = SC( affinity="nearest_neighbors",n_neighbors=10, n_clusters=k).fit_predict(X)
        reslut += np.array(printResult(k, labels, label))
    reslut = reslut / 30
    print("纯度:{}, RI:{}, F1_measure:{}, 熵:{}, 准确率:{}, 召回率:{}".format(
        reslut[0], reslut[1], reslut[2], reslut[3], reslut[4], reslut[5]))
示例#8
0
def sc_cluster_and_plot(row_number, row, csv_out = None):
    filename = str(row_number) + "-" + str(row[0]) + "-color" + ".png"
    #matrix = np.matrix(np.array(row[1:])).reshape(28,28)
    #plot(filename, matrix)
    data_entry = row_to_data(row[1:])
    #sc = SC(assign_labels='discretize', n_clusters=3).fit(data_entry)
    sc = SC(assign_labels='discretize', affinity='rbf', n_clusters=3).fit(data_entry)
    #sc = SC(n_clusters=1).fit(data_entry)
    al = sc.labels_
    #
    metrics = get_clusters_metrcs(data_entry, al)
    print filename," ",metrics
    dump(csv_out, row_number, row[0], metrics)
    #plot_color(filename, data_entry, al)
    major_points = get_major_points(data_entry, al)
    mapping = order_clusters(split_to_clusters(data_entry, al))
    plot_color(filename, data_entry, al, major_points, mapping)
    print "Row:",row_number," Digit:",row[0],"Mapping: ",mapping, "Maj:", major_points
示例#9
0
 def __init__(self,
              n_clusters=8,
              eigen_solver='None',
              n_components='n_clusters',
              random_state=None,
              n_init=10,
              gamma=1.0,
              affinity='rbf',
              n_neighbors=10,
              eigen_tol=0.0,
              assign_labels='kmeans',
              degree=3,
              coef0=1,
              kernel_params=None,
              n_jobs=None):
     self.assign_labels = assign_labels
     self.random_state = random_state
     self.n_init = n_init
     self.n_clusters = n_clusters
     self.coef0 = coef0
     self.n_jobs = n_jobs
     self.eigen_solver = eigen_solver
     self.affinity = affinity
     self.degree = degree
     self.n_neighbors = n_neighbors
     self.eigen_tol = eigen_tol
     self.gamma = gamma
     self.kernel_params = kernel_params
     self.n_components = n_components
     self.model = SC(coef0=self.coef0,
                     eigen_solver=self.eigen_solver,
                     n_components=self.n_components,
                     gamma=self.gamma,
                     eigen_tol=self.eigen_tol,
                     affinity=self.affinity,
                     assign_labels=self.assign_labels,
                     n_init=self.n_init,
                     n_jobs=self.n_jobs,
                     degree=self.degree,
                     kernel_params=self.kernel_params,
                     n_clusters=self.n_clusters,
                     n_neighbors=self.n_neighbors,
                     random_state=self.random_state)
示例#10
0
def build_model(file_name):
    label, data = get_embeddings(file_name)
    print data.shape
    n_c = 2
    clf = SC(n_clusters=n_c)
    #temp=np.transpose(data)
    output = clf.fit_predict(data)
    s = open("color", 'w')
    m = {0: '+', 1: 'o', 2: '^', 3: 'x'}
    pickle.dump(output, s)
    for x in xrange(n_c):
        plt.scatter(data[:, 0][output == x],
                    data[:, 1][output == x],
                    marker=m[x],
                    s=45,
                    label='Class %s' % x)
    plt.legend(loc='upper left')
    plt.show()
    print "here"
    return output, np.array(label), data
示例#11
0
def clustering(idTfidf, num_clu, term_num):
    docFeature = idTfidf
    vecTfidf = {}
    for file in idTfidf:
        row = np.zeros(len(idTfidf[file]))
        col = idTfidf[file].keys()
        val = idTfidf[file].values()
        vec = csc_matrix((np.array(val), (np.array(row), np.array(col))), shape=(1, term_num))
        vecTfidf[file] = vec.todense().tolist()[0]
    # print vecTfidf
    features = vecTfidf.values()
    # print features

    selection = 'GM'  # selecting model here!!! Options: AgglomerativeClustering as AC, SpectralClustering as SC, GMM

    if selection == 'AC':
        model = AC(n_clusters=num_clu, affinity='cosine', linkage='average')
    if selection == 'SC':
        model = SC(n_clusters=num_clu, affinity='cosine')
    if selection == 'GMM':
        model = GMM(n_components=num_clu, covariance_type='full')
    if selection == 'GM':
        model = GM(n_components=num_clu)
        model.fit(features)
        res = model.predict(features)
    else:
        res = model.fit_predict(features)

    resDic = {}
    for i in range(len(res)):
        if not resDic.has_key(res[i]):
            resDic[res[i]] = []
            resDic[res[i]].append(int(docFeature.keys()[i]))
        else:
            resDic[res[i]].append(int(docFeature.keys()[i]))
    result = resDic.values()
    # print result
    with open('gt_GMRes.json', 'w') as f:
        f.write(json.dumps(result))

    return result
示例#12
0
def lda_kmn_result(k, topic, doc, former, iterator=1000):

    # 返回对应的聚类结果
    # 获取lda模型和词袋
    print("创建主题模型")
    word_list, r_model = ldaa.lda_model(doc, topic, iterator)

    # 获取文档——主题分布
    doc_topic = r_model.doc_topic_
    # 转为普通list进行聚类
    doc_topic_list = np.array(doc_topic).tolist()

    result = np.zeros(6)
    for i in range(30):
        #estimator = kmn.kMeansByFeature(topic, doc_topic_list)
        # labels = estimator.labels_
        # assign_labels="discretize",
        labels = SC(assign_labels="discretize", gamma=1e-7,
                    n_clusters=k).fit_predict(doc_topic)
        result += cr.printResult(k, labels, former)
    return result / 30
示例#13
0
def build_model(file_name):
    label, data = get_embeddings(file_name)
    print data.shape
    n_c = 8
    clf = SC(n_clusters=n_c)
    #temp=np.transpose(data)
    output = clf.fit_predict(data)
    s = open("color", 'w')
    m = {0: '+', 1: 'o', 2: '^', 3: 'x', 4: 'D', 5: '*', 6: '>', 7: 'v'}
    c = ['r', 'b', 'g', 'c', 'm', 'y', 'k', '#eeefff']
    pickle.dump(output, s)
    for x in xrange(n_c):
        plt.scatter(data[:, 0][output == x],
                    data[:, 1][output == x],
                    marker=m[x],
                    s=45,
                    label='Class %s' % x,
                    c=c[x])
    plt.legend(loc='upper left')
    plt.show()
    print "here"
    return output, np.array(label), data
示例#14
0
def sc_cluster(data):
    sc = SC(assign_labels='discretize', affinity='rbf', n_clusters=3).fit(data)
    al = sc.labels_
    metrics = get_clusters_metrcs(data, al)
    return metrics
示例#15
0
def supervised(numClu, affinity):
    print 'Buidling supervised model...'
    model = SC(n_clusters=numClu, affinity='precomputed')
    res = model.fit_predict(affinity)
    return res
示例#16
0
def unsupervised_clu(feature, part, model_selection):
    if part:
        if feature == 'graph':
            docFeature = json.loads(
                open('rmMultiPart1WOZeroGraph.json').read())
        if feature == 'doc2vec':
            docFeature = json.loads(open('rmMultiPart1Doc2vec.json').read())
        if feature == 'comb':
            walk = json.loads(open('rmMultiPart1WOZeroGraph.json').read())
            dv = json.loads(open('rmMultiPart1Doc2vec.json').read())
            docFeature = {}
            for doc in walk:
                val = walk[doc] + dv[doc]
                docFeature[doc] = val
        groundTruth = json.loads(open('rmMultiPart1CluInd.json').read())
        num_clu = len(groundTruth)  # number of clusters in each part
    else:
        rmMulti = True  # False #
        if rmMulti:
            if feature == 'graph':
                docFeature = json.loads(
                    open('rmMultiCluDatabaseWOZeroGraph.json').read())
            if feature == 'doc2vec':
                docFeature = json.loads(
                    open('rmMultiCluDatabaseDoc2vec.json').read())
            if feature == 'comb':
                walk = json.loads(
                    open('rmMultiCluDatabaseWOZeroGraph.json').read())
                dv = json.loads(open('rmMultiCluDatabaseDoc2vec.json').read())
                docFeature = {}
                for doc in walk:
                    val = walk[doc] + dv[doc]
                    docFeature[doc] = val
            groundTruth = json.loads(open('rmMultiGroundTruth.json').read())
            num_clu = len(
                groundTruth
            )  # number of clusters after removing documents appearing multi-cluster, #doc = 1274 (3 all 0s for walk)
        else:
            if feature == 'graph':
                docFeature = json.loads(
                    open('cluDatabaseWOZeroGraph.json').read())
            if feature == 'doc2vec':
                docFeature = json.loads(open('cluDatabaseDoc2vec.json').read())
            if feature == 'comb':
                walk = json.loads(open('cluDatabaseWOZeroGraph.json').read())
                dv = json.loads(open('cluDatabaseDoc2vec.json').read())
                docFeature = {}
                for doc in walk:
                    val = walk[doc] + dv[doc]
                    docFeature[doc] = val
            groundTruth = json.loads(open('groundTruth.json').read())
            num_clu = len(
                groundTruth
            )  # number of clusters before removing documents appearing multi-cluster, #doc = 1393 (3 all 0s for walk)

    features = docFeature.values()
    if model_selection == 'AC':
        model = AC(n_clusters=num_clu, affinity='cosine', linkage='average')
    if model_selection == 'SC':
        model = SC(n_clusters=num_clu, affinity='cosine')
    if model_selection == 'GMM':
        model = GMM(n_components=num_clu, covariance_type='full')
    if model_selection == 'KMeans':
        model = KMeans(n_clusters=num_clu)
    if model_selection == 'GM':
        model = GM(n_components=num_clu)
        model.fit(features)
        res = model.predict(features)
    else:
        res = model.fit_predict(features)
    resDic = {}
    for i in range(len(res)):
        if not resDic.has_key(res[i]):
            resDic[res[i]] = []
            resDic[res[i]].append(int(docFeature.keys()[i]))
        else:
            resDic[res[i]].append(int(docFeature.keys()[i]))
    result = resDic.values()

    return (result, groundTruth)
示例#17
0
for i in xrange(100):
    out = sess.run([opt, loss_, h_1], feed_dict=feed_dict)
    acc = sess.run([acc_, h_2], feed_dict=feed_dict)
    print out[1], acc[0]
    #raw_input()

q = np.argmax(sess.run(predict, feed_dict=feed_dict), 1)
a = sess.run(acc_, feed_dict=feed_dict)

print "\n", q, a, np.argmax(y, 1), "\n"
viz = sess.run(h_1, feed_dict=feed_dict)
#plt.scatter(viz[:,0],viz[:,1],s=100)

from sklearn.cluster import SpectralClustering as SC
clf = SC(n_clusters=2)
output = clf.fit_predict(viz)
plt.scatter(viz[:, 0], viz[:, 1], c=output, s=75)
plt.show()
d1 = data[output == 0]
d2 = data[output == 1]

import matplotlib.image as mpimg
im = mpimg.imread('scene.jpg')
plt.imshow(im)

for row in d1:
    row = np.reshape(row, (15, 2))
    plt.scatter(row[:, 0], row[:, 1], c='r')
plt.show()
示例#18
0
        quats_arr = np.array(quats_arr)

        # compute similarity matrix
        X = np.zeros((n_samples, n_samples))
        for x in range(n_samples):
            for y in range(n_samples):
                if x == y:
                    X[x, y] = 1
                else:
                    a = quats[x]
                    b = quats[y]
                    X[x, y] = (sqrt_2 -
                               pyq.Quaternion.absolute_distance(a, b)) / sqrt_2

        clustering = SC(n_clusters=n_clust, affinity='precomputed')
        clustering.fit(X)
        samples_labels = clustering.labels_
        print(np.bincount(samples_labels))

        neigh = Nearest(n_neighbors=K, metric=utils.quatmetric)
        neigh.fit(quats_arr)

        pos = []
        labels = []
        labels_cvt = []
        for _ in range(n_test):
            quat = pyq.Quaternion.random()
            test = np.array(quat.elements)
            pos.append(quat)
def get_trending_topics(FILE_LOAD,
                        dt,
                        load_file,
                        load_from_file=0,
                        no_of_topics=10,
                        t=50000):
    DataIndex = IndexBox()
    DataIndex.load(FILE_LOAD)

    if load_from_file == 0:
        trendTime = DataIndex.getIndexTime(dt)
        df_idft_scores = {}
        for i in DataIndex.data:
            df_idft_scores[i] = get_df_idft(DataIndex.data[i], t, trendTime)
        sorted_by_score = sorted(df_idft_scores.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
        trending_topics = []
        count = 0
        for i in sorted_by_score:
            trending_topics += [i[0].encode('utf-8')]
            count += 1
            if count == 1000:  # 1000 top df-idf ngrams for clustering
                break
        save_to_file = open(load_file, 'w')
        save_to_file.write('$$'.join(trending_topics))
        save_to_file.close()
        return 'saved to file'

    else:
        load_from_file = open(load_file, 'r')
        trending_topics = load_from_file.read().split('$$')
        GraphMatrix = []
        for ng1 in trending_topics:
            row = []
            for ng2 in trending_topics:
                score = 0
                for tw1 in DataIndex.data[ng1.decode('utf-8')]:
                    for tw2 in DataIndex.data[ng2.decode('utf-8')]:
                        if tw1 == tw2:
                            score += 1
                row += [math.log(score + 1, 2)]
            GraphMatrix += [np.array(row)]
        GraphMatrix = np.array(GraphMatrix)
        No_of_clusters = 5
        clusters = SC(GraphMatrix,
                      n_clusters=No_of_clusters,
                      eigen_solver='arpack')

        f_stop = open('stopwords.txt', 'r')
        stopwords = f_stop.read().split('\n')

        Mark = [0] * No_of_clusters
        count = 0
        topics_trending = []
        for i in clusters:
            current_gram = trending_topics[count].decode('utf-8')
            if Mark[i] == 0:
                if '~~' not in current_gram:
                    if current_gram not in stopwords and (
                            not current_gram.isdigit()):
                        topics_trending += [current_gram]
                        Mark[i] = 1
                else:
                    topics_trending += [current_gram]
                    Mark[i] = 1
            count += 1
        return topics_trending
示例#20
0
def supervised_clu(feature, rmMulti, trial):
    (part1Pos, part1Neg, part2Pos, part2Neg, part3Pos, part3Neg, part4Pos,
     part4Neg, part5Pos, part5Neg, globalPos,
     globalNeg) = data_selection(feature, rmMulti)
    sumpurity = 0
    sumfone = 0
    for i in range(0, trial):
        print '#', i + 1, 'trial!!!'
        pos_dataset = dic2List(
            globalPos
        )  # dic2List(part1Pos) + dic2List(part2Pos) + dic2List(part3Pos) + dic2List(part4Pos) + dic2List(part5Pos)  #
        neg_dataset = dic2List(
            globalNeg
        )  # dic2List(part1Neg) + dic2List(part2Neg) + dic2List(part3Neg) + dic2List(part4Neg) + dic2List(part5Neg)  #
        # print len(pos_dataset)

        num_pos_sample = int(0.3 * len(pos_dataset))
        num_neg_sample = num_pos_sample

        (posPicked, posNotPicked) = takingSamples(pos_dataset,
                                                  num=num_pos_sample)
        (negPicked, negNotPicked) = takingSamples(neg_dataset,
                                                  num=num_neg_sample)
        # print len(posPicked),len(negPicked)
        # print posPicked, posNotPicked

        # train_X = pd.DataFrame(mat2arr(list2Dic(posPicked).values() + list2Dic(negPicked).values()))
        train_X = pd.DataFrame(
            list2Dic(posPicked).values() + list2Dic(negPicked).values())
        train_y = np.array(
            [1 for i in range(len(list2Dic(posPicked).values()))] +
            [0 for i in range(len(list2Dic(negPicked).values()))])
        print len(train_X), len(train_y)

        reg = RFC(n_estimators=200, max_features='log2')
        model = reg.fit(train_X, train_y)
        # print 'model ready!'

        # print 'get affinity matrix...'
        matrixVal = {}
        for item in posPicked:
            matrixVal[str(item.keys()[0])] = 1
        for item in negPicked:
            matrixVal[str(item.keys()[0])] = 0

        test_X = posNotPicked + negNotPicked
        modelIn = list2Dic(test_X)
        test_Y = model.predict_proba(modelIn.values())[:, 1]
        for i in range(0, len(modelIn)):
            matrixVal[modelIn.keys()[i]] = test_Y[i]

        # print matrixVal.keys()
        # print map(eval,matrixVal.keys())
        # print matrixVal.values()
        # print size
        row = []
        col = []
        docMap = {}
        mapDoc = {}
        size = 0
        for pair in map(eval, matrixVal.keys()):
            for doc in pair:
                if not docMap.has_key(doc):
                    docMap[doc] = size
                    mapDoc[size] = doc
                    size += 1
        # print mapDoc
        # print docMap
        for pair in map(eval, matrixVal.keys()):
            row.append(docMap[pair[0]])
            col.append(docMap[pair[1]])
        for pair in map(eval, matrixVal.keys()):
            row.append(docMap[pair[1]])
            col.append(docMap[pair[0]])
        data = matrixVal.values() + matrixVal.values()
        # print size
        affinity = csc_matrix((data, (row, col)), shape=(size, size)).toarray()
        # print 'affinity matrix get!'

        # print 'run clustering...'
        # groundTruth = json.loads(open('groundTruth.json').read())
        # groundTruth = json.loads(open('rmMultiGroundTruth.json').read()) # some documents appears in one part only once, but multiple time in global
        groundTruth = json.loads(open('rmMultiGroundTruthNew.json').read(
        ))  # rmMultiGroundTruthNew.json is for simply combining all parts only
        # groundTruth = json.loads(open('part1CluInd.json').read())
        # groundTruth = json.loads(open('rmMultiPart5CluInd.json').read())
        num_clu = len(groundTruth)
        # print num_clu
        model = SC(n_clusters=num_clu, affinity='precomputed')
        res = model.fit_predict(affinity)
        # print res
        # print len(res), len(set(res))

        resDic = {}
        for i in range(len(res)):
            if not resDic.has_key(res[i]):
                resDic[res[i]] = []
                resDic[res[i]].append(mapDoc[i])
            else:
                resDic[res[i]].append(mapDoc[i])
        result = resDic.values()

        purVal = purity(result, groundTruth)
        (pre, rec, fone) = fmeasure(result, groundTruth)
        sumpurity += purVal
        sumfone += fone
        print 'purity %.4f' % purVal, 'precision: %.4f' % pre, 'recall: %.4f' % rec, 'f1: %.4f' % fone

        return (sumpurity, sumfone)
示例#21
0
sigma2 = 0
for i in range(shape_img0):
    for j in range(i + 1, shape_img0):
        sigma1 += (W[i, j] - mean1) ** 2
for i in range(shape_img0, len(img_list)):
    for j in range(i + 1, len(img_list)):
        sigma2 += (W[i, j] - mean2) ** 2

sigma1 = np.sqrt(sigma1 / (shape_img0 * (shape_img0 - 1) / 2))
sigma2 = np.sqrt(sigma2 / (shape_img1 * (shape_img1 - 1) / 2))
sigma = (sigma1 + sigma2) / 2

W = np.exp(-1 * W / sigma)

# NCut cluster
img_list = np.array(img_list)
cluster = SC(n_clusters=2, affinity='precomputed')
cluster.fit(W)
result = cluster.fit_predict(W)
print(result)

# accuracy compute
accuracy = 0
for i in range(len(result)):
    if i < shape_img0 and result[i] == 0:
        accuracy += 1
    if i >= shape_img0 and result[i] == 1:
        accuracy += 1
accuracy = float(accuracy) / len(result)
print(accuracy)