Python get_jaccard 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: validation

메소드/함수: get_jaccard

hotexamples.com에서의 예제들: 1

Python get_jaccard - 1개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 validation.get_jaccard에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: kmeans.py 프로젝트: prakashn27/Data-Mining-Projects

def run():
    # fname = "data/cho.txt"
    options = get_options()
    fname = options.input
    no_of_clusters = options.no_of_clusters
    expr_value, cluster_no, map_with_true_values, map_of_gene_id = get_points(fname)
    # print expr_value
    # print len(cluster_no)
    # no_of_clusters = 5 #
    gene_id_array = range(1, no_of_clusters + 1)

    old_centroids = []
    for i in range(no_of_clusters):
        old_centroids.append(expr_value.get(gene_id_array[i]))
    # print old_centroids
    # print len(old_centroids)

    count = 0
    # print len(old_centroids), len(expr_value)
    while True:
        count += 1
        clusters = {}
        cluster_and_gene_id = {}
        # print sys.float_info.max
        for cur_gene in range(1, len(expr_value) + 1): # since gene starts with 1
            min_dist = sys.float_info.max
            nearest_centroid = 0
            for cur_centroid in range(len(old_centroids)):
                # print len(old_centroids[cur_centroid])
                # print expr_value
                dist = get_euclidean_distance(old_centroids[cur_centroid], expr_value[cur_gene])
                if dist < min_dist:
                    min_dist = dist
                    nearest_centroid = cur_centroid
            if nearest_centroid in cluster_and_gene_id:
                t = cluster_and_gene_id[nearest_centroid]
                t.append(cur_gene)
            else:
                t = []
                t.append(cur_gene)
                cluster_and_gene_id[nearest_centroid] = t
            if nearest_centroid in clusters:
                t = clusters[nearest_centroid]
                t.append(expr_value[cur_gene])
            else:
                t = []
                t.append(expr_value[cur_gene])
                clusters[nearest_centroid] = t
        # print clusters
        # print cluster_and_gene_id
        new_centroids = get_new_centroids(clusters)
        # print old_centroids
        # print new_centroids
        got_result = True
        for i in range(len(new_centroids)):
            if new_centroids[i] != old_centroids[i]:
                got_result = False
                break
        if not got_result:
            old_centroids = new_centroids
            count += 1
        else:
            print "How many Iterations is it taking to compute the cluster:", count
            for id in cluster_and_gene_id:
                print id + 1, "th cluster with " , str(len(cluster_and_gene_id[id])), " Points"
            # print cluster_and_gene_id
            break
    print "clusters are done"
    output = []
    for i in cluster_and_gene_id:
        output.append(cluster_and_gene_id[i])
    # print "cluster and result of genes"
    # print cluster_and_gene_id
    result_of_genes = {}
    for entry in cluster_and_gene_id:
        for i in cluster_and_gene_id[entry]:
            result_of_genes[i] = entry + 1
    f = open('result.txt','w')
    str_file = ""
    for i in map_of_gene_id:
        if map_of_gene_id[i] == result_of_genes[i]:
            str_file += str(result_of_genes[i]) + "\n"
        else:
            str_file += "-1\n"
    f.write(str_file) # python will convert \n to os.linesep
    f.close()

    # Validation
    # ====================================
    our_truth = [[0 for row in range(len(expr_value) + 1)] for col in range(len(expr_value) + 1)]
    for a in output:
        for i in range(len(a)):
            for j in range(len(a)):
                # print i, j, a
                our_truth[a[i]][a[j]] = 1
    # print "our truth"
    # print our_truth
    ground_truth = [[0 for row in range(len(expr_value) + 1)] for col in range(len(expr_value) + 1)]
    for entry in map_with_true_values:
        temp = map_with_true_values[entry]
        for i in range(len(temp)):
            for j in range(len(temp)):
                ground_truth[temp[i]][temp[j]] = 1
    # print ground_truth

    # construct the distance matrix
    distance_matrix = [[0.0 for row in range(len(expr_value) + 1)] for col in range(len(expr_value) + 1)]
    for i in range(1, len(expr_value) + 1):
        for j in range(1, len(expr_value) + 1):
            if i != j:
                list1 = expr_value[i]
                list2 = expr_value[j]
                distance_matrix[i][j] = v.find_distance(list1, list2)
            else:
                distance_matrix[i][j] = 0.0
    jac = v.get_jaccard(ground_truth, our_truth)
    cor = v.get_corrlation(distance_matrix, our_truth)
    ran = v.get_rand(ground_truth, our_truth)
    print "Jaccard Coefficient is ", jac
    print "Correlation is ", cor
    print "Rand Index is ", ran
    # matplotlib.mlab.PCA("result.txt")
    # np.loadtxt("result.txt")
    X = np.loadtxt("data/cho.txt")[:,2:]
    res = a = np.loadtxt('result.txt')
    mlab_pca = mlab.PCA(X)
    plt.scatter(mlab_pca.Y[:,0],mlab_pca.Y[:,1], c = res)
    plt.show()