示例#1
0
def test_kmeans_when_k_is_2(dataset, expected1, expected2):
    expected_clustering1 = kmeans.get_list_from_dataset_file(expected1)
    expected_clustering2 = kmeans.get_list_from_dataset_file(expected2)
    clustering = kmeans.k_means(dataset_file=dataset, k=2)
    cost = kmeans.cost_function(clustering)

    for _ in range(10):
        new_clustering = kmeans.k_means(dataset_file=dataset, k=2)
        new_cost = kmeans.cost_function(clustering)
        if new_cost < cost:
            clustering = new_clustering
            cost = new_cost


    assert len(clustering.keys()) == 2
    assert clustered_all_points(clustering, kmeans.get_list_from_dataset_file(dataset)) is True
    clustered = []
    for assignment in clustering:
        clustered.append(clustering[assignment])
    assert clustered == [expected_clustering1, expected_clustering2]
示例#2
0
def test_kmeans_when_k_is_1(dataset):
    expected_clustering = kmeans.get_list_from_dataset_file(dataset)
    clustering = kmeans.k_means(dataset_file=dataset, k=1)

    assert len(clustering.keys()) == 1
    assert clustered_all_points(clustering, kmeans.get_list_from_dataset_file(dataset)) is True

    clustered = []
    for assignment in clustering:
        clustered.append(clustering[assignment])
    assert clustered == [expected_clustering]
示例#3
0
    def kmeans(self, args):
        """Run the kmeans command
        """
        import csv
        clustering = kmeans.k_means(dataset_file=args.dataset, k=int(args.k))
        cost = kmeans.cost_function(clustering)

        for _ in range(100):
            new_clustering = kmeans.k_means(dataset_file=args.dataset, k=int(args.k))
            new_cost = kmeans.cost_function(clustering)
            if new_cost < cost:
                clustering = new_clustering
                cost = new_cost

        for assignment in clustering.keys():
            file_name = str(args.dataset).split(".")[0]+"_k_is_"+args.k+"_"+str(assignment)+".csv"
            with open(file_name, "w") as f:
                writer = csv.writer(f)
                print("assignement ", assignment, " is: ", clustering[assignment])
                writer.writerows(clustering[assignment])
            f.close()
示例#4
0
def test_kmeans_when_k_is_3(dataset, expected1, expected2, expected3):
    expected_clustering1 = kmeans.get_list_from_dataset_file(expected1)
    expected_clustering2 = kmeans.get_list_from_dataset_file(expected2)
    expected_clustering3 = kmeans.get_list_from_dataset_file(expected3)
    clustering = kmeans.k_means(dataset_file=dataset, k=3)
    cost = kmeans.cost_function(clustering)

    for _ in range(3000):
        new_clustering = kmeans.k_means(dataset_file=dataset, k=3)
        new_cost = kmeans.cost_function(clustering)
        if new_cost < cost:
            clustering = new_clustering
            cost = new_cost

    assert len(clustering.keys()) == 3
    assert clustered_all_points(
        clustering, kmeans.get_list_from_dataset_file(dataset)) is True

    clustered = []
    for assignment in clustering:
        clustered.append(clustering[assignment])
    assert clustered == [
        expected_clustering1, expected_clustering2, expected_clustering3
    ]
    #return clustered == [expected_clustering1, expected_clustering2, expected_clustering3]


# a = "/Users/AaronLee/clustering/tests/test_files/dataset_1.csv"
# b = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_3_0.csv"
# c = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_3_1.csv"
# d = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_3_2.csv"
# x = test_kmeans_when_k_is_3(a,b,c,d)
#
# a1 = "/Users/AaronLee/clustering/tests/test_files/dataset_1.csv"
# b1 = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_2_0.csv"
# c1 = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_2_1.csv"
# x1 = test_kmeans_when_k_is_2(a1,b1,c1)
#
# a2 = "/Users/AaronLee/clustering/tests/test_files/dataset_1.csv"
# x2 = test_kmeans_when_k_is_1(a2)
示例#5
0
def generateSeparator(length):
    result = ""
    for i in range(length):
        result += "-"
    result += "\n"
    return result


startTime = current_milli_time()
result = head
for i in range(1, 6, 1):
    result += generateSeparator(120)
    dataset = "inputs/graph.txt"
    result += "Dataset: " + dataset + "\t\t count centroids: " + str(i) + "\n"
    centroids = k_means(dataset, i)
    centroids.sort()
    for centroid in centroids:
        for coordinate in centroid:
            result += str(coordinate) + "\t"
        result += "\n"
    result += "\n"

for i in range(1, 6, 1):
    result += generateSeparator(120)
    dataset = "inputs/iris.data.txt"
    result += "Dataset: " + dataset + "\t\t count centroids: " + str(i) + "\n"
    centroids = k_means(dataset, i)
    centroids.sort()
    for centroid in centroids:
        for coordinate in centroid:
示例#6
0
NC = list(range(3, 4))  #numOfModels+1)) # list of numbers of clusters
accuracies = np.zeros((numOfEPS, len(NC)))
clusteringResult = {}
for numOfClusters in NC:
    clusteringResult[numOfClusters] = []

with open("clustering_result.txt", "w") as fp:
    for numOfClusters in NC:
        # clustering into c groups
        print("Clustering: {} clusters".format(numOfClusters))
        #    kmeans = KMeans(n_clusters=numOfClusters, random_state=0).fit(predVec)
        #    for c in range(numOfClusters):
        #        clusteringResult[numOfClusters].append(np.where(kmeans.labels_ == c)[0])
        #        print(np.where(kmeans.labels_ == c)[0])

        assignments = k_means(predVec, numOfClusters, "L2", "ZerosFarAway")
        fp.write("## number of clusters: " + str(numOfClusters) + "\n")
        for c in range(numOfClusters):
            cluster = np.where(assignments == c)[0]
            clusteringResult[numOfClusters].append(cluster)
            print(cluster)
            fp.write("\t" + str(cluster) + "\n")
        fp.write("\n")


def vote1(participants):
    '''
        Input:
            participants: a list of opinions. Each element in the list is a numpy array, N X 2.
                            N is the number of events. The second dimension contains (opinion/label, confidence)
        Output:
示例#7
0
current_milli_time = lambda: int(round(time.time() * 1000))

def generateSeparator(length):
    result = ""
    for i in range(length):
        result += "-"
    result += "\n"
    return result

startTime = current_milli_time()
result = head
for i in range(1, 6, 1):
    result += generateSeparator(120)
    dataset = "inputs/graph.txt"
    result += "Dataset: " + dataset + "\t\t count centroids: " + str(i) + "\n"
    centroids = k_means(dataset, i)
    centroids.sort()
    for centroid in centroids:
        for coordinate in centroid:
            result += str(coordinate) + "\t"
        result+="\n"
    result += "\n"

for i in range(1, 6, 1):
    result += generateSeparator(120)
    dataset = "inputs/iris.data.txt"
    result += "Dataset: " + dataset + "\t\t count centroids: " + str(i) + "\n"
    centroids = k_means(dataset, i)
    centroids.sort()
    for centroid in centroids:
        for coordinate in centroid: