Exemplo n.º 1
0
def kMeans(data, nrOFCluster, nrOfGivenIterations):

    centroids = findCentroids(nrOFCluster, data)
    listD = [[0 for i in range(0, nrOFCluster)], [0 for i in range(0, nrOFCluster)]]
    loopCounter = 0
    givenError = 0.5

    while loopCounter < nrOfGivenIterations:
        assignment = assignToCluster(nrOFCluster, data, centroids)

        # seeking for empty centroids
        empty = []
        for e in range(0, nrOFCluster):
            if assignment.count(e) == 0:
                empty.append(e)
        # print "empty ", empty, "\n\n"

        #collecting empty centroids
        emptyCentroid = Dataset([])
        emptyListD = [[], []]
        for q in range(0, empty.__len__()):
            emptyCentroid.getListOfVectors().append(centroids.getListOfVectors()[empty[q]])
            emptyListD[0].append(listD[0][empty[q]])
            emptyListD[1].append(listD[1][empty[q]])

        #calculation
        D = meanQuantizationError(nrOFCluster, data, assignment, centroids)
        centroids = calculateNewCentroids(nrOFCluster, data, assignment, centroids)
        numberOfExistingCluster = []

        if (loopCounter == 0):
            listD[0] = D
        else:
            if loopCounter == 1:
                listD[1] = D
            else:
                listD[0] = listD[1]
                listD[1] = D

            # check if all cluster have at least one Vector assigned
            # otherwise del from cluster, decrement nrOFCluster
            centroidsTmp = Dataset([])
            listDTmp= [[],[]]

            for z in range(0, centroids.getListOfVectors().__len__()):
                ctr = 0
                for q in range(0, centroids.getListOfVectors()[0].__len__()):
                    if centroids.getListOfVectors()[z][q] != 0:
                        ctr += 1
                if ctr == 0:
                    centroidsTmp.getListOfVectors().append(centroids.getListOfVectors()[z])
                    listDTmp[0].append(listD[0][z])
                    listDTmp[1].append(listD[1][z])
                else:
                    numberOfExistingCluster.append(z)

            #remove empty data from processing
            for q in range(0,centroidsTmp.getListOfVectors().__len__()):
                centroids.getListOfVectors().remove(centroidsTmp.getListOfVectors()[q])
                listD[0].remove(listDTmp[0][q])
                listD[1].remove(listDTmp[1][q])
                nrOFCluster -= 1

            #check if given error tha calculated
            QoS = 0
            for q in range(0, numberOfExistingCluster.__len__()):
                if countDiff(listD[0][q], listD[1][q]) < givenError:
                    QoS += 1


            if QoS == centroids.getListOfVectors().__len__():
                print "\n!!!!!!!!!!KONIEC!!!!!!!!!!"
                break

            #restore empty centroids
            centroids.getListOfVectors().extend(emptyCentroid.getListOfVectors())
            nrOFCluster += emptyCentroid.getListOfVectors().__len__()
            listD[1].extend(emptyListD[0])
            listD[0].extend(emptyListD[1])

            print "@@ centr= " , centroids.getListOfVectors()

        print "\n*******END OF LOOP " ,loopCounter, "\n"
        loopCounter += 1

    for i in range(len(assignment)):
        print assignment[i], getAllFiles('../input')[i]
Exemplo n.º 2
0
            if QoS == centroids.getListOfVectors().__len__():
                print "\n!!!!!!!!!!KONIEC!!!!!!!!!!"
                break

            #restore empty centroids
            centroids.getListOfVectors().extend(emptyCentroid.getListOfVectors())
            nrOFCluster += emptyCentroid.getListOfVectors().__len__()
            listD[1].extend(emptyListD[0])
            listD[0].extend(emptyListD[1])

            print "@@ centr= " , centroids.getListOfVectors()

        print "\n*******END OF LOOP " ,loopCounter, "\n"
        loopCounter += 1

    for i in range(len(assignment)):
        print assignment[i], getAllFiles('../input')[i]



nrOFCluster = 3
nrOfGivenIterations = 3

data = Dataset(prepareInput('../input', 10, '/home/kchrusci/Workspace/repo/k-means/projektpython/ForbiddenWords.txt'))
print "data = ", data.getListOfVectors()

kMeans(data,nrOFCluster,nrOfGivenIterations)
print(getAllFiles('/home/koper/PycharmProjects/First/Samples'))