예제 #1
0
파일: clust.py 프로젝트: inachen/CS181
def kmeans(data,numExamples,numClusters):

    # find the dimension of the data
    dimension = len(data[1])

    # find the initial prototypes
    prototypes = []
    index = []

    for i in xrange(numClusters):
        rand = random.randint(0,numExamples-1)
        # make sure there are no repeats
        while rand in index:
            rand = random.randint(0,numExamples-1)
        prototypes.append(data[rand])
        index.append(rand)

    errors = []
    counter = 0
    responsibilities = [None]*numExamples

    # repeat until the error stops decreasing
    while True:

        # assign responsibilities to data
        for i in xrange(numExamples):
            distances = map(lambda n: utils.squareDistance(data[i],n), prototypes)
            responsibilities[i] = makeR(numClusters,distances.index(min(distances)))

        # find new error
        error = 0
        for j in range(numExamples):
            for k in range(numClusters):
                error += responsibilities[j][k]*utils.squareDistance(data[j],prototypes[k])/numExamples

        # print "Error:",counter,":",errors, error

        # quit if error isn't improving
        if counter > 10 and error >= errors[counter-9]:
            return error
            break

        # else, updates the prototypes
        errors.append(error)
        counter += 1

        for l in range(numClusters):
            topsum = [0]*dimension
            bottomsum = 0
            for m in range(numExamples):
                myrespon = responsibilities[m][l]
                topsum = listadd(topsum, listmult(data[m],myrespon))
                bottomsum += responsibilities[m][l]
            if bottomsum == 0:
                print "Cluster number",l,"is obsolete"
            prototypes[l] = listmult(topsum,float(1.0/bottomsum))
예제 #2
0
파일: clust.py 프로젝트: rlucioni/cs181
def kmeans(data, k):
    # threshold used to determine when to stop
    threshold = 0.01

    # for each k, set prototype u_k to a random vector in data
    prototypes = random.sample(data, k)
    responsibilities = []
    for d in data:
        responsibilities.append([0] * k)

    while True:
        # update responsibilities
        for i in range(len(data)):
            # zero out the responsibility vector
            for s in range(len(responsibilities[i])):
                responsibilities[i][s] = 0
            # calculate closest_prototype
            distances = []
            for p in prototypes:
                distances.append(utils.squareDistance(data[i], p))
            responsibilities[i][distances.index(min(distances))] = 1

        # update prototypes
        largest_shift = 0.0
        for p in range(len(prototypes)):
            temp = [0.0] * len(prototypes[p])
            point_count = 0
            for j in range(len(responsibilities)):
                if responsibilities[j][p] == 1:
                    temp = [a + b for a, b in zip(temp, data[j])]
                    point_count += 1
            temp = map(lambda x: x / point_count, temp)
            diffs = [math.fabs(a - b) for a, b in zip(prototypes[p], temp)]
            largest_shift = max(diffs)
            prototypes[p] = temp

        if largest_shift < threshold:
            break

    sq_err = 0.0
    count = 0
    for d in range(len(data)):
        for j in range(len(responsibilities[d])):
            if responsibilities[d][j] == 1:
                sq_err += utils.squareDistance(data[d], prototypes[j])
                count += 1
                break

    mse = sq_err / count

    print "\nMSE: {}".format(mse)

    print "\n***CLUSTER MEANS***\n"
    for p in range(len(prototypes)):
        print "CLUSTER {}: {}\n".format(p + 1, prototypes[p])
예제 #3
0
파일: clust.py 프로젝트: acutkosky/cs181
def mean_squared_by_cluster(clusters):
    tot = 0
    for clust in clusters:
        clust_mean = mean_of_cluster(clust)
        for x in clust:
            tot += utils.squareDistance(x, clust_mean)
    return 1.0/len(cluster)*tot
def MSEforDirection(directions, pCentroids, pMeasure, distrib, dataset):
    """
        A more efficient way to compute MSE for different directions (for a
        specific update function)
    """
    directionsMSE = np.zeros(len(directions))
    numberOfPointsUsed = np.zeros(len(directions))
    # define regions and centroids
    regionsWithCentroids = []
    for dir in directions:
        regionsWithCentroids.append(core.centroids(dir,pCentroids,distrib) )
    # calculate error
    for k in range(pMeasure):
        x = utils.f(len(directions[0][0])-1 , distrib, dataset) # pick a random point x
        for i in range(len(directions)): #for each direction i
            r = utils.findRegion(x,directions[i])
            regionRegistered = False
            for j in range(len(regionsWithCentroids[i])):
                if np.all(regionsWithCentroids[i][j,0] == r):
                    c = regionsWithCentroids[i][j,1]
                    regionRegistered = True
                    break;
            if regionRegistered:
                directionsMSE[i] += utils.squareDistance(x,c)
                numberOfPointsUsed[i] += 1.
    directionsMSE /= float(len(x))
    directionsMSE /= numberOfPointsUsed
    return directionsMSE
def MSE(hyperplanes,pCentroids,pMeasure,distrib, dataset):
    """
        Returns MSE given the hyperplanes separating regions.
        Parameter pCentroids is the number of realisations of f used for
        determining the centroids of each region.
        Parameter pMeasure is the number of realisations used for computing the MSE.
    """
    error = 0.
    numberOfPointsUsed = 0
    regionsWithCentroids = core.centroids(hyperplanes,pCentroids,distrib)
    for i in range(pMeasure):
        x = utils.f(len(hyperplanes[0])-1 , distrib, dataset)
        r = utils.findRegion(x,hyperplanes)
        regionRegistered = False
        for j in range(len(regionsWithCentroids)):
            if np.all(regionsWithCentroids[j,0] == r):
                c = regionsWithCentroids[j,1]
                regionRegistered = True
                break;
        if regionRegistered:
            error += utils.squareDistance(x,c)
            numberOfPointsUsed += 1
    error /= float(len(x))
    error /= float(numberOfPointsUsed)
    return error
예제 #6
0
파일: clust.py 프로젝트: acutkosky/cs181
def k_means(xs, numExamples,numClusters):
    #initializing of the means as randomly chosen members of the input
    means = random.sample(xs, numClusters)

    #initializing the membership variables
    #r_nk = 1 if example n is in cluster k, 0 if not
    rlist = []
#    for n in range(numExamples):
#        rlist.append([])
#        for k in range(numClusters):
#            rlist[n].append(0)

    #initialize the indicator matrix
    for n in range(numExamples):
        #find the index of the old mean
        rlist.append(zero_list(numClusters))
        #find the best cluster, i.e. that minimizes the L_2 norm
        k = utils.argmin_index(means, lambda y : utils.squareDistance(xs[n],y))
        rlist[n][k]=1        


    converged = False
    while not(converged):
        converged = True
        #we change this if any means get updated
        for n in range(numExamples):
            #find the index of the old mean
            k_old = numpy.argmax(rlist[n])
            rlist[n] = zero_list(numClusters)
            #find the best cluster, i.e. that minimizes the L_2 norm
            k = utils.argmin_index(means, lambda y : utils.squareDistance(xs[n],y))
            rlist[n][k]=1
            #as long as one datapoint changes clusters, we have failed to converge
            if not(k_old == k):
                converged = False
        #unit test
        assert checkMembership(rlist)
        for k in range(numClusters):
            means[k] = avg(rlist, xs, numExamples, k)

    print "---------"
        
   # for i in range(numClusters):
      #  print "mean of cluster ",i, " is ", means[i]

    print "mean_squared is ", mean_squared(means, rlist, xs)
예제 #7
0
def kMeans(data, numClusters):
    muList = []
    r_vectors = []

    for i in range(numClusters):
        muList.append(data[random.randint(0, len(data))])

    for i in range(len(data)):
        toAppend = []
        for k in range(numClusters):
            toAppend.append(0)
        r_vectors.append(toAppend)

    # convergence when no examples are reassigned
    somethingChanged = True
    while somethingChanged:
        # setting the r vector: assigning examples to clusters
        somethingChanged = False
        # average mean squared distance
        ave_msd = 0
        for i in range(len(data)):
            distances = []
            for muVec in muList:
                if not(isinstance(muVec,str)):
                    distances.append(utils.squareDistance(data[i], muVec))
            minDist = min(distances)
            ave_msd += minDist
            k = distances.index(minDist)
            rVec = r_vectors[i]
            if rVec[k] != 1: # data as been reassigned to different cluster 
                somethingChanged = True
            for j in range(len(rVec)):
                if j == k:
                    rVec[j] = 1
                else:
                    rVec[j] = 0
            r_vectors[i] = rVec
        ave_msd = ave_msd/len(data)

        # recenter the prototype vectors (muVecs)
        for k in range(len(muList)):
            count = 0.0
            vecSum = [0] * len(data[0]) # initialize vecSum to zero vector with the same length as a datum
            for i in range(len(data)):
                if r_vectors[i][k] == 1:
                    vecSum = map(add, vecSum, data[i])
                    count += 1.0
            if count!=0:
                muList[k] = [x/count for x in vecSum]
            else:
                muList[k] = 'Empty'

    return muList,ave_msd
예제 #8
0
def MSE(regions, germs, pMeasure, distrib, dataset):
    '''
        Returns the mean squared error based on an approximation made with 
        random points generated according to the random distribution being 
        studied.
    '''
    nDimensions = len(regions[0, 0]) - 1
    error = 0.
    for k in range(pMeasure):
        x = utils.f(nDimensions, distrib, dataset)
        r = findRegion(x, regions)
        error += utils.squareDistance(x, germs[r])
    error /= float(nDimensions)
    error /= float(pMeasure)
    return error
예제 #9
0
파일: clust.py 프로젝트: acutkosky/cs181
def mean_squared(means, rlist, xs):
    tot = 0
    for n in range(len(xs)):
        for k in range(len(means)):
            tot += rlist[n][k]*utils.squareDistance(xs[n], means[k])
    return tot/len(xs)
예제 #10
0
파일: clust.py 프로젝트: acutkosky/cs181
def Distance(xs,ys):
    return math.sqrt(utils.squareDistance(xs,ys))