示例#1
0
def main():
    #first run the initializer to get starting centroids
    filePath = '//home//mike-bowles//pyWorkspace//mapReducers//src//mr_GMixEm//inputWide.txt'
    mrJob = MrGMixEmInit(args=[filePath])
    with mrJob.make_runner() as runner:
        runner.run()
    
    #pull out the centroid values to compare with values after one iteration
    emPath = "//home//mike-bowles//pyWorkspace//mapReducers//src//mr_GMixEm//intermediateResults.txt"
    fileIn = open(emPath)
    paramJson = fileIn.read()
    fileIn.close()
    
    delta = 10
    #Begin iteration on change in centroids
    while delta > 0.01:
        #parse old centroid values
        oldParam = json.loads(paramJson)
        #run one iteration
        oldMeans = oldParam[1]
        mrJob2 = MrGMixEm(args=[filePath])
        with mrJob2.make_runner() as runner:
            runner.run()
            
        #compare new centroids to old ones
        fileIn = open(emPath)
        paramJson = fileIn.read()
        fileIn.close()
        newParam = json.loads(paramJson)
        
        k_means = len(newParam[1])
        newMeans = newParam[1]
        
        delta = 0.0
        for i in range(k_means):
            delta += dist(newMeans[i],oldMeans[i])
        
        print delta
示例#2
0
def main():

    #data path parameters
    filePath = os.getcwd() + "/data/"
    inputDataName = "input.txt"  #the dataset you want to rung clustering
    intermediateDataName = "intermediateResults.txt"  #intermediate file for EM
    canopyList = "canopylist.txt"  # list of canopy centers
    #canopyCentroidAssign="canopyCentroidAssign.txt"

    print 'Canopy-EM cluster by CanEM Team'

    #Generate Canopies
    print 'Generating Canopies...'
    #canopyforEM=[]
    mrJob0 = MrCanopy(args=[filePath + inputDataName])
    with mrJob0.make_runner() as runner:
        runner.run()
        for line in runner.stream_output():
            key, value = mrJob0.parse_output_line(
                line)  #only one key; so only one line
            #canopyforEM.append(value)

    #write canopies to file
    canOut = json.dumps(value)
    fileOut = open(filePath + canopyList, 'w')
    fileOut.write(canOut)
    fileOut.close()

    #Run the EM initializer to get starting centroids
    print 'Initializing...'

    mrJob = MrGMixEmInit(args=[filePath + inputDataName])
    with mrJob.make_runner() as runner:
        runner.run()

    #pull out the centroid values to compare with values after one iteration
    fileIn = open(filePath + intermediateDataName)
    paramJson = fileIn.read()
    fileIn.close()

    delta = 10
    #Begin iteration on change in centroids
    print 'Iterating...'
    while delta > 0.01:

        #        #assign centroid to canopy
        #        mrJob3 = MrAssignCentToCan(args=[filePath+intermediateDataName])
        #        with mrJob3.make_runner() as runner:
        #            runner.run()
        #

        #parse old centroid values
        oldParam = json.loads(paramJson)
        #run one iteration
        oldMeans = oldParam[1]
        mrJob2 = MrGMixEm(args=[filePath + inputDataName])
        with mrJob2.make_runner() as runner:
            runner.run()

        #compare new centroids to old ones
        fileIn = open(filePath + intermediateDataName)
        paramJson = fileIn.read()
        fileIn.close()
        newParam = json.loads(paramJson)

        k_means = len(newParam[1])
        newMeans = newParam[1]

        delta = 0.0
        for i in range(k_means):
            delta += dist(newMeans[i], oldMeans[i])

        print delta