def main(): #first run the initializer to get starting centroids filePath = '//home//mike-bowles//pyWorkspace//mapReducers//src//mr_GMixEm//inputWide.txt' mrJob = MrGMixEmInit(args=[filePath]) with mrJob.make_runner() as runner: runner.run() #pull out the centroid values to compare with values after one iteration emPath = "//home//mike-bowles//pyWorkspace//mapReducers//src//mr_GMixEm//intermediateResults.txt" fileIn = open(emPath) paramJson = fileIn.read() fileIn.close() delta = 10 #Begin iteration on change in centroids while delta > 0.01: #parse old centroid values oldParam = json.loads(paramJson) #run one iteration oldMeans = oldParam[1] mrJob2 = MrGMixEm(args=[filePath]) with mrJob2.make_runner() as runner: runner.run() #compare new centroids to old ones fileIn = open(emPath) paramJson = fileIn.read() fileIn.close() newParam = json.loads(paramJson) k_means = len(newParam[1]) newMeans = newParam[1] delta = 0.0 for i in range(k_means): delta += dist(newMeans[i],oldMeans[i]) print delta
def main(): #data path parameters filePath = os.getcwd() + "/data/" inputDataName = "input.txt" #the dataset you want to rung clustering intermediateDataName = "intermediateResults.txt" #intermediate file for EM canopyList = "canopylist.txt" # list of canopy centers #canopyCentroidAssign="canopyCentroidAssign.txt" print 'Canopy-EM cluster by CanEM Team' #Generate Canopies print 'Generating Canopies...' #canopyforEM=[] mrJob0 = MrCanopy(args=[filePath + inputDataName]) with mrJob0.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = mrJob0.parse_output_line( line) #only one key; so only one line #canopyforEM.append(value) #write canopies to file canOut = json.dumps(value) fileOut = open(filePath + canopyList, 'w') fileOut.write(canOut) fileOut.close() #Run the EM initializer to get starting centroids print 'Initializing...' mrJob = MrGMixEmInit(args=[filePath + inputDataName]) with mrJob.make_runner() as runner: runner.run() #pull out the centroid values to compare with values after one iteration fileIn = open(filePath + intermediateDataName) paramJson = fileIn.read() fileIn.close() delta = 10 #Begin iteration on change in centroids print 'Iterating...' while delta > 0.01: # #assign centroid to canopy # mrJob3 = MrAssignCentToCan(args=[filePath+intermediateDataName]) # with mrJob3.make_runner() as runner: # runner.run() # #parse old centroid values oldParam = json.loads(paramJson) #run one iteration oldMeans = oldParam[1] mrJob2 = MrGMixEm(args=[filePath + inputDataName]) with mrJob2.make_runner() as runner: runner.run() #compare new centroids to old ones fileIn = open(filePath + intermediateDataName) paramJson = fileIn.read() fileIn.close() newParam = json.loads(paramJson) k_means = len(newParam[1]) newMeans = newParam[1] delta = 0.0 for i in range(k_means): delta += dist(newMeans[i], oldMeans[i]) print delta