def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str,range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str, range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) ### print h2o.dump_json(inspect) # compare this kmeans to the first one. since the files are replications, the results # should be similar? KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def kmeans_doit(self, csvFilename, bucket, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=10 ) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun kwargs = { "k": 1, "initialization": "Furthest", "destination_key": "KMeansModel.hex", "max_iter": 25, # reuse the same seed, to get deterministic results (otherwise sometimes fails "seed": 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans( parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs ) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, "d", **kwargs) expected = [ ( [ -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154, ], num_rows, None, ) ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans["destination_key"]) KMeansModel = inspect["KMeansModel"] clusters = KMeansModel["centers"][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str, range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) expected = [([ -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154 ], num_rows, None)] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def kmeans_doit(self, csvFilename, bucket, csvPathname, numRows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=20) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun kwargs = { 'k': 1, 'initialization': 'Furthest', 'destination_key': 'KMeansModel.hex', 'max_iter': 25, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [ ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], numRows, None) ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? # inspect doesn't work # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key']) # KMeansModel = inspect['KMeansModel'] modelView = h2o.nodes[0].kmeans_view(model='KMeansModel.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['centers'] within_cluster_variances = model['within_cluster_variances'] total_within_SS = model['total_within_SS'] print "within_cluster_variances:", within_cluster_variances print "total_within_SS:", total_within_SS if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str,range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) expected = [ ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], num_rows, None) ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def kmeans_doit(self, csvFilename, bucket, csvPathname, numRows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=20) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun kwargs = { 'k': 1, 'initialization': 'Furthest', 'destination_key': 'KMeansModel.hex', 'max_iter': 25, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [([ -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154 ], numRows, None)] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? # inspect doesn't work # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key']) # KMeansModel = inspect['KMeansModel'] modelView = h2o.nodes[0].kmeans_view(model='KMeansModel.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['centers'] within_cluster_variances = model['within_cluster_variances'] total_within_SS = model['total_within_SS'] print "within_cluster_variances:", within_cluster_variances print "total_within_SS:", total_within_SS if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)