def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path a_node = h2o.nodes[0] # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv")) import_result = a_node.import_files(path=find_file("smalldata/poker/poker-hand-testing.data")) # print dump_json(import_result) k = import_result['keys'][0] # frames_result = a_node.frames(key=k[0], len=5) frames_result = a_node.frames(key=k) frame = frames_result['frames'][0] byteSize = frame['byteSize'] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing'] stype = c['type'] zeros = c['zeros'] domain = c['domain'] # print dump_json(frame) # how do you parse multiple files parse_result = a_node.parse(key=k) frame = parse_result['frames'][0] hex_key = frame['key']['name'] verboseprint(hex_key, ":", dump_json(parse_result))
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data") csvPathname = find_file("smalldata/logreg/prostate.csv") import_result = h2o.n0.import_files(path=csvPathname) # print dump_json(import_result) k = import_result['keys'][0] frames_result = h2o.n0.frames(key=k) frame = frames_result['frames'][0] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing_count'] stype = c['type'] domain = c['domain'] # print dump_json(frame) # let's see what ray's util does frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) for k,v in frames_dict.items(): print "frames_dict key:", k # interesting. we can do dictionary comprehensions # { k:v for k,v in my_dict.items() if 'Peter' in k } # how do you parse multiple files parse_result = h2o.n0.parse(key=k, intermediateResults=DO_INTERMEDIATE_RESULTS) frame = parse_result['frames'][0] hex_key = frame['key']['name'] colCount = 9 rowCount = 380 # colCount = 11 # rowCount = 1000000 start = time.time() inspect = h2o_cmd.runInspect(None, hex_key) print "Inspect:", hex_key, "took", time.time() - start, "seconds" numCols = len(inspect['frames'][0]['columns']) numRows = inspect['frames'][0]['rows'] print "\n" + csvPathname, \ " rows:", "{:,}".format(numRows), \ " len(columns):", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount)) verboseprint(hex_key, ":", dump_json(parse_result))
def upload_file(self, f, progress=None): # FIX! we won't find it here if it's hdfs://172.16.2.151/ file f = find_file(f) if f not in self.uploaded: start = time.time() import md5 m = md5.new() m.update(open(f).read()) m.update(getpass.getuser()) dest = '/tmp/' + m.hexdigest() + "-" + os.path.basename(f) # sigh. we rm/create sandbox in build_cloud now # (because nosetests doesn't exec h2o_main and we # don't want to code "clean_sandbox()" in all the tests. # So: we don't have a sandbox here, or if we do, we're going to delete it. # Just don't log anything until build_cloud()? that should be okay? # we were just logging this upload message..not needed. # log('Uploading to %s: %s -> %s' % (self.http_addr, f, dest)) sftp = self.ssh.open_sftp() # check if file exists on remote side # does paramiko have issues with big files? (>1GB, or 650MB?). maybe we don't care. # This would arise (as mentioned in the source, line no 667, # http://www.lag.net/paramiko/docs/paramiko.sftp_client-pysrc.html) when there is # any error reading the packet or when there is EOFError # but I'm getting sftp close here randomly at sm. # http://stackoverflow.com/questions/22708942/python-paramiko-module-error-with-callback # http://stackoverflow.com/questions/15010540/paramiko-sftp-server-connection-dropped # http://stackoverflow.com/questions/12322210/handling-paramiko-sshexception-server-connection-dropped try: # note we don't do a md5 compare. so if a corrupted file was uploaded we won't re-upload # until we do another build. sftp.stat(dest) print "{0} Skipping upload of file {1}. File {2} exists on remote side!".format( self, f, dest) except IOError, e: # if self.channel.closed or self.channel.exit_status_ready(): # raise Exception("something bad happened to our %s being used for sftp. keepalive? %s %s" % \ # (self, self.channel.closed, self.channel.exit_status_ready())) if e.errno == errno.ENOENT: # no such file or directory verboseprint("{0} uploading file {1}".format(self, f)) sftp.put(f, dest, callback=progress) # if you want to track upload times ### print "\n{0:.3f} seconds".format(time.time() - start) elif e.errno == errno.EEXIST: # File Exists pass else: print "Got unexpected errno: %s on paramiko sftp." % e.errno print "Lookup here: https://docs.python.org/2/library/errno.html" # throw the exception again, if not what we expected exc_info = sys.exc_info() raise exc_info[1], None, exc_info[2] finally:
def upload_file(self, f, progress=None): # FIX! we won't find it here if it's hdfs://172.16.2.151/ file f = find_file(f) if f not in self.uploaded: start = time.time() import md5 m = md5.new() m.update(open(f).read()) m.update(getpass.getuser()) dest = '/tmp/' + m.hexdigest() + "-" + os.path.basename(f) # sigh. we rm/create sandbox in build_cloud now # (because nosetests doesn't exec h2o_main and we # don't want to code "clean_sandbox()" in all the tests. # So: we don't have a sandbox here, or if we do, we're going to delete it. # Just don't log anything until build_cloud()? that should be okay? # we were just logging this upload message..not needed. # log('Uploading to %s: %s -> %s' % (self.http_addr, f, dest)) sftp = self.ssh.open_sftp() # check if file exists on remote side # does paramiko have issues with big files? (>1GB, or 650MB?). maybe we don't care. # This would arise (as mentioned in the source, line no 667, # http://www.lag.net/paramiko/docs/paramiko.sftp_client-pysrc.html) when there is # any error reading the packet or when there is EOFError # but I'm getting sftp close here randomly at sm. # http://stackoverflow.com/questions/22708942/python-paramiko-module-error-with-callback # http://stackoverflow.com/questions/15010540/paramiko-sftp-server-connection-dropped # http://stackoverflow.com/questions/12322210/handling-paramiko-sshexception-server-connection-dropped try: # note we don't do a md5 compare. so if a corrupted file was uploaded we won't re-upload # until we do another build. sftp.stat(dest) print "{0} Skipping upload of file {1}. File {2} exists on remote side!".format(self, f, dest) except IOError, e: # if self.channel.closed or self.channel.exit_status_ready(): # raise Exception("something bad happened to our %s being used for sftp. keepalive? %s %s" % \ # (self, self.channel.closed, self.channel.exit_status_ready())) if e.errno == errno.ENOENT: # no such file or directory verboseprint("{0} uploading file {1}".format(self, f)) sftp.put(f, dest, callback=progress) # if you want to track upload times ### print "\n{0:.3f} seconds".format(time.time() - start) elif e.errno == errno.EEXIST: # File Exists pass else: print "Got unexpected errno: %s on paramiko sftp." % e.errno print "Lookup here: https://docs.python.org/2/library/errno.html" # throw the exception again, if not what we expected exc_info = sys.exc_info() raise exc_info[1], None, exc_info[2] finally:
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path a_node = h2o.nodes[0] import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv")) print dump_json(import_result) frames = a_node.frames(key=import_result['keys'][0], len=5)['frames'] print dump_json(frames) parse_result = a_node.parse(key=import_result['keys'][0]) hex_key = parse_result['frames'][0]['key']['name'] verboseprint(hex_key, ":", dump_json(parse_result))
def upload_jar_to_remote_hosts(hosts, slow_connection=False): def prog(sofar, total): # output is bad for jenkins. username = getpass.getuser() if username != 'jenkins': p = int((10.0*sofar)/total) sys.stdout.write('\rUploading jar [%s%s] %02d%%' % ('#'*p, ' ' * (10-p), (100*sofar)/total)) sys.stdout.flush() if not slow_connection: for h in hosts: f = find_file('target/h2o.jar') h.upload_file(f, progress=prog) # skipping progress indicator for the flatfile h.upload_file(flatfile_pathname()) else: f = find_file('target/h2o.jar') hosts[0].upload_file(f, progress=prog) hosts[0].push_file_to_remotes(f, hosts[1:]) f = find_file(flatfile_pathname()) hosts[0].upload_file(f, progress=prog) hosts[0].push_file_to_remotes(f, hosts[1:])
def upload_jar_to_remote_hosts(hosts, slow_connection=False): def prog(sofar, total): # output is bad for jenkins. username = getpass.getuser() if username != 'jenkins': p = int((10.0 * sofar) / total) sys.stdout.write('\rUploading jar [%s%s] %02d%%' % ('#' * p, ' ' * (10 - p), (100 * sofar) / total)) sys.stdout.flush() if not slow_connection: for h in hosts: f = find_file('build/h2o.jar') h.upload_file(f, progress=prog) # skipping progress indicator for the flatfile h.upload_file(h2o_bc.flatfile_pathname()) else: f = find_file('build/h2o.jar') hosts[0].upload_file(f, progress=prog) hosts[0].push_file_to_remotes(f, hosts[1:]) f = find_file(h2o_bc.flatfile_pathname()) hosts[0].upload_file(f, progress=prog) hosts[0].push_file_to_remotes(f, hosts[1:])
def get_h2o_jar(self): return find_file('build/h2o.jar')
def get_args(self): args = ['java'] # I guess it doesn't matter if we use flatfile for both now # defaults to not specifying # FIX! we need to check that it's not outside the limits of the dram of the machine it's running on? if self.java_heap_GB is not None: if not (1 <= self.java_heap_GB <= 256): raise Exception('java_heap_GB <1 or >256 (GB): %s' % (self.java_heap_GB)) args += ['-Xms%dG' % self.java_heap_GB] args += ['-Xmx%dG' % self.java_heap_GB] if self.java_heap_MB is not None: if not (1 <= self.java_heap_MB <= 256000): raise Exception('java_heap_MB <1 or >256000 (MB): %s' % (self.java_heap_MB)) args += ['-Xms%dm' % self.java_heap_MB] args += ['-Xmx%dm' % self.java_heap_MB] if self.java_extra_args is not None: args += ['%s' % self.java_extra_args] if self.use_debugger: # currently hardwire the base port for debugger to 8000 # increment by one for every node we add # sence this order is different than h2o cluster order, print out the ip and port for the user # we could save debugger_port state per node, but not really necessary (but would be more consistent) debuggerBasePort = 8000 if self.node_id is None: debuggerPort = debuggerBasePort else: debuggerPort = debuggerBasePort + self.node_id if self.http_addr: a = self.http_addr else: a = "localhost" if self.port: b = str(self.port) else: b = "h2o determined" # I guess we always specify port? print "You can attach debugger at port %s for jvm at %s:%s" % (debuggerPort, a, b) args += ['-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=%s' % debuggerPort] if self.disable_assertions: print "WARNING: h2o is running with assertions disabled" else: args += ["-ea"] if self.use_maprfs: args += ["-Djava.library.path=/opt/mapr/lib"] if self.classpath: entries = [find_file('build/classes'), find_file('lib/javassist.jar')] entries += glob.glob(find_file('lib') + '/*/*.jar') entries += glob.glob(find_file('lib') + '/*/*/*.jar') args += ['-classpath', os.pathsep.join(entries), 'water.Boot'] else: args += ["-jar", self.get_h2o_jar()] if 1==1: if self.hdfs_config: args += [ '-hdfs_config ' + self.hdfs_config ] if h2o_args.beta_features: # no -beta # args += ["-beta"] pass if self.network: args += ["-network " + self.network] # H2O should figure it out, if not specified # DON"T EVER USE on multi-machine...h2o should always get it right, to be able to run on hadoop # where it's not told # new 10/22/14. Allow forcing the ip when we do remote, for networks with bridges, where # h2o can't self identify (does -network work?) if self.force_ip and self.h2o_addr: # should always have an addr if force_ip...but.. args += [ '-ip %s' % self.h2o_addr, ] # Need to specify port, since there can be multiple ports for an ip in the flatfile if self.port is not None: args += [ "-port %d" % self.port, ] if self.use_flatfile: args += [ '-flatfile ' + self.flatfile, ] args += [ '-ice_root %s' % self.get_ice_dir(), # if I have multiple jenkins projects doing different h2o clouds, I need # I need different ports and different cloud name. # does different cloud name prevent them from joining up # (even if same multicast ports?) # I suppose I can force a base address. or run on another machine? ] args += [ '-name ' + self.cloud_name ] # ignore the other -hdfs args if the config is used? if 1==0: if self.hdfs_config: args += [ '-hdfs_config ' + self.hdfs_config ] # UPDATE: no longer valid to h2o? if 1==0 and self.use_hdfs: args += [ # it's fine if hdfs_name has a ":9000" port or something too '-hdfs hdfs://' + self.hdfs_name_node, '-hdfs_version ' + self.hdfs_version, ] # UPDATE: no longer valid to h2o? if 1==0 and self.use_maprfs: args += [ # 3 slashes? '-hdfs maprfs:///' + self.hdfs_name_node, '-hdfs_version ' + self.hdfs_version, ] if self.aws_credentials: args += ['-aws_credentials ' + self.aws_credentials] # passed thru build_cloud in test, or global from commandline arg if self.random_udp_drop or h2o_args.random_udp_drop: args += ['-random_udp_drop'] if self.force_tcp: args += ['-force_tcp'] if self.disable_h2o_log: args += ['-nolog'] # psutil psopen needs param/value in different arg elements # othetwise we'd need to pass as joined string, and run /bin/sh # this joins them up with space, then splits on space. # works as long as no pathnames have embedded space, which should be true # for unix, maybe not windows. For windows we join them as string before use in psopen argsSplitByWhiteSpace = " ".join(args).split() return argsSplitByWhiteSpace
def get_h2o_jar(self): return find_file('target/h2o.jar')
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data") csvPathname = find_file("smalldata/logreg/prostate.csv") import_result = h2o.n0.import_files(path=csvPathname) # print dump_json(import_result) k = import_result['keys'][0] frames_result = h2o.n0.frames(key=k) frame = frames_result['frames'][0] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing_count'] stype = c['type'] domain = c['domain'] # print dump_json(frame) # let's see what ray's util does frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) for k, v in frames_dict.items(): print "frames_dict key:", k # interesting. we can do dictionary comprehensions # { k:v for k,v in my_dict.items() if 'Peter' in k } # how do you parse multiple files parse_result = h2o.n0.parse( key=k, intermediateResults=DO_INTERMEDIATE_RESULTS) frame = parse_result['frames'][0] hex_key = frame['key']['name'] colCount = 9 rowCount = 380 # colCount = 11 # rowCount = 1000000 start = time.time() inspect = h2o_cmd.runInspect(None, hex_key) print "Inspect:", hex_key, "took", time.time() - start, "seconds" numCols = len(inspect['frames'][0]['columns']) numRows = inspect['frames'][0]['rows'] print "\n" + csvPathname, \ " rows:", "{:,}".format(numRows), \ " len(columns):", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount)) verboseprint(hex_key, ":", dump_json(parse_result))