def import_frame(path=None, vecs=None): """ Import a frame. :param path: :return: """ return H2OFrame(vecs=vecs) if vecs else H2OFrame(remote_fname=path)
def import_frame(path=None, vecs=None): """ Import a frame from a file (remote or local machine). If you run H2O on Hadoop, you can access to HDFS :param path: A path specifying the location of the data to import. :return: A new H2OFrame """ return H2OFrame(vecs=vecs) if vecs else H2OFrame(remote_fname=path)
def ifelse(test, yes, no): """ Semantically equivalent to R's ifelse. Based on the booleans in the test vector, the output has the values of the yes and no vectors interleaved (or merged together). :param test: A "test" H2OFrame :param yes: A "yes" H2OFrame :param no: A "no" H2OFrame :return: An H2OFrame """ test_a = None yes_a = None no_a = None test_tmp = None yes_tmp = None no_tmp = None if isinstance(test, bool): test_a = "%TRUE" if test else "%FALSE" else: if isinstance(test, H2OVec): test_tmp = test._expr.eager() else: test_tmp = test.key() test_a = "'" + test_tmp + "'" if isinstance(yes, (int, float)): yes_a = "#{}".format(str(yes)) elif yes is None: yes_a = "#NaN" else: if isinstance(yes, H2OVec): yes_tmp = yes._expr.eager() else: yes_tmp = yes.key() yes_a = "'" + yes_tmp + "'" if isinstance(no, (int, float)): no_a = "#{}".format(str(no)) elif no is None: no_a = "#NaN" else: if isinstance(no, H2OVec): no_tmp = no._expr.eager() else: no_tmp = no.key() no_a = "'" + no_tmp + "'" tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (ifelse {} {} {}))".format(tmp_key, test_a, yes_a, no_a) rapids(expr) j = frame(tmp_key) # Fetch the frame as JSON fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] vecs = H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame removeFrameShallow(tmp_key) if yes_tmp is not None: removeFrameShallow(str(yes_tmp)) if no_tmp is not None: removeFrameShallow(str(no_tmp)) if test_tmp is not None: removeFrameShallow(str(test_tmp)) return H2OFrame(vecs=vecs)
def upload_file(path, destination_key=""): """ Upload a dataset at the path given from the local machine to the H2O cluster. :param path: A path specifying the location of the data to upload. :param destination_key: The name of the H2O Frame in the H2O Cluster. :return: A new H2OFrame """ fui = {"file": os.path.abspath(path)} dest_key = H2OFrame.py_tmp_key() if destination_key == "" else destination_key H2OConnection.post_json(url_suffix="PostFile", file_upload_info=fui,destination_key=dest_key) return H2OFrame(text_key=dest_key)
def get_frame(frame_id): """ Obtain a handle to the frame in H2O with the frame_id key. :return: An H2OFrame """ return H2OFrame.get_frame(frame_id)
def get_timezone(): """ Get the Time Zone on the H2O Cloud :return: the time zone (string) """ return H2OFrame(expr=ExprNode("getTimeZone"))._scalar()
def ls(): """ List Keys on an H2O Cluster :return: Returns a list of keys in the current H2O instance """ return H2OFrame(expr=ExprNode("ls"))._frame().as_data_frame()
def list_timezones(): """ Get a list of all the timezones :return: the time zones (as an H2OFrame) """ return H2OFrame(expr=ExprNode("listTimeZones"))._frame()
def as_list(data, use_pandas=True): """ Convert an H2O data object into a python-specific object. WARNING: This will pull all data local! If Pandas is available (and use_pandas is True), then pandas will be used to parse the data frame. Otherwise, a list-of-lists populated by character data will be returned (so the types of data will all be str). :param data: An H2O data object. :param use_pandas: Try to use pandas for reading in the data. :return: List of list (Rows x Columns). """ # check to see if we can use pandas found_pandas = False try: imp.find_module('pandas') # if have pandas, use this to eat a frame found_pandas = True except ImportError: found_pandas = False # if frame, download the frame and jam into lol or pandas df if isinstance(data, H2OFrame): fr = H2OFrame.send_frame(data) res = _as_data_frame(fr, use_pandas and found_pandas) removeFrameShallow(fr) return res if isinstance(data, Expr): if data.is_local(): return data._data if data.is_pending(): data.eager() if data.is_local(): return [data._data] if isinstance(data._data, list) else [[data._data]] return _as_data_frame(data._data, use_pandas and found_pandas) if isinstance(data, H2OVec): if data._expr.is_local(): return data._expr._data if data._expr.is_pending(): data._expr.eager() if data._expr.is_local(): return [[data._expr._data]] return as_list(H2OFrame(vecs=[data]), use_pandas)
def parse_raw(setup, id=None, first_line_is_header=(-1, 0, 1)): """ Used in conjunction with import_file and parse_setup in order to make alterations before parsing. :param setup: Result of h2o.parse_setup :param id: An optional id for the frame. :param first_line_is_header: -1,0,1 if the first line is to be used as the header :return: An H2OFrame object """ if id is None: id = H2OFrame.py_tmp_key() parsed = parse(setup, id, first_line_is_header) veckeys = parsed['vec_ids'] rows = parsed['rows'] cols = parsed['column_names'] if parsed["column_names"] else [ "C" + str(x) for x in range(1, len(veckeys) + 1) ] vecs = H2OVec.new_vecs(zip(cols, veckeys), rows) return H2OFrame(vecs=vecs)
def get_frame(frame_id): if frame_id is None: raise ValueError("frame_id must not be None") res = H2OConnection.get_json("Frames/" + urllib.quote(frame_id)) res = res["frames"][0] colnames = [v["label"] for v in res["columns"]] veckeys = res["vec_ids"] vecs = H2OVec.new_vecs(zip(colnames, veckeys), res["rows"]) return H2OFrame(vecs=vecs)
def ls(): """ List Keys on an H2O Cluster :return: Returns a list of keys in the current H2O instance """ tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (ls ))".format(tmp_key) rapids(expr) j = frame(tmp_key) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] vecs = H2OVec.new_vecs(zip(colnames, veckeys), rows) fr = H2OFrame(vecs=vecs) fr.setNames(["keys"]) print "First 10 Keys: " fr.show() return as_list(fr, use_pandas=False)
def export_file(frame,path,force=False): """ Export a given H2OFrame to a path on the machine this python session is currently connected to. To view the current session, call h2o.cluster_info(). :param frame: The Frame to save to disk. :param path: The path to the save point on disk. :param force: Overwrite any preexisting file with the same path :return: None """ fr = H2OFrame.send_frame(frame) f = "true" if force else "false" H2OConnection.get_json("Frames/"+str(fr)+"/export/"+path+"/overwrite/"+f)
def ifelse(test,yes,no): """ Semantically equivalent to R's ifelse. Based on the booleans in the test vector, the output has the values of the yes and no vectors interleaved (or merged together). :param test: A "test" H2OFrame :param yes: A "yes" H2OFrame :param no: A "no" H2OFrame :return: An H2OFrame """ return H2OFrame(expr=ExprNode("ifelse",test,yes,no))._frame()
def upload_file(path, destination_frame=""): """ Upload a dataset at the path given from the local machine to the H2O cluster. :param path: A path specifying the location of the data to upload. :param destination_frame: The name of the H2O Frame in the H2O Cluster. :return: A new H2OFrame """ fui = {"file": os.path.abspath(path)} destination_frame = H2OFrame.py_tmp_key() if destination_frame == "" else destination_frame H2OConnection.post_json(url_suffix="PostFile", file_upload_info=fui,destination_frame=destination_frame) return H2OFrame(text_key=destination_frame)
def ifelse(test,yes,no): """ Semantically equivalent to R's ifelse. Based on the booleans in the test vector, the output has the values of the yes and no vectors interleaved (or merged together). :param test: A "test" H2OFrame :param yes: A "yes" H2OFrame :param no: A "no" H2OFrame :return: An H2OFrame """ test_a=None yes_a =None no_a =None test_tmp = None yes_tmp = None no_tmp = None if isinstance(test, bool): test_a = "%TRUE" if test else "%FALSE" else: if isinstance(test,H2OVec): test_tmp = test._expr.eager() else: test_tmp = test.key() test_a = "'"+test_tmp+"'" if isinstance(yes, (int,float)): yes_a = "#{}".format(str(yes)) elif yes is None: yes_a = "#NaN" else: if isinstance(yes,H2OVec): yes_tmp = yes._expr.eager() else: yes_tmp = yes.key() yes_a = "'"+yes_tmp+"'" if isinstance(no, (int,float)): no_a = "#{}".format(str(no)) elif no is None: no_a = "#NaN" else: if isinstance(no,H2OVec): no_tmp = no._expr.eager() else: no_tmp = no.key() no_a = "'"+no_tmp+"'" tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (ifelse {} {} {}))".format(tmp_key,test_a,yes_a,no_a) rapids(expr) j = frame(tmp_key) # Fetch the frame as JSON fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids']# List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame removeFrameShallow(tmp_key) if yes_tmp is not None: removeFrameShallow(str(yes_tmp)) if no_tmp is not None: removeFrameShallow(str(no_tmp)) if test_tmp is not None: removeFrameShallow(str(test_tmp)) return H2OFrame(vecs=vecs)
def _check_frame(x,y,response): if not isinstance(x,H2OFrame): if not isinstance(x,list): raise ValueError("`x` must be an H2OFrame or a list of H2OVecs. Got: " + str(type(x))) x = H2OFrame(vecs=x) if y: if not isinstance(y,H2OVec): raise ValueError("`y` must be an H2OVec. Got: " + str(type(y))) for v in x._vecs: if y._name == v._name: raise ValueError("Found response "+y._name+" in training `x` data") x[response._name] = y return x
def export_file(frame, path, force=False): """ Export a given H2OFrame to a path on the machine this python session is currently connected to. To view the current session, call h2o.cluster_info(). :param frame: The Frame to save to disk. :param path: The path to the save point on disk. :param force: Overwrite any preexisting file with the same path :return: None """ fr = H2OFrame.send_frame(frame) f = "true" if force else "false" H2OConnection.get_json("Frames/" + str(fr) + "/export/" + path + "/overwrite/" + f)
def _simple_un_math_op(op, data): """ Element-wise math operations on H2OFrame and H2OVec :param op: the math operation :param data: the H2OFrame or H2OVec object to operate on. :return: H2OFrame or H2oVec, with lazy operation """ if isinstance(data, H2OFrame): return H2OFrame( vecs=[_simple_un_math_op(op, vec) for vec in data._vecs]) if isinstance(data, H2OVec): return H2OVec(data._name, Expr(op, left=data, length=len(data))) raise ValueError, op + " only operates on H2OFrame or H2OVec objects"
def cbind(left, right): """ :param left: H2OFrame or H2OVec :param right: H2OFrame or H2OVec :return: new H2OFrame with left|right cbinded """ # Check left and right data types vecs = [] if isinstance(left, H2OFrame) and isinstance(right, H2OFrame): vecs = left._vecs + right._vecs elif isinstance(left, H2OFrame) and isinstance(right, H2OVec): [vecs.append(vec) for vec in left._vecs] vecs.append(right) elif isinstance(left, H2OVec) and isinstance(right, H2OVec): vecs = [left, right] elif isinstance(left, H2OVec) and isinstance(right, H2OFrame): vecs.append(left) [vecs.append(vec) for vec in right._vecs] else: raise ValueError("left and right data must be H2OVec or H2OFrame") names = [vec.name() for vec in vecs] fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %FALSE %" cbind += " %".join([vec._expr.eager() for vec in vecs]) + "))" rapids(cbind) j = frame(fr) fr = j['frames'][0] rows = fr['rows'] vec_ids = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, vec_ids), rows)) result.setNames(names) return result
def as_list(data, use_pandas=True): """ Convert an H2O data object into a python-specific object. WARNING: This will pull all data local! If Pandas is available (and use_pandas is True), then pandas will be used to parse the data frame. Otherwise, a list-of-lists populated by character data will be returned (so the types of data will all be str). :param data: An H2O data object. :param use_pandas: Try to use pandas for reading in the data. :return: List of list (Rows x Columns). """ return H2OFrame.as_data_frame(data, use_pandas)
def parse_raw(setup, id=None, first_line_is_header=(-1,0,1)): """ Used in conjunction with import_file and parse_setup in order to make alterations before parsing. :param setup: Result of h2o.parse_setup :param id: An optional id for the frame. :param first_line_is_header: -1,0,1 if the first line is to be used as the header :return: An H2OFrame object """ if id is None: id = H2OFrame.py_tmp_key() parsed = parse(setup, id, first_line_is_header) veckeys = parsed['vec_ids'] rows = parsed['rows'] cols = parsed['column_names'] if parsed["column_names"] else ["C" + str(x) for x in range(1,len(veckeys)+1)] vecs = H2OVec.new_vecs(zip(cols, veckeys), rows) return H2OFrame(vecs=vecs)
def rep_len(data, length_out): if isinstance(data, (str, int)): tmp_key = H2OFrame.py_tmp_key() scaler = '#{}'.format(data) if isinstance(data, int) else '\"{}\"'.format(data) expr = "(= !{} (rep_len {} {}))".format(tmp_key,scaler,'#{}'.format(length_out)) rapids(expr) j = frame(tmp_key) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) removeFrameShallow(tmp_key) return H2OFrame(vecs=vecs) return data.rep_len(length_out=length_out)
def parse_raw(setup, id=None, first_line_is_header=(-1,0,1)): """ Used in conjunction with import_file and parse_setup in order to make alterations before parsing. :param setup: Result of h2o.parse_setup :param id: An optional id for the frame. :param first_line_is_header: -1,0,1 if the first line is to be used as the header :return: An H2OFrame object """ id = setup["destination_frame"] fr = H2OFrame() parsed = parse(setup, id, first_line_is_header) fr._nrows = parsed['rows'] fr._col_names = parsed['column_names'] fr._ncols = len(fr._col_names) fr._computed = True fr._id = id return fr
def as_list(data, use_pandas=True): """ Convert an H2O data object into a python-specific object. WARNING: This will pull all data local! If Pandas is available (and use_pandas is True), then pandas will be used to parse the data frame. Otherwise, a list-of-lists populated by character data will be returned (so the types of data will all be str). :param data: An H2O data object. :param use_pandas: Try to use pandas for reading in the data. :return: List of list (Rows x Columns). """ # check to see if we can use pandas found_pandas=False try: imp.find_module('pandas') # if have pandas, use this to eat a frame found_pandas = True except ImportError: found_pandas = False # if frame, download the frame and jam into lol or pandas df if isinstance(data, H2OFrame): fr = H2OFrame.send_frame(data) res = _as_data_frame(fr, use_pandas and found_pandas) removeFrameShallow(fr) return res if isinstance(data, Expr): if data.is_local(): return data._data if data.is_pending(): data.eager() if data.is_local(): return [data._data] if isinstance(data._data, list) else [[data._data]] return _as_data_frame(data._data, use_pandas and found_pandas) if isinstance(data, H2OVec): if data._expr.is_local(): return data._expr._data if data._expr.is_pending(): data._expr.eager() if data._expr.is_local(): return [[data._expr._data]] return as_list(H2OFrame(vecs=[data]), use_pandas)
def ls(): """ List Keys on an H2O Cluster :return: Returns a list of keys in the current H2O instance """ tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (ls ))".format(tmp_key) rapids(expr) j = frame(tmp_key) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) fr = H2OFrame(vecs=vecs) print "First 10 Keys: " fr.show() return as_list(fr, use_pandas=False)
def remove(object): """ Remove object from H2O. This is a "hard" delete of the object. It removes all subparts. :param object: The object pointing to the object to be removed. :return: None """ if object is None: raise ValueError("remove with no object is not supported, for your protection") if isinstance(object, H2OFrame): fr = H2OFrame.send_frame(object) remove(fr) object._vecs=[] elif isinstance(object, H2OVec): H2OConnection.delete("DKV/"+str(object.key())) object._expr=None object=None else: H2OConnection.delete("DKV/" + object)
def remove(object): """ Remove object from H2O. This is a "hard" delete of the object. It removes all subparts. :param object: The object pointing to the object to be removed. :return: None """ if object is None: raise ValueError( "remove with no object is not supported, for your protection") if isinstance(object, H2OFrame): fr = H2OFrame.send_frame(object) remove(fr) object._vecs = [] elif isinstance(object, H2OVec): H2OConnection.delete("DKV/" + str(object.key())) object._expr = None object = None else: H2OConnection.delete("DKV/" + object)
def cbind(left,right): """ :param left: H2OFrame or H2OVec :param right: H2OFrame or H2OVec :return: new H2OFrame with left|right cbinded """ # Check left and right data types vecs = [] if isinstance(left,H2OFrame) and isinstance(right,H2OFrame): vecs = left._vecs + right._vecs elif isinstance(left,H2OFrame) and isinstance(right,H2OVec): [vecs.append(vec) for vec in left._vecs] vecs.append(right) elif isinstance(left,H2OVec) and isinstance(right,H2OVec): vecs = [left, right] elif isinstance(left,H2OVec) and isinstance(right,H2OFrame): vecs.append(left) [vecs.append(vec) for vec in right._vecs] else: raise ValueError("left and right data must be H2OVec or H2OFrame") names = [vec.name() for vec in vecs] fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %FALSE %" cbind += " %".join([vec._expr.eager() for vec in vecs]) + "))" rapids(cbind) j = frame(fr) fr = j['frames'][0] rows = fr['rows'] vec_ids = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, vec_ids), rows)) result.setNames(names) return result
def which(condition): """ :param condition: A conditional statement. :return: A H2OFrame of 1 column filled with 0-based indices for which the condition is True """ return H2OFrame(expr=ExprNode("h2o.which",condition,False))._frame()
def export_file(frame,path,force=False): fr = H2OFrame.send_frame(frame) f = "true" if force else "false" H2OConnection.get_json("Frames/"+str(fr)+"/export/"+path+"/overwrite/"+f)