def show(self, noprint=False): """ Evaluate and print. :return: """ self.eager() if noprint: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = j['frames'][0]['columns'][0]['data'][0:10] return data return self._data else: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = [c['data'] for c in j['frames'][0]['columns'][:]] elif isinstance(self._data, (int, float, str, list)): print self._data print return else: data = [self._data] t_data = map(list, zip(*data)) for didx, d in enumerate(t_data): t_data[didx].insert(0, didx) headers = ["Row ID"] for i in range(len(t_data[0])): headers.append('') print "Displaying first " + str(len(t_data)) + " row(s)" print tabulate.tabulate(t_data, headers=headers) print
def describe(self): """ Generate an in-depth description of this H2OFrame. The description is a tabular print of the type, min, max, sigma, number of zeros, and number of missing elements for each H2OVec in this H2OFrame. :return: None (print to stdout) """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") thousands_sep = h2o.H2ODisplay.THOUSANDS print "Rows:", thousands_sep.format(len(self._vecs[0])), "Cols:", thousands_sep.format(len(self)) headers = [vec._name for vec in self._vecs] table = [ self._row('type', None), self._row('mins', 0), self._row('mean', None), self._row('maxs', 0), self._row('sigma', None), self._row('zero_count', None), self._row('missing_count', None) ] chunk_summary_tmp_key = H2OFrame.send_frame(self) chunk_summary = h2o.frame(chunk_summary_tmp_key)["frames"][0]["chunk_summary"] dist_summary = h2o.frame(chunk_summary_tmp_key)["frames"][0]["distribution_summary"] h2o.delete(chunk_summary_tmp_key) chunk_summary.show() dist_summary.show() h2o.H2ODisplay(table, [""] + headers, "Column-by-Colum Summary")
def import_folder(): tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. # compressed the whole directory of files. multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/hexdev_497/milsongs_csv.zip")) # directory containing the gzip version of csv files here. multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame( multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def show(self, noprint=False): """ Evaluate and print. :return: """ self.eager() if noprint: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = j['frames'][0]['columns'][0]['data'][0:10] return data return self._data else: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = j['frames'][0]['columns'][0]['data'][0:10] elif isinstance(self._data, int): print self._data return else: data = [self._data] header = self._vecname + " (first " + str(len(data)) + " row(s))" rows = range(1, len(data) + 1, 1) print tabulate.tabulate(zip(rows, data), headers=["Row ID", header]) print
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_orc_file = "/datasets/orc_parser/air05_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/air05_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(hdfs_csv_file, na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1])) - 1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc, col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv - startcsv)) print( "************** ORC (without column type forcing) parse time is {0}" .format(endorc1 - startorc1)) print( "************** ORC (with column type forcing) parse time is {0}". format(endorc - startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def additional_parameters(): #col_types as list dest_frame="dev29&hex%" c_names = ["a", "b", "c"] c_types = ["enum", "enum", "string"] fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), destination_frame=dest_frame, col_names=c_names, col_types=c_types) fhex.describe() assert fhex._id == dest_frame.replace("%",".").replace("&",".") assert fhex.col_names == c_names col_summary = h2o.frame(fhex._id)["frames"][0]["columns"] for i in range(len(col_summary)): assert col_summary[i]["type"] == c_types[i] #col_types as dictionary dest_frame="dev29&hex%" c_names = ["a", "b", "c"] c_types = {"c":"string", "a":"enum", "b": "enum"} fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), destination_frame=dest_frame, col_names=c_names, col_types=c_types) fhex.describe() assert fhex._id == dest_frame.replace("%",".").replace("&",".") assert fhex.col_names == c_names col_summary = h2o.frame(fhex._id)["frames"][0]["columns"] for i in range(len(col_summary)): assert col_summary[i]["type"] == c_types[c_names[i]]
def import_folder(): tol_time = 200 # comparing in ms or ns for timestamp columns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame( multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def show(self, noprint=False): """ Evaluate and print. :return: None """ self.eager() if noprint: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = [c['data'] for c in j['frames'][0]['columns'][:]] data = map(list, zip(*data)) return data[0:min(10,len(data))] return self._data else: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = [c['data'] for c in j['frames'][0]['columns'][:]] elif isinstance(self._data, (int, float, str, list)): print self._data print return else: data = [self._data] t_data = map(list, zip(*data)) t_data = t_data[0:min(10,len(t_data))] for didx,d in enumerate(t_data): t_data[didx].insert(0,didx) headers = ["Row ID"] for i in range(len(t_data[0])): headers.append('') print "Displaying first " + str(len(t_data)) + " row(s)" print tabulate.tabulate(t_data, headers=headers) print
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/air05_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/air05_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1]))-1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc,col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def import_folder(): multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) multi_file_gzip_comp.summary() zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def import_folder(): """ This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv from and build another H2O frame from the multi-file orc parser using multiple orc files that are saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc. It will compare the two frames to make sure they are equal. :return: None if passed. Otherwise, an exception will be thrown. """ startcsv = time.time() multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/orc/pubdev_3200/air05_csv"), na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1])) - 1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/orc/pubdev_3200/air05_orc")) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/orc/pubdev_3200/air05_orc"), col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv - startcsv)) print("************** ORC (without column type forcing) parse time is {0}". format(endorc1 - startorc1)) print("************** ORC (with column type forcing) parse time is {0}". format(endorc - startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def import_folder(): multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) multi_file_gzip_comp.summary() zip_summary = h2o.frame( multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def parquet_parse_simple(): """ Tests Parquet parser by comparing the summary of the original csv frame with the h2o parsed Parquet frame. Basic use case of importing files with auto-detection of column types. :return: None if passed. Otherwise, an exception will be thrown. """ csv = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) parquet = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet")) csv.summary() csv_summary = h2o.frame(csv.frame_id)["frames"][0]["columns"] parquet.summary() parquet_summary = h2o.frame(parquet.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, parquet_summary)
def summary(self): self.eager() if self.is_local(): x = self._data[0] t = 'int' if isinstance( x, int) else ('enum' if isinstance(x, str) else 'real') mins = [min(self._data)] maxs = [max(self._data)] n = len(self._data) mean = sum(self._data) / n if t != 'enum' else None ssq = 0 zeros = 0 missing = 0 for x in self._data: delta = x - mean if t != 'enum': ssq += delta * delta if x == 0: zeros += 1 if x is None or (t != 'enum' and isnan(x)): missing += 1 stddev = sqrt(ssq / (n - 1)) if t != 'enum' else None return { 'type': t, 'mins': mins, 'maxs': maxs, 'mean': mean, 'sigma': stddev, 'zeros': zeros, 'missing': missing } if self._summary: return self._summary j = h2o.frame(self._data) self._summary = j['frames'][0]['columns'][0] return self._summary
def ddply(self,cols,fun): """ :param cols: Column names used to control grouping :param fun: Function to execute on each group. Right now limited to textual Rapids expression :return: New frame with 1 row per-group, of results from 'fun' """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") # Confirm all names present in dataset; collect column indices rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")" # Eagerly eval and send the cbind'd frame over key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (h2o.ddply %{} {} {}))".format(tmp_key,key,rapids_series,fun) h2o.rapids(expr) # ddply in h2o # Remove h2o temp frame after ddply h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) # Fetch the frame as JSON fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids']# List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def cbind(self,data): """ :param data: H2OFrame or H2OVec :return: new H2OFrame with data cbinded to the end """ # Check data type vecs = [] if isinstance(data,H2OFrame): vecs.append(self) [vecs.append(vec) for vec in data._vecs] elif isinstance(data,H2OVec): vecs = [self, data] else: raise ValueError("data parameter must be H2OVec or H2OFrame") names = [vec.name() for vec in vecs] fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([vec._expr.eager() for vec in vecs]) + "))" h2o.rapids(cbind) j = h2o.frame(fr) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows)) result.setNames(names) return result
def predict(self, test_data): """ Predict on a dataset. :param test_data: Data to be predicted on. :return: A new H2OFrame filled with predictions. """ if not test_data: raise ValueError("Must specify test data") # cbind the test_data vecs together and produce a temp key test_data_key = H2OFrame.send_frame(test_data) # get the predictions # this job call is blocking j = H2OConnection.post_json("Predictions/models/" + self._key + "/frames/" + test_data_key) # toast the cbound frame h2o.removeFrameShallow(test_data_key) # retrieve the prediction frame prediction_frame_key = j["model_metrics"][0]["predictions"]["frame_id"]["name"] # get the actual frame meta dta pred_frame_meta = h2o.frame(prediction_frame_key)["frames"][0] # toast the prediction frame h2o.removeFrameShallow(prediction_frame_key) # collect the vec_ids vec_ids = pred_frame_meta["vec_ids"] # get the number of rows rows = pred_frame_meta["rows"] # get the column names cols = [col["label"] for col in pred_frame_meta["columns"]] # create a set of H2OVec objects vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows) # return a new H2OFrame object return H2OFrame(vecs=vecs)
def quantile(self, prob=None, combine_method="interpolate"): """ Compute quantiles over a given H2OFrame. :param prob: A list of probabilties, default is [0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]. You may provide any sequence of any length. :param combine_method: For even samples, how to combine quantiles. Should be one of ["interpolate", "average", "low", "hi"] :return: an H2OFrame containing the quantiles and probabilities. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") if len(self) == 0: return self if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99] if not isinstance(prob, list): raise ValueError("prob must be a list") probs = "(dlist #"+" #".join([str(p) for p in prob])+")" if combine_method not in ["interpolate","average","low","high"]: raise ValueError("combine_method must be one of: [" + ",".join(["interpolate","average","low","high"])+"]") key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (quantile '{}' {} '{}'".format(tmp_key,key,probs,combine_method) h2o.rapids(expr) # Remove h2o temp frame after groupby h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def __getitem__(self, i): if isinstance(i, int): return self._vecs[i] if isinstance(i, str): return self._find(i) # Slice; return a Frame not a Vec if isinstance(i, slice): return H2OFrame(vecs=self._vecs[i]) # Row selection from a boolean Vec if isinstance(i, H2OVec): self._len_check(i) return H2OFrame(vecs=[x.row_select(i) for x in self._vecs]) # have a list/tuple of numbers or strings if isinstance(i, list) or (isinstance(i, tuple) and len(i) != 2): vecs = [] for it in i: if isinstance(it, int): vecs.append(self._vecs[it]) elif isinstance(it, str): vecs.append(self._find(it)) else: raise NotImplementedError return H2OFrame(vecs=vecs) # multi-dimensional slicing via 2-tuple if isinstance(i, tuple): j = h2o.frame(self.send_frame()) fr = j['frames'][0] veckeys = [str(v['name']) for v in fr['vec_keys']] left = Expr(veckeys) rite = Expr((i[0], i[1])) return Expr("[", left, rite, length=2) raise NotImplementedError("Slicing by unknown type: "+str(type(i)))
def predict(self, test_data): """ Predict on a dataset. :param test_data: Data to be predicted on. :return: A new H2OFrame filled with predictions. """ if not test_data: raise ValueError("Must specify test data") # cbind the test_data vecs together and produce a temp key test_data_key = H2OFrame.send_frame(test_data) # get the predictions # this job call is blocking j = H2OConnection.post_json("Predictions/models/" + self._key + "/frames/" + test_data_key) # retrieve the prediction frame prediction_frame_key = j["model_metrics"][0]["predictions"][ "frame_id"]["name"] # get the actual frame meta dta pred_frame_meta = h2o.frame(prediction_frame_key)["frames"][0] # collect the vec_ids vec_ids = pred_frame_meta["vec_ids"] # get the number of rows rows = pred_frame_meta["rows"] # get the column names cols = [col["label"] for col in pred_frame_meta["columns"]] # create a set of H2OVec objects vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows) # toast the cbound frame h2o.delete(test_data_key) # return a new H2OFrame object return H2OFrame(vecs=vecs)
def describe(self): """ Generate an in-depth description of this H2OFrame. The description is a tabular print of the type, min, max, sigma, number of zeros, and number of missing elements for each H2OVec in this H2OFrame. :return: None (print to stdout) """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") print "Rows:", len(self._vecs[0]), "Cols:", len(self) headers = [vec._name for vec in self._vecs] table = [ self._row('type', None), self._row('mins', 0), self._row('mean', None), self._row('maxs', 0), self._row('sigma', None), self._row('zero_count', None), self._row('missing_count', None) ] chunk_summary_tmp_key = H2OFrame.send_frame(self) chunk_summary = h2o.frame(chunk_summary_tmp_key)["frames"][0]["chunk_summary"] h2o.remove(chunk_summary_tmp_key) print tabulate.tabulate(table, headers) print print chunk_summary print
def deepfeatures(self, test_data, layer): """ Return hidden layer details :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if not test_data: raise ValueError("Must specify test data") # create test_data by cbinding vecs test_data_key = H2OFrame.send_frame(test_data) # get the deepfeatures of the dataset j = H2OConnection.post_json("Predictions/models/" + self._key + "/frames/" + test_data_key, deep_features_hidden_layer=layer) # retreive the frame data deepfeatures_frame_key = j["predictions_frame"]["name"] df_frame_meta = h2o.frame(deepfeatures_frame_key)["frames"][0] # create vecs by extracting vec_ids, col length, and col names vec_ids = df_frame_meta["vec_ids"] rows = df_frame_meta["rows"] cols = [col["label"] for col in df_frame_meta["columns"]] vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows) # remove test data from kv h2o.delete(test_data_key) # finally return frame return H2OFrame(vecs=vecs)
def import_folder(): """ This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv from and build another H2O frame from the multi-file orc parser using multiple orc files that are saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc. It will compare the two frames to make sure they are equal. :return: None if passed. Otherwise, an exception will be thrown. """ startcsv = time.time() multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_csv"), na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1]))-1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc")) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"), col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def parquet_parse_simple(): """ Tests Parquet parser by comparing the summary of the original csv frame with the h2o parsed Parquet frame. Basic use case of importing files with auto-detection of column types. :return: None if passed. Otherwise, an exception will be thrown. """ csv = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) parquet = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/parquet/airlines-simple.snappy.parquet")) csv.summary() csv_summary = h2o.frame(csv.frame_id)["frames"][0]["columns"] parquet.summary() parquet_summary = h2o.frame(parquet.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, parquet_summary)
def import_folder(): """ This test will build a H2O frame from importing the bigdata/laptop/parser/orc/milsongs_orc_csv from and build another H2O frame from the multi-file orc parser using multiple orc files that are saved in the directory bigdata/laptop/parser/orc/milsongs_orc. It will compare the two frames to make sure they are equal. :return: None if passed. Otherwise, an exception will be thrown. """ multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc_csv")) multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc")) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def h2oframe(): """ Python API test: h2o.frame(frame_id) """ training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) frame_summary = h2o.frame(training_data.frame_id) assert_is_type(frame_summary, H2OResponse) assert frame_summary["frames"][0]['rows']==training_data.nrow, "h2o.frame() command is not working." assert frame_summary["frames"][0]['column_count']==training_data.ncol, "h2o.frame() command is not working."
def dim(self): """ Eagerly evaluate the Expr. If it's an H2OFrame, return the number of rows and columns. :return: The number of rows and columns in the H2OFrame as a list [rows, cols]. """ self.eager() if isinstance(self._data, unicode): frame = h2o.frame(self._data) return [frame['frames'][0]['rows'], len(frame['frames'][0]['columns'])] raise ValueError("data must be a (unicode) key")
def row_select(self, vec): """ Boolean column select lookup :param vec: An H2OVec. :return: A new H2OVec. """ e = Expr("[", self, vec) j = h2o.frame(e.eager()) e.set_len(j['frames'][0]['rows']) return H2OVec(self._name, e)
def show(self, noprint=False): """ Evaluate and print. :return: None """ self.eager() if noprint: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = [ c['data'] if c['type'] != "string" else c["string_data"] for c in j['frames'][0]['columns'][:] ] domains = [c['domain'] for c in j['frames'][0]['columns']] for i in range(len(data)): if domains[i] is not None: for j in range(len(data[i])): if data[i][j] == "NaN": continue data[i][j] = domains[i][int(data[i][j])] data = map(list, zip(*data)) return data[0:min(10, len(data))] return self._data else: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = [c['data'] for c in j['frames'][0]['columns'][:]] elif isinstance(self._data, (int, float, str, list)): print self._data print return else: data = [self._data] t_data = map(list, zip(*data)) t_data = t_data[0:min(10, len(t_data))] for didx, d in enumerate(t_data): t_data[didx].insert(0, didx) headers = ["Row ID"] for i in range(len(t_data[0])): headers.append('') print "Displaying first " + str(len(t_data)) + " row(s)" h2o.H2ODisplay(t_data, headers)
def parquet_parse_dates(): parquet_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/parquet/parquet-file-with-date-column.snappy.parquet" )) parquet_data.summary() parquet_summary = h2o.frame(parquet_data.frame_id)["frames"][0]["columns"] date_converted_column_type = parquet_summary[2]['type'] assert date_converted_column_type == "time" date_string_rows = parquet_data[:, "date_string"] date_converted_rows = parquet_data[:, "date_converted"] pyunit_utils.compare_frames(date_string_rows, date_converted_rows, 1)
def dim(self): """ Eagerly evaluate the Expr. If it's an H2OFrame, return the number of rows and columns. :return: The number of rows and columns in the H2OFrame as a list [rows, cols]. """ self.eager() if self.is_remote(): # potentially big data frame = h2o.frame(self._data) return [frame['frames'][0]['rows'], len(frame['frames'][0]['columns'])] elif self.is_local(): # small data return [1,1] if not hasattr(self._data, '__len__') else [1,len(self._data)] raise ValueError("data must be local or remote")
def import_folder(): tol_time = 200 # comparing in ms or ns for timestamp columns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def show(self, noprint=False): """ Evaluate and print. :return: None """ self.eager() if noprint: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = [c['data'] if c['type']!="string" else c["string_data"] for c in j['frames'][0]['columns'][:]] domains = [c['domain'] for c in j['frames'][0]['columns']] for i in range(len(data)): if domains[i] is not None: for j in range(len(data[i])): if data[i][j] == "NaN": continue data[i][j] = domains[i][int(data[i][j])] data = map(list, zip(*data)) return data[0:min(10,len(data))] return self._data else: if isinstance(self._data, unicode): j = h2o.frame(self._data) data = [c['data'] for c in j['frames'][0]['columns'][:]] elif isinstance(self._data, (int, float, str, list)): print self._data print return else: data = [self._data] t_data = map(list, zip(*data)) t_data = t_data[0:min(10,len(t_data))] for didx,d in enumerate(t_data): t_data[didx].insert(0,didx) headers = ["Row ID"] for i in range(len(t_data[0])): headers.append('') print "Displaying first " + str(len(t_data)) + " row(s)" h2o.H2ODisplay(t_data,headers)
def row_select(self, vec): """ Boolean column select lookup :param vec: An H2OVec. :return: A new H2OVec. """ e = Expr("[", self, vec) r = e.eager() if isinstance(r, (float,int)): e.set_len(1) else: j = h2o.frame(r) e.set_len(j['frames'][0]['rows']) return H2OVec(self._name, e)
def import_folder(): tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. # compressed the whole directory of files. multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv.zip")) # directory containing the gzip version of csv files here. multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def group_by(self,cols,a): """ GroupBy :param cols: The columns to group on. :param a: A dictionary of aggregates having the following shape: \ {"colname":[aggregate, column, naMethod]}\ e.g.: {"bikes":["count", 0, "all"]}\ The naMethod is one of "all", "ignore", or "rm", which specifies how to handle NAs that appear in columns that are being aggregated. "all" - include NAs "rm" - exclude NAs "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.) :return: The group by frame. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")" aggregates = copy.deepcopy(a) key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() aggs = [] # transform cols in aggregates to their indices... for k in aggregates: if isinstance(aggregates[k][1],str): aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1])) else: aggregates[k][1] = '#'+str(aggregates[k][1]) aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])] aggs = "(agg {})".format(" ".join(aggs)) expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs) h2o.rapids(expr) # group by # Remove h2o temp frame after groupby h2o.delete(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame h2o.delete(tmp_key) return H2OFrame(vecs=vecs)
def quantile(self, prob=None): if len(self) == 0: return self if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99] if not isinstance(prob, list): raise ValueError("prob must be a list") probs = "(dlist #"+" #".join([str(p) for p in prob])+")" key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (quantile '{}' {}".format(tmp_key, key, probs) h2o.rapids(expr) j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_keys'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def additional_parameters(ip, port): dest_frame="29&devhex%" c_names = ["a", "b", "c"] c_types = ["enum", "enum", "enum"] fhex = h2o.import_file(h2o.locate("smalldata/jira/hexdev_29.csv"), destination_frame=dest_frame, col_names=c_names, col_types=c_types) fhex.describe() assert fhex._id == dest_frame.replace("%",".").replace("&",".") assert fhex._col_names == c_names col_summary = h2o.frame(fhex._id)["frames"][0]["columns"] for i in range(len(col_summary)): assert col_summary[i]["type"] == c_types[i]
def var(self): """ :return: The covariance matrix of the columns in this H2OFrame. """ key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (var %{} \"null\" %FALSE \"everything\"))".format(tmp_key,key) h2o.rapids(expr) # Remove h2o temp frame after var h2o.remove(key) j = h2o.frame(tmp_key) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_keys'] cols = fr['columns'] colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def group_by(self,cols,a): """ GroupBy :param cols: The columns to group on. :param a: A dictionary of aggregates having the following shape: \ {"colname":[aggregate, column, naMethod]}\ e.g.: {"bikes":["count", 0, "all"]}\ The naMethod is one of "all", "ignore", or "rm", which specifies how to handle NAs that appear in columns that are being aggregated. "all" - include NAs "rm" - exclude NAs "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.) :return: The group by frame. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")" aggregates = copy.deepcopy(a) key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() aggs = [] # transform cols in aggregates to their indices... for k in aggregates: if isinstance(aggregates[k][1],str): aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1])) else: aggregates[k][1] = '#'+str(aggregates[k][1]) aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])] aggs = "(agg {})".format(" ".join(aggs)) expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs) h2o.rapids(expr) # group by # Remove h2o temp frame after groupby h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def biases(self, vector_id=0): """ Return the frame for the respective bias vector :param: vector_id: an integer, ranging from 0 to number of layers, that specifies the bias vector to return. :return: an H2OFrame which represents the bias vector identified by vector_id """ num_bias_vectors = len(self._model_json['output']['biases']) if vector_id not in range(num_bias_vectors): raise ValueError("Bias vector does not exist. Model has {0} bias vectors (0-based indexing), but vector {1} " "was requested.".format(num_bias_vectors, vector_id)) j = h2o.frame(self._model_json['output']['biases'][vector_id]['URL'].split('/')[3]) fr = j['frames'][0] rows = fr['rows'] vec_ids = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, vec_ids), rows)) return result
def weights(self, matrix_id=0): """ Return the frame for the respective weight matrix :param: matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return. :return: an H2OFrame which represents the weight matrix identified by matrix_id """ num_weight_matrices = len(self._model_json['output']['weights']) if matrix_id not in range(num_weight_matrices): raise ValueError("Weight matrix does not exist. Model has {0} weight matrices (0-based indexing), but matrix {1} " "was requested.".format(num_weight_matrices, matrix_id)) j = h2o.frame(self._model_json['output']['weights'][matrix_id]['URL'].split('/')[3]) fr = j['frames'][0] rows = fr['rows'] vec_ids = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, vec_ids), rows)) return result
def groupby(self,cols,a): """ GroupBy :param cols: The columns to group on. :param a: A dictionary of aggregates having the following shape: {"colname":[aggregate, column, naMethod]} e.g.: {"bikes":["count", 0, "all"]} The naMethod is one of "all", "ignore", or "rm", which specifies how to handle NAs that appear in columns that are being aggregated. "all" - include NAs "rm" - exclude NAs "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.) :return: The group by frame. """ colnums = [str(self._find_idx(name)) for name in cols] rapids_series = "{"+";".join(colnums)+"}" aggregates = copy.deepcopy(a) key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() nAggs = len(aggregates) aggs = [] # transform cols in aggregates to their indices... for k in aggregates: if isinstance(aggregates[k][1],str): aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1])) else: aggregates[k][1] = '#'+str(aggregates[k][1]) aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])] aggs = "(agg #{} {})".format(nAggs, " ".join(aggs)) expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs) h2o.rapids(expr) # group by j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_keys']# List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def __getitem__(self, i): """ Column selection via integer, string(name) Column selection via slice returns a subset of the H2OFrame :param i: An int, str, slice, H2OVec, or list/tuple :return: An H2OVec, an H2OFrame, or scalar depending on the input slice. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") if isinstance(i, int): return self._vecs[i] if isinstance(i, str): return self._find(i) # Slice; return a Frame not a Vec if isinstance(i, slice): return H2OFrame(vecs=self._vecs[i]) # Row selection from a boolean Vec if isinstance(i, H2OVec): self._len_check(i) return H2OFrame(vecs=[x.row_select(i) for x in self._vecs]) # have a list/tuple of numbers or strings if isinstance(i, list) or (isinstance(i, tuple) and len(i) != 2): vecs = [] for it in i: if isinstance(it, int): vecs.append(self._vecs[it]) elif isinstance(it, str): vecs.append(self._find(it)) else: raise NotImplementedError return H2OFrame(vecs=vecs) # multi-dimensional slicing via 2-tuple if isinstance(i, tuple): veckeys = [str(v._expr._data) for v in self._vecs] left = Expr(veckeys) rite = Expr((i[0], i[1])) res = Expr("[", left, rite, length=2) if not isinstance(i[0], int) or not isinstance(i[1], int): return res # possible big data # small data (single value) res.eager() if res.is_local(): return res._data j = h2o.frame(res._data) # data is remote return map(list, zip(*[c['data'] for c in j['frames'][0]['columns'][:]]))[0][0] raise NotImplementedError("Slicing by unknown type: "+str(type(i)))