def levels(self, col=None): """ Get the factor levels for this frame and the specified column index. :param col: A column index in this H2OFrame. :return: a list of strings that are the factor levels for the column. """ if self.ncol()==1: levels=h2o.as_list(H2OFrame(expr=ExprNode("levels", self))._frame(), False)[1:] elif col is not None: levels=h2o.as_list(H2OFrame(expr=ExprNode("levels", ExprNode("[", self, None,col)))._frame(),False)[1:] else: levels=None return None if levels is None or levels==[] else [i[0] for i in levels]
def levels(self, col=None): """ Get the factor levels for this frame and the specified column index. :param col: A column index in this H2OFrame. :return: a list of strings that are the factor levels for the column. """ if self.ncol==1 or col is None: lol=h2o.as_list(H2OFrame(expr=ExprNode("levels", self))._frame(), False)[1:] levels=[level for l in lol for level in l] if self.ncol==1 else lol elif col is not None: lol=h2o.as_list(H2OFrame(expr=ExprNode("levels", ExprNode("[", self, None,col)))._frame(),False)[1:] levels=[level for l in lol for level in l] else: levels=None return None if levels is None or levels==[] else levels
def hist(self, breaks="Sturges", plot=True, **kwargs): """ Compute a histogram over a numeric column. If breaks=="FD", the MAD is used over the IQR in computing bin width. :param breaks: breaks Can be one of the following: A string: "Sturges", "Rice", "sqrt", "Doane", "FD", "Scott." A single number for the number of breaks splitting the range of the vec into number of breaks bins of equal width. Or, A vector of numbers giving the split points, e.g., c(-50,213.2123,9324834) :param plot: A logical value indicating whether or not a plot should be generated (default is TRUE). :return: if plot is True, then return None, else, an H2OFrame with these columns: breaks, counts, mids_true, mids, and density """ frame = H2OFrame(expr=ExprNode("hist", self, breaks))._frame() total = frame["counts"].sum() densities = [(frame["counts"][i,:]/total)._scalar()*(1/(frame["breaks"][i,:]._scalar()-frame["breaks"][i-1,:]._scalar())) for i in range(1,frame["counts"].nrow())] densities.insert(0,0) densities_frame = H2OFrame(python_obj=[[d] for d in densities]) densities_frame.setNames(["density"]) frame = frame.cbind(densities_frame) if plot: try: imp.find_module('matplotlib') import matplotlib if 'server' in kwargs.keys() and kwargs['server']: matplotlib.use('Agg', warn=False) import matplotlib.pyplot as plt except ImportError: print "matplotlib is required to make the histogram plot. Set `plot` to False, if a plot is not desired." return lower = float(frame["breaks"][0,:]) clist = h2o.as_list(frame["counts"], use_pandas=False) clist.pop(0) clist.pop(0) mlist = h2o.as_list(frame["mids"], use_pandas=False) mlist.pop(0) mlist.pop(0) counts = [float(c[0]) for c in clist] counts.insert(0,0) mids = [float(m[0]) for m in mlist] mids.insert(0,lower) plt.xlabel(self._col_names[0]) plt.ylabel('Frequency') plt.title('Histogram of {0}'.format(self._col_names[0])) plt.bar(mids, counts) if not ('server' in kwargs.keys() and kwargs['server']): plt.show() else: return frame
def hist(self, breaks="Sturges", plot=True, **kwargs): """ Compute a histogram over a numeric column. If breaks=="FD", the MAD is used over the IQR in computing bin width. :param breaks: breaks Can be one of the following: A string: "Sturges", "Rice", "sqrt", "Doane", "FD", "Scott." A single number for the number of breaks splitting the range of the vec into number of breaks bins of equal width. Or, A vector of numbers giving the split points, e.g., c(-50,213.2123,9324834) :param plot: A logical value indicating whether or not a plot should be generated (default is TRUE). :return: if plot is True, then return None, else, an H2OFrame with these columns: breaks, counts, mids_true, mids, and density """ frame = H2OFrame(expr=ExprNode("hist", self, breaks))._frame() total = frame["counts"].sum() densities = [(frame["counts"][i,:]/total)._scalar()*(1/(frame["breaks"][i,:]._scalar()-frame["breaks"][i-1,:]._scalar())) for i in range(1,frame["counts"].nrow)] densities.insert(0,0) densities_frame = H2OFrame(python_obj=[[d] for d in densities]) densities_frame.setNames(["density"]) frame = frame.cbind(densities_frame) if plot: try: imp.find_module('matplotlib') import matplotlib if 'server' in kwargs.keys() and kwargs['server']: matplotlib.use('Agg', warn=False) import matplotlib.pyplot as plt except ImportError: print "matplotlib is required to make the histogram plot. Set `plot` to False, if a plot is not desired." return lower = float(frame["breaks"][0,:]) clist = h2o.as_list(frame["counts"], use_pandas=False) clist.pop(0) clist.pop(0) mlist = h2o.as_list(frame["mids"], use_pandas=False) mlist.pop(0) mlist.pop(0) counts = [float(c[0]) for c in clist] counts.insert(0,0) mids = [float(m[0]) for m in mlist] mids.insert(0,lower) plt.xlabel(self._col_names[0]) plt.ylabel('Frequency') plt.title('Histogram of {0}'.format(self._col_names[0])) plt.bar(mids, counts) if not ('server' in kwargs.keys() and kwargs['server']): plt.show() else: return frame
def structure(self): """ Similar to R's str method: Compactly Display the Structure of this H2OFrame instance. :return: None """ df = self.head().as_data_frame(use_pandas=False) nr = self.nrow nc = len(df[0]) cn = df.pop(0) width = max([len(c) for c in cn]) isfactor = [c.isfactor() for c in self] numlevels = [self.nlevels(i) for i in range(nc)] lvls = self.levels() print "H2OFrame '{}': \t {} obs. of {} variables(s)".format(self._id,nr,nc) for i in range(nc): print "$ {} {}: ".format(cn[i], ' '*(width-max(0,len(cn[i])))), if isfactor[i]: nl = numlevels[i] print "Factor w/ {} level(s) {},..: ".format(nl, '"' + '","'.join(zip(*lvls)[i]) + '"'), print " ".join(it[0] for it in h2o.as_list(self[:10,i].match(list(zip(*lvls)[i])), False)[1:]), print "..." else: print "num {} ...".format(" ".join(it[0] for it in h2o.as_list(self[:10,i], False)[1:]))