def weights_check(ip,port): # Connect to h2o h2o.init(ip,port) def check_same(data1, data2): glm1_regression = h2o.glm(x=data1[2:20], y=data1[1]) glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights") glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial") glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial") assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \ "and {1}".format(glm1_regression.mse(), glm2_regression.mse()) assert abs(glm1_binomial.auc() - glm2_binomial.auc()) < 1e-6, "Expected auc's to be the same, but got {0}, and " \ "{1}".format(glm1_binomial.auc(), glm2_binomial.auc()) data = [["ab"[random.randint(0,1)] if c==0 else random.gauss(0,1) for c in range(20)] for r in range(100)] h2o_data = h2o.H2OFrame(python_obj=data) # uniform weights same as no weights weight = random.uniform(.1,100) uniform_weights = [[weight] for r in range(100)] h2o_uniform_weights = h2o.H2OFrame(python_obj=uniform_weights) h2o_uniform_weights.setNames(["weights"]) h2o_data_uniform_weights = h2o.cbind(h2o_data, h2o_uniform_weights) print "Checking that using uniform weights is equivalent to no weights:" print check_same(h2o_data, h2o_data_uniform_weights) # zero weights same as removed observations zero_weights = [[0] if random.randint(0,1) else [1] for r in range(100)] h2o_zero_weights = h2o.H2OFrame(python_obj=zero_weights) h2o_zero_weights.setNames(["weights"]) h2o_data_zero_weights = h2o.cbind(h2o_data, h2o_zero_weights) h2o_data_zeros_removed = h2o_data[h2o_zero_weights["weights"] == 1] print "Checking that using some zero weights is equivalent to removing those observations:" print check_same(h2o_data_zeros_removed, h2o_data_zero_weights) # doubled weights same as doubled observations doubled_weights = [[1] if random.randint(0,1) else [2] for r in range(100)] h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights) h2o_doubled_weights.setNames(["weights"]) h2o_data_doubled_weights = h2o.cbind(h2o_data, h2o_doubled_weights) doubled_data = copy.deepcopy(data) for d, w in zip(data,doubled_weights): if w[0] == 2: doubled_data.append(d) h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data) print "Checking that doubling some weights is equivalent to doubling those observations:" print check_same(h2o_data_doubled, h2o_data_doubled_weights)
def missing(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) missing_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.99] errors = [0, 0, 0, 0, 0, 0] for i in range(len(missing_ratios)): data = h2o.upload_file(h2o.locate("smalldata/junit/weather.csv")) data[15] = data[15].asfactor() #ChangeTempDir data[16] = data[16].asfactor() #ChangeTempMag data[17] = data[17].asfactor() #ChangeWindDirect data[18] = data[18].asfactor() #MaxWindPeriod data[19] = data[19].asfactor() #RainToday data[21] = data[21].asfactor() #PressureChange data[23] = data[23].asfactor() #RainTomorrow print "For missing {0}%".format(missing_ratios[i]*100) # add missing values to the data section of the file (leave the response alone) if missing_ratios[i] > 0: resp = data[23] pred = data[list(set(range(data.ncol())) - set([23]))] data_missing = pred.insert_missing_values(fraction=missing_ratios[i]) data_fin = h2o.cbind(data_missing,resp) else: data_fin = data # split into train + test datasets ratio = data_fin[0].runif() train = data_fin[ratio <= .75] test = data_fin[ratio > .75] hh = h2o.deeplearning(x=train[2:22], y=train[23], validation_x=test[2:22], validation_y=test[23], epochs=5, reproducible=True, seed=12345, activation='RectifierWithDropout', l1=1e-5, input_dropout_ratio=0.2) errors[i] = hh.error()[0][1] for i in range(len(missing_ratios)): print "missing ratio: {0}% --> classification error: {1}".format(missing_ratios[i]*100, errors[i]) assert sum(errors) < 2.2, "Sum of classification errors is too large!"
def robustness_index(self, data, model, targetcol=None, labels=[], nsamples=10, nrecods=100, clevels=[95], random_state=None): """ The function will create nsamples of size nrecords with repetition using bootstrapping. :param nsamples: :param nrecods: :return: """ is_h2o_model = False cols = data.columns # cbind features and target nlen = len(labels) if targetcol is None and nlen == 0: assert False, "Either targetcol or labels must be specified" if nlen > 0: if nlen != data.shape[0]: assert False, "Number of observations and number of labels must match" else: if isinstance(cols, pd.RangeIndex): targetcol = pd.Index([len(cols)]) cols = pd.RangeIndex(start=0, stop=len(cols) + 1, step=1) else: targetcol = 'target' cols.append(targetcol) if is_h2o_frame(data): data = h2o.cbind(data, labels) else: data = pd.concat([data.reset_index(drop=True), labels], axis=1) # check the type of data if is_h2o_frame(data): # if h2o then convert it np_data = h2o.as_list(data).values col_types = [v for v in data.types.values()] is_h2o_model = True else: np_data = data.values for i in range(1, nsamples + 1): logging.info("Sampling " + str(i)) data_boot = resample(np_data, replace=True, n_samples=nrecods, random_state=random_state) data_boot_df = pd.DataFrame(data=data_boot[0:, 0:], columns=cols) if is_h2o_model: data_boot_df = h2o.H2OFrame(data_boot_df, column_types=col_types) y_act = h2o.as_list(data_boot_df[targetcol]).values else: y_act = data_boot_df[len(cols) - 1] # remove the target column for predicting as model if nlen > 0: data_boot_df = data_boot_df.drop([len(cols) - 1], axis=1) preds = model.predict(data_boot_df) if preds.ndim == 1: y_preds = preds else: y_preds = preds[:, 0] # make necessary transformation for h20 frames if is_h2o_frame(y_preds): y_preds = convert_h2o_list(y_preds) if is_h2o_frame(y_act): y_act = convert_h2o_list(y_act) self.prepare_stats_sample(y_act, y_preds) # We have estimations from multiple sample. Now get the mean, se, and CI self.prepare_robustness_index(clevels) return self.stats_df
def cbind(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) hdf = h2o.import_frame(path=h2o.locate('smalldata/jira/pub-180.csv')) otherhdf = h2o.import_frame(path=h2o.locate('smalldata/jira/v-11.csv')) rows, cols = hdf.dim() ################################## ##### non-mutating h2o.cbind ##### ################################## # frame to frame hdf2 = h2o.cbind(hdf, hdf) rows2, cols2 = hdf2.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows2 == 12 and cols2 == 8, "unexpected dimensions in result" # vec to vec xx = hdf[0] yy = hdf[1] hdf3 = h2o.cbind(xx, yy) rows3, cols3 = hdf3.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows3 == 12 and cols3 == 2, "unexpected dimensions in result" # vec to frame hdf4 = h2o.cbind(hdf, hdf[1]) rows4, cols4 = hdf4.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows4 == 12 and cols4 == 5, "unexpected dimensions in result" # frame to vec hdf5 = h2o.cbind(yy, hdf) rows5, cols5 = hdf5.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows5 == 12 and cols5 == 5, "unexpected dimensions in result" # logical expressions hdf6 = h2o.cbind(hdf[2] <= 5, hdf[3] >= 4) rows6, cols6 = hdf6.dim() assert rows6 == 12 and cols6 == 2, "unexpected dimensions in result" # sets column names correctly hdf_names = h2o.cbind(xx, yy).names() assert hdf_names == ['colgroup', 'colgroup2'], "expected column names to be the same" # unequal rows should fail try: hdf7 = h2o.cbind(hdf, otherhdf) assert False, "expected an error" except EnvironmentError: assert True ################################### ##### mutating H2OFrame.cbind ##### ################################### # frame to frame hdf.cbind(hdf) rows, cols = hdf.dim() assert rows == 12 and cols == 8, "unexpected dimensions in result" # frame to vec hdf.cbind(yy) rows, cols = hdf.dim() assert rows == 12 and cols == 9, "unexpected dimensions in result" # logical expressions hdf.cbind(hdf[2] <= 5) rows, cols = hdf.dim() assert rows == 12 and cols == 10, "unexpected dimensions in result" # sets column names correctly assert hdf.names() == ['colgroup','colgroup2','col1','col2','colgroup','colgroup2','col1','col2','colgroup2','col1'],\ "expected column names to be the same" # unequal rows should fail #try: # hdf.cbind(otherhdf) # assert False, "expected an error" #except EnvironmentError: # assert True ################################### ##### non-mutating H2OVec.cbind ### ################################### hdf = h2o.import_frame(path=h2o.locate('smalldata/jira/pub-180.csv')) rows, cols = hdf.dim() # vec to frame hdf8 = hdf[1].cbind(hdf) rows8, cols8 = hdf8.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows8 == 12 and cols8 == 5, "unexpected dimensions in result" # vec to vec hdf9 = hdf[1].cbind(hdf[2]) rows9, cols9 = hdf9.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows9 == 12 and cols9 == 2, "unexpected dimensions in result" # logical expressions hdf10 = (hdf[3] >= 4).cbind(hdf[2] <= 5) rows10, cols10 = hdf10.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows10 == 12 and cols10 == 2, "unexpected dimensions in result" # sets column names correctly hdf_names = xx.cbind(yy).names() assert hdf_names == ['colgroup', 'colgroup2'], "expected column names to be the same"
def cbind(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) hdf = h2o.import_frame(path=h2o.locate('smalldata/jira/pub-180.csv')) otherhdf = h2o.import_frame(path=h2o.locate('smalldata/jira/v-11.csv')) rows, cols = hdf.dim() ################################## ##### non-mutating h2o.cbind ##### ################################## # frame to frame hdf2 = h2o.cbind(hdf, hdf) rows2, cols2 = hdf2.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows2 == 12 and cols2 == 8, "unexpected dimensions in result" # vec to vec xx = hdf[0] yy = hdf[1] hdf3 = h2o.cbind(xx,yy) rows3, cols3 = hdf3.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows3 == 12 and cols3 == 2, "unexpected dimensions in result" # vec to frame hdf4 = h2o.cbind(hdf, hdf[1]) rows4, cols4 = hdf4.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows4 == 12 and cols4 == 5, "unexpected dimensions in result" # frame to vec hdf5 = h2o.cbind(yy,hdf) rows5, cols5 = hdf5.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows5 == 12 and cols5 == 5, "unexpected dimensions in result" # logical expressions hdf6 = h2o.cbind(hdf[2] <= 5, hdf[3] >= 4) rows6, cols6 = hdf6.dim() assert rows6 == 12 and cols6 == 2, "unexpected dimensions in result" # sets column names correctly hdf_names = h2o.cbind(xx,yy).names() assert hdf_names == ['colgroup', 'colgroup2'], "expected column names to be the same" # unequal rows should fail try: hdf7 = h2o.cbind(hdf, otherhdf) assert False, "expected an error" except EnvironmentError: assert True ################################### ##### mutating H2OFrame.cbind ##### ################################### # frame to frame hdf.cbind(hdf) rows, cols = hdf.dim() assert rows == 12 and cols == 8, "unexpected dimensions in result" # frame to vec hdf.cbind(yy) rows, cols = hdf.dim() assert rows == 12 and cols == 9, "unexpected dimensions in result" # logical expressions hdf.cbind(hdf[2] <= 5) rows, cols = hdf.dim() assert rows == 12 and cols == 10, "unexpected dimensions in result" # sets column names correctly assert hdf.names() == ['colgroup','colgroup2','col1','col2','colgroup','colgroup2','col1','col2','colgroup2','col1'],\ "expected column names to be the same" # unequal rows should fail #try: # hdf.cbind(otherhdf) # assert False, "expected an error" #except EnvironmentError: # assert True ################################### ##### non-mutating H2OVec.cbind ### ################################### hdf = h2o.import_frame(path=h2o.locate('smalldata/jira/pub-180.csv')) rows, cols = hdf.dim() # vec to frame hdf8 = hdf[1].cbind(hdf) rows8, cols8 = hdf8.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows8 == 12 and cols8 == 5, "unexpected dimensions in result" # vec to vec hdf9 = hdf[1].cbind(hdf[2]) rows9, cols9 = hdf9.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows9 == 12 and cols9 == 2, "unexpected dimensions in result" # logical expressions hdf10 = (hdf[3] >= 4).cbind(hdf[2] <= 5) rows10, cols10 = hdf10.dim() assert rows == 12 and cols == 4, "unexpected dimensions in original" assert rows10 == 12 and cols10 == 2, "unexpected dimensions in result" # sets column names correctly hdf_names = xx.cbind(yy).names() assert hdf_names == ['colgroup', 'colgroup2'], "expected column names to be the same"