예제 #1
0
def weights_check(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    def check_same(data1, data2):
        glm1_regression = h2o.glm(x=data1[2:20], y=data1[1])
        glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights")
        glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial")
        glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial")

        assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \
                                                                          "and {1}".format(glm1_regression.mse(),
                                                                                           glm2_regression.mse())
        assert abs(glm1_binomial.auc() - glm2_binomial.auc()) < 1e-6, "Expected auc's to be the same, but got {0}, and " \
                                                                      "{1}".format(glm1_binomial.auc(), glm2_binomial.auc())

    data = [["ab"[random.randint(0,1)] if c==0 else random.gauss(0,1) for c in range(20)] for r in range(100)]
    h2o_data = h2o.H2OFrame(python_obj=data)

    # uniform weights same as no weights
    weight = random.uniform(.1,100)
    uniform_weights = [[weight] for r in range(100)]
    h2o_uniform_weights = h2o.H2OFrame(python_obj=uniform_weights)
    h2o_uniform_weights.setNames(["weights"])
    h2o_data_uniform_weights = h2o.cbind(h2o_data, h2o_uniform_weights)

    print "Checking that using uniform weights is equivalent to no weights:"
    print
    check_same(h2o_data, h2o_data_uniform_weights)

    # zero weights same as removed observations
    zero_weights = [[0] if random.randint(0,1) else [1] for r in range(100)]
    h2o_zero_weights = h2o.H2OFrame(python_obj=zero_weights)
    h2o_zero_weights.setNames(["weights"])
    h2o_data_zero_weights = h2o.cbind(h2o_data, h2o_zero_weights)
    h2o_data_zeros_removed = h2o_data[h2o_zero_weights["weights"] == 1]

    print "Checking that using some zero weights is equivalent to removing those observations:"
    print
    check_same(h2o_data_zeros_removed, h2o_data_zero_weights)

    # doubled weights same as doubled observations
    doubled_weights = [[1] if random.randint(0,1) else [2] for r in range(100)]
    h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights)
    h2o_doubled_weights.setNames(["weights"])
    h2o_data_doubled_weights = h2o.cbind(h2o_data, h2o_doubled_weights)

    doubled_data = copy.deepcopy(data)
    for d, w in zip(data,doubled_weights):
        if w[0] == 2: doubled_data.append(d)
    h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data)

    print "Checking that doubling some weights is equivalent to doubling those observations:"
    print
    check_same(h2o_data_doubled, h2o_data_doubled_weights)
def missing(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    missing_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.99]
    errors = [0, 0, 0, 0, 0, 0]

    for i in range(len(missing_ratios)):
        data = h2o.upload_file(h2o.locate("smalldata/junit/weather.csv"))
        data[15] = data[15].asfactor() #ChangeTempDir
        data[16] = data[16].asfactor() #ChangeTempMag
        data[17] = data[17].asfactor() #ChangeWindDirect
        data[18] = data[18].asfactor() #MaxWindPeriod
        data[19] = data[19].asfactor() #RainToday
        data[21] = data[21].asfactor() #PressureChange
        data[23] = data[23].asfactor() #RainTomorrow

        print "For missing {0}%".format(missing_ratios[i]*100)

        # add missing values to the data section of the file (leave the response alone)
        if missing_ratios[i] > 0:
            resp = data[23]
            pred = data[list(set(range(data.ncol())) - set([23]))]
            data_missing = pred.insert_missing_values(fraction=missing_ratios[i])
            data_fin = h2o.cbind(data_missing,resp)
        else:
            data_fin = data

        # split into train + test datasets
        ratio = data_fin[0].runif()
        train = data_fin[ratio <= .75]
        test  = data_fin[ratio >  .75]

        hh = h2o.deeplearning(x=train[2:22], y=train[23], validation_x=test[2:22], validation_y=test[23], epochs=5,
                            reproducible=True, seed=12345, activation='RectifierWithDropout', l1=1e-5,
                            input_dropout_ratio=0.2)

        errors[i] = hh.error()[0][1]

    for i in range(len(missing_ratios)):
        print "missing ratio: {0}% --> classification error: {1}".format(missing_ratios[i]*100, errors[i])

    assert sum(errors) < 2.2, "Sum of classification errors is too large!"
예제 #3
0
    def robustness_index(self, data, model, targetcol=None, labels=[], nsamples=10, nrecods=100, clevels=[95], random_state=None):
        """
        The function will create nsamples of size nrecords with repetition using bootstrapping.
        :param nsamples:
        :param nrecods:
        :return:
        """
        is_h2o_model = False
        cols = data.columns
        # cbind features and target
        nlen = len(labels)
        if targetcol is None and nlen == 0:
            assert False, "Either targetcol or labels must be specified"
        if nlen > 0:
            if nlen != data.shape[0]:
                assert False, "Number of observations and number of labels must match"
            else:
                if isinstance(cols, pd.RangeIndex):
                    targetcol = pd.Index([len(cols)])
                    cols = pd.RangeIndex(start=0, stop=len(cols) + 1, step=1)
                else:
                    targetcol = 'target'
                    cols.append(targetcol)

                if is_h2o_frame(data):
                    data = h2o.cbind(data, labels)
                else:
                    data = pd.concat([data.reset_index(drop=True), labels], axis=1)

        # check the type of data
        if is_h2o_frame(data):  # if h2o then convert it
            np_data = h2o.as_list(data).values
            col_types = [v for v in data.types.values()]
            is_h2o_model = True
        else:
            np_data = data.values

        for i in range(1, nsamples + 1):
            logging.info("Sampling " + str(i))
            data_boot = resample(np_data, replace=True, n_samples=nrecods, random_state=random_state)
            data_boot_df = pd.DataFrame(data=data_boot[0:, 0:], columns=cols)
            if is_h2o_model:
                data_boot_df = h2o.H2OFrame(data_boot_df, column_types=col_types)
                y_act = h2o.as_list(data_boot_df[targetcol]).values
            else:
                y_act = data_boot_df[len(cols) - 1]

            # remove the target column for predicting as model
            if nlen > 0:
                data_boot_df = data_boot_df.drop([len(cols) - 1], axis=1)

            preds = model.predict(data_boot_df)
            if preds.ndim == 1:
                y_preds = preds
            else:
                y_preds = preds[:, 0]

            # make necessary transformation for h20 frames
            if is_h2o_frame(y_preds):
                y_preds = convert_h2o_list(y_preds)

            if is_h2o_frame(y_act):
                y_act = convert_h2o_list(y_act)

            self.prepare_stats_sample(y_act, y_preds)

        # We have estimations from multiple sample. Now get the mean, se, and CI
        self.prepare_robustness_index(clevels)
        return self.stats_df
예제 #4
0
def cbind(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    hdf = h2o.import_frame(path=h2o.locate('smalldata/jira/pub-180.csv'))
    otherhdf = h2o.import_frame(path=h2o.locate('smalldata/jira/v-11.csv'))
    rows, cols = hdf.dim()

    ##################################
    ##### non-mutating h2o.cbind #####
    ##################################
    # frame to frame
    hdf2 = h2o.cbind(hdf, hdf)
    rows2, cols2 = hdf2.dim()
    assert rows == 12 and cols == 4, "unexpected dimensions in original"
    assert rows2 == 12 and cols2 == 8, "unexpected dimensions in result"

    # vec to vec
    xx = hdf[0]
    yy = hdf[1]
    hdf3 = h2o.cbind(xx, yy)
    rows3, cols3 = hdf3.dim()
    assert rows == 12 and cols == 4, "unexpected dimensions in original"
    assert rows3 == 12 and cols3 == 2, "unexpected dimensions in result"

    # vec to frame
    hdf4 = h2o.cbind(hdf, hdf[1])
    rows4, cols4 = hdf4.dim()
    assert rows == 12 and cols == 4, "unexpected dimensions in original"
    assert rows4 == 12 and cols4 == 5, "unexpected dimensions in result"

    # frame to vec
    hdf5 = h2o.cbind(yy, hdf)
    rows5, cols5 = hdf5.dim()
    assert rows == 12 and cols == 4, "unexpected dimensions in original"
    assert rows5 == 12 and cols5 == 5, "unexpected dimensions in result"

    # logical expressions
    hdf6 = h2o.cbind(hdf[2] <= 5, hdf[3] >= 4)
    rows6, cols6 = hdf6.dim()
    assert rows6 == 12 and cols6 == 2, "unexpected dimensions in result"

    # sets column names correctly
    hdf_names = h2o.cbind(xx, yy).names()
    assert hdf_names == ['colgroup',
                         'colgroup2'], "expected column names to be the same"

    # unequal rows should fail
    try:
        hdf7 = h2o.cbind(hdf, otherhdf)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    ###################################
    ##### mutating H2OFrame.cbind #####
    ###################################
    # frame to frame
    hdf.cbind(hdf)
    rows, cols = hdf.dim()
    assert rows == 12 and cols == 8, "unexpected dimensions in result"

    # frame to vec
    hdf.cbind(yy)
    rows, cols = hdf.dim()
    assert rows == 12 and cols == 9, "unexpected dimensions in result"

    # logical expressions
    hdf.cbind(hdf[2] <= 5)
    rows, cols = hdf.dim()
    assert rows == 12 and cols == 10, "unexpected dimensions in result"

    # sets column names correctly
    assert hdf.names() == ['colgroup','colgroup2','col1','col2','colgroup','colgroup2','col1','col2','colgroup2','col1'],\
      "expected column names to be the same"

    # unequal rows should fail
    #try:
    #  hdf.cbind(otherhdf)
    #  assert False, "expected an error"
    #except EnvironmentError:
    #  assert True

    ###################################
    ##### non-mutating H2OVec.cbind ###
    ###################################
    hdf = h2o.import_frame(path=h2o.locate('smalldata/jira/pub-180.csv'))
    rows, cols = hdf.dim()

    # vec to frame
    hdf8 = hdf[1].cbind(hdf)
    rows8, cols8 = hdf8.dim()
    assert rows == 12 and cols == 4, "unexpected dimensions in original"
    assert rows8 == 12 and cols8 == 5, "unexpected dimensions in result"

    # vec to vec
    hdf9 = hdf[1].cbind(hdf[2])
    rows9, cols9 = hdf9.dim()
    assert rows == 12 and cols == 4, "unexpected dimensions in original"
    assert rows9 == 12 and cols9 == 2, "unexpected dimensions in result"

    # logical expressions
    hdf10 = (hdf[3] >= 4).cbind(hdf[2] <= 5)
    rows10, cols10 = hdf10.dim()
    assert rows == 12 and cols == 4, "unexpected dimensions in original"
    assert rows10 == 12 and cols10 == 2, "unexpected dimensions in result"

    # sets column names correctly
    hdf_names = xx.cbind(yy).names()
    assert hdf_names == ['colgroup',
                         'colgroup2'], "expected column names to be the same"
예제 #5
0
def cbind(ip,port):
  # Connect to a pre-existing cluster
  h2o.init(ip,port)

  hdf = h2o.import_frame(path=h2o.locate('smalldata/jira/pub-180.csv'))
  otherhdf = h2o.import_frame(path=h2o.locate('smalldata/jira/v-11.csv'))
  rows, cols = hdf.dim()

  ##################################
  ##### non-mutating h2o.cbind #####
  ##################################
  # frame to frame
  hdf2 = h2o.cbind(hdf, hdf)
  rows2, cols2 = hdf2.dim()
  assert rows == 12 and cols == 4, "unexpected dimensions in original"
  assert rows2 == 12 and cols2 == 8, "unexpected dimensions in result"

  # vec to vec
  xx = hdf[0]
  yy = hdf[1]
  hdf3 = h2o.cbind(xx,yy)
  rows3, cols3 = hdf3.dim()
  assert rows == 12 and cols == 4, "unexpected dimensions in original"
  assert rows3 == 12 and cols3 == 2, "unexpected dimensions in result"

  # vec to frame
  hdf4 = h2o.cbind(hdf, hdf[1])
  rows4, cols4 = hdf4.dim()
  assert rows == 12 and cols == 4, "unexpected dimensions in original"
  assert rows4 == 12 and cols4 == 5, "unexpected dimensions in result"

  # frame to vec
  hdf5 = h2o.cbind(yy,hdf)
  rows5, cols5 = hdf5.dim()
  assert rows == 12 and cols == 4, "unexpected dimensions in original"
  assert rows5 == 12 and cols5 == 5, "unexpected dimensions in result"

  # logical expressions
  hdf6 = h2o.cbind(hdf[2] <= 5, hdf[3] >= 4)
  rows6, cols6 = hdf6.dim()
  assert rows6 == 12 and cols6 == 2, "unexpected dimensions in result"

  # sets column names correctly
  hdf_names = h2o.cbind(xx,yy).names()
  assert hdf_names == ['colgroup', 'colgroup2'], "expected column names to be the same"

  # unequal rows should fail
  try:
    hdf7 = h2o.cbind(hdf, otherhdf)
    assert False, "expected an error"
  except EnvironmentError:
    assert True


  ###################################
  ##### mutating H2OFrame.cbind #####
  ###################################
  # frame to frame
  hdf.cbind(hdf)
  rows, cols = hdf.dim()
  assert rows == 12 and cols == 8, "unexpected dimensions in result"

  # frame to vec
  hdf.cbind(yy)
  rows, cols = hdf.dim()
  assert rows == 12 and cols == 9, "unexpected dimensions in result"

  # logical expressions
  hdf.cbind(hdf[2] <= 5)
  rows, cols = hdf.dim()
  assert rows == 12 and cols == 10, "unexpected dimensions in result"

  # sets column names correctly
  assert hdf.names() == ['colgroup','colgroup2','col1','col2','colgroup','colgroup2','col1','col2','colgroup2','col1'],\
    "expected column names to be the same"

  # unequal rows should fail
  #try:
  #  hdf.cbind(otherhdf)
  #  assert False, "expected an error"
  #except EnvironmentError:
  #  assert True

  ###################################
  ##### non-mutating H2OVec.cbind ###
  ###################################
  hdf = h2o.import_frame(path=h2o.locate('smalldata/jira/pub-180.csv'))
  rows, cols = hdf.dim()

  # vec to frame
  hdf8 = hdf[1].cbind(hdf)
  rows8, cols8 = hdf8.dim()
  assert rows == 12 and cols == 4, "unexpected dimensions in original"
  assert rows8 == 12 and cols8 == 5, "unexpected dimensions in result"

  # vec to vec
  hdf9 = hdf[1].cbind(hdf[2])
  rows9, cols9 = hdf9.dim()
  assert rows == 12 and cols == 4, "unexpected dimensions in original"
  assert rows9 == 12 and cols9 == 2, "unexpected dimensions in result"

  # logical expressions
  hdf10 = (hdf[3] >= 4).cbind(hdf[2] <= 5)
  rows10, cols10 = hdf10.dim()
  assert rows == 12 and cols == 4, "unexpected dimensions in original"
  assert rows10 == 12 and cols10 == 2, "unexpected dimensions in result"

  # sets column names correctly
  hdf_names = xx.cbind(yy).names()
  assert hdf_names == ['colgroup', 'colgroup2'], "expected column names to be the same"