예제 #1
0
def import_svmlight(path, headers=""):
    raw = h2o.lazy_import(path)
    if settings.debug and len(headers) < 100:
        print utils.time() + "import with headers: " + str(headers)
    #parsesetup = h2o.parse_setup(raw,column_names=headers)
    parsesetup = h2o.parse_setup(
        raw
    )  # Issue: H2O 3.8 tests length of header vs. columns, but still imports the "pseudotarget" additionally
    parsesetup['parse_type'] = 'SVMLight'
    loaded_frame = h2o.parse_raw(parsesetup)
    if settings.debug:
        print "......HEader length: " + str(len(headers))
        print "......Frame imported: " + str(loaded_frame.ncol)
    if (len(headers) > loaded_frame.ncol):
        n = len(headers) - loaded_frame.ncol
        print "Remove last " + str(n) + " header entries"
        del headers[-n:]
    loaded_frame.set_names(headers)  #Workaround, Set names now
    print "First column: " + loaded_frame.names[
        0]  #needed because lazy name setting
    if settings.debug and len(headers) < 100: loaded_frame.head(show=True)
    loaded_frame.pop(0)  #remove first ('pseudotarget') columnn
    #if loaded_frame.ncol>len(headers)-1: #workaround: H2O reads info from svmlight into columns -> remove everything that is not in headers
    #    delete = []
    #    for i in xrange(len(headers)-1,loaded_frame.ncol):
    #        delete.append(loaded_frame.names[i])
    #    loaded_frame = remove_vecs(loaded_frame,delete)
    if settings.debug and len(headers) < 100: loaded_frame.head(show=True)
    return loaded_frame
예제 #2
0
def parse_false():

    fraw = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), parse=False)
    assert isinstance(fraw, list)

    fhex = h2o.parse_raw(h2o.parse_setup(fraw))
    fhex.summary()
    assert fhex.__class__.__name__ == "H2OFrame"
예제 #3
0
def parse_false():

    fraw = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           parse=False)
    assert isinstance(fraw, list)

    fhex = h2o.parse_raw(h2o.parse_setup(fraw))
    fhex.summary()
    assert fhex.__class__.__name__ == "H2OFrame"
예제 #4
0
def h2oparse_raw():
    """
    Python API test: h2o.parse_raw(setup, id=None, first_line_is_header=0)

    copied from pyunit_hexdev_29_parse_false.py
    """
    fraw = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv"), parse=False)
    assert isinstance(fraw, list)

    fhex = h2o.parse_raw(h2o.parse_setup(fraw), id='hexdev_29.hex', first_line_is_header=0)
    fhex.summary()
    assert_is_type(fhex, H2OFrame)
def continuous_or_categorical():
  fraw = h2o.lazy_import(h2o.locate("smalldata/jira/hexdev_29.csv"))
  fsetup = h2o.parse_setup(fraw)
  fsetup["column_types"][0] = "ENUM"
  fsetup["column_types"][1] = "ENUM"
  fsetup["column_types"][2] = "ENUM"

  df_hex = h2o.parse_raw(fsetup)

  df_hex.summary()

  assert (df_hex['h1'].isfactor())
  assert (df_hex['h2'].isfactor())
  assert (df_hex['h3'].isfactor())
예제 #6
0
def continuous_or_categorical():
    fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv"))
    fsetup = h2o.parse_setup(fraw)
    fsetup["column_types"][0] = "ENUM"
    fsetup["column_types"][1] = "ENUM"
    fsetup["column_types"][2] = "ENUM"

    df_hex = h2o.parse_raw(fsetup)

    df_hex.summary()

    assert (df_hex['h1'].isfactor())
    assert (df_hex['h2'].isfactor())
    assert (df_hex['h3'].isfactor())
예제 #7
0
def hexdev_394():
  path = tests.locate("smalldata/covtype/covtype.20k.data")
  trainraw = h2o.lazy_import(path)
  tsetup = h2o.parse_setup(trainraw)
  tsetup["column_types"][10] = "ENUM"
  tsetup["column_types"][11] = "ENUM"
  tsetup["column_types"][12] = "ENUM"
  train = h2o.parse_raw(tsetup)
  
  cols = train.col_names  # This returned space for first column name
  x_cols = [colname for colname in cols if colname != "C55"]
  x_cols
  
  
  splits = train.split_frame()
  newtrain = splits[0]
  newvalid = splits[1]
  newtrain_x = newtrain[x_cols]
  newtrain_y = newtrain[54].asfactor()
  newvalid_x = newvalid[x_cols]
  newvalid_y = newvalid[54].asfactor()
  
  
  my_gbm = h2o.gbm(y=newtrain_y,
                   validation_y=newvalid_y,
                   x=newtrain_x,
                   validation_x=newvalid_x,
                   distribution =  "multinomial",
                   ntrees=100,
                   learn_rate=0.1,
                   max_depth=6)
  
  split1, split2 = train.split_frame()
  
  newtrain_x = split1[x_cols]
  newtrain_y = split1[54].asfactor()
  newvalid_x = split2[x_cols]
  newvalid_y = split2[54].asfactor()
  
  my_gbm = h2o.gbm(y=newtrain_y,
                   validation_y=newvalid_y,
                   x=newtrain_x,
                   validation_x=newvalid_x,
                   distribution = "multinomial",
                   ntrees=100,
                   learn_rate=0.1,
                   max_depth=6) 

  print "KEEPING FRAME???"
  print train._keep
예제 #8
0
def continuous_or_categorical(ip, port):
    fraw = h2o.import_file(h2o.locate("smalldata/jira/hexdev_29.csv"))
    fsetup = h2o.parse_setup(fraw)
    fsetup["column_types"][0] = "ENUM"
    fsetup["column_types"][1] = "ENUM"
    fsetup["column_types"][2] = "ENUM"

    df_hex = h2o.parse_raw(fsetup)

    df_hex.summary()

    assert df_hex["h1"].isfactor()
    assert df_hex["h2"].isfactor()
    assert df_hex["h3"].isfactor()
예제 #9
0
def hexdev_394():
    path = tests.locate("smalldata/covtype/covtype.20k.data")
    trainraw = h2o.lazy_import(path)
    tsetup = h2o.parse_setup(trainraw)
    tsetup["column_types"][10] = "ENUM"
    tsetup["column_types"][11] = "ENUM"
    tsetup["column_types"][12] = "ENUM"
    train = h2o.parse_raw(tsetup)

    cols = train.col_names  # This returned space for first column name
    x_cols = [colname for colname in cols if colname != "C55"]
    x_cols

    splits = train.split_frame()
    newtrain = splits[0]
    newvalid = splits[1]
    newtrain_x = newtrain[x_cols]
    newtrain_y = newtrain[54].asfactor()
    newvalid_x = newvalid[x_cols]
    newvalid_y = newvalid[54].asfactor()

    my_gbm = h2o.gbm(y=newtrain_y,
                     validation_y=newvalid_y,
                     x=newtrain_x,
                     validation_x=newvalid_x,
                     distribution="multinomial",
                     ntrees=100,
                     learn_rate=0.1,
                     max_depth=6)

    split1, split2 = train.split_frame()

    newtrain_x = split1[x_cols]
    newtrain_y = split1[54].asfactor()
    newvalid_x = split2[x_cols]
    newvalid_y = split2[54].asfactor()

    my_gbm = h2o.gbm(y=newtrain_y,
                     validation_y=newvalid_y,
                     x=newtrain_x,
                     validation_x=newvalid_x,
                     distribution="multinomial",
                     ntrees=100,
                     learn_rate=0.1,
                     max_depth=6)

    print "KEEPING FRAME???"
    print train._keep
    def get_data(self, src_bucket="cargo.ml.training", obj_name="training_sample.csv"):
        # boto3.setup_default_session(region_name='us-west-2')
        # s3_client = boto3.client('s3', aws_access_key_id=ACCESS_KID, aws_secret_access_key=ACCESS_KEY)
        input_path = os.path.join(INPUT_PATH, 'training_sample_input/training_sample.csv')
        # s3_client.download_file(src_bucket, obj_name, input_path)

        df_raw = h2o.import_file(input_path, parse=False)
        setup = h2o.parse_setup(df_raw,
                                destination_frame="training.hex",
                                header=1,
                                column_names=self.col_headers,
                                column_types=self.col_types)
        df = h2o.parse_raw(h2o.parse_setup(df_raw),
                           id='training.csv',
                           first_line_is_header=1)

        print("Input dataframe: ", df)
        return df