def glrm_iris():
  print("Importing iris.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
  irisH2O.describe()

  print("@@@@@@  Building PCA with GramSVD...\n")
  glrmPCA = H2OPCA(k=5, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, seed=21)
  glrmPCA.train(x=irisH2O.names, training_frame=irisH2O)

  glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss="Quadratic",transform="STANDARDIZE", recover_svd=True,  seed=21)
  glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)

  # compare singular values and stuff with GramSVD
  print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
  pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["importance"],
                                           glrm_h2o._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=1e-6)
  print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")

  # compare singular vectors
  pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["eigenvectors"],
                                           glrm_h2o._model_json["output"]["eigenvectors"],
                                           glrm_h2o._model_json["output"]["names"], tolerance=1e-6,check_sign=True)

  # check to make sure maximum proportional variance <= 1
  assert glrmPCA._model_json["output"]["importance"].cell_values[1][1] <= 1, \
    "Expected value <= 1.0 but received {0}".format(glrmPCA._model_json["output"]["importance"].cell_values[1][1])
示例#2
0
    def init_for_pipeline(self):
        """
        Returns H2OPCA object which implements fit and transform method to be used in sklearn.Pipeline properly.
        All parameters defined in self.__params, should be input parameters in H2OPCA.__init__ method.

        :returns: H2OPCA object

        :examples:

        >>> from sklearn.pipeline import Pipeline
        >>> from h2o.transforms.preprocessing import H2OScaler
        >>> from h2o.estimators import H2ORandomForestEstimator
        >>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
        >>> pipe = Pipeline([("standardize", H2OScaler()),
        ...                  ("pca", H2OPrincipalComponentAnalysisEstimator(k=2).init_for_pipeline()),
        ...                  ("rf", H2ORandomForestEstimator(seed=42,ntrees=5))])
        >>> pipe.fit(iris[:4], iris[4])
        """
        import inspect
        from h2o.transforms.decomposition import H2OPCA
        # check which parameters can be passed to H2OPCA init
        var_names = list(
            dict(inspect.getmembers(H2OPCA.__init__.__code__))['co_varnames'])
        parameters = {k: v for k, v in self._parms.items() if k in var_names}
        return H2OPCA(**parameters)
示例#3
0
def glrm_arrests():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    pca_h2o = H2OPCA(k=4, transform="STANDARDIZE")
    pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
    pca_h2o.summary()
    pca_h2o.show()

    print("H2O GLRM on standardized data with quadratic loss:\n")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=4,
                                              transform="STANDARDIZE",
                                              loss="Quadratic",
                                              gamma_x=0,
                                              gamma_y=0,
                                              init="SVD",
                                              recover_svd=True)
    glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
    glrm_h2o.show()

    # compare table values and make sure they are the same between PCA and GLRM
    assert pyunit_utils.equal_2D_tables(pca_h2o._model_json["output"]["importance"]._cell_values,
                                        glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \
      "PCA and GLRM variance metrics do not agree.  Fix it please."

    sys.stdout.flush()
def pca_wideDataset_rotterdam():
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("transform used on dataset is {0}.\n".format(transformN))

    gramSVD = H2OPCA(k=8,
                     impute_missing=True,
                     transform=transformN,
                     seed=12345)
    gramSVD.train(x=x, training_frame=rotterdamH2O)

    powerPCA = H2OPCA(k=8,
                      impute_missing=True,
                      transform=transformN,
                      pca_method="Power",
                      seed=12345)  # power
    powerPCA.train(x=x, training_frame=rotterdamH2O)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        powerPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-6,
        check_all=False)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors

    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["names"],
        tolerance=1e-6,
        check_sign=True,
        check_all=False)
示例#5
0
def screeplot_test():
    kwargs = {}
    kwargs['server'] = True
    australia = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/AustraliaCoast.csv"))
    australia_pca = H2OPCA(k=4, transform="STANDARDIZE")
    australia_pca.train(x=list(range(8)), training_frame=australia)
    australia_pca.screeplot(type="barplot", **kwargs)
    australia_pca.screeplot(type="lines", **kwargs)
示例#6
0
def pca_pubdev_4167_OOM():
    """
  This pyunit is written to make sure PCA works with customer data.  It is mainly used by customer to verify
  PCA operations and not to be used as a regular test since I do not want to expose customer data.
  """
    h2o.remove_all()
    transform_types = [
        "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"
    ]  # make sure we check all tranforms
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("transform used on dataset is {0}.\n".format(transformN))

    training_data = h2o.import_file(path=pyunit_utils.locate(
        "/Users/wendycwong/gitBackup/SDatasets/pubdev_4167_Avkash/m120K.tar")
                                    )  # Nidhi: import may not work

    gramSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN)
    gramSVDPCA.train(x=list(range(0, training_data.ncols)),
                     training_frame=training_data)

    powerSVDPCA = H2OPCA(k=training_data.ncols,
                         transform=transformN,
                         pca_method="Power")
    powerSVDPCA.train(x=list(range(0, training_data.ncols)),
                      training_frame=training_data)

    # compare singular values and stuff between power and GramSVD methods
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVDPCA._model_json["output"]["importance"],
        powerSVDPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-5,
        check_all=False)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVDPCA._model_json["output"]["eigenvectors"],
        powerSVDPCA._model_json["output"]["eigenvectors"],
        powerSVDPCA._model_json["output"]["names"],
        tolerance=1e-1,
        check_sign=True)
示例#7
0
def pca_arrests():

    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()

    for i in range(4):
        print("H2O PCA with " + str(i) + " dimensions:\n")
        print("Using these columns: {0}".format(arrestsH2O.names))
        pca_h2o = H2OPCA(k=i + 1)
        pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
def _get_pca_model(predictor_col, response_col, train_f, val_f):
    from h2o.transforms.decomposition import H2OPCA
    k = 10
    pca_decomp = H2OPCA(k=k,
                        transform="NONE",
                        pca_method="Power",
                        impute_missing=True)
    pca_decomp.train(x=predictor_columns, training_frame=train_f)
    pca_decomp.summary()
    # Explained Variance
    logr.log_event(f'Training Accuracy', f'{pca_decomp.varimp()[2][k-1]}')
    return pca_decomp
示例#9
0
文件: pca.py 项目: zawlazaw/h2o-3
    def init_for_pipeline(self):
        """
        Returns H2OPCA object which implements fit and transform method to be used in sklearn.Pipeline properly.
        All parameters defined in self.__params, should be input parameters in H2OPCA.__init__ method.

        :returns: H2OPCA object
        """
        import inspect
        from h2o.transforms.decomposition import H2OPCA
        # check which parameters can be passed to H2OPCA init
        var_names = list(dict(inspect.getmembers(H2OPCA.__init__.__code__))['co_varnames'])
        parameters = {k: v for k, v in self._parms.items() if k in var_names}
        return H2OPCA(**parameters)
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.transforms.decomposition import H2OPCA
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))])
    pipe.fit(iris[:4], iris[4])
示例#11
0
def pca_car():
  num_runs = 10
  run_time_c = []

  car = h2o.import_file(path=pyunit_utils.locate("smalldata/pca_test/car.arff.txt"))  # Nidhi: import may not work
  for run_index in range(num_runs):  # multiple runs to get an idea of run time info
    carPCA = H2OPCA(k=car.ncols, transform="STANDARDIZE")
    carPCA.train(x=list(range(0, car.ncols)), training_frame=car)
    run_time_c.append(carPCA._model_json['output']['end_time']-carPCA._model_json['output']['start_time'])
    print("PCA model training time with car.arff.txt data in ms is {0}".format(run_time_c[run_index]))

    h2o.remove(carPCA)

  assert (max(run_time_c)) < 1000, "PCA runs for car.arff.txt take too much time!"
示例#12
0
def pca_pubdev_4314():
    print("Importing prostate_cat.csv data...\n")
    prostate = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    prostate.describe()
    print("PCA with k = 3, retx = FALSE, transform = 'STANDARDIZE'")
    fitPCA = H2OPCA(k=3, transform="StANDARDIZE", pca_method="GramSVD")
    fitPCA.train(x=list(range(0,8)), training_frame=prostate)
    print(fitPCA.summary())
    varimpPandas = fitPCA.varimp(use_pandas=True)
    assert_is_type(varimpPandas, DataFrame)
    varimpList = fitPCA.varimp()
    print(varimpList)
    assert_is_type(varimpList, list)
    sys.stdout.flush()
示例#13
0
def pca_arrests():

    print "Importing USArrests.csv data..."
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()

    from h2o.transforms.decomposition import H2OPCA

    for i in range(4):
        print "H2O PCA with " + str(i) + " dimensions:\n"
        print "Using these columns: {0}".format(arrestsH2O.names)
        pca_h2o = H2OPCA(k=i + 1)
        pca_h2o.train(x=range(4), training_frame=arrestsH2O)
def pca_scoring():

    print("Importing arrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    print("Run PCA with transform = 'DEMEAN'")

    fitH2O = H2OPCA(k=4, transform="DEMEAN")
    fitH2O.train(x=list(range(4)), training_frame=arrestsH2O)
    # TODO: fitH2O.show()

    print("Project training data into eigenvector subspace")
    predH2O = fitH2O.predict(arrestsH2O)
    print("H2O Projection:")
    predH2O.head()
示例#15
0
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.transforms.decomposition import H2OPCA
    # this should work below, but it's not yet: https://0xdata.atlassian.net/browse/PUBDEV-5236
    #from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as H2OPCA
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))])
    pipe.fit(iris[:4], iris[4])
示例#16
0
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.transforms.decomposition import H2OPCA
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import RandomizedSearchCV
    from h2o.cross_validation import H2OKFold
    from h2o.model.regression import h2o_r2_score
    from sklearn.metrics.scorer import make_scorer
    from scipy.stats import randint

    iris = h2o.import_file(
        path=tests.locate("smalldata/iris/iris_wheader.csv"))

    # build  transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPCA(n_components=2)),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))])

    params = {
        "standardize__center": [True, False],  # Parameters to test
        "standardize__scale": [True, False],
        "pca__n_components": randint(2, iris[1:].shape[1]),
        "rf__ntrees": randint(50, 60),
        "rf__max_depth": randint(4, 8),
        "rf__min_rows": randint(5, 10),
    }

    custom_cv = H2OKFold(iris, n_folds=5, seed=42)
    random_search = RandomizedSearchCV(pipe,
                                       params,
                                       n_iter=5,
                                       scoring=make_scorer(h2o_r2_score),
                                       cv=custom_cv,
                                       random_state=42,
                                       n_jobs=1)

    random_search.fit(iris[1:], iris[0])

    print random_search.best_estimator_
示例#17
0
def pca_arrests():

    print("Importing USArrests.csv data...")
    arrests = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrests.describe()

    # import from h2o.transforms.decomposition
    for i in range(4):
        print("H2O PCA with " + str(i) + " dimensions:\n")
        print("Using these columns: {0}".format(arrests.names))
        pca_h2o = H2OPCA(k=i + 1)
        pca_h2o.train(x=list(range(4)), training_frame=arrests)
        # TODO: pca_h2o.show()

    # import from h2o.estimators.pca
    for i in range(4):
        print("H2O PCA with " + str(i) + " dimensions:\n")
        print("Using these columns: {0}".format(arrests.names))
        pca_h2o = H2OPrincipalComponentAnalysisEstimator(k=i + 1)
        pca_h2o.train(x=list(range(4)), training_frame=arrests)
示例#18
0
def pca_prostate():

    print "Importing prostate.csv data...\n"
    prostate = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors"
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate["RACE"] = prostate["RACE"].asfactor()
    prostate["DPROS"] = prostate["DPROS"].asfactor()
    prostate["DCAPS"] = prostate["DCAPS"].asfactor()
    prostate.describe()

    print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'"

    fitPCA = H2OPCA(k=3, transform="NONE", pca_method="Power")
    fitPCA.train(x=range(2, 9), training_frame=prostate)
    pred = fitPCA.predict(prostate)

    print "Projection matrix:\n"
    pred.head()
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by PCA.
        2. It will find the intersection of parameters that are both griddable and used by PCA.
        3. There are several extra parameters that are used by PCA that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OPCA(k=10, transform="NONE", pca_method=self.pca_method)
        model.train(x=self.x_indices, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        if 'max_iterations' in list(self.hyper_params):
            self.hyper_params['max_iterations'] = [self.max_iter_scale * x for x in self.hyper_params['max_iterations']]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        # must include k in hyper-parameters
        if ('k' not in list(self.final_hyper_params)) and ('k' in list(self.hyper_params)):
            self.final_hyper_params["k"] = self.hyper_params["k"]
            len_good_k = len([x for x in self.hyper_params["k"] if (x > 0)])
            self.possible_number_models = self.possible_number_models*len_good_k

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
示例#20
0
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all
    h2o algos.  See PUBDEV-4702.
    '''
    global model_within_max_runtime
    global err_bound
    seed = 12345

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # deepwater
    if H2ODeepWaterEstimator.available():
        training1_data = h2o.import_file(
            path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
        training1_data = training1_data.drop('Site')
        training1_data['Angaus'] = training1_data['Angaus'].asfactor()
        y_index = "Angaus"
        x_indices = list(range(1, training1_data.ncol))
        model = H2ODeepWaterEstimator(epochs=50,
                                      hidden=[4096, 4096, 4096],
                                      hidden_dropout_ratios=[0.2, 0.2, 0.2])
        grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices,
                        y_index)
        cleanUp([training1_data, model])

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE")
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # PCA
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    grabRuntimeInfo(err_bound * 3, 1.2, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(err_bound * 2, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)
示例#21
0
def javapredict(algo,
                equality,
                train,
                test,
                x,
                y,
                compile_only=False,
                **kwargs):
    print "Creating model in H2O"
    if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs)
    elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs)
    elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs)
    elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs)
    elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs)
    elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs)
    elif algo == "pca": model = H2OPCA(**kwargs)
    else: raise (ValueError, "algo {0} is not supported".format(algo))
    if algo == "kmeans" or algo == "pca":
        model.train(x=x, training_frame=train)
    else:
        model.train(x=x, y=y, training_frame=train)
    print model

    # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means.
    # TODO: clients should extract Java class name from header.
    regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]")
    pojoname = regex.sub("_", model._id)

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                     "results", pojoname))
    os.mkdir(tmpdir)
    h2o.download_pojo(model, path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar")
    assert os.path.exists(
        h2o_genmodel_jar
    ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir, pojoname + ".java")
    assert os.path.exists(
        java_file), "Expected file {0} to exist, but it does not.".format(
            java_file)
    print "java code saved in {0}".format(java_file)

    print "Compiling Java Pojo"
    javac_cmd = [
        "javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g",
        "-J-XX:MaxPermSize=256m", java_file
    ]
    subprocess.check_call(javac_cmd)

    if not compile_only:
        print "Predicting in H2O"
        predictions = model.predict(test)
        predictions.summary()
        predictions.head()
        out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv")
        h2o.download_csv(predictions, out_h2o_csv)
        assert os.path.exists(
            out_h2o_csv
        ), "Expected file {0} to exist, but it does not.".format(out_h2o_csv)
        print "H2O Predictions saved in {0}".format(out_h2o_csv)

        print "Setting up for Java POJO"
        in_csv = os.path.join(tmpdir, "in.csv")
        h2o.download_csv(test[x], in_csv)

        # hack: the PredictCsv driver can't handle quoted strings, so remove them
        f = open(in_csv, 'r+')
        csv = f.read()
        csv = re.sub('\"', '', csv)
        f.seek(0)
        f.write(csv)
        f.truncate()
        f.close()
        assert os.path.exists(
            in_csv), "Expected file {0} to exist, but it does not.".format(
                in_csv)
        print "Input CSV to PredictCsv saved in {0}".format(in_csv)

        print "Running PredictCsv Java Program"
        out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv")
        cp_sep = ";" if sys.platform == "win32" else ":"
        java_cmd = [
            "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir,
            "-Xmx12g", "-XX:MaxPermSize=2g", "-XX:ReservedCodeCacheSize=256m",
            "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname,
            "--input", in_csv, "--output", out_pojo_csv
        ]
        p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
        o, e = p.communicate()
        print "Java output: {0}".format(o)
        assert os.path.exists(
            out_pojo_csv
        ), "Expected file {0} to exist, but it does not.".format(out_pojo_csv)
        predictions2 = h2o.upload_file(path=out_pojo_csv)
        print "Pojo predictions saved in {0}".format(out_pojo_csv)

        print "Comparing predictions between H2O and Java POJO"
        # Dimensions
        hr, hc = predictions.dim
        pr, pc = predictions2.dim
        assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(
            hr, pr)
        assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(
            hc, pc)

        # Value
        for r in range(hr):
            hp = predictions[r, 0]
            if equality == "numeric":
                pp = float.fromhex(predictions2[r, 0])
                assert abs(
                    hp - pp
                ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(
                    r, hp, pp)
            elif equality == "class":
                pp = predictions2[r, 0]
                assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(
                    r, hp, pp)
            else:
                raise (ValueError,
                       "equality type {0} is not supported".format(equality))
prostate_df.describe()
glm_classifier = H2OGeneralizedLinearEstimator(family="binomial",
                                               nfolds=10,
                                               alpha=0.5)
glm_classifier.train(x=["AGE", "RACE", "PSA", "DCAPS"],
                     y="CAPSULE",
                     training_frame=prostate_df)
glm_classifier

from h2o.estimators.kmeans import H2OKMeansEstimator
cluster_estimator = H2OKMeansEstimator(k=3)
cluster_estimator.train(x=[0, 1, 2, 3], training_frame=iris_df)
cluster_estimator

from h2o.transforms.decomposition import H2OPCA
pca_decomp = H2OPCA(k=2, transform="NONE", pca_method="Power")
pca_decomp.train(x=range(0, 4), training_frame=iris_df)
pca_decomp

pred = pca_decomp.predict(iris_df)
pred.head()  # Projection results

# Grid Search

ntrees_opt = [5, 10, 15]
max_depth_opt = [2, 3, 4]
learn_rate_opt = [0.1, 0.2]
hyper_parameters = {
    "ntrees": ntrees_opt,
    "max_depth": max_depth_opt,
    "learn_rate": learn_rate_opt
def pca_wideDataset_rotterdam():
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("transform used on dataset is {0}.\n".format(transformN))
    buildModel = [False, False, False]
    buildModel[randint(0, len(buildModel) - 1)] = True

    expNum = 0
    if (buildModel[expNum]):
        # special test with GLRM.  Need use_all_levels to be true
        print("------  Testing GLRM PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345,
                         use_all_factor_levels=True)
        gramSVD.train(x=x, training_frame=rotterdamH2O)

        glrmPCA = H2OGeneralizedLowRankEstimator(k=8,
                                                 transform=transformN,
                                                 seed=12345,
                                                 init="Random",
                                                 max_iterations=10,
                                                 recover_svd=True,
                                                 regularization_x="None",
                                                 regularization_y="None")
        glrmPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
        print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            glrmPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1,
            check_all=False)

        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            glrmPCA._model_json["output"]["eigenvectors"],
            glrmPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)
        h2o.remove(gramSVD)
        h2o.remove(glrmPCA)

    expNum = expNum + 1
    if (buildModel[expNum]):
        print("------  Testing Power PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        powerPCA = H2OPCA(k=8,
                          impute_missing=True,
                          transform=transformN,
                          pca_method="Power",
                          seed=12345)  # power
        powerPCA.train(x=x, training_frame=rotterdamH2O)
        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            powerPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1e-6,
            check_all=False)
        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors

        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            powerPCA._model_json["output"]["eigenvectors"],
            powerPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)

    expNum = expNum + 1
    if (buildModel[expNum]):
        print("------  Testing Randomized PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        randomizedPCA = H2OPCA(k=8,
                               impute_missing=True,
                               transform=transformN,
                               pca_method="Randomized",
                               seed=12345,
                               max_iterations=5)  # power
        randomizedPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print(
            "@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n"
        )
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            randomizedPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1e-1,
            check_all=False)

        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            randomizedPCA._model_json["output"]["eigenvectors"],
            randomizedPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)
    h2o.remove_all()
def pca_scoring_history_importance():
    """
    This test aims to check and make sure PCA returns the scoring history and importance which are
    reported missing for certain PCA mode.  Apart from changing the PCA mode, I throw in the transform
    type to test as well randomly.
    """
    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]

    print("Importing australia.csv data...\n")
    australia = h2o.upload_file(
        pyunit_utils.locate("smalldata/extdata/australia.csv"))
    col_indices = list(range(0, australia.ncol))

    print("transform is {0}.\n".format(transformN))
    # checking out PCA with GramSVD
    print("@@@@@@  Building PCA with GramSVD...\n")
    gramSVD = H2OPCA(k=3, transform=transformN)
    gramSVD.train(x=col_indices, training_frame=australia)

    # check PCA with PCA set to Randomized
    print("@@@@@@  Building PCA with Randomized...\n")
    randomizedPCA = H2OPCA(k=3,
                           transform=transformN,
                           pca_method="Randomized",
                           compute_metrics=True,
                           use_all_factor_levels=True)
    randomizedPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        randomizedPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-3)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Randomized...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        randomizedPCA._model_json["output"]["eigenvectors"],
        randomizedPCA._model_json["output"]["names"],
        tolerance=5e-2,
        check_sign=True)

    # check PCA with PCA set to Power
    print("@@@@@@  Building PCA with Power...\n")
    powerPCA = H2OPCA(k=3,
                      transform=transformN,
                      pca_method="Power",
                      compute_metrics=True,
                      use_all_factor_levels=True)
    powerPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        powerPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["names"],
        tolerance=1e-5,
        check_sign=True)

    # check PCA with PCA set to GLRM
    print("@@@@@@  Building PCA with GLRM...\n")
    glrmPCA = H2OPCA(k=3,
                     transform=transformN,
                     pca_method="GLRM",
                     compute_metrics=True,
                     use_all_factor_levels=True)
    glrmPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        glrmPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=2e-2)
    print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["names"],
        tolerance=2e-1,
        check_sign=True)

    # make sure we find the scoring history and it is not empty for all the PCA modes
    # just check and make sure the cell_values exceed 0
    assert len(gramSVD._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                "pca_method to GramSVD is empty."
    assert len(powerPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                 "pca_method to using is empty."
    assert len(randomizedPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                      "pca_method to Randomized is " \
                                                                                      "empty."
    assert len(glrmPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                  "pca_method to GLRM is empty."
示例#25
0
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
prostate_data_path = h2o.system_file("prostate.csv")
prostate_df = h2o.import_file(path=prostate_data_path)
prostate_df["RACE"] = prostate_df["RACE"].asfactor()
prostate_df.describe()
glm_classifier = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5)
glm_classifier.train(x=["AGE","RACE","PSA","DCAPS"],y="CAPSULE", training_frame=prostate_df)
glm_classifier

from h2o.estimators.kmeans import H2OKMeansEstimator
cluster_estimator = H2OKMeansEstimator(k=3)
cluster_estimator.train(x=[0,1,2,3], training_frame=iris_df)
cluster_estimator

from h2o.transforms.decomposition import H2OPCA
pca_decomp = H2OPCA(k=2, transform="NONE", pca_method="Power")
pca_decomp.train(x=range(0,4), training_frame=iris_df)
pca_decomp

pred = pca_decomp.predict(iris_df)
pred.head()  # Projection results

# Grid Search

ntrees_opt = [5, 10, 15]
max_depth_opt = [2, 3, 4]
learn_rate_opt = [0.1, 0.2]
hyper_parameters = {"ntrees": ntrees_opt, "max_depth":max_depth_opt, "learn_rate":learn_rate_opt}
from h2o.grid.grid_search import H2OGridSearch
gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters)
gs.train(x=range(0,iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10)
    def test_PCA_grid_search_over_params(self):
        """
        test_pca_grid_search_over_params: test for condition 1 and performs the following:
        a. build H2O PCA models using grid search.  Count and make sure models
           are only built for hyper-parameters set to legal values.  No model is built for bad hyper-parameters
           values.  We should instead get a warning/error message printed out.
        c. For each model built using grid search, we will extract the parameters used in building
           that model and manually build a H2O PCA model.  Training metrics are calculated from the
           gridsearch model and the manually built model.  If their metrics
           differ by too much, print a warning message but don't fail the test.
        d. we will check and make sure the models are built within the max_runtime_secs time limit that was set
           for it as well.  If max_runtime_secs was exceeded, declare test failure.
        """
        print("*******************************************************************************************")
        print("test_PCA_grid_search_over_params for PCA ")
        h2o.cluster_info()

        try:
            print("Hyper-parameters used here is {0}".format(self.final_hyper_params))

            # start grid search
            grid_model = H2OGridSearch(H2OPCA(pca_method=self.pca_method),
                                       hyper_params=self.final_hyper_params)
            grid_model.train(x=self.x_indices, training_frame=self.training1_data)

            self.correct_model_number = len(grid_model)     # store number of models built

            # make sure the correct number of models are built by gridsearch
            if not (self.correct_model_number == self.possible_number_models):  # wrong grid model number
                self.test_failed += 1
                print("test_PCA_grid_search_over_params for PCA failed: number of models built by gridsearch "
                      "does not equal to all possible combinations of hyper-parameters")
            else:
                # add parameters into params_dict.  Use this to manually build model
                params_dict = dict()
                params_dict["pca_method"] = self.pca_method
                total_run_time_limits = 0.0   # calculate upper bound of max_runtime_secs
                true_run_time_limits = 0.0
                manual_run_runtime = 0.0

                # compare performance metric of model built by gridsearch with manually built model
                for each_model in grid_model:

                    params_list = grid_model.get_hyperparams_dict(each_model._id)
                    params_list.update(params_dict)

                    model_params = dict()

                    # need to taken out max_runtime_secs from model parameters, it is now set in .train()
                    if "max_runtime_secs" in params_list:
                        model_params["max_runtime_secs"] = params_list["max_runtime_secs"]
                        max_runtime = params_list["max_runtime_secs"]
                        del params_list["max_runtime_secs"]
                    else:
                        max_runtime = 0

                    # make sure manual model was provided the same max_runtime_secs as the grid model
                    each_model_runtime = pyunit_utils.find_grid_runtime([each_model])

                    manual_model = H2OPCA(**params_list)
                    manual_model.train(x=self.x_indices, training_frame=self.training1_data,
                                       **model_params)

                    # collect the time taken to manually built all models
                    model_runtime = pyunit_utils.find_grid_runtime([manual_model])  # time taken to build this model
                    manual_run_runtime += model_runtime

                    if max_runtime > 0:
                        # shortest possible time it takes to build this model
                        if max_runtime < self.model_run_time:
                            total_run_time_limits += model_runtime
                        else:
                            total_run_time_limits += max_runtime

                    true_run_time_limits += max_runtime

                    # compute and compare test metrics between the two models
                    grid_model_metrics = \
                        sum(each_model._model_json["output"]["model_summary"].cell_values[0][1:params_list["k"]])
                    manual_model_metrics = \
                        sum(manual_model._model_json["output"]["model_summary"].cell_values[0][1:params_list["k"]])

                    # just compare the mse in this case within tolerance:
                    if not((type(grid_model_metrics) == str) or (type(manual_model_metrics) == str)):
                        if (abs(grid_model_metrics) > 0) and \
                                (abs(grid_model_metrics - manual_model_metrics)/grid_model_metrics > self.allowed_diff):
                            print("test_PCA_grid_search_over_params for PCA warning: grid search model mdetric ({0}) "
                                  "and manually built H2O model metric ({1}) differ too much"
                                  "!".format(grid_model_metrics, manual_model_metrics))

                total_run_time_limits = max(total_run_time_limits, true_run_time_limits) * (1+self.extra_time_fraction)

                # make sure the max_runtime_secs is working to restrict model built time
                if not(manual_run_runtime <= total_run_time_limits):
                    self.test_failed += 1
                    print("test_PCA_grid_search_over_params for PCA failed: time taken to manually build models is {0}."
                          "  Maximum allowed time is {1}".format(manual_run_runtime, total_run_time_limits))
                else:
                    print("time taken to manually build all models is {0}. Maximum allowed time is "
                          "{1}".format(manual_run_runtime, total_run_time_limits))

                if self.test_failed == 0:
                    print("test_PCA_grid_search_over_params for PCA has passed!")
        except Exception as e:
            if self.possible_number_models > 0:
                print("test_PCA_grid_search_over_params for PCA failed: exception ({0}) was thrown for no reason.".format(e))
                self.test_failed += 1
示例#27
0
#gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters)
#gs.train(x=range(0, iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10)

##
## Pipeline
##

from h2o.transforms.preprocessing import H2OScaler
from h2o.transforms.decomposition import H2OPCA
from sklearn.pipeline import Pipeline

h2o.no_progress()

pipeline = Pipeline([
    ("standardize", H2OScaler()), ("pca", H2OPCA(k=2)),
    ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))
])

print pipeline.fit(iris_df[:4], iris_df[4])

##
## Randomized Gird Search
##
from sklearn.grid_search import RandomizedSearchCV
from h2o.cross_validation import H2OKFold
from h2o.model.regression import h2o_r2_score
from sklearn.metrics.scorer import make_scorer

params = {
    "standardize__center": [True, False],
def pca_max_k():
    data = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(data.names) - y)
    pcaGramSVD = H2OPCA(k=-1,
                        transform="STANDARDIZE",
                        pca_method="GramSVD",
                        impute_missing=True,
                        max_iterations=100)
    pcaGramSVD.train(x, training_frame=data)

    pcaPower = H2OPCA(k=-1,
                      transform="STANDARDIZE",
                      pca_method="Power",
                      impute_missing=True,
                      max_iterations=100,
                      seed=12345)
    pcaPower.train(x, training_frame=data)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaGramSVD._model_json["output"]["importance"],
        pcaPower._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1)

    correctEigNum = pcaPower.full_parameters["k"]["actual_value"]
    gramSVDNum = len(
        pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1
    powerNum = len(
        pcaPower._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \
                                        ", actual: " + gramSVDNum + "."
    assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \
                                      ", actual: " + powerNum + "."

    # Randomized and GLRM does not have wide dataset implementation.  Check with smaller datasets
    data = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    x = list(set(data.names))
    pcaRandomized = H2OPCA(k=-1,
                           transform="STANDARDIZE",
                           pca_method="Randomized",
                           impute_missing=True,
                           max_iterations=100,
                           seed=12345)
    pcaRandomized.train(x, training_frame=data)
    # should still work with rank deficient dataset
    pcaRandomizedF = H2OPCA(k=-1,
                            transform="STANDARDIZE",
                            pca_method="Randomized",
                            use_all_factor_levels=True,
                            impute_missing=True,
                            max_iterations=100,
                            seed=12345)
    pcaRandomizedF.train(x, training_frame=data)

    pcaPower = H2OPCA(k=-1,
                      transform="STANDARDIZE",
                      pca_method="Power",
                      impute_missing=True,
                      max_iterations=100,
                      seed=12345)
    pcaPower.train(x, training_frame=data)
    # should still work with rank deficient dataset
    pcaPowerF = H2OPCA(k=-1,
                       transform="STANDARDIZE",
                       pca_method="Power",
                       use_all_factor_levels=True,
                       impute_missing=True,
                       max_iterations=100,
                       seed=12345)
    pcaPowerF.train(x, training_frame=data)

    # eigenvalues between the PCA and Randomize should be close, I hope...
    print(
        "@@@@@@  Comparing eigenvalues between Randomized and Power PCA...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaRandomized._model_json["output"]["importance"],
        pcaPower._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])

    # eigenvalues between the PCA and Randomize should be close with rank deficient dataset, I hope...
    print(
        "@@@@@@  Comparing eigenvalues between Randomized and Power PCA with rank deficient dataset...\n"
    )
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaRandomizedF._model_json["output"]["importance"],
        pcaPowerF._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])

    pcaGLRM = H2OPCA(k=-1,
                     transform="STANDARDIZE",
                     pca_method="GLRM",
                     use_all_factor_levels=True,
                     max_iterations=100,
                     seed=12345)
    pcaGLRM.train(x, training_frame=data)
    correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"]
    glrmNum = len(
        pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \
                                     ", actual: " + glrmNum + "."