예제 #1
0
def pca_arrests():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    print(
        "Testing to see whether the trained PCA are essentially the same using different implementation..."
    )

    eigenvector_standard = None
    for impl in [
            "MTJ_EVD_DENSEMATRIX", "MTJ_EVD_SYMMMATRIX", "MTJ_SVD_DENSEMATRIX",
            "JAMA"
    ]:
        print("Run PCA with implementation: " + impl)
        model = H2OPCA(k=4, pca_impl=impl, seed=1234)
        model.train(x=list(range(4)), training_frame=arrestsH2O)
        eigenvectors = model._model_json["output"]["eigenvectors"]
        if eigenvector_standard is not None:
            # Compare to see if they are fundamentally the same
            pyunit_utils.assert_H2OTwoDimTable_equal(
                eigenvector_standard,
                eigenvectors,
                model._model_json["output"]["names"],
                tolerance=1e-6,
                check_sign=True,
                check_all=False)
        else:
            eigenvector_standard = eigenvectors
예제 #2
0
def glrm_iris():
  print("Importing iris.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
  irisH2O.describe()

  print("@@@@@@  Building PCA with GramSVD...\n")
  glrmPCA = H2OPCA(k=5, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, seed=21)
  glrmPCA.train(x=irisH2O.names, training_frame=irisH2O)

  glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss="Quadratic",transform="STANDARDIZE", recover_svd=True,  seed=21)
  glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)

  # compare singular values and stuff with GramSVD
  print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
  pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["importance"],
                                           glrm_h2o._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=1e-6)
  print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")

  # compare singular vectors
  pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["eigenvectors"],
                                           glrm_h2o._model_json["output"]["eigenvectors"],
                                           glrm_h2o._model_json["output"]["names"], tolerance=1e-6,check_sign=True)

  # check to make sure maximum proportional variance <= 1
  assert glrmPCA._model_json["output"]["importance"].cell_values[1][1] <= 1, \
    "Expected value <= 1.0 but received {0}".format(glrmPCA._model_json["output"]["importance"].cell_values[1][1])
def pca_pubdev_4167_OOM():
  """
  This pyunit is written to make sure PCA works with customer data.  It is mainly used by customer to verify
  PCA operations and not to be used as a regular test since I do not want to expose customer data.
  """
  h2o.remove_all()
  transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]   # make sure we check all tranforms
  transformN = transform_types[randint(0, len(transform_types)-1)]
  print("transform used on dataset is {0}.\n".format(transformN))

  training_data = h2o.import_file(path=pyunit_utils.locate("/Users/wendycwong/gitBackup/SDatasets/pubdev_4167_Avkash/m120K.tar"))  # Nidhi: import may not work

  gramSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN)
  gramSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data)

  powerSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN, pca_method="Power")
  powerSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data)

  # compare singular values and stuff between power and GramSVD methods
  print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
  pyunit_utils.assert_H2OTwoDimTable_equal(gramSVDPCA._model_json["output"]["importance"],
                                           powerSVDPCA._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=1e-5, check_all=False)
  print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
  # compare singular vectors
  pyunit_utils.assert_H2OTwoDimTable_equal(gramSVDPCA._model_json["output"]["eigenvectors"],
                                           powerSVDPCA._model_json["output"]["eigenvectors"],
                                           powerSVDPCA._model_json["output"]["names"], tolerance=1e-1,
                                           check_sign=True)
예제 #4
0
def glrm_iris():
  print("Importing iris.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
  irisH2O.describe()

  print("@@@@@@  Building PCA with GramSVD...\n")
  glrmPCA = H2OPCA(k=5, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, seed=21)
  glrmPCA.train(x=irisH2O.names, training_frame=irisH2O)

  glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss="Quadratic",transform="STANDARDIZE", recover_svd=True,  seed=21)
  glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)

  # compare singular values and stuff with GramSVD
  print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
  pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["importance"],
                                           glrm_h2o._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=1e-6)
  print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")

  # compare singular vectors
  pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["eigenvectors"],
                                           glrm_h2o._model_json["output"]["eigenvectors"],
                                           glrm_h2o._model_json["output"]["names"], tolerance=1e-6,check_sign=True)

  # check to make sure maximum proportional variance <= 1
  assert glrmPCA._model_json["output"]["importance"].cell_values[1][1] <= 1, \
    "Expected value <= 1.0 but received {0}".format(glrmPCA._model_json["output"]["importance"].cell_values[1][1])
예제 #5
0
def pca_wideDataset_rotterdam_glrm():
    tol = 2e-5
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    # special test with GLRM.  Need use_all_levels to be true
    print("------  Testing GLRM PCA --------")
    gramSVD = H2OPCA(k=8,
                     impute_missing=True,
                     transform="DEMEAN",
                     seed=12345,
                     use_all_factor_levels=True)
    gramSVD.train(x=x, training_frame=rotterdamH2O)

    glrmPCA = H2OGeneralizedLowRankEstimator(k=8,
                                             transform="DEMEAN",
                                             seed=12345,
                                             init="Random",
                                             recover_svd=True,
                                             regularization_x="None",
                                             regularization_y="None",
                                             max_iterations=11)
    glrmPCA.train(x=x, training_frame=rotterdamH2O)

    # compare singular values and stuff with GramSVD
    print(
        "@@@@@@  Comparing eigenvectors and eigenvalues between GramSVD and GLRM...\n"
    )
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        glrmPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1,
        check_all=False)

    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["names"],
        tolerance=tol,
        check_sign=True,
        check_all=False)
def pca_wideDataset_rotterdam():
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("transform used on dataset is {0}.\n".format(transformN))

    gramSVD = H2OPCA(k=8,
                     impute_missing=True,
                     transform=transformN,
                     seed=12345)
    gramSVD.train(x=x, training_frame=rotterdamH2O)

    powerPCA = H2OPCA(k=8,
                      impute_missing=True,
                      transform=transformN,
                      pca_method="Power",
                      seed=12345)  # power
    powerPCA.train(x=x, training_frame=rotterdamH2O)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        powerPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-6,
        check_all=False)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors

    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["names"],
        tolerance=1e-6,
        check_sign=True,
        check_all=False)
예제 #7
0
def pca_pubdev_4167_OOM():
    """
  This pyunit is written to make sure PCA works with customer data.  It is mainly used by customer to verify
  PCA operations and not to be used as a regular test since I do not want to expose customer data.
  """
    h2o.remove_all()
    transform_types = [
        "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"
    ]  # make sure we check all tranforms
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("transform used on dataset is {0}.\n".format(transformN))

    training_data = h2o.import_file(path=pyunit_utils.locate(
        "/Users/wendycwong/gitBackup/SDatasets/pubdev_4167_Avkash/m120K.tar")
                                    )  # Nidhi: import may not work

    gramSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN)
    gramSVDPCA.train(x=list(range(0, training_data.ncols)),
                     training_frame=training_data)

    powerSVDPCA = H2OPCA(k=training_data.ncols,
                         transform=transformN,
                         pca_method="Power")
    powerSVDPCA.train(x=list(range(0, training_data.ncols)),
                      training_frame=training_data)

    # compare singular values and stuff between power and GramSVD methods
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVDPCA._model_json["output"]["importance"],
        powerSVDPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-5,
        check_all=False)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVDPCA._model_json["output"]["eigenvectors"],
        powerSVDPCA._model_json["output"]["eigenvectors"],
        powerSVDPCA._model_json["output"]["names"],
        tolerance=1e-1,
        check_sign=True)
예제 #8
0
def pca_max_k():
    data = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"))
    x = list(set(data.names))

    pcaGramSVD = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GramSVD", impute_missing=True, max_iterations=100)
    pcaGramSVD.train(x, training_frame=data)
    pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100,
                      seed=12345)
    pcaPower.train(x, training_frame=data)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(pcaGramSVD._model_json["output"]["importance"],
                                             pcaPower._model_json["output"]["importance"],
                                             ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                             tolerance=1)

    correctEigNum = pcaPower.full_parameters["k"]["actual_value"]
    gramSVDNum = len(pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1
    powerNum = len(pcaPower._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \
                                        ", actual: " + gramSVDNum + "."
    assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \
                                      ", actual: " + powerNum + "."

    pcaRandomized = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized",
                           impute_missing=True, max_iterations=100, seed=12345)
    pcaRandomized.train(x, training_frame=data)

    # eigenvalues between the PCA and Randomize should be close, I hope...
    print("@@@@@@  Comparing eigenvalues between Randomized and Power PCA...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomized._model_json["output"]["importance"],
                                             pcaPower._model_json["output"]["importance"],
                                             ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"])

    pcaGLRM = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True,
                     max_iterations=100, seed=12345)
    pcaGLRM.train(x, training_frame=data)
    correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"]
    glrmNum = len(pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \
                                     ", actual: " + glrmNum + "."
예제 #9
0
def pca_wideDataset_rotterdam_pcapower():
    tol = 2e-5
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    print("------  Testing Power PCA --------")
    gramSVD = H2OPCA(k=8,
                     impute_missing=True,
                     transform="STANDARDIZE",
                     seed=12345)
    gramSVD.train(x=x, training_frame=rotterdamH2O)
    powerPCA = H2OPCA(k=8,
                      impute_missing=True,
                      transform="STANDARDIZE",
                      pca_method="Power",
                      seed=12345)  # power
    powerPCA.train(x=x, training_frame=rotterdamH2O)
    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        powerPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-6,
        check_all=False)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors

    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["names"],
        tolerance=tol,
        check_sign=True,
        check_all=False)
def pca_arrests():
  print("Importing USArrests.csv data...")
  arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

  print("Testing to see whether the trained PCA are essentially the same using different implementation...")
  
  eigenvector_standard = None
  for impl in ["MTJ_EVD_DENSEMATRIX", "MTJ_EVD_SYMMMATRIX", "MTJ_SVD_DENSEMATRIX", "JAMA"]:
    print("Run PCA with implementation: " + impl)
    model = H2OPCA(k = 4, pca_impl=impl, seed=1234)
    model.train(x=list(range(4)), training_frame=arrestsH2O)
    eigenvectors = model._model_json["output"]["eigenvectors"]
    if eigenvector_standard is not None:
      # Compare to see if they are fundamentally the same
      pyunit_utils.assert_H2OTwoDimTable_equal(
        eigenvector_standard,
        eigenvectors,
        model._model_json["output"]["names"],
        tolerance=1e-6,
        check_sign=True,
        check_all=False)
    else:
      eigenvector_standard = eigenvectors
def pca_wideDataset_rotterdam():
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names)-y)

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]
    print("transform used on dataset is {0}.\n".format(transformN))
    buildModel = [False, False, False]
    buildModel[randint(0, len(buildModel)-1)] = True

    expNum = 0
    if (buildModel[expNum]):
        # special test with GLRM.  Need use_all_levels to be true
        print("------  Testing GLRM PCA --------")
        gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345, use_all_factor_levels=True)
        gramSVD.train(x=x, training_frame=rotterdamH2O)

        glrmPCA = H2OGeneralizedLowRankEstimator(k=8, transform=transformN, seed=12345, init="Random",
                                                 max_iterations=10, recover_svd=True, regularization_x="None",
                                                 regularization_y="None")
        glrmPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
        print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                                 glrmPCA._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                                 tolerance=1, check_all=False)

        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                                 glrmPCA._model_json["output"]["eigenvectors"],
                                                 glrmPCA._model_json["output"]["names"], tolerance=1e-6,
                                                 check_sign=True, check_all=False)
        h2o.remove(gramSVD)
        h2o.remove(glrmPCA)

    expNum=expNum+1
    if (buildModel[expNum]):
        print("------  Testing Power PCA --------")
        gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        powerPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Power", seed=12345)  # power
        powerPCA.train(x=x, training_frame=rotterdamH2O)
        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                             powerPCA._model_json["output"]["importance"],
                                             ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                             tolerance=1e-6, check_all=False)
        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors

        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                             powerPCA._model_json["output"]["eigenvectors"],
                                             powerPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True,
                                             check_all=False)

    expNum=expNum+1
    if (buildModel[expNum]):
        print("------  Testing Randomized PCA --------")
        gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        randomizedPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Randomized", seed=12345,
                               max_iterations=5)  # power
        randomizedPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                                 randomizedPCA._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                                 tolerance=1e-1, check_all=False)

        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                                 randomizedPCA._model_json["output"]["eigenvectors"],
                                                 randomizedPCA._model_json["output"]["names"], tolerance=1e-6,
                                                 check_sign=True, check_all=False)
    h2o.remove_all()
def pca_scoring_history_importance():
    """
    This test aims to check and make sure PCA returns the scoring history and importance which are
    reported missing for certain PCA mode.  Apart from changing the PCA mode, I throw in the transform
    type to test as well randomly.
    """
    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]

    print("Importing australia.csv data...\n")
    australia = h2o.upload_file(
        pyunit_utils.locate("smalldata/extdata/australia.csv"))
    col_indices = list(range(0, australia.ncol))

    print("transform is {0}.\n".format(transformN))
    # checking out PCA with GramSVD
    print("@@@@@@  Building PCA with GramSVD...\n")
    gramSVD = H2OPCA(k=3, transform=transformN)
    gramSVD.train(x=col_indices, training_frame=australia)

    # check PCA with PCA set to Randomized
    print("@@@@@@  Building PCA with Randomized...\n")
    randomizedPCA = H2OPCA(k=3,
                           transform=transformN,
                           pca_method="Randomized",
                           compute_metrics=True,
                           use_all_factor_levels=True)
    randomizedPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        randomizedPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-3)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Randomized...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        randomizedPCA._model_json["output"]["eigenvectors"],
        randomizedPCA._model_json["output"]["names"],
        tolerance=5e-2,
        check_sign=True)

    # check PCA with PCA set to Power
    print("@@@@@@  Building PCA with Power...\n")
    powerPCA = H2OPCA(k=3,
                      transform=transformN,
                      pca_method="Power",
                      compute_metrics=True,
                      use_all_factor_levels=True)
    powerPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        powerPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["names"],
        tolerance=1e-5,
        check_sign=True)

    # check PCA with PCA set to GLRM
    print("@@@@@@  Building PCA with GLRM...\n")
    glrmPCA = H2OPCA(k=3,
                     transform=transformN,
                     pca_method="GLRM",
                     compute_metrics=True,
                     use_all_factor_levels=True)
    glrmPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        glrmPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=2e-2)
    print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["names"],
        tolerance=2e-1,
        check_sign=True)

    # make sure we find the scoring history and it is not empty for all the PCA modes
    # just check and make sure the cell_values exceed 0
    assert len(gramSVD._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                "pca_method to GramSVD is empty."
    assert len(powerPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                 "pca_method to using is empty."
    assert len(randomizedPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                      "pca_method to Randomized is " \
                                                                                      "empty."
    assert len(glrmPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                  "pca_method to GLRM is empty."
예제 #13
0
def pca_wideDataset_rotterdam():
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("transform used on dataset is {0}.\n".format(transformN))
    buildModel = [False, False, False]
    buildModel[randint(0, len(buildModel) - 1)] = True

    expNum = 0
    if (buildModel[expNum]):
        # special test with GLRM.  Need use_all_levels to be true
        print("------  Testing GLRM PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345,
                         use_all_factor_levels=True)
        gramSVD.train(x=x, training_frame=rotterdamH2O)

        glrmPCA = H2OGeneralizedLowRankEstimator(k=8,
                                                 transform=transformN,
                                                 seed=12345,
                                                 init="Random",
                                                 max_iterations=10,
                                                 recover_svd=True,
                                                 regularization_x="None",
                                                 regularization_y="None")
        glrmPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
        print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            glrmPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1,
            check_all=False)

        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            glrmPCA._model_json["output"]["eigenvectors"],
            glrmPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)
        h2o.remove(gramSVD)
        h2o.remove(glrmPCA)

    expNum = expNum + 1
    if (buildModel[expNum]):
        print("------  Testing Power PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        powerPCA = H2OPCA(k=8,
                          impute_missing=True,
                          transform=transformN,
                          pca_method="Power",
                          seed=12345)  # power
        powerPCA.train(x=x, training_frame=rotterdamH2O)
        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            powerPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1e-6,
            check_all=False)
        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors

        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            powerPCA._model_json["output"]["eigenvectors"],
            powerPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)

    expNum = expNum + 1
    if (buildModel[expNum]):
        print("------  Testing Randomized PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        randomizedPCA = H2OPCA(k=8,
                               impute_missing=True,
                               transform=transformN,
                               pca_method="Randomized",
                               seed=12345,
                               max_iterations=5)  # power
        randomizedPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print(
            "@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n"
        )
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            randomizedPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1e-1,
            check_all=False)

        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            randomizedPCA._model_json["output"]["eigenvectors"],
            randomizedPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)
    h2o.remove_all()
def pca_max_k():
    data = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(data.names) - y)
    pcaGramSVD = H2OPCA(k=-1,
                        transform="STANDARDIZE",
                        pca_method="GramSVD",
                        impute_missing=True,
                        max_iterations=100)
    pcaGramSVD.train(x, training_frame=data)

    pcaPower = H2OPCA(k=-1,
                      transform="STANDARDIZE",
                      pca_method="Power",
                      impute_missing=True,
                      max_iterations=100,
                      seed=12345)
    pcaPower.train(x, training_frame=data)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaGramSVD._model_json["output"]["importance"],
        pcaPower._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1)

    correctEigNum = pcaPower.full_parameters["k"]["actual_value"]
    gramSVDNum = len(
        pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1
    powerNum = len(
        pcaPower._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \
                                        ", actual: " + gramSVDNum + "."
    assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \
                                      ", actual: " + powerNum + "."

    # Randomized and GLRM does not have wide dataset implementation.  Check with smaller datasets
    data = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    x = list(set(data.names))
    pcaRandomized = H2OPCA(k=-1,
                           transform="STANDARDIZE",
                           pca_method="Randomized",
                           impute_missing=True,
                           max_iterations=100,
                           seed=12345)
    pcaRandomized.train(x, training_frame=data)
    # should still work with rank deficient dataset
    pcaRandomizedF = H2OPCA(k=-1,
                            transform="STANDARDIZE",
                            pca_method="Randomized",
                            use_all_factor_levels=True,
                            impute_missing=True,
                            max_iterations=100,
                            seed=12345)
    pcaRandomizedF.train(x, training_frame=data)

    pcaPower = H2OPCA(k=-1,
                      transform="STANDARDIZE",
                      pca_method="Power",
                      impute_missing=True,
                      max_iterations=100,
                      seed=12345)
    pcaPower.train(x, training_frame=data)
    # should still work with rank deficient dataset
    pcaPowerF = H2OPCA(k=-1,
                       transform="STANDARDIZE",
                       pca_method="Power",
                       use_all_factor_levels=True,
                       impute_missing=True,
                       max_iterations=100,
                       seed=12345)
    pcaPowerF.train(x, training_frame=data)

    # eigenvalues between the PCA and Randomize should be close, I hope...
    print(
        "@@@@@@  Comparing eigenvalues between Randomized and Power PCA...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaRandomized._model_json["output"]["importance"],
        pcaPower._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])

    # eigenvalues between the PCA and Randomize should be close with rank deficient dataset, I hope...
    print(
        "@@@@@@  Comparing eigenvalues between Randomized and Power PCA with rank deficient dataset...\n"
    )
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaRandomizedF._model_json["output"]["importance"],
        pcaPowerF._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])

    pcaGLRM = H2OPCA(k=-1,
                     transform="STANDARDIZE",
                     pca_method="GLRM",
                     use_all_factor_levels=True,
                     max_iterations=100,
                     seed=12345)
    pcaGLRM.train(x, training_frame=data)
    correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"]
    glrmNum = len(
        pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \
                                     ", actual: " + glrmNum + "."
def pca_max_k():
    data = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(data.names) - y)

    buildModel = [False, False, False, False]
    buildModel[randint(0, len(buildModel)-1)] = True
    # test 1

    if buildModel[0]:
        pcaGramSVD = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GramSVD", impute_missing=True, max_iterations=100)
        pcaGramSVD.train(x, training_frame=data)
        pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100,
                      seed=12345)
        pcaPower.train(x, training_frame=data)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(pcaGramSVD._model_json["output"]["importance"],
                                             pcaPower._model_json["output"]["importance"],
                                             ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                             tolerance=1)

        correctEigNum = pcaPower.full_parameters["k"]["actual_value"]
        gramSVDNum = len(pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1
        powerNum = len(pcaPower._model_json["output"]["importance"].cell_values[0]) - 1
        assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \
                                        ", actual: " + gramSVDNum + "."
        assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \
                                      ", actual: " + powerNum + "."

    # Randomized and GLRM does not have wide dataset implementation.  Check with smaller datasets
    # test 2
    data = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    x = list(set(data.names))
    if buildModel[1]:
        pcaRandomized = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized",
                               impute_missing=True, max_iterations=100, seed=12345)
        pcaRandomized.train(x, training_frame=data)

        pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power",
                          impute_missing=True, max_iterations=100, seed=12345)
        pcaPower.train(x, training_frame=data)
        # eigenvalues between the PCA and Randomize should be close, I hope...
        print("@@@@@@  Comparing eigenvalues between Randomized and Power PCA...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomized._model_json["output"]["importance"],
                                                 pcaPower._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"])
     # test 3
    if buildModel[2]:
        # should still work with rank deficient dataset
        pcaRandomizedF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", use_all_factor_levels=True,
                               impute_missing=True, max_iterations=100, seed=12345)
        pcaRandomizedF.train(x, training_frame=data)
        # should still work with rank deficient dataset
        pcaPowerF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", use_all_factor_levels=True,
                            impute_missing=True, max_iterations=100, seed=12345)
        pcaPowerF.train(x, training_frame=data)



        # eigenvalues between the PCA and Randomize should be close with rank deficient dataset, I hope...
        print("@@@@@@  Comparing eigenvalues between Randomized and Power PCA with rank deficient dataset...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomizedF._model_json["output"]["importance"],
                                                 pcaPowerF._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"])

    # test 4
    if buildModel[3]:
        pcaGLRM = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True,
                         max_iterations=100, seed=12345)
        pcaGLRM.train(x, training_frame=data)
        correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"]
        glrmNum = len(pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1
        assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \
                                         ", actual: " + glrmNum + "."
def pca_scoring_history_importance():
    """
    This test aims to check and make sure PCA returns the scoring history and importance which are
    reported missing for certain PCA mode.  Apart from changing the PCA mode, I throw in the transform
    type to test as well randomly.
    """
    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    print("Importing australia.csv data...\n")
    australia = h2o.upload_file(pyunit_utils.locate("smalldata/extdata/australia.csv"))
    col_indices = list(range(0, australia.ncol))

    print("transform is {0}.\n".format(transformN))
    # checking out PCA with GramSVD
    print("@@@@@@  Building PCA with GramSVD...\n")
    gramSVD = H2OPCA(k=3, transform=transformN)
    gramSVD.train(x=col_indices, training_frame=australia)

    # check PCA with PCA set to Randomized
    print("@@@@@@  Building PCA with Randomized...\n")
    randomizedPCA = H2OPCA(k=3, transform=transformN, pca_method="Randomized", compute_metrics=True,
                         use_all_factor_levels=True)
    randomizedPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                           randomizedPCA._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=1e-3)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Randomized...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                           randomizedPCA._model_json["output"]["eigenvectors"],
                                           randomizedPCA._model_json["output"]["names"], tolerance=5e-2,
                                           check_sign=True)

    # check PCA with PCA set to Power
    print("@@@@@@  Building PCA with Power...\n")
    powerPCA = H2OPCA(k=3, transform=transformN, pca_method="Power", compute_metrics=True, use_all_factor_levels=True)
    powerPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                           powerPCA._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"])
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                           powerPCA._model_json["output"]["eigenvectors"],
                                           powerPCA._model_json["output"]["names"], tolerance=1e-5, check_sign=True)

    # check PCA with PCA set to GLRM
    print("@@@@@@  Building PCA with GLRM...\n")
    glrmPCA = H2OPCA(k=3, transform=transformN, pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True)
    glrmPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                           glrmPCA._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=2e-2)
    print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                           glrmPCA._model_json["output"]["eigenvectors"],
                                           glrmPCA._model_json["output"]["names"], tolerance=2e-1,check_sign=True)

    # make sure we find the scoring history and it is not empty for all the PCA modes
    # just check and make sure the cell_values exceed 0
    assert len(gramSVD._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                "pca_method to GramSVD is empty."
    assert len(powerPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                 "pca_method to using is empty."
    assert len(randomizedPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                      "pca_method to Randomized is " \
                                                                                      "empty."
    assert len(glrmPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                  "pca_method to GLRM is empty."