def word2vec_export():
    print("###### WORD2VEC ######")
    words = h2o.create_frame(rows=1000, cols=1, string_fraction=1.0, missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=1000, cols=100, real_fraction=1.0, missing_fraction=0.0)
    frame = words.cbind(embeddings)
    model = H2OWord2vecEstimator(pre_trained=frame)
    model.train(training_frame=frame)
    expect_error(model.download_pojo, model="Word2Vec", format="POJO")
    model.download_mojo(path=RESULT_DIR)
示例#2
0
def pubdev_5112():
    words = h2o.create_frame(rows=10,
                             cols=1,
                             string_fraction=1.0,
                             missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=10,
                                  cols=100,
                                  real_fraction=1.0,
                                  missing_fraction=0.0)
    word_embeddings = words.cbind(embeddings)

    w2v_model = H2OWord2vecEstimator.from_external(external=word_embeddings)

    model_id = w2v_model.model_id
    model = h2o.get_model(model_id)

    assert model, "Worder2Vec model without a training frame was retrived"

    # Only leading column should be of type String
    leading_column_string_error = False
    try:
        string_frame = h2o.create_frame(rows=10,
                                        cols=10,
                                        real_fraction=1.0,
                                        missing_fraction=0.0)
        H2OWord2vecEstimator.from_external(external=string_frame)
    except H2OValueError:
        leading_column_string_error = True

    assert leading_column_string_error, "Word2Vec pre-trained model should be checked for the leading column" \
                                        " to be string"
    # Other columns should be non-string type
    multiple_string_columns_error = False
    try:
        string_frame = h2o.create_frame(rows=10,
                                        cols=10,
                                        string_fraction=1.0,
                                        missing_fraction=0.0)
        H2OWord2vecEstimator.from_external(external=string_frame)
    except H2OValueError:
        multiple_string_columns_error = True

    assert multiple_string_columns_error, "Word2Vec pre-trained model should be checked for columns not to have a" \
                                          " String type except for the leading column"
示例#3
0
def word2vec_get_model():
    print("Test retrieving a word2vec model by a key")

    words = h2o.create_frame(rows=1000,cols=1,string_fraction=1.0,missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=1000,cols=100,real_fraction=1.0,missing_fraction=0.0)
    word_embeddings = words.cbind(embeddings)

    w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings)
    w2v_model.train(training_frame=word_embeddings)

    model_id = w2v_model.model_id
    model = h2o.get_model(model_id)

    assert model, "Model was retrived"
示例#4
0
def word2vec():
    print("word2vec smoke test on text8 dataset")

    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])

    w2v_model = H2OWord2vecEstimator(epochs=1)
    w2v_model.train(training_frame=train)

    synonyms = w2v_model.find_synonyms("horse", 3)
    print(synonyms)

    assert bool(synonyms), "synonyms should not be empty"
def word2vec_to_frame():
    print("Test converting a word2vec model to a Frame")

    words = h2o.create_frame(rows=1000,cols=1,string_fraction=1.0,missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=1000,cols=100,real_fraction=1.0,missing_fraction=0.0)
    word_embeddings = words.cbind(embeddings)

    w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings)
    w2v_model.train(training_frame=word_embeddings)

    w2v_frame = w2v_model.to_frame()

    word_embeddings.names = w2v_frame.names
    assert word_embeddings.as_data_frame().equals(word_embeddings.as_data_frame()), "Source and generated embeddings match"
def word2vec():
    for word_model in ["SkipGram", "CBOW"]:
        print("word2vec %s smoke test on text8 dataset" % word_model)

        train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                                header=1,
                                col_types=["string"])

        w2v_model = H2OWord2vecEstimator(epochs=1, word_model=word_model)
        w2v_model.train(training_frame=train)

        synonyms = w2v_model.find_synonyms("horse", 3)
        print(synonyms)

        assert len(synonyms) == 3, "there should be three synonmys"
def train_w2v(df, epochs=None, save_dir=None):
    """ trains word2vec model on all text columns of df.
        Returns w2v model object that can transform data.
    """
    print("training word2vec model ...")
    args = {}
    if epochs is not None:
        args['epochs'] = int(epochs)
    if save_dir is not None:
        args['export_checkpoints_dir'] = os.path.join(save_dir,"h2o_model/")

    df = df.copy()
    text_columns = get_text_cols(df)
    print("Text columns are: ", text_columns)
    df_text = df[text_columns]
    text_frame = H2OFrame(df_text)
    for col in text_columns:
        text_frame[col] = text_frame[col].ascharacter()

    words = text_frame.tokenize(" ")
    w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, **args)
    w2v_model.train(training_frame=words)
    w2v_model.text_columns = text_columns
    return w2v_model
示例#8
0
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs
    is set to be too short.  See PUBDEV-4802.
    '''
    global model_within_max_runtime
    seed = 12345

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # PCA, pca_method=Power
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=Randomized
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Randomized",
                   compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=GLRM
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="GLRM",
                   compute_metrics=True,
                   use_all_factor_levels=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE",
                                           recover_svd=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)
示例#9
0
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all
    h2o algos.  See PUBDEV-4702.
    '''
    global model_within_max_runtime
    global err_bound
    seed = 12345

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # deepwater
    if H2ODeepWaterEstimator.available():
        training1_data = h2o.import_file(
            path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
        training1_data = training1_data.drop('Site')
        training1_data['Angaus'] = training1_data['Angaus'].asfactor()
        y_index = "Angaus"
        x_indices = list(range(1, training1_data.ncol))
        model = H2ODeepWaterEstimator(epochs=50,
                                      hidden=[4096, 4096, 4096],
                                      hidden_dropout_ratios=[0.2, 0.2, 0.2])
        grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices,
                        y_index)
        cleanUp([training1_data, model])

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE")
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # PCA
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    grabRuntimeInfo(err_bound * 3, 1.2, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(err_bound * 2, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)
示例#10
0

def predict(title, w2v, gbm):
    words = tokenize(h2o.H2OFrame(title).ascharacter())
    title_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=title_vec))


print("Break malware asm codes into sequence of words")
words = tokenize(malware_dataset["asm_codes"])

print("Build word2vec model")

w2v_model = H2OWord2vecEstimator(model_id="word2vec_model_malware",
                                 init_learning_rate=1.0,
                                 window_size=5,
                                 vec_size=200,
                                 sent_sample_rate=0.0,
                                 epochs=10)

w2v_model.train(training_frame=words)

print("Calculate a vector for each malware asm codes")
malware_vecs = w2v_model.transform(words, aggregate_method="AVERAGE")

print(
    "Prepare training&validation data (keep only malware made of known words)")
valid_malware_codes = ~malware_vecs["C1"].isna()
data = malware_dataset[valid_malware_codes, :].cbind(
    malware_vecs[valid_malware_codes, :])
train_split, valid_split = data.split_frame(ratios=[0.8])
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that column names and column types are returned in the model
      output for every algorithm supported by H2O.  See PUBDEV-5801.
    '''
    seed = 12345
    print("Checking GLM.....")
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    model = H2OGeneralizedLinearEstimator(family="binomial",
                                          alpha=1.0,
                                          lambda_search=False,
                                          max_iterations=2,
                                          seed=seed)
    checkColumnNamesTypesReturned(
        training1_data,
        model, ["displacement", "power", "weight", "acceleration", "year"],
        y_index="economy_20mpg")

    print("Checking GLRM.....")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=3,
                                              loss="Quadratic",
                                              gamma_x=0.5,
                                              gamma_y=0.5,
                                              transform="STANDARDIZE")
    checkColumnNamesTypesReturned(irisH2O, glrm_h2o, irisH2O.names)

    print("Checking NaiveBayes......")
    model = H2ONaiveBayesEstimator(laplace=0.25)
    x_indices = irisH2O.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    checkColumnNamesTypesReturned(irisH2O, model, x_indices, y_index=y_index)

    # deeplearning
    print("Checking deeplearning.....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    x_indices = training1_data.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Test done in pyunit_stackedensemble_regression.py."
    )

    # GBM run
    print("Checking GBM.....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    x_indices = training1_data.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # random foreset
    print("Checking Random Forest.....")
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # PCA
    print("Checking PCA.....")
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = training1_data.names
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    checkColumnNamesTypesReturned(training1_data, model, x_indices)

    # kmeans
    print("Checking kmeans....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = training1_data.names
    model = H2OKMeansEstimator(k=10)
    checkColumnNamesTypesReturned(training1_data, model, x_indices)

    # word2vec
    print("Checking word2vec....")
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    checkColumnNamesTypesReturned(train, w2v_model, [], 0)
    tokenized_words = tokenized_words[(tokenized_words.isna()) |
                                      (~tokenized_words.isin(STOP_WORDS)), :]
    return tokenized_words


# In[118]:

# Break loan description into sequence of words
words = tokenize(loans["desc"].ascharacter())

# In[119]:

# Train Word2Vec Model
from h2o.estimators.word2vec import H2OWord2vecEstimator

w2v_model = H2OWord2vecEstimator(vec_size=100, model_id="w2v.hex")
w2v_model.train(training_frame=words)

# In[120]:

# Sanity check - find synonyms for the word 'car'
w2v_model.find_synonyms("car", count=5)

# In[121]:

# Calculate a vector for each description
desc_vecs = w2v_model.transform(words, aggregate_method="AVERAGE")

# In[122]:

desc_vecs.head()