def word2vec_export(): print("###### WORD2VEC ######") words = h2o.create_frame(rows=1000, cols=1, string_fraction=1.0, missing_fraction=0.0) embeddings = h2o.create_frame(rows=1000, cols=100, real_fraction=1.0, missing_fraction=0.0) frame = words.cbind(embeddings) model = H2OWord2vecEstimator(pre_trained=frame) model.train(training_frame=frame) expect_error(model.download_pojo, model="Word2Vec", format="POJO") model.download_mojo(path=RESULT_DIR)
def pubdev_5112(): words = h2o.create_frame(rows=10, cols=1, string_fraction=1.0, missing_fraction=0.0) embeddings = h2o.create_frame(rows=10, cols=100, real_fraction=1.0, missing_fraction=0.0) word_embeddings = words.cbind(embeddings) w2v_model = H2OWord2vecEstimator.from_external(external=word_embeddings) model_id = w2v_model.model_id model = h2o.get_model(model_id) assert model, "Worder2Vec model without a training frame was retrived" # Only leading column should be of type String leading_column_string_error = False try: string_frame = h2o.create_frame(rows=10, cols=10, real_fraction=1.0, missing_fraction=0.0) H2OWord2vecEstimator.from_external(external=string_frame) except H2OValueError: leading_column_string_error = True assert leading_column_string_error, "Word2Vec pre-trained model should be checked for the leading column" \ " to be string" # Other columns should be non-string type multiple_string_columns_error = False try: string_frame = h2o.create_frame(rows=10, cols=10, string_fraction=1.0, missing_fraction=0.0) H2OWord2vecEstimator.from_external(external=string_frame) except H2OValueError: multiple_string_columns_error = True assert multiple_string_columns_error, "Word2Vec pre-trained model should be checked for columns not to have a" \ " String type except for the leading column"
def word2vec_get_model(): print("Test retrieving a word2vec model by a key") words = h2o.create_frame(rows=1000,cols=1,string_fraction=1.0,missing_fraction=0.0) embeddings = h2o.create_frame(rows=1000,cols=100,real_fraction=1.0,missing_fraction=0.0) word_embeddings = words.cbind(embeddings) w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings) w2v_model.train(training_frame=word_embeddings) model_id = w2v_model.model_id model = h2o.get_model(model_id) assert model, "Model was retrived"
def word2vec(): print("word2vec smoke test on text8 dataset") train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) w2v_model = H2OWord2vecEstimator(epochs=1) w2v_model.train(training_frame=train) synonyms = w2v_model.find_synonyms("horse", 3) print(synonyms) assert bool(synonyms), "synonyms should not be empty"
def word2vec_to_frame(): print("Test converting a word2vec model to a Frame") words = h2o.create_frame(rows=1000,cols=1,string_fraction=1.0,missing_fraction=0.0) embeddings = h2o.create_frame(rows=1000,cols=100,real_fraction=1.0,missing_fraction=0.0) word_embeddings = words.cbind(embeddings) w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings) w2v_model.train(training_frame=word_embeddings) w2v_frame = w2v_model.to_frame() word_embeddings.names = w2v_frame.names assert word_embeddings.as_data_frame().equals(word_embeddings.as_data_frame()), "Source and generated embeddings match"
def word2vec(): for word_model in ["SkipGram", "CBOW"]: print("word2vec %s smoke test on text8 dataset" % word_model) train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) w2v_model = H2OWord2vecEstimator(epochs=1, word_model=word_model) w2v_model.train(training_frame=train) synonyms = w2v_model.find_synonyms("horse", 3) print(synonyms) assert len(synonyms) == 3, "there should be three synonmys"
def train_w2v(df, epochs=None, save_dir=None): """ trains word2vec model on all text columns of df. Returns w2v model object that can transform data. """ print("training word2vec model ...") args = {} if epochs is not None: args['epochs'] = int(epochs) if save_dir is not None: args['export_checkpoints_dir'] = os.path.join(save_dir,"h2o_model/") df = df.copy() text_columns = get_text_cols(df) print("Text columns are: ", text_columns) df_text = df[text_columns] text_frame = H2OFrame(df_text) for col in text_columns: text_frame[col] = text_frame[col].ascharacter() words = text_frame.tokenize(" ") w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, **args) w2v_model.train(training_frame=words) w2v_model.text_columns = text_columns return w2v_model
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs is set to be too short. See PUBDEV-4802. ''' global model_within_max_runtime seed = 12345 # word2vec train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() grabRuntimeInfo(w2v_model, used, [], 0) cleanUp([train, used, w2v_model]) # kmeans training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = list(range(training1_data.ncol)) model = H2OKMeansEstimator(k=10) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([training1_data, model]) # PCA, pca_method=Power training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # PCA, pca_method=Randomized model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Randomized", compute_metrics=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # PCA, pca_method=GLRM model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # deeplearning training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Not an iterative algo." ) # GBM run training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([model]) # GLM run model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([model]) # naivebayes, not iterative print( "******************** Skip testing Naives Bayes. Not an iterative algo." ) # random foreset model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model, training1_data]) # GLRM, do not make sense to stop in the middle of an iteration training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE", recover_svd=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([training1_data, model]) if sum(model_within_max_runtime) > 0: sys.exit(1)
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all h2o algos. See PUBDEV-4702. ''' global model_within_max_runtime global err_bound seed = 12345 # deeplearning training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Not an iterative algo." ) # GBM run training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([model]) # GLM run model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([model]) # naivebayes, not iterative print( "******************** Skip testing Naives Bayes. Not an iterative algo." ) # random foreset model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices) cleanUp([model, training1_data]) # deepwater if H2ODeepWaterEstimator.available(): training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) training1_data = training1_data.drop('Site') training1_data['Angaus'] = training1_data['Angaus'].asfactor() y_index = "Angaus" x_indices = list(range(1, training1_data.ncol)) model = H2ODeepWaterEstimator(epochs=50, hidden=[4096, 4096, 4096], hidden_dropout_ratios=[0.2, 0.2, 0.2]) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # GLRM, do not make sense to stop in the middle of an iteration training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE") grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices) cleanUp([training1_data, model]) # PCA training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) grabRuntimeInfo(err_bound * 3, 1.2, model, training1_data, x_indices) cleanUp([training1_data, model]) # kmeans training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = list(range(training1_data.ncol)) model = H2OKMeansEstimator(k=10) grabRuntimeInfo(err_bound * 2, 2.0, model, training1_data, x_indices) cleanUp([training1_data, model]) # word2vec train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0) cleanUp([train, used, w2v_model]) if sum(model_within_max_runtime) > 0: sys.exit(1)
def predict(title, w2v, gbm): words = tokenize(h2o.H2OFrame(title).ascharacter()) title_vec = w2v.transform(words, aggregate_method="AVERAGE") print(gbm.predict(test_data=title_vec)) print("Break malware asm codes into sequence of words") words = tokenize(malware_dataset["asm_codes"]) print("Build word2vec model") w2v_model = H2OWord2vecEstimator(model_id="word2vec_model_malware", init_learning_rate=1.0, window_size=5, vec_size=200, sent_sample_rate=0.0, epochs=10) w2v_model.train(training_frame=words) print("Calculate a vector for each malware asm codes") malware_vecs = w2v_model.transform(words, aggregate_method="AVERAGE") print( "Prepare training&validation data (keep only malware made of known words)") valid_malware_codes = ~malware_vecs["C1"].isna() data = malware_dataset[valid_malware_codes, :].cbind( malware_vecs[valid_malware_codes, :]) train_split, valid_split = data.split_frame(ratios=[0.8])
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that column names and column types are returned in the model output for every algorithm supported by H2O. See PUBDEV-5801. ''' seed = 12345 print("Checking GLM.....") training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) model = H2OGeneralizedLinearEstimator(family="binomial", alpha=1.0, lambda_search=False, max_iterations=2, seed=seed) checkColumnNamesTypesReturned( training1_data, model, ["displacement", "power", "weight", "acceleration", "year"], y_index="economy_20mpg") print("Checking GLRM.....") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) glrm_h2o = H2OGeneralizedLowRankEstimator(k=3, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, transform="STANDARDIZE") checkColumnNamesTypesReturned(irisH2O, glrm_h2o, irisH2O.names) print("Checking NaiveBayes......") model = H2ONaiveBayesEstimator(laplace=0.25) x_indices = irisH2O.names y_index = x_indices[-1] x_indices.remove(y_index) checkColumnNamesTypesReturned(irisH2O, model, x_indices, y_index=y_index) # deeplearning print("Checking deeplearning.....") training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) x_indices = training1_data.names y_index = x_indices[-1] x_indices.remove(y_index) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) checkColumnNamesTypesReturned(training1_data, model, x_indices, y_index=y_index) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Test done in pyunit_stackedensemble_regression.py." ) # GBM run print("Checking GBM.....") training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) x_indices = training1_data.names y_index = x_indices[-1] x_indices.remove(y_index) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) checkColumnNamesTypesReturned(training1_data, model, x_indices, y_index=y_index) # random foreset print("Checking Random Forest.....") model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) checkColumnNamesTypesReturned(training1_data, model, x_indices, y_index=y_index) # PCA print("Checking PCA.....") training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = training1_data.names model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) checkColumnNamesTypesReturned(training1_data, model, x_indices) # kmeans print("Checking kmeans....") training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = training1_data.names model = H2OKMeansEstimator(k=10) checkColumnNamesTypesReturned(training1_data, model, x_indices) # word2vec print("Checking word2vec....") train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() checkColumnNamesTypesReturned(train, w2v_model, [], 0)
tokenized_words = tokenized_words[(tokenized_words.isna()) | (~tokenized_words.isin(STOP_WORDS)), :] return tokenized_words # In[118]: # Break loan description into sequence of words words = tokenize(loans["desc"].ascharacter()) # In[119]: # Train Word2Vec Model from h2o.estimators.word2vec import H2OWord2vecEstimator w2v_model = H2OWord2vecEstimator(vec_size=100, model_id="w2v.hex") w2v_model.train(training_frame=words) # In[120]: # Sanity check - find synonyms for the word 'car' w2v_model.find_synonyms("car", count=5) # In[121]: # Calculate a vector for each description desc_vecs = w2v_model.transform(words, aggregate_method="AVERAGE") # In[122]: desc_vecs.head()