def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", is_pai=False, pai_table=""): # import custom model package sqlflow_submitter.import_model_def(estimator_string, globals()) estimator = eval(estimator_string) if not is_pai: conn = db.connect_with_data_source(datasource) model_params.update(feature_columns) is_estimator = issubclass( estimator, (tf.estimator.Estimator, tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor)) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, is_pai, pai_table, feature_column_names, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table) print("Done predicting. Predict table : %s" % result_table)
def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", is_pai=False, pai_table=""): # import custom model package sqlflow_submitter.import_model_def(estimator_string, globals()) estimator_cls = eval(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) eval_dataset, _ = get_dataset_fn(select, "", datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_table, "", 1, batch_size, 1, is_estimator=is_estimator) model_params.update(feature_columns) if is_estimator: if is_pai: FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir else: model_params["model_dir"] = save # tf estimator always have feature_column argument estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) # write result metrics to a table if is_pai: driver = "pai_maxcompute" conn = None else: conn = connect_with_data_source(datasource) driver = conn.driver if result_table: metric_name_list = ["loss"] + validation_metrics write_result_metrics(result_metrics, metric_name_list, result_table, driver, conn, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass)
def train(datasource, estimator_string, select, validation_select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, epoch=1, validation_steps=1, verbose=0, max_steps=None, validation_start_delay_secs=0, validation_throttle_secs=0, save_checkpoints_steps=100, log_every_n_iter=10, load_pretrained_model=False, is_pai=False, pai_table="", pai_val_table="", feature_columns_code="", model_repo_image="", original_sql=""): model_meta = collect_model_metadata(original_sql, select, validation_select, estimator_string, model_params, feature_columns_code, feature_metas, label_meta, None, model_repo_image) # import custom model package sqlflow_submitter.import_model_def(estimator_string, globals()) estimator = eval(estimator_string) is_estimator = is_tf_estimator(estimator) if is_pai and verbose < 1: # always use verbose == 1 when using PAI to get more logs verbose = 1 set_log_level(verbose, is_estimator) # fill in feature columns parameters model_params.update(feature_columns) FLAGS = None num_workers = 1 worker_id = 0 # only support distributed training on PAI (TF version 1.x) if is_pai: FLAGS = define_tf_flags() set_oss_environs(FLAGS) num_workers = len(FLAGS.worker_hosts.split(",")) worker_id = FLAGS.task_index # TODO(typhoonzero): remove this after update the keras models. # copy feature_name to name field for Keras functional models: # https://github.com/sql-machine-learning/models/blob/develop/sqlflow_models/dnnclassifier_functional_api_example.py for k in feature_metas: feature_metas[k]["name"] = feature_metas[k]["feature_name"] train_dataset_fn, val_dataset_fn = get_dataset_fn( select, validation_select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_table, pai_val_table, epoch, batch_size, 1000, num_workers=num_workers, worker_id=worker_id, is_estimator=is_estimator) if not is_estimator: # keras if isinstance(estimator, types.FunctionType): # functional model need field_metas parameter model_params["field_metas"] = feature_metas keras_train_and_save(estimator, model_params, save, is_pai, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epoch, verbose, validation_metrics, validation_steps, load_pretrained_model, model_meta) else: estimator_train_and_save(estimator, model_params, save, is_pai, FLAGS, train_dataset_fn, val_dataset_fn, log_every_n_iter, max_steps, validation_start_delay_secs, validation_throttle_secs, save_checkpoints_steps, validation_metrics, load_pretrained_model, model_meta) # remove cache files any(map(os.remove, glob.glob('cache_train.*'))) any(map(os.remove, glob.glob('cache_validation.*'))) print("Done training")
def explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", is_pai=False, pai_table="", plot_type='bar', result_table="", hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): # import custom model package sqlflow_submitter.import_model_def(estimator_string, globals()) estimator_cls = eval(estimator_string) if is_pai: FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir else: model_params['model_dir'] = save def _input_fn(): if is_pai: dataset = input_fn("", datasource, feature_column_names, feature_metas, label_meta, is_pai=True, pai_table=pai_table) else: dataset = input_fn(select, datasource, feature_column_names, feature_metas, label_meta) return dataset.batch(1).cache() model_params.update(feature_columns) estimator = init_model_with_feature_column(estimator_cls, model_params) if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)