def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, pai_table=""): FLAGS = define_tf_flags() set_oss_environs(FLAGS) estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) is_pai = True if pai_table else False eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=is_pai, pai_table=pai_table, batch_size=batch_size) model_params.update(feature_columns) pop_optimizer_and_loss(model_params) if is_estimator: with open("exported_path", "r") as fid: exported_path = str(fid.read()) model_params["warm_start_from"] = exported_path estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator_cls, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) if result_table: metric_name_list = ["loss"] + validation_metrics if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) write_result_metrics(result_metrics, metric_name_list, result_table, conn) conn.close()
def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass=""): estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", batch_size=batch_size) model_params.update(feature_columns) if is_estimator: model_params["model_dir"] = save estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator_cls, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) # write result metrics to a table conn = connect_with_data_source(datasource) driver = conn.driver if result_table: metric_name_list = ["loss"] + validation_metrics write_result_metrics(result_metrics, metric_name_list, result_table, driver, conn, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass)
def _explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir model_params.update(feature_columns) def _input_fn(): dataset = input_fn("", datasource, feature_column_names, feature_metas, label_meta, is_pai=True, pai_table=pai_table) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) driver = "pai_maxcompute" conn = None if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, driver, conn, "", "", "", "", oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, driver, conn, "", "", "", "", oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def _evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, pai_table=""): estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=True, pai_table=pai_table, batch_size=batch_size) model_params.update(feature_columns) if is_estimator: FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) if result_table: metric_name_list = ["loss"] + validation_metrics write_result_metrics(result_metrics, metric_name_list, result_table, "pai_maxcompute", None, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="")
def explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) model_params['model_dir'] = save model_params.update(feature_columns) def _input_fn(): dataset = input_fn(select, datasource, feature_column_names, feature_metas, label_meta) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) conn = connect_with_data_source(datasource) if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, conn.driver, conn, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, conn.driver, conn, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def keras_train_and_save_legacy(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epochs, verbose, metric_names, validation_steps, load_pretrained_model, model_meta, is_pai): print("Start training using keras model...") try: classifier, has_none_optimizer = keras_compile(estimator, model_params, metric_names) except Exception as e: if hasattr(estimator, "sqlflow_train_loop"): sys.stderr.write( "compile keras model failed, ignoring this error " "since the model seems to defined sqlflow_train_loop.") classifier = init_model_with_feature_column( estimator, model_params, has_none_optimizer=True) has_none_optimizer = True else: raise e train_dataset = train_dataset_fn() if val_dataset_fn is not None: validate_dataset = val_dataset_fn() else: validate_dataset = None if load_pretrained_model: # Must run one batch to initialize parameters before load_weights inputs, targets = next(iter(train_dataset.take(1))) classifier.evaluate(inputs, targets) # NOTE(sneaxiy): should we save/load optimizer info for incremental # training, or let users to write the same WITH statements in SQL? load_keras_model_weights(classifier, save) if len(FLAGS.worker_hosts.split(",")) > 1: keras_train_distributed(classifier, model_params, save, model_meta, FLAGS, train_dataset_fn, val_dataset_fn, is_pai) else: keras_train_compiled(classifier, save, train_dataset, validate_dataset, label_meta, epochs, verbose, model_meta, validation_steps, has_none_optimizer) if is_pai: print("saving keras model to: %s" % FLAGS.sqlflow_oss_modeldir) oss.save_dir(FLAGS.sqlflow_oss_modeldir, save) oss.save_file(FLAGS.sqlflow_oss_modeldir, "model_meta.json")
def keras_train_and_save(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epochs, verbose, metric_names, validation_steps, load, model_meta, is_pai): print("Start training using keras model...") try: classifier, has_none_optimizer = keras_compile(estimator, model_params, metric_names) except Exception: if hasattr(estimator, "sqlflow_train_loop"): sys.stderr.write( "compile keras model failed, ignoring this error " "since the model seems to defined sqlflow_train_loop.") classifier = init_model_with_feature_column( estimator, model_params, has_none_optimizer=True) has_none_optimizer = True else: six.reraise(*sys.exc_info()) train_dataset = train_dataset_fn() if val_dataset_fn is not None: validate_dataset = val_dataset_fn() else: validate_dataset = None if load: # FIXME(typhoonzero): copied from runtime.tensorflow.train_keras inputs, targets = next(iter(train_dataset.take(1))) classifier.evaluate(inputs, targets) load_keras_model_weights(classifier, load) if len(FLAGS.worker_hosts.split(",")) > 1: keras_train_distributed(classifier, model_params, save, model_meta, FLAGS, train_dataset_fn, val_dataset_fn, is_pai) else: keras_train_compiled(classifier, save, train_dataset, validate_dataset, label_meta, epochs, verbose, model_meta, validation_steps, has_none_optimizer) if is_pai: print("saving keras model to: %s" % FLAGS.sqlflow_oss_modeldir) oss.save_dir(FLAGS.sqlflow_oss_modeldir, save) oss.save_file(FLAGS.sqlflow_oss_modeldir, "model_meta.json")
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = init_model_with_feature_column(estimator, model_params) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(predict_generator, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset if not hasattr(classifier, 'sqlflow_predict_one'): # NOTE: load_weights should be called by keras models only. # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters. See: # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 classifier.predict_on_batch(one_batch) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except: # noqa: E722 train_label_index = -1 if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: if hasattr(classifier, 'sqlflow_predict_one'): result = classifier.sqlflow_predict_one(features) else: result = classifier.predict_on_batch(features) # FIXME(typhoonzero): determine the predict result is # classification by adding the prediction result together # to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format # output tensor using a comma separated string. Only # available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset
def keras_compile(estimator, model_params, metric_names): # remove optimizer param from model_params and use it when call "compile()" optimizer = None loss = None if "optimizer" in model_params: optimizer = model_params["optimizer"] del model_params["optimizer"] if "loss" in model_params: loss = model_params["loss"] del model_params["loss"] classifier_pkg = sys.modules[estimator.__module__] model_metrics = [] if hasattr(classifier_pkg, "eval_metrics_fn"): metrics_functions = classifier_pkg.eval_metrics_fn() for key, func in metrics_functions.items(): func.__name__ = key model_metrics.append(func) # use WITH specified metrics if it's not default. if metric_names != ["Accuracy"]: keras_metrics = metrics.get_keras_metrics(metric_names) else: if len(model_metrics) > 0: keras_metrics = model_metrics else: keras_metrics = metrics.get_keras_metrics(["Accuracy"]) # setting optimizer has_none_optimizer = False if optimizer is None: # use keras model default optimizer if optimizer is not specified in # WITH clause. members = inspect.getmembers(classifier_pkg) # default optimizer optimizer = tf.keras.optimizers.Adagrad(lr=0.001) for m, func in members: if m == "optimizer": optimizer = classifier_pkg.optimizer() if optimizer is None: has_none_optimizer = True warnings.warn('optimizer() returns None') if loss is None: members = inspect.getmembers(classifier_pkg) # FIXME(typhoonzero): default loss may cause error if model's output # shape does not fit. loss = "sparse_categorical_crossentropy" for m, func in members: if m == "loss": loss = classifier_pkg.loss classifier = init_model_with_feature_column( estimator, model_params, has_none_optimizer=has_none_optimizer) # FIXME(sneaxiy): some models defined by other framework (not TensorFlow or # XGBoost) may return None optimizer. # For example: # https://github.com/sql-machine-learning/models/blob/ce970d14a524e20de10a645c99b6bf8724be17d9/sqlflow_models/arima_with_stl_decomposition.py#L123 # noqa: E501 if has_none_optimizer: assert hasattr( classifier, "sqlflow_train_loop"), "optimizer() should not return None" else: classifier.compile(optimizer=optimizer, loss=loss, metrics=keras_metrics) return classifier, has_none_optimizer
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols): pop_optimizer_and_loss(model_params) classifier = init_model_with_feature_column(estimator, model_params) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(predict_generator, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset def to_feature_sample(row, selected_cols): features = {} for name in feature_column_names: row_val = row[selected_cols.index(name)] if feature_metas[name].get("delimiter_kv", "") != "": # kv list that should be parsed to two features. if feature_metas[name]["is_sparse"]: features[name] = tf.SparseTensor( row_val[0], tf.ones_like(tf.reshape(row_val[0], [-1])), row_val[2]) features["_".join([name, "weight"])] = tf.SparseTensor(*row_val) else: raise ValueError( "not supported DENSE column with key:value" "list format.") else: if feature_metas[name]["is_sparse"]: features[name] = tf.SparseTensor(*row_val) else: features[name] = tf.constant(([row_val], )) return features if not hasattr(classifier, 'sqlflow_predict_one'): # NOTE: load_weights should be called by keras models only. # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters. See: # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 classifier.predict_on_batch(one_batch) load_keras_model_weights(classifier, save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except: # noqa: E722 train_label_index = -1 if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) column_names.extend(extra_result_cols) with db.buffered_db_writer(conn, result_table, column_names, 100) as w: for row, _ in predict_generator(): features = to_feature_sample(row, column_names) if hasattr(classifier, 'sqlflow_predict_one'): result = classifier.sqlflow_predict_one(features) else: result = classifier.predict_on_batch(features) if extra_result_cols: assert isinstance( result, tuple ), "TO PREDICT must return a " \ "tuple when predict.extra_outputs is not empty" assert len(extra_result_cols) + 1 <= len( result ), "TO PREDICT must return at least " \ "%d items instead of %d" % (len(extra_result_cols) + 1, len(result)) extra_pred_outputs = result[1:len(extra_result_cols) + 1] result = result[0:1] else: extra_pred_outputs = None # FIXME(typhoonzero): determine the predict result is # classification by adding the prediction result together # to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row.append(encode_pred_result(result)) if extra_pred_outputs is not None: row.extend([encode_pred_result(p) for p in extra_pred_outputs]) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] w.write(row) del pred_dataset
def explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) is_pai = True if pai_table else False if is_pai: FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir select = "" else: if is_tf_estimator(estimator_cls): model_params['model_dir'] = save model_params.update(feature_columns) pop_optimizer_and_loss(model_params) def _input_fn(): dataset = input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=is_pai, pai_table=pai_table) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) if not is_tf_estimator(estimator_cls): load_keras_model_weights(estimator, save) if is_pai: conn = PaiIOConnection.from_table( result_table) if result_table else None else: conn = connect_with_data_source(datasource) if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) if conn is not None: conn.close()