def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, pai_table=""): FLAGS = define_tf_flags() set_oss_environs(FLAGS) estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) is_pai = True if pai_table else False eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=is_pai, pai_table=pai_table, batch_size=batch_size) model_params.update(feature_columns) pop_optimizer_and_loss(model_params) if is_estimator: with open("exported_path", "r") as fid: exported_path = str(fid.read()) model_params["warm_start_from"] = exported_path estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator_cls, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) if result_table: metric_name_list = ["loss"] + validation_metrics if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) write_result_metrics(result_metrics, metric_name_list, result_table, conn) conn.close()
def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, pred_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) if pai_table != "": conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) else: conn = db.connect_with_data_source(datasource) selected_cols = db.selected_cols(conn, select) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols) else: # TODO(sneaxiy): support extra_result_cols for estimator model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def _explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir model_params.update(feature_columns) pop_optimizer_and_loss(model_params) def _input_fn(): dataset = input_fn("", datasource, feature_column_names, feature_metas, label_meta, is_pai=True, pai_table=pai_table) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) conn = PaiIOConnection.from_table(result_table) if result_table else None if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) if is_tf_estimator(estimator_cls): model_params['model_dir'] = save model_params.update(feature_columns) pop_optimizer_and_loss(model_params) def _input_fn(): dataset = input_fn(select, datasource, feature_column_names, feature_metas, label_meta) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) conn = connect_with_data_source(datasource) if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) conn.close()
def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0): estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", batch_size=batch_size) model_params.update(feature_columns) pop_optimizer_and_loss(model_params) if is_estimator: model_params["model_dir"] = save estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator_cls, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) # write result metrics to a table conn = connect_with_data_source(datasource) if result_table: metric_name_list = ["loss"] + validation_metrics write_result_metrics(result_metrics, metric_name_list, result_table, conn) conn.close()
def _evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, pai_table=""): estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=True, pai_table=pai_table, batch_size=batch_size) model_params.update(feature_columns) pop_optimizer_and_loss(model_params) if is_estimator: FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) if result_table: metric_name_list = ["loss"] + validation_metrics write_result_metrics(result_metrics, metric_name_list, result_table, PaiIOConnection.from_table(result_table))
def _predict(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols): pop_optimizer_and_loss(model_params) classifier = init_model_with_feature_column(estimator, model_params) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(predict_generator, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset def to_feature_sample(row, selected_cols): features = {} for name in feature_column_names: row_val = row[selected_cols.index(name)] if feature_metas[name].get("delimiter_kv", "") != "": # kv list that should be parsed to two features. if feature_metas[name]["is_sparse"]: features[name] = tf.SparseTensor( row_val[0], tf.ones_like(tf.reshape(row_val[0], [-1])), row_val[2]) features["_".join([name, "weight"])] = tf.SparseTensor(*row_val) else: raise ValueError( "not supported DENSE column with key:value" "list format.") else: if feature_metas[name]["is_sparse"]: features[name] = tf.SparseTensor(*row_val) else: features[name] = tf.constant(([row_val], )) return features if not hasattr(classifier, 'sqlflow_predict_one'): # NOTE: load_weights should be called by keras models only. # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters. See: # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 classifier.predict_on_batch(one_batch) load_keras_model_weights(classifier, save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except: # noqa: E722 train_label_index = -1 if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) column_names.extend(extra_result_cols) with db.buffered_db_writer(conn, result_table, column_names, 100) as w: for row, _ in predict_generator(): features = to_feature_sample(row, column_names) if hasattr(classifier, 'sqlflow_predict_one'): result = classifier.sqlflow_predict_one(features) else: result = classifier.predict_on_batch(features) if extra_result_cols: assert isinstance( result, tuple ), "TO PREDICT must return a " \ "tuple when predict.extra_outputs is not empty" assert len(extra_result_cols) + 1 <= len( result ), "TO PREDICT must return at least " \ "%d items instead of %d" % (len(extra_result_cols) + 1, len(result)) extra_pred_outputs = result[1:len(extra_result_cols) + 1] result = result[0:1] else: extra_pred_outputs = None # FIXME(typhoonzero): determine the predict result is # classification by adding the prediction result together # to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row.append(encode_pred_result(result)) if extra_pred_outputs is not None: row.extend([encode_pred_result(p) for p in extra_pred_outputs]) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] w.write(row) del pred_dataset
def predict_step(datasource, select, result_table, label_name, model, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") label_meta = model.get_meta("label") train_label_desc = label_meta.get_field_desc()[0] if label_meta else None train_label_name = train_label_desc.name if train_label_desc else None estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) is_pai = True if pai_table else False if is_pai: select = "SELECT * FROM %s" % pai_table conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) if is_pai: conn.close() conn = PaiIOConnection.from_table(pai_table) select = None selected_cols = result_column_names[0:-1] if train_label_idx >= 0: selected_cols = selected_cols[0:train_label_idx] + [ train_label_name ] + selected_cols[train_label_idx:] estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table) conn.close()
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols): pop_optimizer_and_loss(model_params) classifier = init_model_with_feature_column(estimator, model_params) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(predict_generator, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset if not hasattr(classifier, 'sqlflow_predict_one'): # NOTE: load_weights should be called by keras models only. # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters. See: # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 classifier.predict_on_batch(one_batch) load_keras_model_weights(classifier, save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except: # noqa: E722 train_label_index = -1 if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) with db.buffered_db_writer(conn, result_table, column_names, 100) as w: for features in pred_dataset: if hasattr(classifier, 'sqlflow_predict_one'): result = classifier.sqlflow_predict_one(features) else: result = classifier.predict_on_batch(features) # FIXME(typhoonzero): determine the predict result is # classification by adding the prediction result together # to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format # output tensor using a comma separated string. Only # available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset