def keras_evaluate(keras_model, eval_dataset_fn, save, keras_model_pkg, validation_metrics): model_metrics = [] if hasattr(keras_model_pkg, "eval_metrics_fn"): metrics_functions = keras_model_pkg.eval_metrics_fn() for key, func in metrics_functions.items(): func.__name__ = key model_metrics.append(func) # use WITH specified metrics if it's not default. if validation_metrics != ["Accuracy"]: keras_metrics = metrics.get_keras_metrics(validation_metrics) else: if len(model_metrics) > 0: keras_metrics = model_metrics else: # default keras_metrics = metrics.get_keras_metrics(["Accuracy"]) has_custom_evaluate_func = hasattr(keras_model, 'sqlflow_evaluate_loop') if not has_custom_evaluate_func: # compile the model with default arguments only for evaluation # (run forward only). keras_model.compile(loss=keras_model_pkg.loss, metrics=keras_metrics) eval_dataset = eval_dataset_fn() def get_features(sample, label): return sample eval_dataset_x = eval_dataset.map(get_features) if has_custom_evaluate_func: result = keras_model.sqlflow_evaluate_loop(eval_dataset, validation_metrics) else: one_batch = next(iter(eval_dataset_x)) # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 keras_model.predict_on_batch(one_batch) load_keras_model_weights(keras_model, save) result = keras_model.evaluate(eval_dataset) assert (len(result) == len(validation_metrics) + 1) result_metrics = dict() for idx, m in enumerate(["loss"] + validation_metrics): result_metrics[m] = float(result[idx]) return result_metrics
def keras_train_and_save_legacy(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epochs, verbose, metric_names, validation_steps, load_pretrained_model, model_meta, is_pai): print("Start training using keras model...") try: classifier, has_none_optimizer = keras_compile(estimator, model_params, metric_names) except Exception as e: if hasattr(estimator, "sqlflow_train_loop"): sys.stderr.write( "compile keras model failed, ignoring this error " "since the model seems to defined sqlflow_train_loop.") classifier = init_model_with_feature_column( estimator, model_params, has_none_optimizer=True) has_none_optimizer = True else: raise e train_dataset = train_dataset_fn() if val_dataset_fn is not None: validate_dataset = val_dataset_fn() else: validate_dataset = None if load_pretrained_model: # Must run one batch to initialize parameters before load_weights inputs, targets = next(iter(train_dataset.take(1))) classifier.evaluate(inputs, targets) # NOTE(sneaxiy): should we save/load optimizer info for incremental # training, or let users to write the same WITH statements in SQL? load_keras_model_weights(classifier, save) if len(FLAGS.worker_hosts.split(",")) > 1: keras_train_distributed(classifier, model_params, save, model_meta, FLAGS, train_dataset_fn, val_dataset_fn, is_pai) else: keras_train_compiled(classifier, save, train_dataset, validate_dataset, label_meta, epochs, verbose, model_meta, validation_steps, has_none_optimizer) if is_pai: print("saving keras model to: %s" % FLAGS.sqlflow_oss_modeldir) oss.save_dir(FLAGS.sqlflow_oss_modeldir, save) oss.save_file(FLAGS.sqlflow_oss_modeldir, "model_meta.json")
def keras_train_and_save(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epochs, verbose, metric_names, validation_steps, load, model_meta, is_pai): print("Start training using keras model...") try: classifier, has_none_optimizer = keras_compile(estimator, model_params, metric_names) except Exception: if hasattr(estimator, "sqlflow_train_loop"): sys.stderr.write( "compile keras model failed, ignoring this error " "since the model seems to defined sqlflow_train_loop.") classifier = init_model_with_feature_column( estimator, model_params, has_none_optimizer=True) has_none_optimizer = True else: six.reraise(*sys.exc_info()) train_dataset = train_dataset_fn() if val_dataset_fn is not None: validate_dataset = val_dataset_fn() else: validate_dataset = None if load: # FIXME(typhoonzero): copied from runtime.tensorflow.train_keras inputs, targets = next(iter(train_dataset.take(1))) classifier.evaluate(inputs, targets) load_keras_model_weights(classifier, load) if len(FLAGS.worker_hosts.split(",")) > 1: keras_train_distributed(classifier, model_params, save, model_meta, FLAGS, train_dataset_fn, val_dataset_fn, is_pai) else: keras_train_compiled(classifier, save, train_dataset, validate_dataset, label_meta, epochs, verbose, model_meta, validation_steps, has_none_optimizer) if is_pai: print("saving keras model to: %s" % FLAGS.sqlflow_oss_modeldir) oss.save_dir(FLAGS.sqlflow_oss_modeldir, save) oss.save_file(FLAGS.sqlflow_oss_modeldir, "model_meta.json")
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols): pop_optimizer_and_loss(model_params) classifier = init_model_with_feature_column(estimator, model_params) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(predict_generator, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset def to_feature_sample(row, selected_cols): features = {} for name in feature_column_names: row_val = row[selected_cols.index(name)] if feature_metas[name].get("delimiter_kv", "") != "": # kv list that should be parsed to two features. if feature_metas[name]["is_sparse"]: features[name] = tf.SparseTensor( row_val[0], tf.ones_like(tf.reshape(row_val[0], [-1])), row_val[2]) features["_".join([name, "weight"])] = tf.SparseTensor(*row_val) else: raise ValueError( "not supported DENSE column with key:value" "list format.") else: if feature_metas[name]["is_sparse"]: features[name] = tf.SparseTensor(*row_val) else: features[name] = tf.constant(([row_val], )) return features if not hasattr(classifier, 'sqlflow_predict_one'): # NOTE: load_weights should be called by keras models only. # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters. See: # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 classifier.predict_on_batch(one_batch) load_keras_model_weights(classifier, save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except: # noqa: E722 train_label_index = -1 if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) column_names.extend(extra_result_cols) with db.buffered_db_writer(conn, result_table, column_names, 100) as w: for row, _ in predict_generator(): features = to_feature_sample(row, column_names) if hasattr(classifier, 'sqlflow_predict_one'): result = classifier.sqlflow_predict_one(features) else: result = classifier.predict_on_batch(features) if extra_result_cols: assert isinstance( result, tuple ), "TO PREDICT must return a " \ "tuple when predict.extra_outputs is not empty" assert len(extra_result_cols) + 1 <= len( result ), "TO PREDICT must return at least " \ "%d items instead of %d" % (len(extra_result_cols) + 1, len(result)) extra_pred_outputs = result[1:len(extra_result_cols) + 1] result = result[0:1] else: extra_pred_outputs = None # FIXME(typhoonzero): determine the predict result is # classification by adding the prediction result together # to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row.append(encode_pred_result(result)) if extra_pred_outputs is not None: row.extend([encode_pred_result(p) for p in extra_pred_outputs]) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] w.write(row) del pred_dataset
return 'categorical_crossentropy' def default_training_epochs(self): """Default training epochs. Used in model.fit.""" return 5 def prepare_prediction_column(self, prediction): """Return the class label of highest probability.""" return prediction.argmax(axis=-1) model = DNNClassifier(feature_columns=feature_columns, hidden_units=[10, 10], n_classes=3) is_training = False if is_training: model.compile(optimizer=model.default_optimizer(), loss=model.default_loss()) model.fit(train_ds, validation_data=val_ds, epochs=model.default_training_epochs(), verbose=0) model.save('my_model', save_format="tf") print("Done training.") else: model.predict(test_ds) load_keras_model_weights(model, 'my_model') prediction = model.predict(test_ds) print(model.prepare_prediction_column(prediction)) print("Done predicting.")
def explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) is_pai = True if pai_table else False if is_pai: FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir select = "" else: if is_tf_estimator(estimator_cls): model_params['model_dir'] = save model_params.update(feature_columns) pop_optimizer_and_loss(model_params) def _input_fn(): dataset = input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=is_pai, pai_table=pai_table) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) if not is_tf_estimator(estimator_cls): load_keras_model_weights(estimator, save) if is_pai: conn = PaiIOConnection.from_table( result_table) if result_table else None else: conn = connect_with_data_source(datasource) if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) if conn is not None: conn.close()
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols): pop_optimizer_and_loss(model_params) classifier = init_model_with_feature_column(estimator, model_params) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(predict_generator, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset if not hasattr(classifier, 'sqlflow_predict_one'): # NOTE: load_weights should be called by keras models only. # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters. See: # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 classifier.predict_on_batch(one_batch) load_keras_model_weights(classifier, save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except: # noqa: E722 train_label_index = -1 if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) with db.buffered_db_writer(conn, result_table, column_names, 100) as w: for features in pred_dataset: if hasattr(classifier, 'sqlflow_predict_one'): result = classifier.sqlflow_predict_one(features) else: result = classifier.predict_on_batch(features) # FIXME(typhoonzero): determine the predict result is # classification by adding the prediction result together # to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format # output tensor using a comma separated string. Only # available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset