Exemplo n.º 1
0
def keras_evaluate(keras_model, eval_dataset_fn, save, keras_model_pkg,
                   validation_metrics):
    model_metrics = []
    if hasattr(keras_model_pkg, "eval_metrics_fn"):
        metrics_functions = keras_model_pkg.eval_metrics_fn()
        for key, func in metrics_functions.items():
            func.__name__ = key
            model_metrics.append(func)
    # use WITH specified metrics if it's not default.
    if validation_metrics != ["Accuracy"]:
        keras_metrics = metrics.get_keras_metrics(validation_metrics)
    else:
        if len(model_metrics) > 0:
            keras_metrics = model_metrics
        else:
            # default
            keras_metrics = metrics.get_keras_metrics(["Accuracy"])
    has_custom_evaluate_func = hasattr(keras_model, 'sqlflow_evaluate_loop')

    if not has_custom_evaluate_func:
        # compile the model with default arguments only for evaluation
        # (run forward only).
        keras_model.compile(loss=keras_model_pkg.loss, metrics=keras_metrics)

    eval_dataset = eval_dataset_fn()

    def get_features(sample, label):
        return sample

    eval_dataset_x = eval_dataset.map(get_features)

    if has_custom_evaluate_func:
        result = keras_model.sqlflow_evaluate_loop(eval_dataset,
                                                   validation_metrics)
    else:
        one_batch = next(iter(eval_dataset_x))
        # NOTE: must run predict one batch to initialize parameters
        # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501
        keras_model.predict_on_batch(one_batch)
        load_keras_model_weights(keras_model, save)
        result = keras_model.evaluate(eval_dataset)

    assert (len(result) == len(validation_metrics) + 1)
    result_metrics = dict()
    for idx, m in enumerate(["loss"] + validation_metrics):
        result_metrics[m] = float(result[idx])
    return result_metrics
Exemplo n.º 2
0
def keras_train_and_save_legacy(estimator, model_params, save, FLAGS,
                                train_dataset_fn, val_dataset_fn, label_meta,
                                epochs, verbose, metric_names,
                                validation_steps, load_pretrained_model,
                                model_meta, is_pai):
    print("Start training using keras model...")
    try:
        classifier, has_none_optimizer = keras_compile(estimator, model_params,
                                                       metric_names)
    except Exception as e:
        if hasattr(estimator, "sqlflow_train_loop"):
            sys.stderr.write(
                "compile keras model failed, ignoring this error "
                "since the model seems to defined sqlflow_train_loop.")
            classifier = init_model_with_feature_column(
                estimator, model_params, has_none_optimizer=True)
            has_none_optimizer = True
        else:
            raise e

    train_dataset = train_dataset_fn()
    if val_dataset_fn is not None:
        validate_dataset = val_dataset_fn()
    else:
        validate_dataset = None

    if load_pretrained_model:
        # Must run one batch to initialize parameters before load_weights
        inputs, targets = next(iter(train_dataset.take(1)))
        classifier.evaluate(inputs, targets)

        # NOTE(sneaxiy): should we save/load optimizer info for incremental
        # training, or let users to write the same WITH statements in SQL?
        load_keras_model_weights(classifier, save)

    if len(FLAGS.worker_hosts.split(",")) > 1:
        keras_train_distributed(classifier, model_params, save, model_meta,
                                FLAGS, train_dataset_fn, val_dataset_fn,
                                is_pai)
    else:
        keras_train_compiled(classifier, save, train_dataset, validate_dataset,
                             label_meta, epochs, verbose, model_meta,
                             validation_steps, has_none_optimizer)
    if is_pai:
        print("saving keras model to: %s" % FLAGS.sqlflow_oss_modeldir)
        oss.save_dir(FLAGS.sqlflow_oss_modeldir, save)
        oss.save_file(FLAGS.sqlflow_oss_modeldir, "model_meta.json")
Exemplo n.º 3
0
def keras_train_and_save(estimator, model_params, save, FLAGS,
                         train_dataset_fn, val_dataset_fn, label_meta, epochs,
                         verbose, metric_names, validation_steps, load,
                         model_meta, is_pai):
    print("Start training using keras model...")
    try:
        classifier, has_none_optimizer = keras_compile(estimator, model_params,
                                                       metric_names)
    except Exception:
        if hasattr(estimator, "sqlflow_train_loop"):
            sys.stderr.write(
                "compile keras model failed, ignoring this error "
                "since the model seems to defined sqlflow_train_loop.")
            classifier = init_model_with_feature_column(
                estimator, model_params, has_none_optimizer=True)
            has_none_optimizer = True
        else:
            six.reraise(*sys.exc_info())

    train_dataset = train_dataset_fn()
    if val_dataset_fn is not None:
        validate_dataset = val_dataset_fn()
    else:
        validate_dataset = None

    if load:
        # FIXME(typhoonzero): copied from runtime.tensorflow.train_keras
        inputs, targets = next(iter(train_dataset.take(1)))
        classifier.evaluate(inputs, targets)
        load_keras_model_weights(classifier, load)

    if len(FLAGS.worker_hosts.split(",")) > 1:
        keras_train_distributed(classifier, model_params, save, model_meta,
                                FLAGS, train_dataset_fn, val_dataset_fn,
                                is_pai)
    else:
        keras_train_compiled(classifier, save, train_dataset, validate_dataset,
                             label_meta, epochs, verbose, model_meta,
                             validation_steps, has_none_optimizer)

    if is_pai:
        print("saving keras model to: %s" % FLAGS.sqlflow_oss_modeldir)
        oss.save_dir(FLAGS.sqlflow_oss_modeldir, save)
        oss.save_file(FLAGS.sqlflow_oss_modeldir, "model_meta.json")
Exemplo n.º 4
0
def keras_predict(estimator, model_params, save, result_table,
                  feature_column_names, feature_metas, train_label_name,
                  result_col_name, conn, predict_generator, selected_cols,
                  extra_result_cols):
    pop_optimizer_and_loss(model_params)
    classifier = init_model_with_feature_column(estimator, model_params)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))
        tf_gen = tf_generator(predict_generator, selected_cols,
                              feature_column_names, feature_metas)
        dataset = tf.data.Dataset.from_generator(tf_gen,
                                                 (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    def to_feature_sample(row, selected_cols):
        features = {}
        for name in feature_column_names:
            row_val = row[selected_cols.index(name)]
            if feature_metas[name].get("delimiter_kv", "") != "":
                # kv list that should be parsed to two features.
                if feature_metas[name]["is_sparse"]:
                    features[name] = tf.SparseTensor(
                        row_val[0], tf.ones_like(tf.reshape(row_val[0], [-1])),
                        row_val[2])
                    features["_".join([name,
                                       "weight"])] = tf.SparseTensor(*row_val)
                else:
                    raise ValueError(
                        "not supported DENSE column with key:value"
                        "list format.")
            else:
                if feature_metas[name]["is_sparse"]:
                    features[name] = tf.SparseTensor(*row_val)
                else:
                    features[name] = tf.constant(([row_val], ))
        return features

    if not hasattr(classifier, 'sqlflow_predict_one'):
        # NOTE: load_weights should be called by keras models only.
        # NOTE: always use batch_size=1 when predicting to get the pairs of
        #       features and predict results to insert into result table.
        pred_dataset = eval_input_fn(1)
        one_batch = next(iter(pred_dataset))
        # NOTE: must run predict one batch to initialize parameters. See:
        # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models  # noqa: E501
        classifier.predict_on_batch(one_batch)
        load_keras_model_weights(classifier, save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()

    column_names = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except:  # noqa: E722
        train_label_index = -1
    if train_label_index != -1:
        del column_names[train_label_index]
    column_names.append(result_col_name)

    column_names.extend(extra_result_cols)

    with db.buffered_db_writer(conn, result_table, column_names, 100) as w:
        for row, _ in predict_generator():
            features = to_feature_sample(row, column_names)
            if hasattr(classifier, 'sqlflow_predict_one'):
                result = classifier.sqlflow_predict_one(features)
            else:
                result = classifier.predict_on_batch(features)

            if extra_result_cols:
                assert isinstance(
                    result, tuple
                ), "TO PREDICT must return a " \
                   "tuple when predict.extra_outputs is not empty"
                assert len(extra_result_cols) + 1 <= len(
                    result
                ), "TO PREDICT must return at least " \
                   "%d items instead of %d" % (len(extra_result_cols) + 1,
                                               len(result))
                extra_pred_outputs = result[1:len(extra_result_cols) + 1]
                result = result[0:1]
            else:
                extra_pred_outputs = None

            # FIXME(typhoonzero): determine the predict result is
            # classification by adding the prediction result together
            # to see if it is close to 1.0.
            if len(result[0]) == 1:  # regression result
                result = result[0][0]
            else:
                sum = 0
                for i in result[0]:
                    sum += i
                if np.isclose(sum, 1.0):  # classification result
                    result = result[0].argmax(axis=-1)
                else:
                    result = result[0]  # multiple regression result

            row.append(encode_pred_result(result))
            if extra_pred_outputs is not None:
                row.extend([encode_pred_result(p) for p in extra_pred_outputs])
            if train_label_index != -1 and len(row) > train_label_index:
                del row[train_label_index]
            w.write(row)
    del pred_dataset
Exemplo n.º 5
0
        return 'categorical_crossentropy'

    def default_training_epochs(self):
        """Default training epochs. Used in model.fit."""
        return 5

    def prepare_prediction_column(self, prediction):
        """Return the class label of highest probability."""
        return prediction.argmax(axis=-1)


model = DNNClassifier(feature_columns=feature_columns,
                      hidden_units=[10, 10],
                      n_classes=3)
is_training = False
if is_training:
    model.compile(optimizer=model.default_optimizer(),
                  loss=model.default_loss())
    model.fit(train_ds,
              validation_data=val_ds,
              epochs=model.default_training_epochs(),
              verbose=0)
    model.save('my_model', save_format="tf")
    print("Done training.")
else:
    model.predict(test_ds)
    load_keras_model_weights(model, 'my_model')
    prediction = model.predict(test_ds)
    print(model.prepare_prediction_column(prediction))
    print("Done predicting.")
Exemplo n.º 6
0
def explain(datasource,
            estimator_string,
            select,
            feature_columns,
            feature_column_names,
            feature_metas={},
            label_meta={},
            model_params={},
            save="",
            pai_table="",
            plot_type='bar',
            result_table="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    estimator_cls = import_model(estimator_string)
    is_pai = True if pai_table else False
    if is_pai:
        FLAGS = tf.app.flags.FLAGS
        model_params["model_dir"] = FLAGS.checkpointDir
        select = ""
    else:
        if is_tf_estimator(estimator_cls):
            model_params['model_dir'] = save
    model_params.update(feature_columns)
    pop_optimizer_and_loss(model_params)

    def _input_fn():
        dataset = input_fn(select,
                           datasource,
                           feature_column_names,
                           feature_metas,
                           label_meta,
                           is_pai=is_pai,
                           pai_table=pai_table)
        return dataset.batch(1).cache()

    estimator = init_model_with_feature_column(estimator_cls, model_params)
    if not is_tf_estimator(estimator_cls):
        load_keras_model_weights(estimator, save)

    if is_pai:
        conn = PaiIOConnection.from_table(
            result_table) if result_table else None
    else:
        conn = connect_with_data_source(datasource)

    if estimator_cls in (tf.estimator.BoostedTreesClassifier,
                         tf.estimator.BoostedTreesRegressor):
        explain_boosted_trees(datasource, estimator, _input_fn, plot_type,
                              result_table, feature_column_names, conn,
                              oss_dest, oss_ak, oss_sk, oss_endpoint,
                              oss_bucket_name)
    else:
        shap_dataset = pd.DataFrame(columns=feature_column_names)
        for i, (features, label) in enumerate(_input_fn()):
            shap_dataset.loc[i] = [
                item.numpy()[0][0] for item in features.values()
            ]
        explain_dnns(datasource, estimator, shap_dataset, plot_type,
                     result_table, feature_column_names, conn, oss_dest,
                     oss_ak, oss_sk, oss_endpoint, oss_bucket_name)

    if conn is not None:
        conn.close()
Exemplo n.º 7
0
def keras_predict(estimator, model_params, save, result_table,
                  feature_column_names, feature_metas, train_label_name,
                  result_col_name, conn, predict_generator, selected_cols):
    pop_optimizer_and_loss(model_params)
    classifier = init_model_with_feature_column(estimator, model_params)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))
        tf_gen = tf_generator(predict_generator, selected_cols,
                              feature_column_names, feature_metas)
        dataset = tf.data.Dataset.from_generator(tf_gen,
                                                 (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    if not hasattr(classifier, 'sqlflow_predict_one'):
        # NOTE: load_weights should be called by keras models only.
        # NOTE: always use batch_size=1 when predicting to get the pairs of
        #       features and predict results to insert into result table.
        pred_dataset = eval_input_fn(1)
        one_batch = next(iter(pred_dataset))
        # NOTE: must run predict one batch to initialize parameters. See:
        # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models  # noqa: E501
        classifier.predict_on_batch(one_batch)
        load_keras_model_weights(classifier, save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()

    column_names = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except:  # noqa: E722
        train_label_index = -1
    if train_label_index != -1:
        del column_names[train_label_index]
    column_names.append(result_col_name)

    with db.buffered_db_writer(conn, result_table, column_names, 100) as w:
        for features in pred_dataset:
            if hasattr(classifier, 'sqlflow_predict_one'):
                result = classifier.sqlflow_predict_one(features)
            else:
                result = classifier.predict_on_batch(features)
            # FIXME(typhoonzero): determine the predict result is
            # classification by adding the prediction result together
            # to see if it is close to 1.0.
            if len(result[0]) == 1:  # regression result
                result = result[0][0]
            else:
                sum = 0
                for i in result[0]:
                    sum += i
                if np.isclose(sum, 1.0):  # classification result
                    result = result[0].argmax(axis=-1)
                else:
                    result = result[0]  # multiple regression result
            row = []
            for idx, name in enumerate(feature_column_names):
                val = features[name].numpy()[0][0]
                row.append(str(val))
            if isinstance(result, np.ndarray):
                if len(result) > 1:
                    # NOTE(typhoonzero): if the output dimension > 1, format
                    # output tensor using a comma separated string. Only
                    # available for keras models.
                    row.append(",".join([str(i) for i in result]))
                else:
                    row.append(str(result[0]))
            else:
                row.append(str(result))
            w.write(row)
    del pred_dataset