예제 #1
0
    def test_trainer_shouldnt_crash(self):
        label_key = "label"
        feature_spec, _ = Datasets.parse_schema(self.schema_path)
        all_features = {
            name: tf.feature_column.numeric_column(name, default_value=.0)
            for name in feature_spec.keys()
        }
        feature_columns = all_features.copy()
        feature_columns.pop(label_key)

        config = tf.estimator.RunConfig(tempfile.mkdtemp())

        estimator = tf.estimator.LinearClassifier(
            feature_columns=feature_columns.values(), config=config)

        def split_features_label_fn(parsed_features):
            label = parsed_features.pop(label_key)
            return parsed_features, label

        def get_in_fn(data):
            raw_feature_spec = tf.feature_column.make_parse_example_spec(
                all_features.values())

            def in_fn():
                dataset = Datasets.examples_via_feature_spec(
                    data, raw_feature_spec)
                return dataset.map(split_features_label_fn)

            return in_fn

        estimator.train(get_in_fn(self.train_data)).evaluate(
            get_in_fn(self.eval_data))
예제 #2
0
def train(_):
    import tempfile

    config = tf.estimator.RunConfig(tempfile.mkdtemp())

    train_data_dir = get_data_dir("train")
    schema_path = os.path.join(train_data_dir, "_inferred_schema.pb")

    feature_spec, _ = Datasets.parse_schema(schema_path)
    # we use OrderedDict and sorted keys for features for determinism
    all_features = OrderedDict([
        (name, tf.feature_column.numeric_column(name, default_value=.0))
        for name in sorted(feature_spec.keys())
    ])
    feature_columns = all_features.copy()
    label_keys = sorted(
        [l for l in set(feature_columns.keys()) if l.startswith("class_name")])
    for l in label_keys:
        feature_columns.pop(l)

    def split_features_label_fn(spec):
        # Canned TF's LinearClassifier requires label to be a single integer, Featran gives us
        # one hot encoding for class, thus we need to convert one hot encoding to single integer
        labels = tf.concat([[spec.pop(l)] for l in label_keys], axis=0)
        label = tf.argmax(labels, axis=0)
        # Get the rest of the features out of the spec
        return spec, label

    def get_in_fn(data):
        raw_feature_spec = tf.feature_column.make_parse_example_spec(
            all_features.values())

        def in_fn():
            dataset = Datasets.examples_via_feature_spec(
                data, raw_feature_spec)
            return dataset.map(split_features_label_fn)

        return in_fn

    classifier = tf.estimator.LinearClassifier(
        feature_columns=feature_columns.values(), n_classes=3, config=config)

    train_data = os.path.join(train_data_dir, "part-*")
    eval_data = os.path.join(get_data_dir("eval"), "part-*")
    classifier.train(get_in_fn(train_data)).evaluate(get_in_fn(eval_data))