Пример #1
0
    def _transform(self, dataset):
        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [
                tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping
            ]
            # Load graph
            tf.import_graph_def(graph_def=graph_def,
                                name='',
                                return_elements=out_tnsr_op_names)

            # Feed dict maps from placeholder name to DF column name
            feed_dict = {
                self._getSparkDlOpName(tnsr_name): col_name
                for col_name, tnsr_name in input_mapping
            }
            fetches = [
                tfx.get_tensor(tnsr_name, graph)
                for tnsr_name in out_tnsr_op_names
            ]

            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
Пример #2
0
def residual_of_closest(df,
                        feature_column,
                        residual_column,
                        centers,
                        assigned_column='assigned'):
    """
    Residual between points and their closest center

    :param df: dataframe, contains all points
    :param feature_column: string, the points column within df
    :param residual_column: string, the output column name for residual error between closest center.
    :param centers: numpy.array, [num_centroids, num_features] the k cluster centers
    :param assigned_column: string, the output column name for index of closest center.
    :return: dataframe, contains two extra columns, `residual_column`, `assigned_column`
    """
    df = tfs.analyze(df)
    num_features = centers.shape[1]
    with tf.Graph().as_default():
        points = tf.placeholder(tf.double,
                                shape=[None, num_features],
                                name=feature_column)
        assigned = _assign_center(points, centers)
        residual = _residual_of_assigned(points, assigned, centers,
                                         residual_column)
        return tfs.map_blocks([assigned, residual], df)
Пример #3
0
    def _transform(self, dataset):
        if any([field.dataType == DoubleType() for field in dataset.schema]):
            logger.warning("Detected DoubleType columns in dataframe passed to transform(). In "
                           "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be "
                           "fed to input tensors of type tf.float64. To feed dataframe data to "
                           "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the "
                           "corresponding Spark SQL data types (FloatType, IntegerType, LongType).")

        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
            # Load graph
            tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
            # Feed dict maps from placeholder name to DF column name
            feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping}
            fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names]
            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
Пример #4
0
def simple_example_2():
    spark = SparkSession.builder.appName(
        'simple-tensorframes-example-2').getOrCreate()
    spark.sparkContext.setLogLevel('WARN')

    rdd = [Row(y=[float(y), float(-y)]) for y in range(10)]
    df = spark.createDataFrame(rdd)

    df.show()
    tfs.print_schema(df)

    # Analyze first to find the dimensions of the vectors.
    df2 = tfs.analyze(df)

    tfs.print_schema(df2)

    # Make a copy of the 'y' column: An inexpensive operation in Spark 2.0+.
    df3 = df2.select(df2.y, df2.y.alias('z'))

    # Execute the tensor graph.
    with tf.Graph().as_default() as graph:
        y_input = tfs.block(df3, 'y', tf_name='y_input')
        z_input = tfs.block(df3, 'z', tf_name='z_input')

        # Perform elementwise sum and minimum.
        y = tf.reduce_sum(y_input, [0], name='y')
        z = tf.reduce_min(z_input, [0], name='z')

        (data_sum, data_min) = tfs.reduce_blocks([y, z], df3)

    print('Elementwise sum: %s and minimum: %s' % (data_sum, data_min))
Пример #5
0
    def _transform(self, dataset):
        if len([field for field in dataset.schema if field.dataType == DoubleType()]) > 0:
            logger.warn("Detected DoubleType columns in dataframe passed to transform(). In "
                        "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be "
                        "fed to input tensors of type tf.float64. To feed dataframe data to "
                        "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the "
                        "corresponding Spark SQL data types (FloatType, IntegerType, LongType).")

        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
            # Load graph
            tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
            # Feed dict maps from placeholder name to DF column name
            feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping}
            fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names]
            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
Пример #6
0
def tf_serving_with_dataframe(df, model_base_path, model_version=None):
    """

    :param df: spark dataframe, batch input for the model
    :param model_base_path: str, tensorflow saved Model model base path
    :param model_version: int, tensorflow saved Model model version, default None
    :return: spark dataframe, with predicted result.
    """
    import tensorframes as tfs
    g, feed_tensors, fetch_tensors = load_model(model_base_path, model_version)
    with g.as_default():
        df = rename_by_mapping(df, feed_tensors)
        df = tfs.analyze(df)
        df = tfs.map_blocks(fetch_tensors.values(), df)
        df = rename_by_mapping(df, feed_tensors, reverse=True)
        return rename_by_mapping(df, fetch_tensors, reverse=True)
Пример #7
0
def _check_transformer_output(transformer, dataset, expected):
    """
    Given a transformer and a spark dataset, check if the transformer
    produces the expected results.
    """
    analyzed_df = tfs.analyze(dataset)
    out_df = transformer.transform(analyzed_df)

    # Collect transformed values
    out_colnames = list(_output_mapping.values())
    _results = []
    for row in out_df.select(out_colnames).collect():
        curr_res = [row[colname] for colname in out_colnames]
        _results.append(np.ravel(curr_res))
    out_tgt = np.hstack(_results)

    _err_msg = 'not close => shape {} != {}, max_diff {} > {}'
    max_diff = np.max(np.abs(expected - out_tgt))
    err_msg = _err_msg.format(expected.shape, out_tgt.shape, max_diff,
                              _all_close_tolerance)
    assert np.allclose(expected, out_tgt, atol=_all_close_tolerance), err_msg
def _check_transformer_output(transformer, dataset, expected):
    """
    Given a transformer and a spark dataset, check if the transformer
    produces the expected results.
    """
    analyzed_df = tfs.analyze(dataset)
    out_df = transformer.transform(analyzed_df)

    # Collect transformed values
    out_colnames = list(_output_mapping.values())
    _results = []
    for row in out_df.select(out_colnames).collect():
        curr_res = [row[colname] for colname in out_colnames]
        _results.append(np.ravel(curr_res))
    out_tgt = np.hstack(_results)

    _err_msg = 'not close => shape {} != {}, max_diff {} > {}'
    max_diff = np.max(np.abs(expected - out_tgt))
    err_msg = _err_msg.format(expected.shape, out_tgt.shape,
                              max_diff, _all_close_tolerance)
    assert np.allclose(expected, out_tgt, atol=_all_close_tolerance), err_msg
Пример #9
0
import tensorflow as tf
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np

num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(sc,
                                  numCols=num_features,
                                  numRows=100,
                                  seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")

# For now, analysis is still required.
df0 = tfs.analyze(df)

init_centers = np.random.randn(k, num_features)

# For debugging
block = np.array(data.take(10))[::, 0, ::]

# Find the distances first
with tf.Graph().as_default() as g:
    points = tf.placeholder(tf.double,
                            shape=[None, num_features],
                            name='points')
    num_points = tf.shape(points)[0]
    #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')
    centers = tf.constant(init_centers)
    squares = tf.reduce_sum(tf.square(points), reduction_indices=1)
Пример #10
0
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np

num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(
    sc,
    numCols=num_features,
    numRows=100,
    seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")

# For now, analysis is still required.
df0 = tfs.analyze(df)

init_centers = np.random.randn(k, num_features)

# For debugging
block = np.array(data.take(10))[::,0,::]

# Find the distances first
with tf.Graph().as_default() as g:
    points = tf.placeholder(tf.double, shape=[None, num_features], name='points')
    num_points = tf.shape(points)[0]
    #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')
    centers = tf.constant(init_centers)
    squares = tf.reduce_sum(tf.square(points), reduction_indices=1)
    center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1)
    prods = tf.matmul(points, centers, transpose_b = True)
Пример #11
0
import tensorflow as tf
import tensorframes as tfs
from pyspark.sql import Row

# Build a DataFrame of vectors
data = [Row(y=[float(y), float(-y)]) for y in range(10)]
df = sqlContext.createDataFrame(data)
# Because the dataframe contains vectors, we need to analyze it first to find the
# dimensions of the vectors.
df2 = tfs.analyze(df)

# The information gathered by TF can be printed to check the content:
tfs.print_schema(df2)
# TF has inferred that y contains vectors of size 2
# root
#  |-- y: array (nullable = false) DoubleType[?,2]

# Let's use the analyzed dataframe to compute the sum and the elementwise minimum 
# of all the vectors:
# First, let's make a copy of the 'y' column. This will be very cheap in Spark 2.0+
df3 = df2.select(df2.y, df2.y.alias("z"))
with tf.Graph().as_default() as g:
    # The placeholders. Note the special name that end with '_input':
    y_input = tfs.block(df3, 'y', tf_name="y_input")
    z_input = tfs.block(df3, 'z', tf_name="z_input")
    y = tf.reduce_sum(y_input, [0], name='y')
    z = tf.reduce_min(z_input, [0], name='z')
    # The resulting dataframe
    (data_sum, data_min) = tfs.reduce_blocks([y, z], df3)

# The final results are numpy arrays:
Пример #12
0
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
import tensorframes as tfs
import tensorflow as tf
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, IntegerType, LongType, FloatType

from tensorframes.core import _java_api
japi = _java_api()
_java_api().initialize_logging()


# The input data
data = [Row(x=[float(x), float(2 * x)], key=str(x % 2)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
df = tfs.analyze(sqlContext.createDataFrame(data))

# The geometric mean:
# TODO(tjh) make a test out of this, it found some bugs
# - non numeric columns (string)
# - unused columns
# - output that has a child
col_name = "x"
col_key = "key"
with tf.Graph().as_default() as g:
    x = tfs.block(df, col_name)
    invs = tf.inv(tf.to_double(x), name="invs")
    df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df)


# The geometric mean
Пример #13
0
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
import tensorframes as tfs
import tensorflow as tf
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, IntegerType, LongType, FloatType

from tensorframes.core import _java_api
japi = _java_api()
_java_api().initialize_logging()

# The input data
data = [Row(x=[float(x), float(2 * x)], key=str(x % 2)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
df = tfs.analyze(sqlContext.createDataFrame(data))

# The geometric mean:
# TODO(tjh) make a test out of this, it found some bugs
# - non numeric columns (string)
# - unused columns
# - output that has a child
col_name = "x"
col_key = "key"
with tf.Graph().as_default() as g:
    x = tfs.block(df, col_name)
    invs = tf.inv(tf.to_double(x), name="invs")
    df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df)

# The geometric mean
gb = df2.select(col_key, "invs", "count").groupBy("key")
Пример #14
0
def m_kmeans(df,
             feature_column,
             num_centroids_each,
             num_features,
             m_groups,
             max_iter=10):
    """
    M K-means algorithm applied on a dataframe of points

    :param df: dataframe, contains all points
    :param feature_column: string, the points column within df
    :param num_centroids_each: int, k clusters
    :param num_features: int, dimension of a point vector
    :param m_groups: int, number of groups a point is spitted into
    :param max_iter: int, maximum number of iterations

    :return: numpy.array: [num_centroids, num_features], the k cluster centers with m groups concatenated
    """
    initial_centers = df.select(feature_column).take(num_centroids_each)
    centers = np.array(initial_centers).reshape(num_centroids_each,
                                                num_features)
    m_slice = map(lambda r: slice(min(r),
                                  max(r) + 1),
                  np.array_split(xrange(num_features), m_groups))
    slices = np.array_split(xrange(m_groups * num_centroids_each), m_groups)
    df = tfs.analyze(df)

    while max_iter > 0:
        max_iter -= 1

        with tf.Graph().as_default():
            points = tf.placeholder(tf.double,
                                    shape=[None, num_features],
                                    name=feature_column)
            counts, vector_sums = calculate_new_centers_for_m_slice(
                m_slice, points, tf.nn.l2_normalize(centers, dim=1),
                num_centroids_each)
            counts = tf.identity(counts, name='counts')
            vector_sums = tf.identity(vector_sums, name='vector_sums')
            df2 = tfs.map_blocks([counts, vector_sums], df, trim=True)

        with tf.Graph().as_default():
            counts = tf.placeholder(
                tf.int64,
                shape=[None, num_centroids_each * m_groups],
                name='counts_input')
            vector_sums = tf.placeholder(tf.double,
                                         shape=[
                                             None,
                                             num_centroids_each * m_groups,
                                             num_features / m_groups
                                         ],
                                         name='vector_sums_input')
            count = tf.reduce_sum(counts, axis=0, name='counts')
            vector_sum = tf.reduce_sum(vector_sums, axis=0, name='vector_sums')
            d_count, d_vector_sum = tfs.reduce_blocks([count, vector_sum], df2)
            new_centers = d_vector_sum / (d_count[:, np.newaxis] + 1e-7)
            new_centers = np.concatenate([new_centers[i] for i in slices],
                                         axis=1)
        if np.allclose(centers, new_centers):
            break
        else:
            centers = new_centers

    return new_centers
Пример #15
0
# The number of clusters
k = 10
num_points = 100000
num_iters = 10
FEATURES_COL = "features"

np.random.seed(2)
np_data = [x.tolist() for x in np.random.uniform(0.0, 1.0, size=(num_points, num_features))]
schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)])
mllib_rows = [Row(_convert_to_vector(x)) for x in np_data]
mllib_df = sqlContext.createDataFrame(mllib_rows, schema).coalesce(1).cache()

df = sqlContext.createDataFrame([[r] for r in np_data]).toDF(FEATURES_COL).coalesce(1)
# For now, analysis is still required. We cache the output because we are going to perform
# multiple runs on the dataset.
df0 = tfs.analyze(df).cache()


mllib_df.count()
df0.count()

np.random.seed(2)
init_centers = np.random.randn(k, num_features)
start_centers = init_centers
dataframe = df0


ta_0 = time.time()
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode(
        "random").setMaxIter(num_iters)
mod = kmeans.fit(mllib_df)
Пример #16
0
import tensorflow as tf
import tensorframes as tfs
from pyspark.sql import Row

# Build a DataFrame of vectors
data = [Row(y=[float(y), float(-y)]) for y in range(10)]
df = sqlContext.createDataFrame(data)
# Because the dataframe contains vectors, we need to analyze it first to find the
# dimensions of the vectors.
df2 = tfs.analyze(df)

# The information gathered by TF can be printed to check the content:
tfs.print_schema(df2)
# TF has inferred that y contains vectors of size 2
# root
#  |-- y: array (nullable = false) DoubleType[?,2]

# Let's use the analyzed dataframe to compute the sum and the elementwise minimum
# of all the vectors:
# First, let's make a copy of the 'y' column. This will be very cheap in Spark 2.0+
df3 = df2.select(df2.y, df2.y.alias("z"))
with tf.Graph().as_default() as g:
    # The placeholders. Note the special name that end with '_input':
    y_input = tfs.block(df3, 'y', tf_name="y_input")
    z_input = tfs.block(df3, 'z', tf_name="z_input")
    y = tf.reduce_sum(y_input, [0], name='y')
    z = tf.reduce_min(z_input, [0], name='z')
    # The resulting dataframe
    (data_sum, data_min) = tfs.reduce_blocks([y, z], df3)

# The final results are numpy arrays:
Пример #17
0
# The number of clusters
k = 10
num_points = 100000
num_iters = 10
FEATURES_COL = "features"

np.random.seed(2)
np_data = [x.tolist() for x in np.random.uniform(0.0, 1.0, size=(num_points, num_features))]
schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)])
mllib_rows = [Row(_convert_to_vector(x)) for x in np_data]
mllib_df = sqlContext.createDataFrame(mllib_rows, schema).coalesce(1).cache()

df = sqlContext.createDataFrame([[r] for r in np_data]).toDF(FEATURES_COL).coalesce(1)
# For now, analysis is still required. We cache the output because we are going to perform
# multiple runs on the dataset.
df0 = tfs.analyze(df).cache()


mllib_df.count()
df0.count()

np.random.seed(2)
init_centers = np.random.randn(k, num_features)
start_centers = init_centers
dataframe = df0

ta_0 = time.time()
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode(
        "random").setMaxIter(num_iters)
mod = kmeans.fit(mllib_df)
ta_1 = time.time()