예제 #1
0
def simple_example_2():
    spark = SparkSession.builder.appName(
        'simple-tensorframes-example-2').getOrCreate()
    spark.sparkContext.setLogLevel('WARN')

    rdd = [Row(y=[float(y), float(-y)]) for y in range(10)]
    df = spark.createDataFrame(rdd)

    df.show()
    tfs.print_schema(df)

    # Analyze first to find the dimensions of the vectors.
    df2 = tfs.analyze(df)

    tfs.print_schema(df2)

    # Make a copy of the 'y' column: An inexpensive operation in Spark 2.0+.
    df3 = df2.select(df2.y, df2.y.alias('z'))

    # Execute the tensor graph.
    with tf.Graph().as_default() as graph:
        y_input = tfs.block(df3, 'y', tf_name='y_input')
        z_input = tfs.block(df3, 'z', tf_name='z_input')

        # Perform elementwise sum and minimum.
        y = tf.reduce_sum(y_input, [0], name='y')
        z = tf.reduce_min(z_input, [0], name='z')

        (data_sum, data_min) = tfs.reduce_blocks([y, z], df3)

    print('Elementwise sum: %s and minimum: %s' % (data_sum, data_min))
예제 #2
0
 def test_reduce_blocks_1(self):
     data = [Row(x=float(x)) for x in range(5)]
     df = self.sql.createDataFrame(data)
     with tf.Graph().as_default() as g:
         # The placeholder that corresponds to column 'x'
         x_input = tf.placeholder(tf.double, shape=[None], name="x_input")
         # The output that adds 3 to x
         x = tf.reduce_sum(x_input, name='x')
         # The resulting dataframe
         res = tfs.reduce_blocks(x, df)
     assert res == sum([r.x for r in data])
예제 #3
0
 def test_reduce_blocks_1(self):
     data = [Row(x=float(x)) for x in range(5)]
     df = self.sql.createDataFrame(data)
     with tf.Graph().as_default() as g:
         # The placeholder that corresponds to column 'x'
         x_input = tf.placeholder(tf.double, shape=[None], name="x_input")
         # The output that adds 3 to x
         x = tf.reduce_sum(x_input, name='x')
         # The resulting dataframe
         res = tfs.reduce_blocks(x, df)
     assert res == sum([r.x for r in data])
예제 #4
0
data = [Row(y=[float(y), float(-y)]) for y in range(10)]
df = sqlContext.createDataFrame(data)
# Because the dataframe contains vectors, we need to analyze it first to find the
# dimensions of the vectors.
df2 = tfs.analyze(df)

# The information gathered by TF can be printed to check the content:
tfs.print_schema(df2)
# TF has inferred that y contains vectors of size 2
# root
#  |-- y: array (nullable = false) DoubleType[?,2]

# Let's use the analyzed dataframe to compute the sum and the elementwise minimum 
# of all the vectors:
# First, let's make a copy of the 'y' column. This will be very cheap in Spark 2.0+
df3 = df2.select(df2.y, df2.y.alias("z"))
with tf.Graph().as_default() as g:
    # The placeholders. Note the special name that end with '_input':
    y_input = tfs.block(df3, 'y', tf_name="y_input")
    z_input = tfs.block(df3, 'z', tf_name="z_input")
    y = tf.reduce_sum(y_input, [0], name='y')
    z = tf.reduce_min(z_input, [0], name='z')
    # The resulting dataframe
    (data_sum, data_min) = tfs.reduce_blocks([y, z], df3)

# The final results are numpy arrays:
print data_sum
# [45.0, -45.0]
print data_min
# [0.0, -9.0]
예제 #5
0
def run_one_step2(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function performs most of the aggregation in TensorFlow.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, _) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The distances
        distances = tf_compute_distances(points, start_centers)
        # The rest of this block performs a pre-aggregation step in TF, to limit the
        # communication between TF and Spark.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        num_points = tf.stack([tf.shape(points)[0]], name="num_points")
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        # These compute the aggregate based on the indexes.
        block_points = tf.unsorted_segment_sum(points, indexes, num_centroids, name="block_points")
        block_counts = tf.unsorted_segment_sum(counts, indexes, num_centroids, name="block_counts")
        block_distances = tf.reduce_sum(min_distances, name="block_distances")
        # One leading dimension is added to express the fact that the previous elements are just
        # one row in the final dataframe.
        # The final dataframe has one row per block.
        agg_points = tf.expand_dims(block_points, 0, name="agg_points")
        agg_counts = tf.expand_dims(block_counts, 0, name="agg_counts")
        agg_distances = tf.expand_dims(block_distances, 0, name="agg_distances")
        # Using trimming to drop the original data (we are just returning one row of data per
        # block).
        df2 = tfs.map_blocks([agg_points, agg_counts, agg_distances],
                             dataframe, trim=True)
    # Now we simply collect and sum the elements
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tf.placeholder(tf.double,
                                 shape=[None, num_centroids, num_features],
                                 name='agg_points_input')
        count_input = tf.placeholder(tf.int32,
                                     shape=[None, num_centroids],
                                     name='agg_counts_input')
        md_input = tf.placeholder(tf.double,
                                  shape=[None],
                                  name='agg_distances_input')
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='agg_points')
        count = tf.reduce_sum(count_input, [0], name='agg_counts')
        min_distances = tf.reduce_sum(md_input, [0], name='agg_distances')
        (x_, count_, total_distances) = tfs.reduce_blocks([x, count, min_distances], df2)
    # The new centers
    new_centers = (x_.T / (count_ + 1e-7)).T
    return (new_centers, total_distances)
예제 #6
0
data = [Row(y=[float(y), float(-y)]) for y in range(10)]
df = sqlContext.createDataFrame(data)
# Because the dataframe contains vectors, we need to analyze it first to find the
# dimensions of the vectors.
df2 = tfs.analyze(df)

# The information gathered by TF can be printed to check the content:
tfs.print_schema(df2)
# TF has inferred that y contains vectors of size 2
# root
#  |-- y: array (nullable = false) DoubleType[?,2]

# Let's use the analyzed dataframe to compute the sum and the elementwise minimum
# of all the vectors:
# First, let's make a copy of the 'y' column. This will be very cheap in Spark 2.0+
df3 = df2.select(df2.y, df2.y.alias("z"))
with tf.Graph().as_default() as g:
    # The placeholders. Note the special name that end with '_input':
    y_input = tfs.block(df3, 'y', tf_name="y_input")
    z_input = tfs.block(df3, 'z', tf_name="z_input")
    y = tf.reduce_sum(y_input, [0], name='y')
    z = tf.reduce_min(z_input, [0], name='z')
    # The resulting dataframe
    (data_sum, data_min) = tfs.reduce_blocks([y, z], df3)

# The final results are numpy arrays:
print data_sum
# [45.0, -45.0]
print data_min
# [0.0, -9.0]
예제 #7
0
def run_one_step2(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function performs most of the aggregation in TensorFlow.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, _) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The distances
        distances = tf_compute_distances(points, start_centers)
        # The rest of this block performs a pre-aggregation step in TF, to limit the
        # communication between TF and Spark.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        num_points = tf.pack([tf.shape(points)[0]], name="num_points")
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        # These compute the aggregate based on the indexes.
        block_points = tf.unsorted_segment_sum(points, indexes, num_centroids, name="block_points")
        block_counts = tf.unsorted_segment_sum(counts, indexes, num_centroids, name="block_counts")
        block_distances = tf.reduce_sum(min_distances, name="block_distances")
        # One leading dimension is added to express the fact that the previous elements are just
        # one row in the final dataframe.
        # The final dataframe has one row per block.
        agg_points = tf.expand_dims(block_points, 0, name="agg_points")
        agg_counts = tf.expand_dims(block_counts, 0, name="agg_counts")
        agg_distances = tf.expand_dims(block_distances, 0, name="agg_distances")
        # Using trimming to drop the original data (we are just returning one row of data per
        # block).
        df2 = tfs.map_blocks([agg_points, agg_counts, agg_distances],
                             dataframe, trim=True)
    # Now we simply collect and sum the elements
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tf.placeholder(tf.double,
                                 shape=[None, num_centroids, num_features],
                                 name='agg_points_input')
        count_input = tf.placeholder(tf.int32,
                                     shape=[None, num_centroids],
                                     name='agg_counts_input')
        md_input = tf.placeholder(tf.double,
                                  shape=[None],
                                  name='agg_distances_input')
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='agg_points')
        count = tf.reduce_sum(count_input, [0], name='agg_counts')
        min_distances = tf.reduce_sum(md_input, [0], name='agg_distances')
        (x_, count_, total_distances) = tfs.reduce_blocks([x, count, min_distances], df2)
    # The new centers
    new_centers = (x_.T / (count_ + 1e-7)).T
    return (new_centers, total_distances)
예제 #8
0
def m_kmeans(df,
             feature_column,
             num_centroids_each,
             num_features,
             m_groups,
             max_iter=10):
    """
    M K-means algorithm applied on a dataframe of points

    :param df: dataframe, contains all points
    :param feature_column: string, the points column within df
    :param num_centroids_each: int, k clusters
    :param num_features: int, dimension of a point vector
    :param m_groups: int, number of groups a point is spitted into
    :param max_iter: int, maximum number of iterations

    :return: numpy.array: [num_centroids, num_features], the k cluster centers with m groups concatenated
    """
    initial_centers = df.select(feature_column).take(num_centroids_each)
    centers = np.array(initial_centers).reshape(num_centroids_each,
                                                num_features)
    m_slice = map(lambda r: slice(min(r),
                                  max(r) + 1),
                  np.array_split(xrange(num_features), m_groups))
    slices = np.array_split(xrange(m_groups * num_centroids_each), m_groups)
    df = tfs.analyze(df)

    while max_iter > 0:
        max_iter -= 1

        with tf.Graph().as_default():
            points = tf.placeholder(tf.double,
                                    shape=[None, num_features],
                                    name=feature_column)
            counts, vector_sums = calculate_new_centers_for_m_slice(
                m_slice, points, tf.nn.l2_normalize(centers, dim=1),
                num_centroids_each)
            counts = tf.identity(counts, name='counts')
            vector_sums = tf.identity(vector_sums, name='vector_sums')
            df2 = tfs.map_blocks([counts, vector_sums], df, trim=True)

        with tf.Graph().as_default():
            counts = tf.placeholder(
                tf.int64,
                shape=[None, num_centroids_each * m_groups],
                name='counts_input')
            vector_sums = tf.placeholder(tf.double,
                                         shape=[
                                             None,
                                             num_centroids_each * m_groups,
                                             num_features / m_groups
                                         ],
                                         name='vector_sums_input')
            count = tf.reduce_sum(counts, axis=0, name='counts')
            vector_sum = tf.reduce_sum(vector_sums, axis=0, name='vector_sums')
            d_count, d_vector_sum = tfs.reduce_blocks([count, vector_sum], df2)
            new_centers = d_vector_sum / (d_count[:, np.newaxis] + 1e-7)
            new_centers = np.concatenate([new_centers[i] for i in slices],
                                         axis=1)
        if np.allclose(centers, new_centers):
            break
        else:
            centers = new_centers

    return new_centers