Python VectorAssembler.foreachPartition示例

编程语言: Python

命名空间/包名称: pyspark.ml.feature

类/类型: VectorAssembler

方法/功能: foreachPartition

hotexamples.com的示例: 2

Python VectorAssembler.foreachPartition - 已找到2个示例。这些是从开源项目中提取的最受好评的pyspark.ml.feature.VectorAssembler.foreachPartition现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

VectorAssembler(30)

getOutputCol(30)

transform(30)

getInputCols(19)

setInputCols(18)

setHandleInvalid(15)

select(11)

load(10)

setOutputCol(9)

randomSplit(7)

show(5)

explainParams(3)

join(2)

take(2)

setParams(2)

printSchema(2)

coalesce(2)

foreachPartition(2)

filter(2)

registerTempTable(1)

count(1)

cache(1)

dropna(1)

drop(1)

collect(1)

示例#1

显示文件

    def update_article_ctr_feature_to_hbase(self):
        """
        文章特征中心
        :return:
        """
        self.spark.sql("use article")
        article_profile = self.spark.sql("select * from article_profile")

        def article_profile_to_feature(row):
            try:
                weights = sorted(row.keywords.values())[:10]
            except Exception as e:
                weights = [0.0] * 10
            return row.article_id, row.channel_id, weights

        article_profile = article_profile.rdd.map(
            article_profile_to_feature).toDF(
                ['article_id', 'channel_id', 'weights'])

        article_vector = self.spark.sql("select * from article_vector")
        article_feature = article_profile.join(article_vector,
                                               on=['article_id'],
                                               how='inner')

        def feature_to_vector(row):
            from pyspark.ml.linalg import Vectors
            return row.article_id, row.channel_id, Vectors.dense(
                row.weights), Vectors.dense(row.articlevector)

        article_feature = article_feature.rdd.map(feature_to_vector).toDF(
            ['article_id', 'channel_id', 'weights', 'articlevector'])

        # 保存特征数据
        cols2 = ['article_id', 'channel_id', 'weights', 'articlevector']
        # 做特征的指定指定合并
        article_feature_two = VectorAssembler().setInputCols(
            cols2[1:4]).setOutputCol("features").transform(article_feature)

        # 保存到特征数据库中
        def save_article_feature_to_hbase(partition):
            import happybase
            pool = happybase.ConnectionPool(size=10, host='hadoop1')
            with pool.connection() as conn:
                table = conn.table('ctr_feature_article')
                for row in partition:
                    table.put(
                        '{}'.format(row.article_id).encode(), {
                            'article:{}'.format(row.article_id).encode():
                            str(row.features).encode()
                        })

        article_feature_two.foreachPartition(save_article_feature_to_hbase)

示例#2

显示文件

article_profile = article_profile.rdd.map(article_profile_to_feature).toDF(['article_id', 'channel_id', 'weights'])
# article_profile.show()

article_vector = sqlContext.sql("select * from article_vector")
article_feature = article_profile.join(article_vector, on=['article_id'], how='inner')

def feature_to_vector(row):
    from pyspark.ml.linalg import Vectors
    return row.article_id, row.channel_id, Vectors.dense(row.weights), Vectors.dense(row.articlevector)

article_feature = article_feature.rdd.map(feature_to_vector).toDF(['article_id', 'channel_id', 'weights', 'articlevector'])

from pyspark.ml.feature import VectorAssembler
columns = ['article_id', 'channel_id', 'weights', 'articlevector']
article_feature_two = VectorAssembler().setInputCols(columns[1:4]).setOutputCol("features").transform(article_feature)
# article_feature_two.show()

def save_article_feature_to_hbase(partition):
    import happybase
    conn = happybase.Connection('localhost')
    table = conn.table('ctr_feature_article')
    for row in partition:
        table.put('{}'.format(row.article_id).encode(),
                 {'article:{}'.format(row.article_id).encode(): str(row.features).encode()})

article_feature_two.foreachPartition(save_article_feature_to_hbase)


sc.stop()