Python VectorAssembler.foreachPartition 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark.ml.feature

클래스/타입: VectorAssembler

메소드/함수: foreachPartition

hotexamples.com에서의 예제들: 2

Python VectorAssembler.foreachPartition - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.ml.feature.VectorAssembler.foreachPartition에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

VectorAssembler(30)

getOutputCol(30)

transform(30)

getInputCols(19)

setInputCols(18)

setHandleInvalid(15)

select(11)

load(10)

setOutputCol(9)

randomSplit(7)

show(5)

explainParams(3)

join(2)

take(2)

setParams(2)

printSchema(2)

coalesce(2)

foreachPartition(2)

filter(2)

registerTempTable(1)

count(1)

cache(1)

dropna(1)

drop(1)

collect(1)

예제 #1

파일 보기

    def update_article_ctr_feature_to_hbase(self):
        """
        文章特征中心
        :return:
        """
        self.spark.sql("use article")
        article_profile = self.spark.sql("select * from article_profile")

        def article_profile_to_feature(row):
            try:
                weights = sorted(row.keywords.values())[:10]
            except Exception as e:
                weights = [0.0] * 10
            return row.article_id, row.channel_id, weights

        article_profile = article_profile.rdd.map(
            article_profile_to_feature).toDF(
                ['article_id', 'channel_id', 'weights'])

        article_vector = self.spark.sql("select * from article_vector")
        article_feature = article_profile.join(article_vector,
                                               on=['article_id'],
                                               how='inner')

        def feature_to_vector(row):
            from pyspark.ml.linalg import Vectors
            return row.article_id, row.channel_id, Vectors.dense(
                row.weights), Vectors.dense(row.articlevector)

        article_feature = article_feature.rdd.map(feature_to_vector).toDF(
            ['article_id', 'channel_id', 'weights', 'articlevector'])

        # 保存特征数据
        cols2 = ['article_id', 'channel_id', 'weights', 'articlevector']
        # 做特征的指定指定合并
        article_feature_two = VectorAssembler().setInputCols(
            cols2[1:4]).setOutputCol("features").transform(article_feature)

        # 保存到特征数据库中
        def save_article_feature_to_hbase(partition):
            import happybase
            pool = happybase.ConnectionPool(size=10, host='hadoop1')
            with pool.connection() as conn:
                table = conn.table('ctr_feature_article')
                for row in partition:
                    table.put(
                        '{}'.format(row.article_id).encode(), {
                            'article:{}'.format(row.article_id).encode():
                            str(row.features).encode()
                        })

        article_feature_two.foreachPartition(save_article_feature_to_hbase)

예제 #2

파일 보기

article_profile = article_profile.rdd.map(article_profile_to_feature).toDF(['article_id', 'channel_id', 'weights'])
# article_profile.show()

article_vector = sqlContext.sql("select * from article_vector")
article_feature = article_profile.join(article_vector, on=['article_id'], how='inner')

def feature_to_vector(row):
    from pyspark.ml.linalg import Vectors
    return row.article_id, row.channel_id, Vectors.dense(row.weights), Vectors.dense(row.articlevector)

article_feature = article_feature.rdd.map(feature_to_vector).toDF(['article_id', 'channel_id', 'weights', 'articlevector'])

from pyspark.ml.feature import VectorAssembler
columns = ['article_id', 'channel_id', 'weights', 'articlevector']
article_feature_two = VectorAssembler().setInputCols(columns[1:4]).setOutputCol("features").transform(article_feature)
# article_feature_two.show()

def save_article_feature_to_hbase(partition):
    import happybase
    conn = happybase.Connection('localhost')
    table = conn.table('ctr_feature_article')
    for row in partition:
        table.put('{}'.format(row.article_id).encode(),
                 {'article:{}'.format(row.article_id).encode(): str(row.features).encode()})

article_feature_two.foreachPartition(save_article_feature_to_hbase)


sc.stop()