예제 #1
0
    def update_article_ctr_feature_to_hbase(self):
        """
        文章特征中心
        :return:
        """
        self.spark.sql("use article")
        article_profile = self.spark.sql("select * from article_profile")

        def article_profile_to_feature(row):
            try:
                weights = sorted(row.keywords.values())[:10]
            except Exception as e:
                weights = [0.0] * 10
            return row.article_id, row.channel_id, weights

        article_profile = article_profile.rdd.map(
            article_profile_to_feature).toDF(
                ['article_id', 'channel_id', 'weights'])

        article_vector = self.spark.sql("select * from article_vector")
        article_feature = article_profile.join(article_vector,
                                               on=['article_id'],
                                               how='inner')

        def feature_to_vector(row):
            from pyspark.ml.linalg import Vectors
            return row.article_id, row.channel_id, Vectors.dense(
                row.weights), Vectors.dense(row.articlevector)

        article_feature = article_feature.rdd.map(feature_to_vector).toDF(
            ['article_id', 'channel_id', 'weights', 'articlevector'])

        # 保存特征数据
        cols2 = ['article_id', 'channel_id', 'weights', 'articlevector']
        # 做特征的指定指定合并
        article_feature_two = VectorAssembler().setInputCols(
            cols2[1:4]).setOutputCol("features").transform(article_feature)

        # 保存到特征数据库中
        def save_article_feature_to_hbase(partition):
            import happybase
            pool = happybase.ConnectionPool(size=10, host='hadoop1')
            with pool.connection() as conn:
                table = conn.table('ctr_feature_article')
                for row in partition:
                    table.put(
                        '{}'.format(row.article_id).encode(), {
                            'article:{}'.format(row.article_id).encode():
                            str(row.features).encode()
                        })

        article_feature_two.foreachPartition(save_article_feature_to_hbase)
예제 #2
0
article_profile = article_profile.rdd.map(article_profile_to_feature).toDF(['article_id', 'channel_id', 'weights'])
# article_profile.show()

article_vector = sqlContext.sql("select * from article_vector")
article_feature = article_profile.join(article_vector, on=['article_id'], how='inner')

def feature_to_vector(row):
    from pyspark.ml.linalg import Vectors
    return row.article_id, row.channel_id, Vectors.dense(row.weights), Vectors.dense(row.articlevector)

article_feature = article_feature.rdd.map(feature_to_vector).toDF(['article_id', 'channel_id', 'weights', 'articlevector'])

from pyspark.ml.feature import VectorAssembler
columns = ['article_id', 'channel_id', 'weights', 'articlevector']
article_feature_two = VectorAssembler().setInputCols(columns[1:4]).setOutputCol("features").transform(article_feature)
# article_feature_two.show()

def save_article_feature_to_hbase(partition):
    import happybase
    conn = happybase.Connection('localhost')
    table = conn.table('ctr_feature_article')
    for row in partition:
        table.put('{}'.format(row.article_id).encode(),
                 {'article:{}'.format(row.article_id).encode(): str(row.features).encode()})

article_feature_two.foreachPartition(save_article_feature_to_hbase)


sc.stop()