def update_article_ctr_feature_to_hbase(self): """ 文章特征中心 :return: """ self.spark.sql("use article") article_profile = self.spark.sql("select * from article_profile") def article_profile_to_feature(row): try: weights = sorted(row.keywords.values())[:10] except Exception as e: weights = [0.0] * 10 return row.article_id, row.channel_id, weights article_profile = article_profile.rdd.map( article_profile_to_feature).toDF( ['article_id', 'channel_id', 'weights']) article_vector = self.spark.sql("select * from article_vector") article_feature = article_profile.join(article_vector, on=['article_id'], how='inner') def feature_to_vector(row): from pyspark.ml.linalg import Vectors return row.article_id, row.channel_id, Vectors.dense( row.weights), Vectors.dense(row.articlevector) article_feature = article_feature.rdd.map(feature_to_vector).toDF( ['article_id', 'channel_id', 'weights', 'articlevector']) # 保存特征数据 cols2 = ['article_id', 'channel_id', 'weights', 'articlevector'] # 做特征的指定指定合并 article_feature_two = VectorAssembler().setInputCols( cols2[1:4]).setOutputCol("features").transform(article_feature) # 保存到特征数据库中 def save_article_feature_to_hbase(partition): import happybase pool = happybase.ConnectionPool(size=10, host='hadoop1') with pool.connection() as conn: table = conn.table('ctr_feature_article') for row in partition: table.put( '{}'.format(row.article_id).encode(), { 'article:{}'.format(row.article_id).encode(): str(row.features).encode() }) article_feature_two.foreachPartition(save_article_feature_to_hbase)
article_profile = article_profile.rdd.map(article_profile_to_feature).toDF(['article_id', 'channel_id', 'weights']) # article_profile.show() article_vector = sqlContext.sql("select * from article_vector") article_feature = article_profile.join(article_vector, on=['article_id'], how='inner') def feature_to_vector(row): from pyspark.ml.linalg import Vectors return row.article_id, row.channel_id, Vectors.dense(row.weights), Vectors.dense(row.articlevector) article_feature = article_feature.rdd.map(feature_to_vector).toDF(['article_id', 'channel_id', 'weights', 'articlevector']) from pyspark.ml.feature import VectorAssembler columns = ['article_id', 'channel_id', 'weights', 'articlevector'] article_feature_two = VectorAssembler().setInputCols(columns[1:4]).setOutputCol("features").transform(article_feature) # article_feature_two.show() def save_article_feature_to_hbase(partition): import happybase conn = happybase.Connection('localhost') table = conn.table('ctr_feature_article') for row in partition: table.put('{}'.format(row.article_id).encode(), {'article:{}'.format(row.article_id).encode(): str(row.features).encode()}) article_feature_two.foreachPartition(save_article_feature_to_hbase) sc.stop()