예제 #1
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
예제 #2
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
예제 #3
0
 def load_lda_model(self):
     print('start to load LDA model:')
     self.model = DistributedLDAModel.load(self.lda_model_path)
     self.lda = LDA.load(self.lda_path)
     print('finished load LDA model........')
     return self.model
예제 #4
0
    def data_describe(self):
        print('start to read data for rdd:')
        rawRdd_nlp = self.read_rdd('track2_title.txt').map(lambda line : eval(line))

        # print(rawRdd_nlp.take(10))
        #转化为dataframe,在不指定schema的情况下会自动推断
        sqlContext = SQLContext(self.sc)
        labels=[
            ('item_id',typ.IntegerType()),
            ('title_features',typ.MapType(typ.StringType(), typ.IntegerType()))]
        Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])
        df = sqlContext.createDataFrame(rawRdd_nlp,Schema)
        # df.show(10)
        # df.printSchema()

        print("统计title中不同词的个数unique,以及title的长度")
        gdf=df.select("item_id",fn.explode(fn.col("title_features"))).groupBy("item_id")
        df2=gdf.agg(fn.count("key").alias("title_words_unique"))

        df3=gdf.agg(fn.sum("value").alias("title_length"))

        df=df.join(df2,"item_id","left") \
             .join(df3,"item_id","left")
        df=df.drop("title_features")
        df.printSchema()


        print('start to deal with the title_features col,and compute the title topic')

        tokens=df.rdd.map(lambda d:d[1]).map(lambda d:list(d.keys()))  #每个titile对应的tokens

        local_tokens=tokens.flatMap(lambda d :[int(token) for token in d]).distinct()
        print('local_tokens最大的值')
        print(local_tokens.top(1))
        vocab_size=max(local_tokens.top(1))+1

        #将title_feature列转化为向量
        toInt=udf(lambda counts :{int(token) :float(counts[token]) for token in counts}, typ.StringType())
        df = df.withColumn("title_features_1", toInt(df.title_features))


        toVector=udf(lambda vs: Vectors.sparse(vocab_size,vs), VectorUDT())
        rescaledData = df.withColumn("features", toVector(df.title_features_1)).select("item_id", "features")

        df=df.drop("title_features_1")
        # del df
        # gc.collect()
        rescaledData.cache()
        lda = LDA(k=50,maxIter=200)
        #lda = LDA(k=2,maxIter=5)
        ldaModel = lda.fit(rescaledData)


        transformed = ldaModel.transform(rescaledData)   #.select("topicDistribution")
        #结果显示 每个文档各个类别的权重, transformed表各列名
        #主题分布向量转化为类别
        # transformed.show(truncate=False)

        def to_array(col):
            def to_array_(v):
                return v.toArray().tolist()
            return psf.udf(to_array_, typ.ArrayType(typ.DoubleType()))(col)
        df_topic=transformed.withColumn("topic", to_array(psf.col("topicDistribution"))).select(["item_id"] + [psf.col("topic")[i] for i in range(50)])

        topicCol=df_topic.columns
        topicCol.remove("item_id")
        print('查看列名')
        print(topicCol)
        def getTopicID(p):  #改用key-value的形式,再排序,找出最大value对应的key
            d={}
            for c in topicCol: #构建字典
                d[c]=p[c]
            z = list(d.keys())[list(d.values()).index(max(d.values()))]
            return int(z.replace("topic[",'').replace("]",''))

        df_topic1=df_topic.rdd.map(lambda p: (p.item_id, getTopicID(p)))

        labels=[
            ('item_id',typ.IntegerType()),
            ('title_topic',typ.IntegerType())]
        Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])
        df_topic2 = sqlContext.createDataFrame(df_topic1,Schema)

        # df_topic2 = df_topic1.toDF(['item_id','topic'])
        # print('观看topic是否为想要的数据格式,并保存于topic2中')
        df_topic2.show(5)

        df_nlp=df.join(df_topic2,"item_id","left")   #UnboundLocalError: local variable 'df' referenced before assignment
        df_nlp.printSchema()
        #item_id|title_features |title_words_unique|title_length|title_features1 |title_topic|

        print('-------5.保存数据预处理结果-------')
        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'nlp_topic_feature2'
        os.system("hadoop fs -rm -r {}".format(file_path))
        df_nlp.rdd.map(tuple).saveAsPickleFile(file_path)
        print('数据保存结束')


        print('start to read act data  only for uid and item_id :')
        rawRdd_train = self.read_rdd('final_track2_train.txt').map(lambda line : line.split('\t'))
        rawRdd_test = self.read_rdd('final_track2_test_no_anwser.txt').map(lambda line : line.split('\t'))
        actionLogRdd_train = rawRdd_train.map(
            lambda x :(int(x[0]), int(x[2])))
        # total = actionLogRdd_train.count()
        # print('total: ' + str(total))
        actionLogRdd_test = rawRdd_test.map(
            lambda x :(int(x[0]), int(x[2])))

        sqlContext = SQLContext(self.sc)
        labels=[('uid',typ.IntegerType()),
            ('item_id',typ.IntegerType())
            ]

        actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])

        dfactionLog_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema)
        dfactionLog_test = sqlContext.createDataFrame(actionLogRdd_test, actionLogSchema)

        #根据item_id进行关联
        # item_id|title_features||title_words_unique|title_length|title_features_1|title_topic
        df_nlp=df_nlp.select(["item_id","title_words_unique","title_length"])

        df_uid_nlp_test=dfactionLog_test.select(["uid","item_id"]).join(df_nlp,'item_id','left').drop("item_id")
        df_uid_nlp_train=dfactionLog_train.select(["uid","item_id"]).join(df_nlp,'item_id','left').drop("item_id")
        del dfactionLog_test
        del dfactionLog_train
        gc.collect()

        #进行处理
        gdf=df_uid_nlp_train.groupby("uid")
        df1=gdf.agg(fn.max("title_words_unique").alias("uid_max_title_words_unique"),fn.avg("title_words_unique").alias("uid_avg_title_words_unique"),\
                    fn.max("title_length").alias("uid_max_title_length"),fn.avg("title_length").alias("uid_avg_title_length")
                    )
        df1.show(1,truncate=False)
        df_uid_train=df_uid_nlp_train.join(df1,'uid','left').drop("title_words_unique").drop("title_length")
        df_uid_test=df_uid_nlp_test.join(df1,'uid','left').drop("title_words_unique").drop("title_length")

        print("理论上应该只有uid,uid_max_beauty,uid_avg_beauty,uid_male_ratio")
        df_uid_train.printSchema()
        df_uid_test.printSchema()

        print('-------保存df_uid_nlp数据-------')
        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'df_uid_nlp_train'
        os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_uid_train.rdd.map(tuple).saveAsPickleFile(file_path)

        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'df_uid_nlp_test'
        os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_uid_test.rdd.map(tuple).saveAsPickleFile(file_path)
        print('数据保存结束')

        #检验上面创建lda模型中使用的参数 ll越大越好,lp越小越好
        '''
        ll = ldaModel.logLikelihood(rescaledData)
        lp = ldaModel.logPerplexity(rescaledData)
        print(ll)
        print(lp)
        '''

        #保存ldaModel,训练集转化的时候直接加载该模型,目前没有必要保存模型,保存df_topic即可
        print("开始保存模型")
        distributed_model_path = self.parser.get("hdfs_path", "hdfs_data_path") + "lda_distributed_model"
        ldaModel.save(distributed_model_path)
        print("保存模型结束")
        #加载的语句
        print("加载模型")
        sameLdaModel = DistributedLDAModel.load(distributed_model_path)
        print("加载模型结束")

        # ---------------------------------3 模型及描述------------------------------
        # 模型通过describeTopics、topicsMatrix来描述
        '''
        topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)
        topicIndices.show(truncate=False)
        #*主题    主题包含最重要的词语序号                     各词语的权重
        '''


        '''