示例#1
0
def embeddingLSH(spark, movieEmbDf):
    # movieEmbSeq = []
    # for key, embedding_list in movieEmbMap.items():
    #     embedding_list = [np.float64(embedding) for embedding in embedding_list]
    #     movieEmbSeq.append((key, Vectors.dense(embedding_list)))
    #
    #
    # movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb")

    # 实际就是训练出一个vector,用于将emb向量内积为数字,再经过多个hash函数后取模分多个桶
    bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="vector",
                                                      outputCol="bucketId",
                                                      bucketLength=0.1,
                                                      numHashTables=3)
    # 训练并生成分桶
    bucketModel = bucketProjectionLSH.fit(movieEmbDf)
    embBucketResult = bucketModel.transform(movieEmbDf)

    print("movieId, emb, bucketId schema:")
    embBucketResult.printSchema()
    print("movieId, emb, bucketId data result:")
    embBucketResult.show(10, truncate=False)

    # 给定任意emb vector,可以计算出其分桶,并在桶内计算余弦距离最近的emb
    print(
        "Approximately searching for 5 nearest neighbors of the sample embedding:"
    )
    sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839,
                              -0.0633, 0.249, 0.673, -0.237)
    bucketModel.approxNearestNeighbors(movieEmbDf, sampleEmb,
                                       5).show(truncate=False)
def embeddingLSH(spark, movieEmbMap):
    movieEmbSeq = []
    for key, embedding_list in movieEmbMap.items():
        embedding_list = [
            np.float64(embedding) for embedding in embedding_list
        ]
        movieEmbSeq.append((key, Vectors.dense(embedding_list)))
    movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb")
    bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="emb",
                                                      outputCol="bucketId",
                                                      bucketLength=0.1,
                                                      numHashTables=3)
    bucketModel = bucketProjectionLSH.fit(movieEmbDF)
    embBucketResult = bucketModel.transform(movieEmbDF)
    print("movieId, emb, bucketId schema:")
    embBucketResult.printSchema()
    print("movieId, emb, bucketId data result:")
    embBucketResult.show(10, truncate=False)
    print(
        "Approximately searching for 5 nearest neighbors of the sample embedding:"
    )
    sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839,
                              -0.0633, 0.249, 0.673, -0.237)
    bucketModel.approxNearestNeighbors(movieEmbDF, sampleEmb,
                                       5).show(truncate=False)
    def embedding_lsh(self, spark_session: SparkSession, movie_emb_map):
        movie_emb_seq = []
        for movieId, vector in movie_emb_map.items():
            movie_emb_seq.append((movieId, Vectors.dense(vector)))
        movie_emb_df = spark.createDataFrame(movie_emb_seq).toDF(
            "movieId", "emb")

        bucket_projection_lsh = BucketedRandomProjectionLSH().setBucketLength(0.1).setNumHashTables(3).setInputCol("emb").\
            setOutputCol("bucketId")
        bucket_model = bucket_projection_lsh.fit(movie_emb_df)

        emb_bucket_result = bucket_model.transform(movie_emb_df)

        print("movieId, emb, bucketId schema:")
        emb_bucket_result.printSchema()
        print("movieId, emb, bucketId data result:")
        emb_bucket_result.show(10, truncate=False)

        print(
            "Approximately searching for 5 nearest neighbors of the sample embedding:"
        )
        sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839,
                                  -0.0633, 0.249, 0.673, -0.237)
        bucket_model.approxNearestNeighbors(movie_emb_df, sampleEmb,
                                            5).show(truncate=False)
def LSH(spot, recommend_num):
    spark = SparkSession \
        .builder \
        .appName("BucketedRandomProjectionLSHExample") \
        .getOrCreate()
    '''
    # 正则化后的所有景点数据
    data = [('嵛山岛', Vectors.dense([0.2, 0.5, 0.7, 0.5]),),
            ('仙山牧场', Vectors.dense([0.4, 0.4, 0.1, 0.4]),),
            ('大洲岛', Vectors.dense([0.5, 0.1, 0.1, 0.5]),),
            ('御茶园', Vectors.dense([0.2, 0.4, 0.3, 0.6]),),
            ('洞宫山', Vectors.dense([0.3, 0.1, 0.2, 0.2]),),
            ('玉女峰', Vectors.dense([0.4, 0.4, 0.5, 0.4]),),
            ('翡翠谷', Vectors.dense([0.6, 0.1, 0.1, 0.5]),),
            ('白云寺', Vectors.dense([0.9, 0.1, 0.2, 0.1]),),
            ('泰宁地质博物苑', Vectors.dense([0.7, 0.1, 0.3, 0.7]),),
            ('晒布岩', Vectors.dense([1, 0.4, 0.5, 0.4]),)]
    '''

    df = spark.createDataFrame(data, ["name", "features"])
    brp = BucketedRandomProjectionLSH(inputCol="features",
                                      outputCol="hashes",
                                      bucketLength=2.0,
                                      numHashTables=3)
    model = brp.fit(df)

    # key = Vectors.dense([0.5, 0.8, 0.1, 0.5])  # 做推荐的景点
    # model.approxNearestNeighbors(df, key, 3).show()  # 对此景点推荐3个最相似的景点
    result = model.approxNearestNeighbors(df, data.get(spot), recommend_num)

    spark.stop()

    return result
def train_forPMML(sparkUrl, dataForTrainPath, savePath):
    # 取得模型存儲路徑
    brp_path, model_path = get_model_save_path(savePath)

    # 載入數據
    sc = get_conf(sparkUrl, 'LSH_train', "8g")
    df = load_sentence_data_frame(sc, dataForTrainPath)

    # 開始訓練模型
    brp = BucketedRandomProjectionLSH() \
        .setBucketLength(BUCKET_LENGTH) \
        .setNumHashTables(NUM_HASH_TABLES) \
        .setInputCol("vector") \
        .setOutputCol("hash")

    # 流水線: 先提取特徵, 再訓練模型
    pipeline = Pipeline(stages=[brp])
    pipeline_model = pipeline.fit(df)

    # 顯示大概結果
    # pipeline_model.transform(df).show()
    # 存儲模型至PMML
    pmmlBuilder = PMMLBuilder(sc, df, pipeline_model)
    pmmlBuilder.buildFile("~/pmmlModels/SM.pmml")
    return
示例#6
0
    def test_bucketed_random_projection_lsh(self):
        data = self.spark.createDataFrame([(
            0,
            Vectors.dense([-1.0, -1.0]),
        ), (
            1,
            Vectors.dense([-1.0, 1.0]),
        ), (
            2,
            Vectors.dense([1.0, -1.0]),
        ), (
            3,
            Vectors.dense([1.0, 1.0]),
        )], ["id", "features"])
        mh = BucketedRandomProjectionLSH(inputCol="features",
                                         outputCol="hashes",
                                         seed=12345,
                                         bucketLength=1.0)
        model = mh.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(
            model,
            'Sparkml BucketedRandomProjectionLSH',
            [('features', FloatTensorType([1, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().hashes.apply(lambda x: pandas.Series(x).map(
                lambda y: y.values[0])).values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBucketedRandomProjectionLSH")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['hashes'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
示例#7
0
def train_lsh_model():
    sf = SparkConf()\
            .setMaster("local") \
            .setAppName("Spark SVM tutorial") \
            .set("spark.executor.memory", "8g")
    sc = SparkContext(conf=sf)
    df = read_csv()

    sdf = SQLContext(sc).createDataFrame(df)

    brp = BucketedRandomProjectionLSH() \
         .setBucketLength(50.0) \
         .setNumHashTables(3) \
         .setInputCol("vector") \
         .setOutputCol("hash")

    model = brp.fit(sdf)
    model.transform(sdf).show()

    model.approxSimilarityJoin(sdf, sdf, 1.5)
def validForSpark(sparkUrl, dataForTrainPath, dataForVaildPath, savePath):
    # 取得模型存儲路徑
    brp_path, model_path = get_model_save_path(savePath)

    # 載入數據
    sc = get_conf(sparkUrl, 'LSH_valid', "8g")
    dft = load_sentence_data_frame(sc, dataForTrainPath)
    dfv = load_sentence_data_frame(sc, dataForVaildPath)

    dft.cache()

    # 載入舊有模型
    brp = BucketedRandomProjectionLSH.load(brp_path)
    model = BucketedRandomProjectionLSHModel.load(model_path)

    sets = dfv.rdd.map(lambda x: {
        x['sentence'], x['vector']
    }).collect()

    # write result to file
    for set in sets:
        readFalse = False
        sent = None
        vect = None
        for element in set:
            if isinstance(element, DenseVector) == True and vect == None:
                vect = element
            elif  isinstance(element, DenseVector) == True and vect != None:
                print('vect_error', set)
                readFalse = True
            if isinstance(element, unicode) == True and sent == None:
                sent = element
            elif isinstance(element, unicode) == True and sent != None:
                print('sent_error', set)
                readFalse = True

        if sent == None or vect == None:
            readFalse = True

        if readFalse == True:
            print('read false')
            break

        print('=================================')
        print(sent)
        print('=================================')
        res = model.approxNearestNeighbors(dft, vect, 5)
        s_s = res.select('sentence').rdd.collect()
        for s in s_s:
            print(s['sentence'])
        print('************************************')

    return
def embeddingLSH(spark, movieEmbMap):
    movieEmbSeq = []
    for key, embedding_list in movieEmbMap.items():
        embedding_list = [np.float64(embedding) for embedding in embedding_list]
        movieEmbSeq.append((key, Vectors.dense(embedding_list)))

    # 数据集准备,创建一个DataFrame
    movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb")
    # 利用Spark MLlib 自带的分桶局部敏感哈希模型,其中numHashTables参数设定的是一个Embedding对应的桶数,即分桶函数的数量
    bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="emb", outputCol="bucketId",
                                                      bucketLength=0.1, numHashTables=3)
    bucketModel = bucketProjectionLSH.fit(movieEmbDF)
    embBucketResult = bucketModel.transform(movieEmbDF)
    print("movieId, emb, bucketId schema:")
    embBucketResult.printSchema()
    print("movieId, emb, bucketId data result:")
    embBucketResult.show(10, truncate=False)
    print("Approximately searching for 5 nearest neighbors of the given sample embedding:")
    # 给定一个Embedding向量,将其转换为Dense Vector
    sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237)
    # 使用bucketProjectionLSH_model自带的函数寻找其最近邻
    bucketModel.approxNearestNeighbors(dataset=movieEmbDF, key=sampleEmb, numNearestNeighbors=5).show(truncate=False)
示例#10
0
def train(sparkUrl, dataForTrainPath, savePath):
    # 取得模型存儲路徑
    brp_path, model_path = get_model_save_path(savePath)

    # 載入數據
    sc = get_conf(sparkUrl, 'LSH_train', "8g")
    df = load_word_data_frame(sc, dataForTrainPath)

    # 開始訓練模型
    brp = BucketedRandomProjectionLSH() \
        .setBucketLength(BUCKET_LENGTH) \
        .setNumHashTables(NUM_HASH_TABLES) \
        .setInputCol("vector") \
        .setOutputCol("hash")
    model = brp.fit(df)

    # 存儲模型
    brp.save(brp_path)
    model.save(model_path)

    # 顯示大概結果
    model.transform(df).show()
示例#11
0
def embeddingLSH(moviesEmb: DataFrame):
    '''
    局部敏感哈希
    :param spark:
    :param moviesEmb:
    :return:
    '''
    brp = BucketedRandomProjectionLSH(inputCol='vector',
                                      outputCol='bucketId',
                                      numHashTables=3,
                                      bucketLength=0.1)
    model = brp.fit(moviesEmb)
    moviesEmbResult = model.transform(moviesEmb)
    moviesEmbResult.printSchema()
    moviesEmbResult.show(5)
    print(
        "Approximately searching for 5 nearest neighbors of the sample embedding:"
    )
    sampleEmb = Vectors.dense([
        0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673,
        -0.237
    ])
    model.approxNearestNeighbors(moviesEmb, sampleEmb, 5).show(5)
示例#12
0
    def similarities(self, graph, config):

        print("Similarity Analysis\t1\tComputing hashes of feature vectors")
        graph = graph.get_df()

        max_id = graph.agg({"dst": "max"}).collect()[0][0] + 1

        # create features as sparse vectors from col and val columns
        def to_sparse_vector(indices, values):
            indices, values = zip(*sorted(zip(indices, values)))
            return Vectors.sparse(max_id, indices, values)

        def non_zero(v):
            return v.numNonzeros()

        to_sparse_vector_udf = udf(
            lambda indices, values: to_sparse_vector(indices, values),
            VectorUDT())
        non_zero_udf = udf(lambda v: non_zero(v), LongType())

        df = graph.groupby("src").agg(
            to_sparse_vector_udf(
                collect_list("dst"),
                collect_list("numberOfPaths")).alias("features"))

        # do not consider vector smaller than this threshold
        df = df.filter(
            non_zero_udf("features") >= int(config["sim_min_values"]))

        # caclulate bucket length, given the specified number of buckets
        total_records = df.count()
        buckets_length = math.ceil(math.sqrt(total_records))
        # 		buckets_length = math.pow(total_records, -1/max_id)
        # 		print(total_records)
        # 		print(buckets_length)
        brp = BucketedRandomProjectionLSH(inputCol="features",
                                          outputCol="hashes",
                                          bucketLength=buckets_length,
                                          numHashTables=int(config["t"]))
        model = brp.fit(df)
        df_t = model.transform(df)

        if ("Similarity Join" in config["analyses"]):
            df_t.cache()

            # Compute the locality sensitive hashes for the input rows, then perform approximate similarity join.
            print("Similarity Analysis\t2\tCalculating Similarity Join")
            join_distance = 3
            while True:
                join_results = model.approxSimilarityJoin(df_t, df_t, join_distance, distCol="EuclideanDistance")\
                 .select(
                  col("datasetA.src").alias("idA"),
                  col("datasetB.src").alias("idB"),
                  col("EuclideanDistance"))\
                 .filter(col("idA") != col("idB"))\
                 .orderBy("EuclideanDistance", ascending=True)\
                 .limit(int(config["searchK"]))

                # loop until we find as many results as requested
                if (join_results.count() >= int(config["searchK"])):
                    break

                # else increase distance and try again
                join_distance *= 2

            join_results.coalesce(1).write.csv(config["sim_join_out"],
                                               sep='\t',
                                               mode='overwrite')

        if ("Similarity Search" in config["analyses"]):
            print(
                "Similarity Analysis\t2\tCalculating Top-k Similarity Search results"
            )
            target_id = int(config["target_id"])
            key_vector = df.filter(col("src") == target_id).select(
                col("features")).collect()
            if (len(key_vector) == 0):
                return

            key_vector = key_vector[0]["features"]

            search_results = model.approxNearestNeighbors(
                df, key_vector,
                int(config["searchK"]) + 1).filter(
                    col("src") != target_id).select(lit(config["target_id"]),
                                                    "src", "distCol")
            search_results.coalesce(1).write.csv(config["sim_search_out"],
                                                 sep='\t',
                                                 mode='overwrite')
示例#13
0
    def compute_article_similar(self, articleProfile):
        """
        计算增量文章与历史文章的相似度
        :param articleProfile:
        :return:
        """
        from pyspark.ml.feature import Word2VecModel

        def avg(row):
            x = 0
            for v in row.vectors:
                x += v
            return row.article_id, row.channel_id, x / len(row.vectors)

        for channel_id, channel_name in CHANNEL_INFO.items():
            profile = articleProfile.filter(
                'channel_id = {}'.format(channel_id))
            wv_model = Word2VecModel.load(
                "hdfs://hadoop1:9000/headlines/models/channel_%d_%s.word2vec" %
                (channel_id, channel_name))

            vectors = wv_model.getVectors()

            # 计算向量
            profile.registerTempTable("incremental")
            articleKeywordsWeights = self.spark.sql(
                "select article_id,channel_id,keyword,weight from profile\
                                    LATERAL VIEW explode(keywords) as keyword,weight"
            )

            articleKeywordsWeightsAndVectors = articleKeywordsWeights.join(
                vectors, vectors.word == articleKeywordsWeights.keyword,
                "inner")
            articleKeywordVectors = articleKeywordsWeightsAndVectors.rdd.map(
                lambda r: (r.article_id, r.channel_id, r.keyword, r.weight * r.
                           vector)).toDF([
                               "article_id", "channel_id", "keyword",
                               "weightVector"
                           ])

            articleKeywordVectors.registerTemptable("Temptable")
            articleVector = self.spark.sql(
                "select article_id, min(channel_id) channel_id, collect_set(weightVector) vectors from temptable group by article_id"
            ).rdd.map(avg).toDF(["article_id", "channel_id", "articleVector"])

            # 写入数据库hive
            def toArray(row):
                return row.article, row.channel_id, [
                    float(i) for i in row.articleVector.toArray()
                ]

            articleVector = articleVector.rdd.map(toArray).toDF(
                ["article_id", "channel_id", "articleVector"])
            articleVector.write.insertInto("article_vector")

            import gc
            del wv_model
            del vectors
            del articleKeywordsWeights
            del articleKeywordsWeightsAndVectors
            del articleKeywordVectors
            gc.collect()

            # 得到历史文章向量,转换成vector格式,使用LSH求相似文章
            from pyspark.ml.linalg import Vectors
            from pyspark.ml.feature import BucketedRandomProjectionLSH
            train = self.spark.sql(
                "select * from article_vector where channel_id=%d" %
                channel_id)

            def _array_to_vector(row):
                return row.article_id, Vectors.dense(row.articleVector)

            train = train.rdd.map(_array_to_vector).toDF(
                ["article_id", "articleVector"])
            test = articleVector.rdd.map(_array_to_vector).toDF(
                "article_id", "articleVector")

            brp = BucketedRandomProjectionLSH(inputCol="articleVector",
                                              outputCol="hashes",
                                              bucketLength=1.0,
                                              seed=12345)
            model = brp.fit(train)
            similar = model.approxSimilarityJoin(test,
                                                 train,
                                                 2.0,
                                                 distCol="EuclideanDistance")

            def save_hbase(partitions):
                import happybase
                pool = happybase.ConnectionPool(size=3, host='hadoop1')

                with pool.connection() as conn:
                    article_similar = conn.table('article_similar')
                    for row in partitions:
                        if row.datasetA.article_id == row.datasetB.article_id:
                            pass
                        else:
                            article_similar.put(
                                str(row.datasetA.article_id).encode(), {
                                    'similar:{}'.format(row.datasetB.article_id).encode(
                                    ):
                                    b'%0.4f' % (row.EuclideanDistance)
                                })

            similar.foreachPartition(save_hbase)
示例#14
0
article_vector = article_vector.rdd.map(toArray).toDF(["article_id","channel_id","vector"])
# article_vector.write.insertInto("article_vector")

# 利用LSH计算文章相似度
# 1.读取数据,将文章向量从array转换成vector
from pyspark.ml.linalg import Vectors

def toVector(row):
    return row.article_id,Vectors.dense(row.vector)

train = article_vector.rdd.map(toVector).toDF(["article_id","vector"])

# 计算相似的文章
from pyspark.ml.feature import BucketedRandomProjectionLSH

brp = BucketedRandomProjectionLSH(inputCol='vector',outputCol='hashes',numHashTables=4.0,bucketLength=10.0)
model = brp.fit(train)
similar = model.approxSimilarityJoin(train,train,2.5,distCol='EuclideanDistance')
# 按欧式距离从小到大排序,距离越小越相似
similar.sort(['EuclideanDistance'])
similar.show()
# 得到的similar是一个struct结构体,因此下面要用row.datasetA.article_id表示

# 相似文章结果存入hbase
# def save_hbase(partitions):
#     import happybase
#     pool = happybase.ConnectionPool(size=3,host='hadoop1')
#
#     with pool.connection() as conn:
#         article_similar = conn.table('article_similar')
#         for row in partitions:
示例#15
0
    ddf = assembler.transform(
        df.select("*",
                  *(df["scores"].getItem(i)
                    for i in range(keywords_length)))).select("user_score")

    normalizer = Normalizer(inputCol="user_score",
                            outputCol="normFeatures",
                            p=2.0)
    extended_user_df = normalizer.transform(ddf)
    extended_user_df.cache()

    seed_user_df = extended_user_df

    # LSH Algorithm
    brp = BucketedRandomProjectionLSH(inputCol="normFeatures",
                                      outputCol="hashes",
                                      bucketLength=bucketLength,
                                      numHashTables=numHashTables)
    lsh = brp.fit(extended_user_df)
    df_users = lsh.approxSimilarityJoin(seed_user_df,
                                        extended_user_df,
                                        1 - minimum_similarity_score,
                                        distCol="EuclideanDistance")
    df_users = df_users.withColumn(
        'similarity_score',
        udf(lambda x: 1 - x, FloatType)(df_users.EuclideanDistance))

    df_users.coalesce(1000).write.mode('overwrite').parquet(write_path)

    sc.stop()

    print("job is completed")
                        b'%0.4f' % (row.EuclideanDistance)
                    })


if __name__ == '__main__':

    ism = ItemSimilarModel()
    item_embeddings = pd.read_csv('./item_embeddings.csv')
    item_df = ism.spark.createDataFrame(item_embeddings)

    #将YoutubeDNN模型导出的32维movieid向量转化成LSH所需的vector格式
    embedding_vecAssembler = VectorAssembler(
        inputCols=cols, outputCol="embeddings").transform(item_df)
    embedding_vecAssembler.registerTempTable('temptable')
    embedding_Vectors = ism.spark.sql(
        "select movie_id, embeddings from temptable")

    # 计算相似的item
    brp = BucketedRandomProjectionLSH(inputCol='embeddings',
                                      outputCol='similar',
                                      numHashTables=4.0,
                                      bucketLength=10.0)
    model = brp.fit(embedding_Vectors)
    similar = model.approxSimilarityJoin(embedding_Vectors,
                                         embedding_Vectors,
                                         2.0,
                                         distCol='EuclideanDistance')

    #数据入库
    similar.foreachPartition(save_hbase)
示例#17
0
    def compute_article_similar(self, articleProfile):
        """
        计算增量文章与历史文章的相似度 word2vec
        :return:
        """

        # 得到要更新的新文章通道类别(不采用)
        # all_channel = set(articleProfile.rdd.map(lambda x: x.channel_id).collect())
        def avg(row):
            x = 0
            for v in row.vectors:
                x += v
            #  将平均向量作为article的向量
            return row.article_id, row.channel_id, x / len(row.vectors)

        for channel_id, channel_name in CHANNEL_INFO.items():

            profile = articleProfile.filter(
                'channel_id = {}'.format(channel_id))
            wv_model = Word2VecModel.load(
                "hdfs://hadoop-master:9000/headlines/models/channel_%d_%s.word2vec"
                % (channel_id, channel_name))
            vectors = wv_model.getVectors()

            # 计算向量
            profile.registerTempTable("incremental")
            articleKeywordsWeights = self.spark.sql(
                "select article_id, channel_id, keyword, weight from incremental LATERAL VIEW explode(keywords) AS keyword, weight where channel_id=%d"
                % channel_id)

            articleKeywordsWeightsAndVectors = articleKeywordsWeights.join(
                vectors, vectors.word == articleKeywordsWeights.keyword,
                "inner")
            articleKeywordVectors = articleKeywordsWeightsAndVectors.rdd.map(
                lambda r: (r.article_id, r.channel_id, r.keyword, r.weight * r.
                           vector)).toDF([
                               "article_id", "channel_id", "keyword",
                               "weightingVector"
                           ])

            articleKeywordVectors.registerTempTable("tempTable")
            articleVector = self.spark.sql(
                "select article_id, min(channel_id) channel_id, collect_set(weightingVector) vectors from tempTable group by article_id"
            ).rdd.map(avg).toDF(["article_id", "channel_id", "articleVector"])

            # 写入数据库
            def toArray(row):
                return row.article_id, row.channel_id, [
                    float(i) for i in row.articleVector.toArray()
                ]

            articleVector = articleVector.rdd.map(toArray).toDF(
                ['article_id', 'channel_id', 'articleVector'])
            articleVector.write.insertInto("article_vector")

            import gc
            del wv_model
            del vectors
            del articleKeywordsWeights
            del articleKeywordsWeightsAndVectors
            del articleKeywordVectors
            gc.collect()

            # 得到历史数据, 转换成固定格式使用LSH进行求相似
            train = self.spark.sql(
                "select * from article_vector where channel_id=%d" %
                channel_id)

            def _array_to_vector(row):
                return row.article_id, Vectors.dense(row.articleVector)

            train = train.rdd.map(_array_to_vector).toDF(
                ['article_id', 'articleVector'])
            test = articleVector.rdd.map(_array_to_vector).toDF(
                ['article_id', 'articleVector'])

            brp = BucketedRandomProjectionLSH(inputCol='articleVector',
                                              outputCol='hashes',
                                              seed=12345,
                                              bucketLength=1.0)
            model = brp.fit(train)
            similar = model.approxSimilarityJoin(test,
                                                 train,
                                                 2.0,
                                                 distCol='EuclideanDistance')

            def save_hbase(partition):
                import happybase
                for row in partition:
                    pool = happybase.ConnectionPool(size=3,
                                                    host='hadoop-master')
                    # article_similar article_id similar:article_id sim
                    with pool.connection() as conn:
                        table = conn.table("article_similar")
                        for row in partition:
                            if row.datasetA.article_id == row.datasetB.article_id:
                                pass
                            else:
                                table.put(
                                    str(row.datasetA.article_id).encode(), {
                                        b"similar:%d" % row.datasetB.article_id:
                                        b"%0.4f" % row.EuclideanDistance
                                    })
                        conn.close()

            similar.foreachPartition(save_hbase)
spark = SparkSession \
    .builder \
    .getOrCreate()

df = spark.read.parquet("user_score")
normalizer = Normalizer(inputCol="user_score", outputCol="normFeatures", p=2.0)
extended_user_df = normalizer.transform(df)
extended_user_df.cache()
# seed_user_df = extended_user_df.sample(0.1, False)

# print("no seed users: ",seed_user_df.count(),"   no of extended users:  ",extended_user_df.count())

# LSH Algorithm
start_time=time.time()
brp = BucketedRandomProjectionLSH(inputCol="normFeatures", outputCol="hashes", bucketLength=10000.0, numHashTables=numHashTables)
brp.setSeed(random.randint())
model = brp.fit(extended_user_df)

# Get the hashes for the users and convert them into a cluster ID number.
df_users = model.transform(extended_user_df)
df_users = df_users.withColumn('cluster_id', udf(lambda input: reduce(lambda x, y: x | y, [ 0x1 << i if value[0] != 0.0 else 0 for i, value in enumerate(input) ]), IntegerType())(df_users.hashes))
#df_users.select('hashes', 'cluster_id').show(50, truncate=False, vertical=True)
df_count = df_users.groupBy(['cluster_id', 'hashes']).count().cache()
df_count.show(100, truncate=False)
df_count.groupBy().max('count').show()

# df_users = model.approxSimilarityJoin(seed_user_df, extended_user_df, 0.99, distCol="EuclideanDistance")
# df_users.coalesce(100).write.mode('overwrite').parquet("user_similarity")
print("{} seconds time take by the script:  ".format(time.time()-start_time))
train_data = article_vector.select(['article_id', 'articlevector'])


def _array_to_vector(row):
    return row.article_id, Vectors.dense(row.articlevector)


train_data = train_data.rdd.map(_array_to_vector).toDF(
    ['article_id', 'articleVector'])

# train_data.show()

from pyspark.ml.feature import BucketedRandomProjectionLSH

brp = BucketedRandomProjectionLSH(inputCol="articleVector",
                                  outputCol="hashes",
                                  numHashTables=4,
                                  bucketLength=10)
model = brp.fit(train_data)

similarity = model.approxSimilarityJoin(train_data,
                                        train_data,
                                        5.0,
                                        distCol='EuclideanDistance')


def save_hbase(partition):
    import happybase
    conn = happybase.Connection('localhost')
    table = conn.table('article_similar')
    for row in partition:
        if row.datasetA.article_id == row.datasetB.article_id:
示例#20
0
    master('local[2]').\
    appName('scholar').\
    getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)

df = sqlContext.read.format('parquet').load(
    'hdfs:/scholar_data/token_embeddings.parquet').select(
        'entities', 'embeddings')

to_vector = F.udf(lambda x: Vectors.dense(x), VectorUDT())
to_normed_vector = F.udf(make_normed_vector, VectorUDT())

df = df.withColumn('normed_embeddings', to_normed_vector('embeddings'))
df = df.withColumn('embeddings', to_vector('embeddings'))

brpLSH = BucketedRandomProjectionLSH(inputCol="normed_embeddings",
                                     outputCol="hashes",
                                     seed=42,
                                     bucketLength=12.0,
                                     numHashTables=20)
brpLSHmodel = brpLSH.fit(df)

brpLSHmodel.save('hdfs:/scholar_model/brpLSH_model')
df.write.save('hdfs:/scholar_data/token_normed_vector_embeddings.parquet',
              format='parquet',
              mode='overwrite')

spark.stop()
    # $example on$
    dataA = [(0, Vectors.dense([1.0, 1.0]),),
             (1, Vectors.dense([1.0, -1.0]),),
             (2, Vectors.dense([-1.0, -1.0]),),
             (3, Vectors.dense([-1.0, 1.0]),)]
    dfA = spark.createDataFrame(dataA, ["id", "features"])

    dataB = [(4, Vectors.dense([1.0, 0.0]),),
             (5, Vectors.dense([-1.0, 0.0]),),
             (6, Vectors.dense([0.0, 1.0]),),
             (7, Vectors.dense([0.0, -1.0]),)]
    dfB = spark.createDataFrame(dataB, ["id", "features"])

    key = Vectors.dense([1.0, 0.0])

    brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=2.0,
                                      numHashTables=3)
    model = brp.fit(dfA)

    # Feature Transformation
    print("The hashed dataset where hashed values are stored in the column 'hashes':")
    model.transform(dfA).show()

    # Compute the locality sensitive hashes for the input rows, then perform approximate
    # similarity join.
    # We could avoid computing hashes by passing in the already-transformed dataset, e.g.
    # `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
    print("Approximately joining dfA and dfB on Euclidean distance smaller than 1.5:")
    model.approxSimilarityJoin(dfA, dfB, 1.5, distCol="EuclideanDistance")\
        .select(col("datasetA.id").alias("idA"),
                col("datasetB.id").alias("idB"),
                col("EuclideanDistance")).show()
示例#22
0
# COMMAND ----------

def recommend_by_book(book_id):
  cluster = predictions.filter(predictions.book_id == book_id).select("prediction").collect()[0][0]
  titles = predictions.filter(predictions.prediction == cluster).select("title").collect()
  for title in titles:
    print(title[0])
    
recommend_by_book(100001)

# COMMAND ----------

# DBTITLE 1,Locality Sensitive Hashing: Bucketed Random Projection for Euclidean Distance
from pyspark.ml.feature import BucketedRandomProjectionLSH

brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=5, numHashTables=10)
model = brp.fit(data_pca)

# COMMAND ----------

def find_nearest_books(book_id, num):
  key = data_pca.filter(data_pca.book_id == book_id).select("features").collect()[0][0]
  res = model.approxNearestNeighbors(data_pca, key, num).select("book_id").collect()
  for r in res:
    print(get_book_title(r[0]))

find_nearest_books(100001, 10)

# COMMAND ----------

# DBTITLE 1,Latent Dirichlet allocation