def LDA_spark():
	data = sc.textFile("data/mllib/sample_lda_data.txt")
	parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
	corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

	# Cluster the documents into three topics using LDA
	Model = LDA.train(corpus, k=3)
			
	# Save and load model
	Model.save(sc, "myModelPath")
	sameModel = LDAModel.load(sc, "myModelPath")
예제 #2
0
def main():
    for tn in tablenames:
        data = spark.read.format("org.apache.spark.sql.cassandra")\
                    .options(table=tn, keyspace=keyspace).load().limit(1000)

        data = data.sort('imdb_score', ascending=False)

        desc = data.rdd.map(lambda x: x['description']).filter(
            lambda x: x is not None)

        StopWords = nltk.corpus.stopwords.words('english')
        StopWords.extend([" ...                See full summary"])

        tokenized = desc.map( lambda y: y.strip().lower()).map( lambda x: re.split(" ", x))\
            .map( lambda word: [x for x in word if x.isalpha()]).map( lambda word: [x for x in word if len(x) > 3] )\
            .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex()

        df_txts = spark.createDataFrame(tokenized, ["words", 'index'])
        countVec = CountVectorizer(inputCol="words",
                                   outputCol="raw_features",
                                   vocabSize=5000,
                                   minDF=10.0)
        CountVectMod = countVec.fit(df_txts)
        result = CountVectMod.transform(df_txts)
        idf = IDF(inputCol="raw_features", outputCol="features")
        idfModel = idf.fit(result)
        resultTFIdf = idfModel.transform(result)

        totalTopics = 10
        totalItr = 100
        LDAModel = MLlibLDA.train(resultTFIdf.select('index','features').rdd.mapValues(MLlibVectors.fromML).map(list),\
                        k=totalTopics, maxIterations=totalItr)

        maxwordsTopic = 5
        topicIndices = sc.parallelize(
            LDAModel.describeTopics(maxTermsPerTopic=5))
        VCarr = CountVectMod.vocabulary

        def finalTopic(topic):
            terms = topic[0]
            result = []
            for i in range(maxwordsTopic):
                term = VCarr[terms[i]]
                result.append(term)
            return result

        topics_final = topicIndices.map(
            lambda topic: finalTopic(topic)).collect()
        print(topics_final)
        for topic in range(len(topics_final)):
            print("Topic" + str(topic) + ":")
            for term in topics_final[topic]:
                print(term)
            print('\n')
예제 #3
0
def test():
    sc = SparkContext(master='local[4]', appName='lda')
    sc.setLogLevel('ERROR')

    def train():
        data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense(
            [float(i) for i in line.strip().split()]))
        corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # print(corpus.take(5))

        lda_model = LDA.train(rdd=corpus,
                              maxIterations=max_iter,
                              seed=seed,
                              checkpointInterval=checkin_point_interval,
                              k=K,
                              optimizer=optimizer,
                              docConcentration=alpha,
                              topicConcentration=beta)
        if os.path.exists('./ldamodel'):
            __import__('shutil').rmtree('./ldamodel')
        lda_model.save(sc, "./ldamodel")

    # train()

    lda_model = LDAModel.load(sc, "./ldamodel")

    # topic-word分布(未归一化的dist,每列代表一个topic)
    topics = lda_model.topicsMatrix()
    # for tid in range(3):
    #     print('Topic' + str(tid) + ':')
    #     for wid in range(0, lda_model.vocabSize()):
    #         print(' ' + str(topics[wid, tid] / sum(topics[:, tid])))  # 加一个归一化
    #         # print(' ' + str(topics[wid, tid]))

    # topic-word按词序排列分布([词id,按权重从大到小排列], [词在主题上的权重])
    topics_dist = lda_model.describeTopics()
    for tid, topic in enumerate(topics_dist):
        print('Topic' + str(tid) + ':' + '\n', topic)

    # 文档的主题分布(mllib不能,ml才可以)
    # doc_topic = lda_model

    sc.stop()
예제 #4
0
def test():
    sc = SparkContext(master='local[4]', appName='lda')
    sc.setLogLevel('ERROR')

    def train():
        data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense([float(i) for i in line.strip().split()]))
        corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # print(corpus.take(5))

        lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval,
                              k=K,
                              optimizer=optimizer, docConcentration=alpha, topicConcentration=beta)
        if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel')
        lda_model.save(sc, "./ldamodel")

    # train()

    lda_model = LDAModel.load(sc, "./ldamodel")

    # topic-word分布(未归一化的dist,每列代表一个topic)
    topics = lda_model.topicsMatrix()
    # for tid in range(3):
    #     print('Topic' + str(tid) + ':')
    #     for wid in range(0, lda_model.vocabSize()):
    #         print(' ' + str(topics[wid, tid] / sum(topics[:, tid])))  # 加一个归一化
    #         # print(' ' + str(topics[wid, tid]))

    # topic-word按词序排列分布([词id,按权重从大到小排列], [词在主题上的权重])
    topics_dist = lda_model.describeTopics()
    for tid, topic in enumerate(topics_dist):
        print('Topic' + str(tid) + ':' + '\n', topic)

    # 文档的主题分布(mllib不能,ml才可以)
    # doc_topic = lda_model

    sc.stop()
예제 #5
0
파일: lda.py 프로젝트: farokojil/segme
from pyspark import SparkContext

from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

# Load and parse the data
sc = SparkContext(appName="lda")
# data = sc.textFile("data/sample_lda_data.txt")
# parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# # Index documents with unique IDs
# corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
#
# # Cluster the documents into three topics using LDA
# ldaModel = LDA.train(corpus, k=3)

ldaModel = LDAModel.load(sc, "data/model/mymodel")


# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
# ldaModel.save(sc, "data/model/mymodel")

# sameModel = LDAModel.load(sc, "data/model/mymodel")
예제 #6
0
corpus = sc.parallelize(data_features)

ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
model.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")

ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
model.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")
예제 #7
0
import sys
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext

sc = SparkContext()
for route, directories, files in os.walk('/media/deepak/data_words.csv'):
    for file in files:
        f_path = os.path.join(route, file)
        f_name = os.path.join(route, file).split('/')
        # Load and parse the data
        data = sc.textFile(f_path)
        dataParsed = data.map(lambda line: Vectors.dense(
            [float(x) for x in line.strip().split(',')]))
        # Index documents with unique IDs
        corpus = dataParsed.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # Cluster the documents into three topics using LDA
        ldaModel = LDA.train(corpus, k=3)
        # Output topics. Each is a distribution over words (matching word count vectors)
        print(f_name[-1])
        print("Learned topics (as distributions over vocab of " +
              str(ldaModel.vocabSize()) + " words):")
        topics = ldaModel.topicsMatrix()
        for topic in range(3):
            print("Topic " + str(topic) + ":")
            for word in range(0, ldaModel.vocabSize()):
                print(" " + str(topics[word][topic]))
        # Save and load model
        ldaModel.save(sc, "/media/deepak/lda_output/" + f_name[-1])
        sameModel = LDAModel.load(sc, "/media/deepak/lda_output/" + f_name[-1])
sc =SparkContext()
corpus = sc.parallelize(data_features)

ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))
    
# Save and load model
model.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")

ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))
    
# Save and load model
model.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")
예제 #9
0
def load_model():
    from pyspark.mllib.clustering import LDA, LDAModel
    #sc = SparkContext(appName='lda_load', conf=conf)
    path = "/user/rmusters/ldaModel2"
    ldaModel = LDAModel.load(sc, path)
    return ldaModel