예제 #1
0
def tfidf_lda(df):
    '''
    TFIDF+LDA
    :param df:
    :return: model
    '''
    # hashingTF
    hashingTF = HashingTF(inputCol="content", outputCol="features")
    df_TF = hashingTF.transform(df)
    print('df_TF')
    df_TF.show(truncate=False)
    # IDF
    idf = IDF(inputCol="features", outputCol="idf")
    model_idf = idf.fit(df_TF)
    df_idf = model_idf.transform(df_TF)
    print('df_idf')
    df_idf.cache()
    df_idf.show(truncate=False)
    # LDA
    lda = LDA(k=20, seed=1, optimizer="em")
    model_lda = lda.fit(df_idf)
    model_lda.describeTopics(maxTermsPerTopic=20)
    df_lda = model_lda.transform(df_idf)
    df_lda.select("content", "topicDistribution").show(truncate=False)
    sparkEntrance.spark.createDataFrame(df_lda.rdd, ['content', 'topicDistribution'])
예제 #2
0
    def _fit(self, papers):
        """
        Build a LDA representation for each paper in the input data set. Based on papers in the papers corpus, a set of all
        terms is extracted. For each of them a unique id is generated. Term ids are sequential. Then depending on all terms
        and their frequence for a paper, a sparse vector is built. A model that can be used to map a tf vector to each paper 
        based on its paper id is used. Based on the tf representation of all papers - LDA is trained and used for prodicing
        LDA representation.
    
        :param data set: input data set, which is an instance of :py:class:`pyspark.sql.DataFrame`
        :returns: a build model which can be used for transformation of a data set
        """
        Logger.log("Train/Transform TF vectorizer.")
        tfVectorizer = TFVectorizer(self.papers_corpus, paperId_col = self.paperId_col, tf_map_col = self.tf_map_col, output_col = "tf_vector")
        tfVectorizerModel = tfVectorizer.fit(papers)
        # paper_id | tf_vector
        papers_tf_vectors = tfVectorizerModel.transform(papers).select(self.paperId_col, "tf_vector")
        papers_tf_vectors.cache()
        Logger.log("Train LDA. Topics:" + str(self.k_topics))
        # Trains a LDA model.
        # The number of topics to infer. Must be > 1.
        lda = LDA(featuresCol = "tf_vector", k = self.k_topics)
        model = lda.fit(papers_tf_vectors)

        Logger.log("Transform LDA over paper corpus.")
        # format -> paper_id | lda_vector
        papers_lda_vectors = model.transform(papers_tf_vectors).withColumnRenamed("topicDistribution", self.output_col).drop("tf_vector")

        Logger.log("Return LDA model.")
        papers_tf_vectors.unpersist()
        return LDAModel(papers_lda_vectors, self.paperId_col, self.output_col);
예제 #3
0
def lda(features, num_clusters):
    """Does clustering on the features dataset using LDA topic clustering.

    Params:
    - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering
    - num_clusters (int): The number of clusters to be used

    Returns:
    - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column
    """
    lda = LDA(k=num_clusters,
              featuresCol='features',
              topicDistributionCol='topics')
    lda_model = lda.fit(features)
    clustered = lda_model.transform(features)
    clustered = clustered.rdd.map(
        lambda row: Row(cluster=int(argmax(row['topics'])), **row.asDict()))
    clustered = clustered.map(
        lambda row: Row(closeness=float(row['topics'][row['cluster']]),
                        **row.asDict())).toDF()
    clustered = clustered.drop('topics')
    clustered.show()
    print("=====Clustering Results=====")
    print("LDA log perplexity = ", lda_model.logPerplexity(features))
    cluster_sizes = list()
    for i in range(num_clusters):
        cluster_size = clustered.rdd.filter(
            lambda row: row['cluster'] == i).count()
        cluster_sizes.append(cluster_size)
    print("Cluster sizes = ", cluster_sizes)
    # Do an argmax over the clusters to get the actual topic, I guess
    return clustered
예제 #4
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
예제 #5
0
    def lda_train(self, file):
        json_rdd, count = self.load_train_titleFeature_rdd(file)
        vocabulary_set = json_rdd.map(lambda line : get_title_words(line))\
                                 .flatMap(lambda word : word).distinct().collect()

        vocab_size = self.sc.broadcast(max(vocabulary_set) + 1)

        print('vocabulart size: ' + str(vocab_size.value))

        sparseVec_rdd = json_rdd.map(lambda line : cast_dict_str2int(line.get('title_features')))\
                                .map(lambda value : SparseVector(vocab_size.value, value))
        zip_rdd = sparseVec_rdd.zipWithIndex()
        lda_train_rdd = zip_rdd.map(lambda x: [x[1], x[0]]).cache()

        K = 4
        max_iter = 10
        seed = 1024

        lda_train_df = self.sqlContext.createDataFrame(lda_train_rdd.collect(),
                                                       ["id", "features"])
        lda = LDA(k=K, maxIter=max_iter, seed=seed)
        lda_model = lda.fit(lda_train_df)

        print('LDA model vocabSize : ' + str(lda_model.vocabSize()))
        print(lda_model.isDistributed())
        lda_model.describeTopics().show()

        #os.system("hadoop fs -rmr {}".format(self.lda_model_path))
        #os.system("hadoop fs -rmr {}".format(self.lda_path))

        lda_model.write().overwrite().save(self.lda_model_path)

        self.sc.stop()
예제 #6
0
파일: ml.py 프로젝트: ribonj/lsir
def lda(df, column):
    df = preprocess(df, column) # text to list of terms
    (df, voc) = count(df, column) # add a feature column containing term counts
    
    # Trains the LDA model.
    # The input to LDA must be a dataframe containing a "features" column
    # (e.g. 10 topics and 100 iterations: k=10, maxIter=100)
    #lda = None
    lda = LDA(featuresCol=column, topicDistributionCol='_'+column, k=5, maxIter=20) 
    model = lda.fit(df)

    '''
    # compute likelihood and perplexity metrics
    ll = model.logLikelihood(df)
    lp = model.logPerplexity(df)
    print("The lower bound on the log likelihood: " + str(ll))
    print("The upper bound bound on perplexity: " + str(lp))
    #'''
    
    # Describe topics (using the 3 first terms)
    topics = model.describeTopics(3)
    #print("The topics described by their top-weighted terms:")
    #topics.show(truncate=False)

    # Shows the result
    df = model.transform(df)
    #df.show(truncate=False)
    df = replace(df, column, '_'+column)
    
    return (df, topics.collect(), voc)
예제 #7
0
파일: lda.py 프로젝트: KoferaDS/PySpark_ML
def train(df,hiperparameter):
    '''
    LDA training, returning LDA model.
    input: - Dataframe
           - config (configurasi hiperparameter)
    
    return: kmeans model
    '''
    lda = LDA(featuresCol = hiperparameter['featuresCol'],
              maxIter = hiperparameter['maxIter'],
              seed = hiperparameter['seed'],
              checkpointInterval = hiperparameter['checkpointInterval'],
              k = hiperparameter['k'],
              optimizer = hiperparameter['optimizer'],
              learningOffset = hiperparameter['learningOffset'],
              learningDecay = hiperparameter['learningDecay'],
              subsamplingRate = hiperparameter['subsamplingRate'],
              optimizeDocConcentration = hiperparameter['optimizeDocConcentration'],
#              docConcentration = hiperparameter['docConcentration'],
              topicConcentration = hiperparameter['topicConcentration'],
              topicDistributionCol = hiperparameter['topicDistributionCol'],
              keepLastCheckpoint = hiperparameter['keepLastCheckpoint'])
    
    model = lda.fit(df)
    return model
예제 #8
0
def feature_engineering(class_balancedDf):
    # N-Gram
    ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams")
    ngramDataFrame = ngram.transform(class_balancedDf)

    # Hashing TF
    hashingTF = HashingTF(inputCol="ngrams",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(ngramDataFrame)

    # IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # K-Means
    kmeans = KMeans().setK(6).setSeed(1)
    kmodel = kmeans.fit(rescaledData).transform(rescaledData)

    #LDA
    lda = LDA(k=10, maxIter=10)
    ldamodel = lda.fit(kmodel).transform(kmodel)

    # changing label column to int
    data = ldamodel.withColumn(
        "label", ldamodel.label.cast("Integer")).drop("prediction")

    return data
예제 #9
0
def perform_lda(documents, n_topics, n_words, beta, tokens_col):
    '''
    will perform LDA on a list of documents (== list of token)
    assume that documents is a DataFrame with a column of unique id (uid).
    
    '''
    cv = CountVectorizer(inputCol=tokens_col, outputCol="raw_features")
    cvmodel = cv.fit(documents)
    result_cv = cvmodel.transform(documents)
    
    #we perform an tf-idf (term frequency inverse document frequency), to avoid threads with a lot of words to pollute the topics.
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 
    
    #keeping created for time series purpose. 
    corpus = result_tfidf.select("uid", "features")#, "date")
    
    lda = LDA(k=n_topics, topicConcentration=beta)
    
    model = lda.fit(corpus)
    
    #retrieving topics, and the vocabulary constructed by the CountVectorizer
    topics = model.describeTopics(maxTermsPerTopic=n_words)
    vocab = cvmodel.vocabulary
    
    #getting topic distribution per document. 
    #topic_distribution = model.transform(corpus)[['topicDistribution', 'date']]
    
    #the topics are just numerical indices, we need to convert them to words, and associate them to their weights..
    topics_with_weights = topics.rdd.map(lambda r: (r[0], ([(vocab[t],w) for t,w in zip(r[1], r[2])]), ' '.join([vocab[t] for t in r[1]]))).toDF().selectExpr("_1 as topic_number", "_2 as topic_weight", "_3 as topic")
    
    return topics_with_weights#, topic_distribution
예제 #10
0
 def do_lda_with_count_vectorizer(self, k, rescaled_data, vocab):
     lda = LDA(k=k, seed=1, maxIter=100, optimizer="em", featuresCol="features", topicConcentration=5)
     lda_model = lda.fit(rescaled_data)
     transformed_df = lda_model.transform(rescaled_data).select("url", "topicDistribution")
     topics_description = lda_model.describeTopics().rdd\
         .map(lambda row: row['termIndices'])\
         .map(lambda idx_list: [vocab[idx] for idx in idx_list]) \
         .collect()
     return transformed_df, topics_description
예제 #11
0
def LDA_pipefit (data_ip, ipcol):
  text_col = ipcol
  from sparknlp.base import DocumentAssembler
  documentAssembler = DocumentAssembler().setInputCol(text_col).setOutputCol('document')
  from sparknlp.annotator import Tokenizer
  tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
  from sparknlp.annotator import Normalizer
  normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True)
  from sparknlp.annotator import LemmatizerModel
  lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
  from sparknlp.annotator import StopWordsCleaner
  stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords)
  from sparknlp.annotator import NGramGenerator
  ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')
  from sparknlp.annotator import PerceptronModel
  pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos')
  from sparknlp.base import Finisher
  finisher = Finisher().setInputCols(['unigrams', 'ngrams','pos'])
  from pyspark.ml import Pipeline
  pipeline = Pipeline().setStages([documentAssembler,
                                  tokenizer,
                                  normalizer,
                                  lemmatizer,
                                  stopwords_cleaner,
                                  pos_tagger,
                                  ngrammer,
                                  finisher])
  review_text_clean = ipcol
  processed_tweets = pipeline.fit(data_ip).transform(data_ip)
  from pyspark.sql.functions import concat
  processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams')))
  from pyspark.ml.feature import CountVectorizer
  tfizer = CountVectorizer(inputCol='final',outputCol='tf_features')
  tf_model = tfizer.fit(processed_tweets)
  tf_result = tf_model.transform(processed_tweets)
  from pyspark.ml.feature import IDF
  idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
  idf_model = idfizer.fit(tf_result)
  tfidf_result = idf_model.transform(tf_result)
  from pyspark.ml.clustering import LDA

  num_topics = 3
  max_iter = 10

  lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features')
  lda_model = lda.fit(tfidf_result)
  from pyspark.sql import types as T
  vocab = tf_model.vocabulary
  def get_words(token_list):
      return [vocab[token_id] for token_id in token_list]
  udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

  num_top_words = 15
  topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
  topics_p=topics.toPandas()
  return topics_p
def main():
    spark.sql("CLEAR CACHE")
    business = spark.read.parquet("yelp-etl/business_etl").repartition(8)
    business.createOrReplaceTempView("business")
    review = spark.read.parquet("yelp-etl/review_etl").repartition(16)#.cache()
    review.createOrReplaceTempView("review")

    ## Location based reviews
    # spark.sql("SELECT b.state, COUNT(*) AS bus_rev_count FROM business b INNER JOIN review r ON b.business_id = r.business_id GROUP BY b.state ORDER BY bus_rev_count DESC").show()
    #
    # ## Choosing reviews from Ontario(state = "ON")
    on_bus_rev = spark.sql("SELECT r.review_id, b.business_id, r.text, r.label FROM business b INNER JOIN review r ON b.business_id = r.business_id WHERE b.state = 'ON' AND r.label = 0")

    ## Remove punctuations and spaces
    punct_remover = functions.udf(lambda x: remove_punct(x))
    review_df = on_bus_rev.select('review_id', 'business_id', punct_remover('text')).withColumnRenamed('<lambda>(text)', 'text')

    ## Tokenize
    tok = Tokenizer(inputCol="text", outputCol="words")

    ## Remove stop words
    stopwordList = ['','i','get','got','also','really','would','one','good','like','great','tri','love','two','three','took','awesome','me','bad','horrible','disgusting','terrible','fabulous','amazing','terrific','worst','best','fine','excellent','acceptable','my','exceptional','satisfactory','satisfying','super','awful','atrocious','unacceptable','poor','sad','gross','authentic','myself','cheap','expensive','we','our','ours','ourselves','you','your','yours','yourself','yourselves', 'he', 'him', 'his', 'himself','she','her','hers','herself','it','its','itself','they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then','once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each','few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn','weren', 'won', 'wouldn']

    stopword_rm = StopWordsRemover(inputCol="words", outputCol="words_nsw", stopWords=stopwordList)

    pipestages = [tok,stopword_rm]
    pipeline = Pipeline(stages = pipestages)
    model = pipeline.fit(review_df)
    tokenized_df = model.transform(review_df)

    ## Lemmatizing
    lemmatize_udf = functions.udf(lambda x: lemmatize(x), types.ArrayType(types.StringType()))
    lemmatized_df = tokenized_df.withColumn("lemmatized",lemmatize_udf("words_nsw")).select("review_id","business_id","lemmatized")
    # ## Stemming
    # stemmer_udf = functions.udf(lambda x: stem(x), types.ArrayType(types.StringType()))
    # stemmed_df = lemmatized_df.withColumn("stemmed", stemmer_udf("lemmatized")).drop(lemmatized_df["lemmatized"])


    ## Count Vectorizer
    cv = CountVectorizer(inputCol="lemmatized", outputCol="vectors")
    cv_model = cv.fit(lemmatized_df)
    cv_df = cv_model.transform(lemmatized_df).drop(lemmatized_df["lemmatized"])
    cv_model.save("topic_modelling/cvmodel_neg")

    idf = IDF(inputCol="vectors",outputCol="tfidf")
    idf_model = idf.fit(cv_df)
    result = idf_model.transform(cv_df)

    result = result.select("review_id","business_id","tfidf")

    lda = LDA(featuresCol='tfidf', k=5, seed=42, maxIter=50)
    model = lda.fit(result)
    model.write().overwrite().save("topic_modelling/ldamodel_neg")
    transformed = model.transform(result)
    transformed.write.parquet("topic_modelling/review_topics_neg",mode="overwrite")
    spark.stop()
    def clusteredData(self, dataset, cvModel):
        lda = LDA(k=20, seed=123, optimizer="em", featuresCol="features")
        ldamodel = lda.fit(dataset)

        # model.isDistributed()
        # model.vocabSize()

        ldaTopics = ldamodel.describeTopics()
        self.getTheMapping(ldaTopics, cvModel)
        '''
예제 #14
0
def lda_train(result_tfidf):
    from pyspark.ml.linalg import Vectors, SparseVector
    from pyspark.ml.clustering import LDA
    #
    lda = LDA(k=10, seed=1, optimizer="em")
    lda.setMaxIter(100)
    #
    model = lda.fit(result_tfidf[['index', 'features']])
    # model = LDA.train(result_tfidf[['index', 'features']].rdd.map(list), k=num_topics, maxIterations=max_iterations)
    return model
예제 #15
0
def main():
    spark = SparkSession \
        .builder \
        .appName("Reddit Site:Get Data") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
 
    file="file:////l2/corpora/reddit/submissions/RS_2015-12.bz2"
    output=file[-14:-3]

    sc = spark.sparkContext
    print('\n\n\n starting read and filter')
    df = filterPosts(file,sc,spark)
 
    df= convertToVec(df, sc, spark, output, inputCol='tokens')

    num_topics=10
    
    print('\n\n\n LDA... \n\n\n')
    newLDA=False
    if newLDA:
        lda=LDA(featuresCol='vectors', k=num_topics, maxIter=50)
        lda_model=lda.fit(df.select('id','vectors'))
        lda_model.save(output+'_ldamodel')
    else:
        lda_model=LocalLDAModel.load(output+'_ldamodel')

    print('\n\n\n Describe Topics... \n\n\n')
    topic_indices=lda_model.describeTopics(maxTermsPerTopic=50)
    topic_indices.write.json(output+'_topics.json', mode='overwrite')
    

    print('\n\n\n reduce to subs\n\n\n')
    #subDF=df.select('subreddit','vectors').groupBy(df.subreddit).sum('vectors')
    subDF=df.select('subreddit','vectors').rdd.mapValues(lambda v: v.toArray()) \
        .reduceByKey(lambda x, y: x + y) \
        .mapValues(lambda x: DenseVector(x)) \
        .toDF(["subreddit", "vectors"])
        
    '''
    print('\n\n\n LDA... \n\n\n')

    lda=LDA(featuresCol='vectors', k=num_topics, maxIter=50)
    lda_model=lda.fit(subDF.select('subreddit','vectors'))
    
    print('\n\n\n Describe Topics... \n\n\n')
    topic_indices=lda_model.describeTopics(maxTermsPerTopic=50)
    topic_indices.write.json(output+'_topics.json', mode='overwrite')
    '''
    print('\n\n\n Transform DataSet \n\n\n')
    subDF=lda_model.transform(subDF).drop('vectors')
    #topicDF=lda_model.transform(vecDF)
    subDF.write.json(output+'_transformed.json', mode='overwrite')
def lda_train():
    # Loads data.
    dataset = spark.read.format("libsvm").load("train.libsvm", numFeatures=4758484)

    # Trains a LDA model.
    lda = LDA(k=20, maxIter=200)
    model = lda.fit(dataset)

    ll = model.logLikelihood(dataset)
    lp = model.logPerplexity(dataset)
    print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
    print("The upper bound on perplexity: " + str(lp))

    # Describe topics.
    topics = model.describeTopics(3)
    print("The topics described by their top-weighted terms:")
    topics.show(truncate=False)

    topics = model.describeTopics()
    topics_array = topics.select('termWeights').collect()
    topics_array = np.array([i[0] for i in topics_array])

    # Shows the result
    transformed = model.transform(dataset)
    transformed.show(truncate=False)

    user_vector = transformed.select('topicDistribution').collect()
    with open('idx.pickle', 'rb') as f:
        idx_item, item_idx, idx_user, user_idx, label, train = pickle.load(f)
    user_test = list(user_idx.keys())

    submit = []
    user_test_idx = []
    for uid in user_test:
        user_test_idx.append(user_idx.get(uid))
    for i in user_test_idx:
        item_rec = [idx_user[i]]
        user_vector[i] = i[0].toArray()
        sim = i.dot(topics_array) / (np.linalg.norm(i) * np.linalg.norm(topics_array))
        sim = np.argsort(-sim).tolist()
        [item_rec.append(idx_item[i]) for i in sim]
        submit.append(item_rec)
    df = pd.DataFrame(submit)
    df.to_csv('submit.csv', header=None, index=None)

    # Save
    model.save('lda.model')

    # Stop
    spark.stop()
예제 #17
0
def lda_model_score(df, num_topics):
    """ 
    LDA pipeline: train LDA, extract topics, predict topic for each data point
    -- 
    input : df -> dataframe of reviews, num_topics -> int
    output : lda -> spark lda with initiated parameters, model -> trained spark lda, topics -> identified clustered topics, transformed -> reviews with topic predicted
    """
    lda = LDA(
        k=num_topics,
        optimizer="em")  # call spark LDA, initialized by number of topics
    model = lda.fit(df)  # fit model
    topics = model.describeTopics(maxTermsPerTopic=30).collect(
    )  # "describe" the topics by topic vocabulary determined by LDA
    transformed = model.transform(df)  # generate predict
    return lda, model, topics, transformed
예제 #18
0
def run_ml_pipeline(nlpPipelineDF, num_topics, max_iterations, vocabSize,
                    minDF, maxDF):
    """Define a Spark LDA topic modelling pipeline"""
    cv = CountVectorizer(
        inputCol="allTokens",
        outputCol="features",
        vocabSize=vocabSize,
        minDF=minDF,
        maxDF=maxDF,
        minTF=1.0,
    )
    idf = IDF(inputCol="features", outputCol="idf")
    lda = LDA(
        k=num_topics,
        maxIter=max_iterations,
        optimizer="online",
        seed=1,
        learningOffset=
        100.0,  # If high, early iterations are downweighted during training
        learningDecay=
        0.51,  # Set between [0.5, 1) to guarantee asymptotic convergence
    )

    mlPipeline = Pipeline(stages=[cv, idf, lda])
    mlModel = mlPipeline.fit(nlpPipelineDF)
    ldaModel = mlModel.stages[2]
    return mlModel, ldaModel
예제 #19
0
def UsefulnessPredictionLDA(trainingdata, model):
    # Data Preprocessing
    tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word")

    remover = StopWordsRemover(inputCol="tokens_word",
                               outputCol="filtered_tokens_word")
    cv = CountVectorizer(inputCol="filtered_tokens_word",
                         outputCol="raw_features",
                         minDF=2.0)
    idf = IDF(inputCol="raw_features", outputCol="features")

    # Extract LDA topic feature
    lda = LDA(k=30, maxIter=10)
    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="topicDistribution")
    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model])
    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")
    paramGrid = ParamGridBuilder() \
        .addGrid(cv.vocabSize, [150, 200, 250]) \
        .build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator_rmse,
                              numFolds=4)  # use 3+ folds in practice
    cvModel = crossval.fit(trainingdata)
    # Explain params for the selected model
    print cvModel.explainParams()
    return cvModel
예제 #20
0
def UsefulnessPredictionLDAWithoutCV(trainingdata, model):
    # Data Preprocessing
    tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word")
    remover = StopWordsRemover(inputCol="tokens_word",
                               outputCol="filtered_tokens_word")
    cv = CountVectorizer(inputCol="filtered_tokens_word",
                         outputCol="raw_features",
                         minDF=2.0,
                         vocabSize=250)
    idf = IDF(inputCol="raw_features", outputCol="features")

    # Extract LDA topic feature
    lda = LDA(k=30, maxIter=10)
    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="topicDistribution")

    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model])
    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")

    cvModel = pipeline.fit(trainingdata)

    # Explain params for the selected model
    print cvModel.explainParams()
    return cvModel
예제 #21
0
def main():
    subreddit_group = spark.read.parquet(input_file).repartition(2000)
    # subreddit_group.show()

    #hashing = HashingTF(inputCol="comments", outputCol="features")
    count_vectorizer = CountVectorizer(inputCol="comments",
                                       outputCol="features")

    lda = LDA(k=10, maxIter=10, optimizer='online')

    pipeline = Pipeline(stages=[count_vectorizer, lda])
    model = pipeline.fit(subreddit_group)

    predictions = model.transform(subreddit_group).selectExpr(
        'id', 'topicDistribution')

    change_to_str = F.udf(to_text)

    topics_df = predictions.select(
        predictions['id'],
        change_to_str(
            predictions['topicDistribution']).alias('topicDistribution'))

    #topics_df.show(20, False)
    topics_df.write.option('sep', ',').save(output,
                                            format='csv',
                                            mode='overwrite')
예제 #22
0
def training_model(train,
                   k=10,
                   maxiter=120,
                   features_name="features",
                   optimizer_type="online",
                   seed=123):
    lda = LDA(k=k,
              seed=123,
              optimizer=optimizer_type,
              featuresCol=features_name,
              subsamplingRate=0.1,
              learningDecay=0.5,
              optimizeDocConcentration=True,
              maxIter=maxiter)
    ldamodel = lda.fit(train)
    predictionTrain = ldamodel.transform(train)
    return (ldamodel, predictionTrain)
예제 #23
0
def main():
    comments = spark.read.json(input_comments,
                               schema=comments_schema).repartition(100)
    comm = comments.select(comments['subreddit'].alias('id'),
                           comments['body']).limit(50)
    preprocess = F.udf(clean_data,
                       returnType=types.ArrayType(types.StringType()))
    comm_split = comm.select(comm['id'],
                             F.split(comm['body'], ' ').alias('comments'))
    sub_group = comm_split.groupBy(comm_split['id']).agg(F.collect_list('comments').alias('comments')) \
                                                    .select(F.col('id'), F.col('comments'))

    comm_lemm = sub_group.select(
        sub_group['id'],
        preprocess(sub_group['comments']).alias('comments')).cache()

    # hashing_model = HashingTF(inputCol="comments", outputCol="features")
    # result = hashing_model.transform(comm_lemm)

    cv = CountVectorizer(inputCol="comments", outputCol="features")
    count_vectorizer_model = cv.fit(comm_lemm)
    result = count_vectorizer_model.transform(comm_lemm)
    result.show(truncate=False)
    #
    # vocabArray = count_vectorizer_model.vocabulary
    # print(vocabArray)

    corpus = result.select(result['id'], result['features']).cache()

    lda = LDA(k=5, optimizer='online')
    lda_model = lda.fit(corpus)

    transformed = lda_model.transform(corpus)
    transformed.show(truncate=False)

    topic_text = F.udf(to_text)
    topics_df = transformed.select(
        transformed['id'],
        topic_text(
            transformed['topicDistribution']).alias('topicDistribution'))
    #topics_df.show(truncate=False)

    topics_df.write.option('sep', ',').save(output_file,
                                            format='csv',
                                            mode='overwrite')
예제 #24
0
def cv_idf_lda(df):
    '''
    CountVectorizer, IDF, LDA
    :param df:
    :return:
    '''
    pass
    # CountVectorizer
    cv = CountVectorizer(inputCol="content", outputCol="features")
    model_cv = cv.fit(df)
    vocabulary = model_cv.vocabulary
    df_cv = model_cv.transform(df)
    df_cv.cache()
    # # IDF
    idf = IDF(inputCol="features", outputCol="cv")
    model_idf = idf.fit(df_cv)
    df_idf = model_idf.transform(df_cv)
    df_idf.cache()

    def getwords(row):
        '''
        根据下标,映射得到对应词
        :param row:
        :return:
        '''
        words = list()
        for index in row[3]:
            words.append(vocabulary[index])
        return [row[0], row[1], row[2], words]

    # LDA
    lda = LDA(k=10, seed=1, optimizer="em")
    model_lda = lda.fit(df_idf)
    print(model_lda.describeTopics(maxTermsPerTopic=10))
    # model_lda.save('file:///home/pxz/model_lda/lda15')
    # print(lda[df_idf])
    # print(type(lda[df_idf]))
    # # 主题数maxTermsPerTopic
    df_des = model_lda.describeTopics(maxTermsPerTopic=15)
    rdd_des = df_des.select("topic", "termIndices", "termWeights", df_des.termIndices).rdd.map(getwords)
    df_des = sparkEntrance.spark.createDataFrame(rdd_des, ['topic', 'termIndices', 'termWeights', 'words'])
    # return df_des
    df_des.select('topic', 'words', 'termWeights').show(truncate=False)
예제 #25
0
def lda_model(data):

    lda = LDA(
        k=LDA_CLUSTERS,
        # seed=123,
        # optimizer="em",
        featuresCol="vectors")

    # todo Gridsearch best parameters

    model = lda.fit(data)
    topics = model.describeTopics(maxTermsPerTopic=15)
    log.info("Learned topics (as distributions over vocab of " +
             str(model.vocabSize()) + " words):")
    wordNumbers = 10
    topicIndices = model.describeTopics(maxTermsPerTopic=wordNumbers)
    topicIndices.show()
    #does not work as it shown in dics. Seems to be in process in current Python API
    show_lda_weights(model, topics)
예제 #26
0
def trainModel(docMatrix, savemodel, k, iterations=10, parallelization=16):
    data = mmread(docMatrix)
    rowRange = sc.parallelize(xrange(data.shape[0]), parallelization)
    dataSpark = spark.createDataFrame(
        rowRange.map(lambda i: Row(
            label=i, features=sparkToScipySparse(data.getrow(i)))))
    lda = LDA(k=k, maxIter=iterations)
    model = lda.fit(dataSpark)
    model.save(savemodel)

    topicMatrix = model.topicsMatrix().toArray()
    topicMatrix = topicMatrix.T
    topicMatrix = topicMatrix / topicMatrix.sum(axis=0)
    print 'TODO: give wordXtopic.mtx a path'
    mmwrite('wordXtopic.mtx', topicMatrix)

    print 'TODO: give docXtopic.mtx a path'
    docXTopics = model.transform(dataSpark)
    dxT = docXTopics.collect()
    dxT_v2 = np.array([dxtI['topicDistribution'] for dxtI in dxT])
    mmwrite('docXtopic.mtx', dxT_v2)
예제 #27
0
def set_lda_model(params: Dict[str, Any]):
    lda = LDA(
        k=params['topics'],
        maxIter=params['iter'],
        optimizer="online",
        seed=1,
        learningOffset=
        100.0,  # If high, early iterations are downweighted during training
        learningDecay=
        0.51,  # Set between [0.5, 1) to guarantee asymptotic convergence
    )
    return lda
예제 #28
0
def content_recom(self,
                  file1,
                  file2,
                  tfidf_model,
                  tfidf_lda_model,
                  sentiment_file,
                  all_business_parquet,
                  key_words,
                  num_results=20):

    from pyspark import SparkContext
    from pyspark.sql import SparkSession
    sparkconf_builder = spark_celery_app.sparkconf_builder
    spark_conf = sparkconf_builder()
    sc = SparkContext.getOrCreate(conf=spark_conf)
    spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

    data = spark.read.json(file1)
    df_business = spark.read.parquet(file2)

    df = data.select('business_id', 'text')
    review_rdd = df.rdd.map(tuple).reduceByKey(operator.add)
    review_df = spark.createDataFrame(review_rdd).withColumnRenamed(
        '_1', 'business_id').withColumnRenamed('_2', 'text')

    tfidf_model = PipelineModel.load(tfidf_model)
    result_tfidf = tfidf_model.transform(review_df)
    yelp = result_tfidf

    lda = LDA(k=15, maxIter=100)
    model = LocalLDAModel.load(tfidf_lda_model)
    # lda output column topicDistribution
    lda_df = model.transform(yelp)
    lda_vec = lda_df.select(
        'business_id',
        'topicDistribution').rdd.map(lambda x: (x[0], x[1])).collect()

    result = get_keywords_recoms(key_words, num_results, tfidf_model, model,
                                 lda_vec)
    df_sentiment = spark.read.json(sentiment_file)
    df_content_rest = df_sentiment.join(
        result, 'business_id',
        'inner').orderBy("sentiment_score", ascending=False).limit(num_results)
    all_busi_df = spark.read.parquet(all_business_parquet)
    df_rest_result = all_busi_df.join(df_content_rest, 'business_id',
                                      'right').select('business_id',
                                                      'sentiment_score',
                                                      'name', 'categories',
                                                      'score', 'latitude',
                                                      'longitude')
    df_rest_result.show()
    collected_df_rest_result = df_rest_result.collect()
    return collected_df_rest_result
예제 #29
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
예제 #30
0
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
    sys.stderr = codecs.getwriter('utf8')(sys.stderr)

    spark = SparkSession.builder.appName("LDA Batch Model").getOrCreate()
    sc = spark.sparkContext

    print AWS_ACCESS_KEY_ID
    print AWS_SECRET_ACCESS_KEY
    sc._jsc.hadoopConfiguration().set(
        "fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', AWS_ACCESS_KEY_ID)
    sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key',
                                      AWS_SECRET_ACCESS_KEY)

    custom_stop_words = utils.load_stop_words(sc)
    texts_df = utils.load_texts(spark)

    pipeline = ml_utils.set_pipeline(custom_stop_words)
    model = pipeline.fit(texts_df)

    result = model.transform(texts_df)

    # Cluster the documents into three topics using LDA
    lda = LDA(k=NUMBER_OF_TOPICS, maxIter=5, featuresCol="vectors")
    lda_model = lda.fit(result)

    # Describe topics
    topics = lda_model.describeTopics(3)
    print("The topics described by their top-weighted terms:")
    topics.show(truncate=False)

    # Shows the result
    transformed = lda_model.transform(result)
    transformed.show(truncate=False)

    # Save and load model
    lda_model.save("s3a://current-models/LDAModel")

    sc.stop()
예제 #31
0
    def modelData(self, corp):
        print("Data modeling")
        #Cluster the data into n topics using LDA
        ldaModel = None

        if (self.persistSteps and not self.recompute
                and os.path.isdir(self.stepsPath + "ldaModel")):
            print("Model exist, loading")
            ldaModel = LocalLDAModel.load(self.stepsPath + "ldaModel")
        else:
            print("Creating Model")
            lda = LDA(k=self.kTopics, maxIter=100, optimizer='online')
            ldaModel = lda.fit(corp)
            if (self.persistSteps):
                print("Saving model")
                if (os.path.isdir(self.stepsPath + "ldaModel")):
                    shutil.rmtree(self.stepsPath + "ldaModel")
                ldaModel.save(self.stepsPath + "ldaModel")

        print("Extracting Topics")
        self.topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)

        if (self.persistSteps and not self.recompute
                and os.path.isdir(self.stepsPath + "predictions")):
            print("Predictions exist, loading")
            self.predictions = self.spark.read.load(self.stepsPath +
                                                    "predictions")
        else:
            print("Predicting Data")
            self.predictions = ldaModel.transform(corp)
            self.topics = ldaModel.topicsMatrix()
            if (self.persistSteps):
                print("Saving predictions data")
                if (os.path.isdir(self.stepsPath + "predictions")):
                    shutil.rmtree(self.stepsPath + "predictions")
                self.predictions.select(
                    "label", "features",
                    "topicDistribution").write.save(self.stepsPath +
                                                    "predictions")
예제 #32
0
def main():
    spark = SparkSession.builder \
        .appName("Spark CV-job ad matching") \
        .config("spark.some.config.option", "some-value") \
        .master("local[*]") \
        .getOrCreate()

    VOCAB_SIZE = 100
    MIN_DF = 1.0
    TOPIC_NUM = 50

    df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter(
        "description is not NULL")

    tokenizer = Tokenizer(inputCol="description", outputCol="words")
    tokenized = tokenizer.transform(df_jobs)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    removed = remover.transform(tokenized)

    processed = removed.rdd.map(lambda row: (
        row.jobId, lemmatize(strip_punctuation(row.filtered)))).toDF(
            ["jobid", "processed"])

    countVectorizer = CountVectorizer(inputCol="processed",
                                      outputCol="rawFeatures",
                                      vocabSize=VOCAB_SIZE,
                                      minDF=MIN_DF,
                                      binary=False)
    cv_model = countVectorizer.fit(processed)
    featurizedData = cv_model.transform(processed)

    lda = LDA(k=TOPIC_NUM, seed=4314, optimizer="em")
    lda.setFeaturesCol("rawFeatures")
    model = lda.fit(featurizedData)
    vocab = cv_model.vocabulary
    model.describeTopics().rdd.map(lambda row: (row.topic, [vocab[x] for x in row.termIndices])).toDF(["Topic", "words"])\
    .coalesce(1).rdd.saveAsTextFile('lda-topics-lemmatized')
from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("features")\
  .setVocabSize(500)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)


# COMMAND ----------

from pyspark.ml.clustering import LDA
lda = LDA().setK(10).setMaxIter(5)
print lda.explainParams()
model = lda.fit(prepped)


# COMMAND ----------

model.describeTopics(3).show()
cvFitted.vocabulary


# COMMAND ----------

예제 #34
0
Run with:
  bin/spark-submit examples/src/main/python/ml/lda_example.py
"""

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("LDAExample") \
        .getOrCreate()

    # $example on$
    # Loads items.
    dataset = spark.read.format("libsvm").load("items/mllib/sample_lda_libsvm_data.txt")

    # Trains a LDA model.
    lda = LDA(k=10, maxIter=10)
    model = lda.fit(dataset)

    ll = model.logLikelihood(dataset)
    lp = model.logPerplexity(dataset)
    print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
    print("The upper bound bound on perplexity: " + str(lp))

    # Describe topics.
    topics = model.describeTopics(3)
    print("The topics described by their top-weighted terms:")
    topics.show(truncate=False)

    # Shows the result
    transformed = model.transform(dataset)
    transformed.show(truncate=False)
featurizedData = cvmodel.transform(clean_text)

vocab = cvmodel.vocabulary
vocab_broadcast = sc.broadcast(vocab)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

################################################################################################
#
#   LDA Clustering - Find Data-driven Topics
#
################################################################################################

lda = LDA(k=25, seed=123, optimizer="em", featuresCol="features")

ldamodel = lda.fit(rescaledData)

#model.isDistributed()
#model.vocabSize()

ldatopics = ldamodel.describeTopics()
ldatopics.show(25)

def map_termID_to_Word(termIndices):
    words = []
    for termID in termIndices:
        words.append(vocab_broadcast.value[termID])
    
    return words
예제 #36
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")