Exemplo n.º 1
0
def run_nlp_pipeline(df):
    """Perform lemmatization using Spark-NLP (add-on library)"""
    document_assembler = DocumentAssembler() \
        .setInputCol("words_joined")

    # Obtain tokens from a string
    tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")

    # Use spaCy lemma dictionary to train Spark NLP lemmatizer
    lemmatizer = Lemmatizer() \
        .setInputCols(["token"]) \
        .setOutputCol("lemma") \
        .setDictionary(LEMMAS, key_delimiter="->", value_delimiter="\s+", read_as="TEXT")

    finisher = Finisher() \
        .setInputCols(["lemma"]) \
        .setIncludeMetadata(False)

    nlpPipeline = Pipeline(stages=[document_assembler, tokenizer, lemmatizer, finisher])
    nlpPipelineDF = nlpPipeline.fit(df) \
        .transform(df) \
        .withColumnRenamed('finished_lemma', 'allTokens')
    return nlpPipelineDF
Exemplo n.º 2
0
def sparknlp_transform(df):
	documentAssembler = DocumentAssembler() \
	     .setInputCol('review') \
	     .setOutputCol('document')
	tokenizer = Tokenizer() \
	     .setInputCols(['document']) \
	     .setOutputCol('token')
	normalizer = Normalizer() \
	     .setInputCols(['token']) \
	     .setOutputCol('normalized') \
	     .setLowercase(True)
	lemmatizer = LemmatizerModel.pretrained() \
	     .setInputCols(['normalized']) \
	     .setOutputCol('lemma')
	stopwords_cleaner = StopWordsCleaner() \
	     .setInputCols(['lemma']) \
	     .setOutputCol('clean_token') \
	     .setCaseSensitive(False) \
	     .setStopWords(eng_stopwords)
	# finisher converts tokens to human-readable output
	finisher = Finisher() \
	     .setInputCols(['clean_token']) \
	     .setCleanAnnotations(True)
	pipeline = Pipeline() \
	     .setStages([
	           documentAssembler,
	           tokenizer,
	           normalizer,
	           lemmatizer,
	           stopwords_cleaner,
	           finisher
	     ])
	data = pipeline.fit(df).transform(df)
	return data
Exemplo n.º 3
0
def LDA_pipefit (data_ip, ipcol):
  text_col = ipcol
  from sparknlp.base import DocumentAssembler
  documentAssembler = DocumentAssembler().setInputCol(text_col).setOutputCol('document')
  from sparknlp.annotator import Tokenizer
  tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
  from sparknlp.annotator import Normalizer
  normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True)
  from sparknlp.annotator import LemmatizerModel
  lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
  from sparknlp.annotator import StopWordsCleaner
  stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords)
  from sparknlp.annotator import NGramGenerator
  ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')
  from sparknlp.annotator import PerceptronModel
  pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos')
  from sparknlp.base import Finisher
  finisher = Finisher().setInputCols(['unigrams', 'ngrams','pos'])
  from pyspark.ml import Pipeline
  pipeline = Pipeline().setStages([documentAssembler,
                                  tokenizer,
                                  normalizer,
                                  lemmatizer,
                                  stopwords_cleaner,
                                  pos_tagger,
                                  ngrammer,
                                  finisher])
  review_text_clean = ipcol
  processed_tweets = pipeline.fit(data_ip).transform(data_ip)
  from pyspark.sql.functions import concat
  processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams')))
  from pyspark.ml.feature import CountVectorizer
  tfizer = CountVectorizer(inputCol='final',outputCol='tf_features')
  tf_model = tfizer.fit(processed_tweets)
  tf_result = tf_model.transform(processed_tweets)
  from pyspark.ml.feature import IDF
  idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
  idf_model = idfizer.fit(tf_result)
  tfidf_result = idf_model.transform(tf_result)
  from pyspark.ml.clustering import LDA

  num_topics = 3
  max_iter = 10

  lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features')
  lda_model = lda.fit(tfidf_result)
  from pyspark.sql import types as T
  vocab = tf_model.vocabulary
  def get_words(token_list):
      return [vocab[token_id] for token_id in token_list]
  udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

  num_top_words = 15
  topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
  topics_p=topics.toPandas()
  return topics_p
Exemplo n.º 4
0
def build_data(df):
    document_assembler1 = DocumentAssembler() \
        .setInputCol('question1').setOutputCol('document1')

    tokenizer1 = Tokenizer() \
        .setInputCols(['document1']) \
        .setOutputCol('token1')

    finisher1 = Finisher() \
        .setInputCols(['token1']) \
        .setOutputCols(['ntokens1']) \
        .setOutputAsArray(True) \
        .setCleanAnnotations(True)

    document_assembler2 = DocumentAssembler() \
        .setInputCol('question2').setOutputCol('document2')

    tokenizer2 = Tokenizer() \
        .setInputCols(['document2']) \
        .setOutputCol('token2')

    finisher2 = Finisher() \
        .setInputCols(['token2']) \
        .setOutputCols(['ntokens2']) \
        .setOutputAsArray(True) \
        .setCleanAnnotations(True)

    p_pipeline = Pipeline(stages=[document_assembler1, tokenizer1, finisher1, \
                                  document_assembler2, tokenizer2, finisher2])
    p_model = p_pipeline.fit(df)
    processed1 = p_model.transform(df)
    label1 = processed1.select('is_duplicate').collect()
    label_array1 = np.array(label1)
    label_array1 = label_array1.astype(np.int)

    return processed1, label_array1
Exemplo n.º 5
0
def setup_sentiment_pipeline():
    lexicon = 'lexicon.txt'
    document_assembler = DocumentAssembler().setInputCol(
        "rawDocument").setOutputCol("document").setIdCol("sentence_id")
    sentence_detector = SentenceDetector().setInputCols(
        ["document"]).setOutputCol("sentence")
    tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
    lemmatizer = Lemmatizer().setInputCols([
        "token"
    ]).setOutputCol("lemma").setDictionary("txt/corpus/lemmas_small.txt",
                                           key_delimiter="->",
                                           value_delimiter="\t")
    sentiment_detector = SentimentDetector().setInputCols(
        ["lemma", "sentence"]).setOutputCol("sentiment_score").setDictionary(
            "txt/corpus/{0}".format(lexicon), ",")
    finisher = Finisher().setInputCols(["sentiment_score"
                                        ]).setOutputCols(["sentiment"])
    pipeline = Pipeline(stages=[
        document_assembler, sentence_detector, tokenizer, lemmatizer,
        sentiment_detector, finisher
    ])
    return pipeline
Exemplo n.º 6
0
    def spark_nlp_sentiment_analysis(self):
        """
        transform reviews with tokenization, normalization, lemmatization and sentiment dict
        calculate sentiment score and aggregate with business ID
        """
        lemma_file = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"],
            self.s3_config["LEMMA_FILE"])
        sentiment_file = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"],
            self.s3_config["SENTIMENT_FILE"])
        yelp_rating_filename = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["YELP_FOLDER"],
            self.s3_config["YELP_REVIEW_DATA_FILE"])
        self.df_yelp_review = self.spark.read.json(yelp_rating_filename)
        self.df_yelp_review = self.df_yelp_review \
                                    .select("user_id", "business_id", "stars", "text") \
                                    .withColumnRenamed("stars", "ratings")
        self.df_id_filter = self.df_ranking.select("business_id")
        self.df_yelp_review = self.df_yelp_review \
                                    .join(self.df_id_filter, self.df_yelp_review.business_id
                                                == self.df_id_filter.business_id, 'inner') \
                                    .drop(self.df_id_filter.business_id)

        document_assembler = DocumentAssembler() \
                            .setInputCol("text")
        sentence_detector = SentenceDetector() \
                            .setInputCols(["document"]) \
                            .setOutputCol("sentence")
        tokenizer = Tokenizer() \
                    .setInputCols(["sentence"]) \
                    .setOutputCol("token")
        normalizer = Normalizer() \
                    .setInputCols(["token"]) \
                    .setOutputCol("normal")
        lemmatizer = Lemmatizer() \
                    .setInputCols(["token"]) \
                    .setOutputCol("lemma") \
                    .setDictionary(lemma_file, key_delimiter="->", value_delimiter="\t")
        sentiment_detector = SentimentDetector() \
                            .setInputCols(["lemma", "sentence"]) \
                            .setOutputCol("sentiment_score") \
                            .setDictionary(sentiment_file, delimiter=",")
        finisher = Finisher() \
                    .setInputCols(["sentiment_score"]) \
                    .setOutputCols(["sentiment"])
        pipeline = Pipeline(stages=[
            document_assembler, \
            sentence_detector, \
            tokenizer, \
            normalizer, \
            lemmatizer, \
            sentiment_detector, \
            finisher
        ])

        self.df_sentiment = pipeline \
                            .fit(self.df_yelp_review) \
                            .transform(self.df_yelp_review)
        self.df_sentiment.cache()
        self.df_sentiment = self.df_sentiment \
                                .select(self.df_sentiment.business_id, functions.when(self.df_sentiment.sentiment
                                    == "positive", 1).when(self.df_sentiment.sentiment == "negative", -1).otherwise(0))\
                                .withColumnRenamed("CASE WHEN (sentiment = positive) THEN 1 WHEN (sentiment = negative) THEN -1 ELSE 0 END", "sentiment")
        self.df_sentiment = self.df_sentiment \
                                .groupby("business_id") \
                                .agg({"sentiment": "mean"}) \
                                .withColumnRenamed("avg(sentiment)", "avg_sentiment_score")
Exemplo n.º 7
0
# While slightly more involved than this, this is effectively taking a string and splitting
# it along ths spaces, so each word is its own string. The data then becomes the
# spark-nlp native type "Token".
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

# The Normalizer will group words together based on similar semantic meaning.
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalizer")

# The Stemmer takes objects of class "Token" and converts the words into their
# root meaning. For instance, the words "cars", "cars'" and "car's" would all be replaced
# with the word "car".
stemmer = Stemmer().setInputCols(["normalizer"]).setOutputCol("stem")

# The Finisher signals to spark-nlp allows us to access the data outside of spark-nlp
# components. For instance, we can now feed the data into components from Spark MLlib.
finisher = Finisher().setInputCols(["stem"]).setOutputCols(
    ["to_spark"]).setValueSplitSymbol(" ")

# Stopwords are common words that generally don't add much detail to the meaning
# of a body of text. In English, these are mostly "articles" such as the words "the"
# and "of".
stopword_remover = StopWordsRemover(inputCol="to_spark", outputCol="filtered")

# Here we implement TF-IDF as an input to our LDA model. CountVectorizer (TF) keeps track
# of the vocabulary that's being created so we can map our topics back to their
# corresponding words.
# TF (term frequency) creates a matrix that counts how many times each word in the
# vocabulary appears in each body of text. This then gives each word a weight based
# on it's frequency.
tf = CountVectorizer(inputCol="filtered", outputCol="raw_features")

# Here we implement the IDF portion. IDF (Inverse document frequency) reduces
tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("/tmp/lemmas_small.txt", key_delimiter="->", value_delimiter="\t")
        
sentiment_detector = SentimentDetector() \
    .setInputCols(["lemma", "sentence"]) \
    .setOutputCol("sentiment_score") \
    .setDictionary("/tmp/default-sentiment-dict.txt", ",")
    
finisher = Finisher() \
    .setInputCols(["sentiment_score"]) \
    .setOutputCols(["sentiment"])

# COMMAND ----------

# MAGIC %md #### 4. Train the pipeline, which is only being trained from external resources, not from the dataset we pass on. The prediction runs on the target dataset

# COMMAND ----------

pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, lemmatizer, sentiment_detector, finisher])
model = pipeline.fit(data)
result = model.transform(data)

# COMMAND ----------

# MAGIC %md #### 5. filter the finisher output, to find the positive sentiment lines
     .setInputCols(['lemmatized']) \
     .setOutputCol('cleaned_lemmatized') \
     .setStopWords(stopwords)

ngrammer = NGramGenerator() \
    .setInputCols(['lemmatized']) \
    .setOutputCol('ngrams') \
    .setN(3) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

pos_tagger = PerceptronModel.pretrained('pos_anc') \
     .setInputCols(['document', 'lemmatized']) \
     .setOutputCol('pos')

finisher = Finisher() \
     .setInputCols(['unigrams', 'ngrams', 'pos'])



pipeline = Pipeline() \
     .setStages([document_assembler,
                 tokenizer,
                 normalizer,
                 lemmatizer,
                 stopwords_cleaner,
                 pos_tagger,
                 ngrammer,
                 finisher])

processed_text = pipeline.fit(text_data).transform(text_data)