Exemplo n.º 1
0
def sparknlp_transform(df):
	documentAssembler = DocumentAssembler() \
	     .setInputCol('review') \
	     .setOutputCol('document')
	tokenizer = Tokenizer() \
	     .setInputCols(['document']) \
	     .setOutputCol('token')
	normalizer = Normalizer() \
	     .setInputCols(['token']) \
	     .setOutputCol('normalized') \
	     .setLowercase(True)
	lemmatizer = LemmatizerModel.pretrained() \
	     .setInputCols(['normalized']) \
	     .setOutputCol('lemma')
	stopwords_cleaner = StopWordsCleaner() \
	     .setInputCols(['lemma']) \
	     .setOutputCol('clean_token') \
	     .setCaseSensitive(False) \
	     .setStopWords(eng_stopwords)
	# finisher converts tokens to human-readable output
	finisher = Finisher() \
	     .setInputCols(['clean_token']) \
	     .setCleanAnnotations(True)
	pipeline = Pipeline() \
	     .setStages([
	           documentAssembler,
	           tokenizer,
	           normalizer,
	           lemmatizer,
	           stopwords_cleaner,
	           finisher
	     ])
	data = pipeline.fit(df).transform(df)
	return data
Exemplo n.º 2
0
def run_nlp_pipeline(df):
    """Perform lemmatization using Spark-NLP (add-on library)"""
    document_assembler = DocumentAssembler() \
        .setInputCol("words_joined")

    # Obtain tokens from a string
    tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")

    # Use spaCy lemma dictionary to train Spark NLP lemmatizer
    lemmatizer = Lemmatizer() \
        .setInputCols(["token"]) \
        .setOutputCol("lemma") \
        .setDictionary(LEMMAS, key_delimiter="->", value_delimiter="\s+", read_as="TEXT")

    finisher = Finisher() \
        .setInputCols(["lemma"]) \
        .setIncludeMetadata(False)

    nlpPipeline = Pipeline(stages=[document_assembler, tokenizer, lemmatizer, finisher])
    nlpPipelineDF = nlpPipeline.fit(df) \
        .transform(df) \
        .withColumnRenamed('finished_lemma', 'allTokens')
    return nlpPipelineDF
Exemplo n.º 3
0
def LDA_pipefit (data_ip, ipcol):
  text_col = ipcol
  from sparknlp.base import DocumentAssembler
  documentAssembler = DocumentAssembler().setInputCol(text_col).setOutputCol('document')
  from sparknlp.annotator import Tokenizer
  tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
  from sparknlp.annotator import Normalizer
  normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True)
  from sparknlp.annotator import LemmatizerModel
  lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
  from sparknlp.annotator import StopWordsCleaner
  stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords)
  from sparknlp.annotator import NGramGenerator
  ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')
  from sparknlp.annotator import PerceptronModel
  pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos')
  from sparknlp.base import Finisher
  finisher = Finisher().setInputCols(['unigrams', 'ngrams','pos'])
  from pyspark.ml import Pipeline
  pipeline = Pipeline().setStages([documentAssembler,
                                  tokenizer,
                                  normalizer,
                                  lemmatizer,
                                  stopwords_cleaner,
                                  pos_tagger,
                                  ngrammer,
                                  finisher])
  review_text_clean = ipcol
  processed_tweets = pipeline.fit(data_ip).transform(data_ip)
  from pyspark.sql.functions import concat
  processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams')))
  from pyspark.ml.feature import CountVectorizer
  tfizer = CountVectorizer(inputCol='final',outputCol='tf_features')
  tf_model = tfizer.fit(processed_tweets)
  tf_result = tf_model.transform(processed_tweets)
  from pyspark.ml.feature import IDF
  idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
  idf_model = idfizer.fit(tf_result)
  tfidf_result = idf_model.transform(tf_result)
  from pyspark.ml.clustering import LDA

  num_topics = 3
  max_iter = 10

  lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features')
  lda_model = lda.fit(tfidf_result)
  from pyspark.sql import types as T
  vocab = tf_model.vocabulary
  def get_words(token_list):
      return [vocab[token_id] for token_id in token_list]
  udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

  num_top_words = 15
  topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
  topics_p=topics.toPandas()
  return topics_p
Exemplo n.º 4
0
def build_data(df):
    document_assembler1 = DocumentAssembler() \
        .setInputCol('question1').setOutputCol('document1')

    tokenizer1 = Tokenizer() \
        .setInputCols(['document1']) \
        .setOutputCol('token1')

    finisher1 = Finisher() \
        .setInputCols(['token1']) \
        .setOutputCols(['ntokens1']) \
        .setOutputAsArray(True) \
        .setCleanAnnotations(True)

    document_assembler2 = DocumentAssembler() \
        .setInputCol('question2').setOutputCol('document2')

    tokenizer2 = Tokenizer() \
        .setInputCols(['document2']) \
        .setOutputCol('token2')

    finisher2 = Finisher() \
        .setInputCols(['token2']) \
        .setOutputCols(['ntokens2']) \
        .setOutputAsArray(True) \
        .setCleanAnnotations(True)

    p_pipeline = Pipeline(stages=[document_assembler1, tokenizer1, finisher1, \
                                  document_assembler2, tokenizer2, finisher2])
    p_model = p_pipeline.fit(df)
    processed1 = p_model.transform(df)
    label1 = processed1.select('is_duplicate').collect()
    label_array1 = np.array(label1)
    label_array1 = label_array1.astype(np.int)

    return processed1, label_array1
    def tokenize(self, event):
        print('entered tokenizer')
        
        documentAssembler = DocumentAssembler()

        documentAssembler.setInputCol('text')

        documentAssembler.setOutputCol('document')
        
        self.spark_df = self.spark.createDataFrame(self.df.astype(str))
        self.spark_df=documentAssembler.transform(self.spark_df)
        tokenizer = Tokenizer()
        tokenizer.setInputCols(['document'])
        tokenizer.setOutputCol('token')
        tokenizer.setTargetPattern(self.search_pattern_input.value)
        token_df=tokenizer.fit(self.spark_df)
        current_df = token_df.transform(self.spark_df) 
        self.spark_df = current_df

        

        self.display_df = get_all_lines(self.spark_df, 'token.result', col = 'token')
        self.continue_button.disabled = False
Exemplo n.º 6
0
class ResourceDownloader(object):

    factory = {
        DocumentAssembler.reader: lambda: DocumentAssembler(),
        SentenceDetector.reader: lambda: SentenceDetector(),
        Tokenizer.reader: lambda: Tokenizer(),
        PerceptronModel.reader: lambda: PerceptronModel(),
        NerCrfModel.reader: lambda: NerCrfModel()
    }

    def downloadModel(self, reader, name, language):
        j_obj = _internal._DownloadModel(reader, name, language).apply()
        py_obj = self.factory[reader]()
        py_obj._java_obj = j_obj
        return py_obj

    def downloadPipeline(self, name, language):
        j_obj = _internal._DownloadPipeline(name, language).apply()
        jmodel = JavaModel()
        jmodel._java_obj = j_obj
        return jmodel
Exemplo n.º 7
0
def setup_sentiment_pipeline():
    lexicon = 'lexicon.txt'
    document_assembler = DocumentAssembler().setInputCol(
        "rawDocument").setOutputCol("document").setIdCol("sentence_id")
    sentence_detector = SentenceDetector().setInputCols(
        ["document"]).setOutputCol("sentence")
    tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
    lemmatizer = Lemmatizer().setInputCols([
        "token"
    ]).setOutputCol("lemma").setDictionary("txt/corpus/lemmas_small.txt",
                                           key_delimiter="->",
                                           value_delimiter="\t")
    sentiment_detector = SentimentDetector().setInputCols(
        ["lemma", "sentence"]).setOutputCol("sentiment_score").setDictionary(
            "txt/corpus/{0}".format(lexicon), ",")
    finisher = Finisher().setInputCols(["sentiment_score"
                                        ]).setOutputCols(["sentiment"])
    pipeline = Pipeline(stages=[
        document_assembler, sentence_detector, tokenizer, lemmatizer,
        sentiment_detector, finisher
    ])
    return pipeline
Exemplo n.º 8
0
def simplePipeline():
    document_assembler = DocumentAssembler() \
        .setInputCol("text") \
        .setOutputCol("document")
    sentenceDetector = SentenceDetector() \
        .setInputCols(["document"]) \
        .setOutputCol("sentences")
    tokenizer = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")
    normalizer = Normalizer() \
        .setInputCols(["token"]) \
        .setOutputCol("normal")
    word_embeddings = WordEmbeddingsModel.pretrained() \
        .setInputCols(["document", "normal"]) \
        .setOutputCol("embeddings")
    nlpPipeline = Pipeline(stages=[
        document_assembler,
        sentenceDetector,
        tokenizer,
        normalizer,
        word_embeddings,
    ])
    return nlpPipeline
Exemplo n.º 9
0
    def spark_nlp_sentiment_analysis(self):
        """
        transform reviews with tokenization, normalization, lemmatization and sentiment dict
        calculate sentiment score and aggregate with business ID
        """
        lemma_file = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"],
            self.s3_config["LEMMA_FILE"])
        sentiment_file = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"],
            self.s3_config["SENTIMENT_FILE"])
        yelp_rating_filename = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["YELP_FOLDER"],
            self.s3_config["YELP_REVIEW_DATA_FILE"])
        self.df_yelp_review = self.spark.read.json(yelp_rating_filename)
        self.df_yelp_review = self.df_yelp_review \
                                    .select("user_id", "business_id", "stars", "text") \
                                    .withColumnRenamed("stars", "ratings")
        self.df_id_filter = self.df_ranking.select("business_id")
        self.df_yelp_review = self.df_yelp_review \
                                    .join(self.df_id_filter, self.df_yelp_review.business_id
                                                == self.df_id_filter.business_id, 'inner') \
                                    .drop(self.df_id_filter.business_id)

        document_assembler = DocumentAssembler() \
                            .setInputCol("text")
        sentence_detector = SentenceDetector() \
                            .setInputCols(["document"]) \
                            .setOutputCol("sentence")
        tokenizer = Tokenizer() \
                    .setInputCols(["sentence"]) \
                    .setOutputCol("token")
        normalizer = Normalizer() \
                    .setInputCols(["token"]) \
                    .setOutputCol("normal")
        lemmatizer = Lemmatizer() \
                    .setInputCols(["token"]) \
                    .setOutputCol("lemma") \
                    .setDictionary(lemma_file, key_delimiter="->", value_delimiter="\t")
        sentiment_detector = SentimentDetector() \
                            .setInputCols(["lemma", "sentence"]) \
                            .setOutputCol("sentiment_score") \
                            .setDictionary(sentiment_file, delimiter=",")
        finisher = Finisher() \
                    .setInputCols(["sentiment_score"]) \
                    .setOutputCols(["sentiment"])
        pipeline = Pipeline(stages=[
            document_assembler, \
            sentence_detector, \
            tokenizer, \
            normalizer, \
            lemmatizer, \
            sentiment_detector, \
            finisher
        ])

        self.df_sentiment = pipeline \
                            .fit(self.df_yelp_review) \
                            .transform(self.df_yelp_review)
        self.df_sentiment.cache()
        self.df_sentiment = self.df_sentiment \
                                .select(self.df_sentiment.business_id, functions.when(self.df_sentiment.sentiment
                                    == "positive", 1).when(self.df_sentiment.sentiment == "negative", -1).otherwise(0))\
                                .withColumnRenamed("CASE WHEN (sentiment = positive) THEN 1 WHEN (sentiment = negative) THEN -1 ELSE 0 END", "sentiment")
        self.df_sentiment = self.df_sentiment \
                                .groupby("business_id") \
                                .agg({"sentiment": "mean"}) \
                                .withColumnRenamed("avg(sentiment)", "avg_sentiment_score")
Exemplo n.º 10
0
        # Change the name of the new column
        ).alias("text")
    )
)

# Now, we begin assembling our pipeline. Each component here is used to some transformation to the data.
# The Document Assembler takes the raw text data and convert it into a format that can
# be tokenized. It becomes one of spark-nlp native object types, the "Document".
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol(
    "document")

# The Tokenizer takes data that is of the "Document" type and tokenizes it.
# While slightly more involved than this, this is effectively taking a string and splitting
# it along ths spaces, so each word is its own string. The data then becomes the
# spark-nlp native type "Token".
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

# The Normalizer will group words together based on similar semantic meaning.
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalizer")

# The Stemmer takes objects of class "Token" and converts the words into their
# root meaning. For instance, the words "cars", "cars'" and "car's" would all be replaced
# with the word "car".
stemmer = Stemmer().setInputCols(["normalizer"]).setOutputCol("stem")

# The Finisher signals to spark-nlp allows us to access the data outside of spark-nlp
# components. For instance, we can now feed the data into components from Spark MLlib.
finisher = Finisher().setInputCols(["stem"]).setOutputCols(
    ["to_spark"]).setValueSplitSymbol(" ")

# Stopwords are common words that generally don't add much detail to the meaning
LANG = "english"

spark = sparknlp.start()

path = 'Some path'
data = spark.read.csv(path, header=True)
text_col = 'sentences'
text_data = data.select(text_col).filter(F.col(text_col).isNotNull())

document_assembler = DocumentAssembler() \
    .setInputCol(text_col) \
    .setOutputCol("document")

tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('tokens')

normalizer = Normalizer() \
     .setInputCols(['tokens']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemmatized')

stopwords = stopwords.words(LANG)

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemmatized']) \