Пример #1
0
class ResourceDownloader(object):

    factory = {
        DocumentAssembler.name: lambda: DocumentAssembler(),
        SentenceDetector.name: lambda: SentenceDetector(),
        Tokenizer.name: lambda: Tokenizer(),
        PerceptronModel.name: lambda: PerceptronModel(),
        NerCrfModel.name: lambda: NerCrfModel(),
        Stemmer.name: lambda: Stemmer(),
        Normalizer.name: lambda: Normalizer(),
        RegexMatcherModel.name: lambda: RegexMatcherModel(),
        LemmatizerModel.name: lambda: LemmatizerModel(),
        DateMatcher.name: lambda: DateMatcher(),
        EntityExtractorModel.name: lambda: EntityExtractorModel(),
        SentimentDetectorModel.name: lambda: SentimentDetectorModel(),
        ViveknSentimentModel.name: lambda: ViveknSentimentModel(),
        NorvigSweetingModel.name: lambda: NorvigSweetingModel(),
        AssertionLogRegModel.name: lambda: AssertionLogRegModel()
    }

    def downloadModel(self, reader, name, language):
        j_obj = _internal._DownloadModel(reader.name, name, language).apply()
        py_obj = self.factory[reader.name]()
        py_obj._java_obj = j_obj
        return py_obj

    def downloadPipeline(self, name, language):
        j_obj = _internal._DownloadPipeline(name, language).apply()
        jmodel = JavaModel()
        jmodel._java_obj = j_obj
        return jmodel

    def clearCache(self, name, language):
        _internal._ClearCache(name, language).apply()
Пример #2
0
def run_nlp_pipeline(df):
    """Perform lemmatization using Spark-NLP (add-on library)"""
    document_assembler = DocumentAssembler() \
        .setInputCol("words_joined")

    # Obtain tokens from a string
    tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")

    # Use spaCy lemma dictionary to train Spark NLP lemmatizer
    lemmatizer = Lemmatizer() \
        .setInputCols(["token"]) \
        .setOutputCol("lemma") \
        .setDictionary(LEMMAS, key_delimiter="->", value_delimiter="\s+", read_as="TEXT")

    finisher = Finisher() \
        .setInputCols(["lemma"]) \
        .setIncludeMetadata(False)

    nlpPipeline = Pipeline(stages=[document_assembler, tokenizer, lemmatizer, finisher])
    nlpPipelineDF = nlpPipeline.fit(df) \
        .transform(df) \
        .withColumnRenamed('finished_lemma', 'allTokens')
    return nlpPipelineDF
Пример #3
0
def sparknlp_transform(df):
	documentAssembler = DocumentAssembler() \
	     .setInputCol('review') \
	     .setOutputCol('document')
	tokenizer = Tokenizer() \
	     .setInputCols(['document']) \
	     .setOutputCol('token')
	normalizer = Normalizer() \
	     .setInputCols(['token']) \
	     .setOutputCol('normalized') \
	     .setLowercase(True)
	lemmatizer = LemmatizerModel.pretrained() \
	     .setInputCols(['normalized']) \
	     .setOutputCol('lemma')
	stopwords_cleaner = StopWordsCleaner() \
	     .setInputCols(['lemma']) \
	     .setOutputCol('clean_token') \
	     .setCaseSensitive(False) \
	     .setStopWords(eng_stopwords)
	# finisher converts tokens to human-readable output
	finisher = Finisher() \
	     .setInputCols(['clean_token']) \
	     .setCleanAnnotations(True)
	pipeline = Pipeline() \
	     .setStages([
	           documentAssembler,
	           tokenizer,
	           normalizer,
	           lemmatizer,
	           stopwords_cleaner,
	           finisher
	     ])
	data = pipeline.fit(df).transform(df)
	return data
Пример #4
0
def LDA_pipefit (data_ip, ipcol):
  text_col = ipcol
  from sparknlp.base import DocumentAssembler
  documentAssembler = DocumentAssembler().setInputCol(text_col).setOutputCol('document')
  from sparknlp.annotator import Tokenizer
  tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
  from sparknlp.annotator import Normalizer
  normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True)
  from sparknlp.annotator import LemmatizerModel
  lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
  from sparknlp.annotator import StopWordsCleaner
  stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords)
  from sparknlp.annotator import NGramGenerator
  ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')
  from sparknlp.annotator import PerceptronModel
  pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos')
  from sparknlp.base import Finisher
  finisher = Finisher().setInputCols(['unigrams', 'ngrams','pos'])
  from pyspark.ml import Pipeline
  pipeline = Pipeline().setStages([documentAssembler,
                                  tokenizer,
                                  normalizer,
                                  lemmatizer,
                                  stopwords_cleaner,
                                  pos_tagger,
                                  ngrammer,
                                  finisher])
  review_text_clean = ipcol
  processed_tweets = pipeline.fit(data_ip).transform(data_ip)
  from pyspark.sql.functions import concat
  processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams')))
  from pyspark.ml.feature import CountVectorizer
  tfizer = CountVectorizer(inputCol='final',outputCol='tf_features')
  tf_model = tfizer.fit(processed_tweets)
  tf_result = tf_model.transform(processed_tweets)
  from pyspark.ml.feature import IDF
  idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
  idf_model = idfizer.fit(tf_result)
  tfidf_result = idf_model.transform(tf_result)
  from pyspark.ml.clustering import LDA

  num_topics = 3
  max_iter = 10

  lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features')
  lda_model = lda.fit(tfidf_result)
  from pyspark.sql import types as T
  vocab = tf_model.vocabulary
  def get_words(token_list):
      return [vocab[token_id] for token_id in token_list]
  udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

  num_top_words = 15
  topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
  topics_p=topics.toPandas()
  return topics_p
Пример #5
0
def build_data(df):
    document_assembler1 = DocumentAssembler() \
        .setInputCol('question1').setOutputCol('document1')

    tokenizer1 = Tokenizer() \
        .setInputCols(['document1']) \
        .setOutputCol('token1')

    finisher1 = Finisher() \
        .setInputCols(['token1']) \
        .setOutputCols(['ntokens1']) \
        .setOutputAsArray(True) \
        .setCleanAnnotations(True)

    document_assembler2 = DocumentAssembler() \
        .setInputCol('question2').setOutputCol('document2')

    tokenizer2 = Tokenizer() \
        .setInputCols(['document2']) \
        .setOutputCol('token2')

    finisher2 = Finisher() \
        .setInputCols(['token2']) \
        .setOutputCols(['ntokens2']) \
        .setOutputAsArray(True) \
        .setCleanAnnotations(True)

    p_pipeline = Pipeline(stages=[document_assembler1, tokenizer1, finisher1, \
                                  document_assembler2, tokenizer2, finisher2])
    p_model = p_pipeline.fit(df)
    processed1 = p_model.transform(df)
    label1 = processed1.select('is_duplicate').collect()
    label_array1 = np.array(label1)
    label_array1 = label_array1.astype(np.int)

    return processed1, label_array1
Пример #6
0
def setup_pipeline():
    """ create a Spark pipeline to ingest raw text then  split into sentences
    :returns: Spark ML pipeline
    """

    document_assembler = DocumentAssembler(). \
            setInputCol("rawDocument"). \
            setOutputCol("document"). \
            setIdCol("fileName")
    sentence_detector = SentenceDetector(). \
            setInputCols(["document"]). \
            setOutputCol("sentence")
    pipeline = Pipeline(). \
            setStages([
                document_assembler,
                sentence_detector
                ])
    return pipeline
Пример #7
0
class ResourceDownloader(object):

    factory = {
        DocumentAssembler.reader: lambda: DocumentAssembler(),
        SentenceDetector.reader: lambda: SentenceDetector(),
        Tokenizer.reader: lambda: Tokenizer(),
        PerceptronModel.reader: lambda: PerceptronModel(),
        NerCrfModel.reader: lambda: NerCrfModel()
    }

    def downloadModel(self, reader, name, language):
        j_obj = _internal._DownloadModel(reader, name, language).apply()
        py_obj = self.factory[reader]()
        py_obj._java_obj = j_obj
        return py_obj

    def downloadPipeline(self, name, language):
        j_obj = _internal._DownloadPipeline(name, language).apply()
        jmodel = JavaModel()
        jmodel._java_obj = j_obj
        return jmodel
Пример #8
0
class ResourceDownloader(object):

    _factory = {
        DocumentAssembler.name: lambda: DocumentAssembler(),
        SentenceDetector.name: lambda: SentenceDetector(),
        Tokenizer.name: lambda: Tokenizer(),
        PerceptronModel.name: lambda: PerceptronModel(),
        NerCrfModel.name: lambda: NerCrfModel(),
        Stemmer.name: lambda: Stemmer(),
        Normalizer.name: lambda: Normalizer(),
        RegexMatcherModel.name: lambda: RegexMatcherModel(),
        LemmatizerModel.name: lambda: LemmatizerModel(),
        DateMatcher.name: lambda: DateMatcher(),
        TextMatcherModel.name: lambda: TextMatcherModel(),
        SentimentDetectorModel.name: lambda: SentimentDetectorModel(),
        ViveknSentimentModel.name: lambda: ViveknSentimentModel(),
        NorvigSweetingModel.name: lambda: NorvigSweetingModel(),
        AssertionLogRegModel.name: lambda: AssertionLogRegModel(),
        AssertionDLModel.name: lambda: AssertionDLModel(),
        NerDLModel.name: lambda: NerDLModel()
    }

    @staticmethod
    def downloadModel(reader, name, language, remote_loc=None):
        j_obj = _internal._DownloadModel(reader.name, name, language,
                                         remote_loc).apply()
        py_obj = ResourceDownloader._factory[reader.name]()
        py_obj._java_obj = j_obj
        return py_obj

    @staticmethod
    def downloadPipeline(name, language, remote_loc=None):
        j_obj = _internal._DownloadPipeline(name, language, remote_loc).apply()
        jmodel = JavaModel()
        jmodel._java_obj = j_obj
        return jmodel

    @staticmethod
    def clearCache(name, language, remote_loc=None):
        _internal._ClearCache(name, language, remote_loc).apply()
Пример #9
0
def setup_sentiment_pipeline():
    lexicon = 'lexicon.txt'
    document_assembler = DocumentAssembler().setInputCol(
        "rawDocument").setOutputCol("document").setIdCol("sentence_id")
    sentence_detector = SentenceDetector().setInputCols(
        ["document"]).setOutputCol("sentence")
    tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
    lemmatizer = Lemmatizer().setInputCols([
        "token"
    ]).setOutputCol("lemma").setDictionary("txt/corpus/lemmas_small.txt",
                                           key_delimiter="->",
                                           value_delimiter="\t")
    sentiment_detector = SentimentDetector().setInputCols(
        ["lemma", "sentence"]).setOutputCol("sentiment_score").setDictionary(
            "txt/corpus/{0}".format(lexicon), ",")
    finisher = Finisher().setInputCols(["sentiment_score"
                                        ]).setOutputCols(["sentiment"])
    pipeline = Pipeline(stages=[
        document_assembler, sentence_detector, tokenizer, lemmatizer,
        sentiment_detector, finisher
    ])
    return pipeline
Пример #10
0
    def tokenize(self, event):
        print('entered tokenizer')
        
        documentAssembler = DocumentAssembler()

        documentAssembler.setInputCol('text')

        documentAssembler.setOutputCol('document')
        
        self.spark_df = self.spark.createDataFrame(self.df.astype(str))
        self.spark_df=documentAssembler.transform(self.spark_df)
        tokenizer = Tokenizer()
        tokenizer.setInputCols(['document'])
        tokenizer.setOutputCol('token')
        tokenizer.setTargetPattern(self.search_pattern_input.value)
        token_df=tokenizer.fit(self.spark_df)
        current_df = token_df.transform(self.spark_df) 
        self.spark_df = current_df

        

        self.display_df = get_all_lines(self.spark_df, 'token.result', col = 'token')
        self.continue_button.disabled = False
Пример #11
0
    def spark_nlp_sentiment_analysis(self):
        """
        transform reviews with tokenization, normalization, lemmatization and sentiment dict
        calculate sentiment score and aggregate with business ID
        """
        lemma_file = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"],
            self.s3_config["LEMMA_FILE"])
        sentiment_file = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"],
            self.s3_config["SENTIMENT_FILE"])
        yelp_rating_filename = "s3a://{}/{}/{}".format(
            self.s3_config["BUCKET"], self.s3_config["YELP_FOLDER"],
            self.s3_config["YELP_REVIEW_DATA_FILE"])
        self.df_yelp_review = self.spark.read.json(yelp_rating_filename)
        self.df_yelp_review = self.df_yelp_review \
                                    .select("user_id", "business_id", "stars", "text") \
                                    .withColumnRenamed("stars", "ratings")
        self.df_id_filter = self.df_ranking.select("business_id")
        self.df_yelp_review = self.df_yelp_review \
                                    .join(self.df_id_filter, self.df_yelp_review.business_id
                                                == self.df_id_filter.business_id, 'inner') \
                                    .drop(self.df_id_filter.business_id)

        document_assembler = DocumentAssembler() \
                            .setInputCol("text")
        sentence_detector = SentenceDetector() \
                            .setInputCols(["document"]) \
                            .setOutputCol("sentence")
        tokenizer = Tokenizer() \
                    .setInputCols(["sentence"]) \
                    .setOutputCol("token")
        normalizer = Normalizer() \
                    .setInputCols(["token"]) \
                    .setOutputCol("normal")
        lemmatizer = Lemmatizer() \
                    .setInputCols(["token"]) \
                    .setOutputCol("lemma") \
                    .setDictionary(lemma_file, key_delimiter="->", value_delimiter="\t")
        sentiment_detector = SentimentDetector() \
                            .setInputCols(["lemma", "sentence"]) \
                            .setOutputCol("sentiment_score") \
                            .setDictionary(sentiment_file, delimiter=",")
        finisher = Finisher() \
                    .setInputCols(["sentiment_score"]) \
                    .setOutputCols(["sentiment"])
        pipeline = Pipeline(stages=[
            document_assembler, \
            sentence_detector, \
            tokenizer, \
            normalizer, \
            lemmatizer, \
            sentiment_detector, \
            finisher
        ])

        self.df_sentiment = pipeline \
                            .fit(self.df_yelp_review) \
                            .transform(self.df_yelp_review)
        self.df_sentiment.cache()
        self.df_sentiment = self.df_sentiment \
                                .select(self.df_sentiment.business_id, functions.when(self.df_sentiment.sentiment
                                    == "positive", 1).when(self.df_sentiment.sentiment == "negative", -1).otherwise(0))\
                                .withColumnRenamed("CASE WHEN (sentiment = positive) THEN 1 WHEN (sentiment = negative) THEN -1 ELSE 0 END", "sentiment")
        self.df_sentiment = self.df_sentiment \
                                .groupby("business_id") \
                                .agg({"sentiment": "mean"}) \
                                .withColumnRenamed("avg(sentiment)", "avg_sentiment_score")
Пример #12
0
        concat(
            # First column to concatenate. col() is used to specify that we're referencing a column
            col("title"),
            # Literal character that will be between the concatenated columns.
            lit(" "),
            # Second column to concatenate.
            col("body")
        # Change the name of the new column
        ).alias("text")
    )
)

# Now, we begin assembling our pipeline. Each component here is used to some transformation to the data.
# The Document Assembler takes the raw text data and convert it into a format that can
# be tokenized. It becomes one of spark-nlp native object types, the "Document".
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol(
    "document")

# The Tokenizer takes data that is of the "Document" type and tokenizes it.
# While slightly more involved than this, this is effectively taking a string and splitting
# it along ths spaces, so each word is its own string. The data then becomes the
# spark-nlp native type "Token".
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

# The Normalizer will group words together based on similar semantic meaning.
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalizer")

# The Stemmer takes objects of class "Token" and converts the words into their
# root meaning. For instance, the words "cars", "cars'" and "car's" would all be replaced
# with the word "car".
stemmer = Stemmer().setInputCols(["normalizer"]).setOutputCol("stem")
# COMMAND ----------

data = spark. \
        read. \
        parquet("/tmp/sentiment.parquet"). \
        limit(10000).cache()

data.show()

# COMMAND ----------

# MAGIC %md #### 3. Create appropriate annotators. We are using Sentence Detection, Tokenizing the sentences, and find the lemmas of those tokens. The Finisher will only output the Sentiment.

# COMMAND ----------

document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("/tmp/lemmas_small.txt", key_delimiter="->", value_delimiter="\t")
        
sentiment_detector = SentimentDetector() \
from pyspark.sql import types as T
from nltk.corpus import stopwords

from pyspark.ml import Pipeline

LANG = "english"

spark = sparknlp.start()

path = 'Some path'
data = spark.read.csv(path, header=True)
text_col = 'sentences'
text_data = data.select(text_col).filter(F.col(text_col).isNotNull())

document_assembler = DocumentAssembler() \
    .setInputCol(text_col) \
    .setOutputCol("document")

tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('tokens')

normalizer = Normalizer() \
     .setInputCols(['tokens']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemmatized')