예제 #1
0
def test_API():
    from sparknlp.pretrained import PretrainedPipeline

    ner_pipeline = PretrainedPipeline("ner_model_finder", "en",
                                      "clinical/models")

    result = ner_pipeline.annotate("medication")
    print(result)
    return result
예제 #2
0
def main():
    spark, sc = init_spark()

    # Download a pre-trained pipeline
    pipeline = PretrainedPipeline('explain_document_dl', lang='en')

    # Your testing dataset
    text = """
    The Mona Lisa is a 16th century oil painting created by Leonardo.
    It's held at the Louvre in Paris.
    """

    # Annotate your testing dataset
    result = pipeline.annotate(text)

    # What's in the pipeline
    print(list(result.keys()))
    print(result['entities'])
예제 #3
0
class ReplaceTerms():
    """ A class to generate sentence perturbations by replacement
    ...
    Methods
    ----------
    replace_terms(sentence, importance_scores, num_replacements, num_output_sents, sampling_strategy, sampling_k)
      Generate synonyms for an input term 
    """
    global SPARK_NLP_ENABLED

    def __init__(self, rep_type: str = 'synonym', use_ner: bool = True):
        """Instantiate a ReplaceTerms object

        Parameters
        ----------
        rep_type : Optional(str)
            The type of target perturbation. May include `synonym` for word2vec replacement, `mlmsynonym` for MLM-based replacement, or `misspelling` for misspelling replacement.
        use_ner : Optional(bool)
            Flag specifying whether to use entity-aware replacement. If True, when calculating the sampling weights for any perturbation, named entities will be zeroed. In this case, the NER model is loaded here. The default value is True.
        """
        self.use_ner = use_ner if SPARK_NLP_ENABLED else False
        self.rep_type = rep_type
        if rep_type not in ['synonym', 'misspelling', 'mlmsynonym']:
            logger.error('{}:ReplaceTerms __init__ invalid rep_type'.format(
                __file__.split('/')[-1]))
            raise ValueError('Not an accepted generator type')
        self._generator = self._get_generator(rep_type)
        if not self._generator:
            raise RuntimeError('Unable to init generator')
        if self.use_ner:
            try:
                spark = sparknlp.start()
                self._ner_pipeline = PretrainedPipeline(
                    'recognize_entities_dl', lang='en')
            except Exception as e:
                logger.error(
                    '{}:ReplaceTerms __init__ invalid rep_type'.format(
                        __file__.split('/')[-1]))
                raise RuntimeError('Unable to load ner pkg')

    def _get_entities(self, sentence: str) -> Dict:
        """ Tokenize and annotate sentence """

        if self.use_ner:
            # Use spark-nlp tokenizer for entity-aware mask
            allowed_tags = ['PER', 'LOC', 'ORG', 'MISC']

            # Annotate your testing dataset
            result = self._ner_pipeline.annotate(sentence)
            toks = result['token']
            mask = [
                1 if
                (any([y in x
                      for y in allowed_tags]) or not toks[i].isalnum()) else 0
                for i, x in enumerate(result['ner'])
            ]
        else:
            # Use simple NLTK tokenizer
            toks = nltk.word_tokenize(sentence)
            mask = [0] * len(toks)
        return mask, toks

    def _get_generator(self, name: str = None):
        if name == 'synonym':
            try:
                _syn = SynonymReplace()
                return _syn
            except Exception as e:
                logger.error(
                    '{}:replace_terms: unable to load word vectors'.format(
                        __file__.split('/')[-1]))
        elif name == 'misspelling':
            try:
                _missp = MisspReplace()
                return _missp
            except Exception as e:
                logger.error(
                    '{}:replace_terms: unable to load misspellings'.format(
                        __file__.split('/')[-1]))
        elif name == 'mlmsynonym':
            try:
                _syn = MLMSynonymReplace()
                return _syn
            except Exception as e:
                logger.error(
                    '{}:replace_terms: unable to load word vectors'.format(
                        __file__.split('/')[-1]))
        return

    def replace_terms(self,
                      sentence: str,
                      importance_scores: List = None,
                      num_replacements: int = 1,
                      num_output_sents: int = 1,
                      sampling_strategy: str = 'random',
                      sampling_k: int = None) -> List:
        """Generate a certain number of sentence perturbations by replacement using either misspelling or synonyms

        Parameters
        ----------
        sentence : str
            The input sentence to be perturbed.
        importance_scores : Optional(List)
            List of tuples defining a weight for each term in the tokenized sentence. These weights are used during sampling to influence perturnation probabilities. If None, uniform sampling is used by default.
        num_replacements : Optional(int)
            Target number of terms to replace in the original sentence. The number is chosen randomly using the target as an upper bound, and lower bound of 1. The default is 1.
        num_output_sents : Optional(int)
            Target number of perturbed sentences to generate based on the original sentence. The default is 1.
        sampling_strategy : Optional(str)
            Strategy used to sample terms to perturb in the original sentence. The default is random. If importance_scores is given, then sampling_strategy may be `topK` or `bottomK`, in which case the importance_scores (or inverted scores) vector is used for weighted sampling.
        sampling_k : Optional(int)
            The number of terms in the importance score vector to include in topK or bottomK sampling. This parameter is not used by the default sampling_strategy, `random` sampling.
        Returns
        -------
        [str]
            Returns a list of perturbed sentences for the input sentence.

        Example
        -------
        >>> from term_replacement import ReplaceTerms
        >>> p = ReplaceTerms(rep_type="synonym")
        >>> sent = "I was born in a small town"
        >>> num_terms = 1
        >>> num_output_sents = 1
        >>> p.generate(sent, num_terms, num_output_sents)
        ['I born in a small village']

        >>> from term_replacement import ReplaceTerms
        >>> p = ReplaceTerms(rep_type="misspelling")
        >>> sent = "I was born in a small town"
        >>> num_terms = 1
        >>> num_output_sents = 1
        >>> p.generate(sent, num_terms, num_output_sents)
        ['I born in a smal town']
        """

        inputs = validate_inputs(num_replacements, num_output_sents,
                                 sampling_strategy)
        num_replacements = inputs.pop(0)
        num_output_sents = inputs.pop(0)
        sampling_strategy = inputs.pop(0)

        # Extract entities in the input sentence to mask
        masked_vector, tokens = self._get_entities(sentence)

        # Check if there are enough candidate terms
        if num_replacements > (len(masked_vector) - sum(masked_vector)):
            logger.warning(
                '{}:replace_terms: unable to generate num_replacements - {} of ({})'
                .format(
                    __file__.split('/')[-1], num_replacements,
                    len(masked_vector) - sum(masked_vector)))
            num_replacements = len(masked_vector) - sum(masked_vector)

        if self.rep_type == 'misspelling':
            remove_stop = True
        else:
            remove_stop = False

        # Initialize sampling scores
        importance_scores = get_scores(tokens, sampling_strategy, sampling_k,
                                       importance_scores, remove_stop)

        if not importance_scores:
            return []

        # Add index and mask to importance scores
        term_score_index = [(word[0], i, masked_vector[i])
                            for i, word in enumerate(importance_scores)]

        # Store only scores for later sampling
        importance_scores = [
            x[1] if not masked_vector[i] else 0  # set masked scores to zero
            for i, x in enumerate(importance_scores)
        ]

        # Candidate terms for synonym replacement
        rep_term_indices = [w[1] for w in term_score_index if not w[2]]

        # Create List of Lists of term variants
        generated = {x[0]: None for x in term_score_index}
        generated = {
            x[0]: self._generator.generate(x[0].lower(), 10, **{
                'toks': tokens,
                'token_idx': i
            })
            for i, x in enumerate(term_score_index) if not x[2]
        }

        term_variants = {
            x[0]: generated.get(x[0], []) if
            (i in rep_term_indices and not masked_vector[i]) else []
            for i, x in enumerate(term_score_index)
        }

        # Check if there are enough candidate terms
        if not term_variants:
            logger.warning(
                '{}:replace_terms: unable to generate num_variants - {} of ({})'
                .format(
                    __file__.split('/')[-1], num_replacements,
                    len(term_variants) - sum(masked_vector)))
        else:
            term_variants = {
                k: [x[0].upper() + x[1:] for x in v] if k[0].isupper() else v
                for k, v in term_variants.items()
            }

        # Set scores to zero for all terms w/o synonyms
        importance_scores = [
            x if (term_score_index[i][0] in term_variants
                  and len(term_variants[term_score_index[i][0]]) > 0) else 0
            for i, x in enumerate(importance_scores)
        ]

        # Renormalize
        if sum(importance_scores) == 0:
            return []  # avoid division by 0 error

        importance_scores = [
            x / sum(importance_scores) for x in importance_scores
        ]

        # Resize num_replacements to avoid p-sampling errors
        nonzero_entries = sum([x > 0. for x in importance_scores])
        if num_replacements > nonzero_entries:
            num_replacements = nonzero_entries
        '''
        # DEBUG
        # Create a List of Lists of all variants
        candidate_variants = [
            v+[k]
            for k,v in term_variants.items()
        ]

        # Check the total number of variants
        candidate_sents = list(
            itertools.product(*candidate_variants)
        )

        # Set number of output variants to the total possible
        if len(candidate_sents) < num_output_sents:
            num_output_sents = len(candidate_sents)
        '''
        if not term_variants or len([x[2] == 0
                                     for x in term_score_index]) == 0:
            raise Exception('no term variants or term_score_index')

        max_attempts = 50
        counter = 0
        new_sentences = set()
        while len(new_sentences) < num_output_sents:
            if counter > max_attempts:
                break

            # Select terms to replace
            rnd_indices = np.random.choice(len(term_score_index),
                                           size=num_replacements,
                                           replace=False,
                                           p=importance_scores)
            replace_terms = [term_score_index[i][0] for i in rnd_indices]

            # Create List of Lists of term variants
            term_combinations = [
                term_variants.get(x[0], [x[0]])
                if x[0] in replace_terms else [x[0]]
                for i, x in enumerate(term_score_index)
            ]

            # Generate combinatorial variants
            candidate_sents = list(itertools.product(*term_combinations))

            for sent in candidate_sents:
                new_sentences.add(' '.join(sent))
            counter += 1

        # Shuffle permutations, sanitize and slice
        new_sentences = list(new_sentences)
        random.shuffle(new_sentences)
        new_sentences = [
            re.sub(r'([A-Za-z0-9])(\s+)([^A-Za-z0-9])', r'\1\3',
                   x.replace('\' s ', '\'s '))
            for x in new_sentences[:num_output_sents]
        ]
        new_sentences = [x for x in new_sentences if x != sentence]

        if len(new_sentences) < num_output_sents:
            logger.debug(
                '{}:replace_terms: unable to generate num_output_sents - {} of ({})'
                .format(
                    __file__.split('/')[-1], len(new_sentences),
                    num_output_sents))
        return new_sentences
    text = re.sub('#', '', text)  # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text)  # Removing RT
    text = re.sub('https?:\/\/\S+', '', text)  # Removing hyperlink
    text = re.sub(':', '', text)  #remove colon
    text = re.sub('(\.|\!|\?|\,)', '', text)  #remove punctutations
    return text


udf_fun = udf(lambda text: cleantext(text), StringType())
preprocessed_text = twitter_df.select('id',
                                      udf_fun('text').alias('text'), 'user')

preprocessed_text.show()

#use pipeline
pipeline = PretrainedPipeline("analyze_sentiment")
result = pipeline.annotate(preprocessed_text, column='text')
#result.select("sentiment.result").show()

#write result to mongodb

cols = ['id', 'text', 'sentiment.result', 'user']
output = result.select(cols)
#output.show()


output.write\
    .format("com.mongodb.spark.sql.DefaultSource") \
    .mode("append") \
    .option("collection", "sentiment_predicted") \
    .save()
예제 #5
0
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {
    "spark.driver.memory": "16G",
    "spark.kryoserializer.buffer.max": "2000M",
    "spark.driver.maxResultSize": "2000M"
}

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(SECRET, params=params)

from sparknlp.pretrained import PretrainedPipeline

ner_pipeline = PretrainedPipeline("ner_model_finder", "en", "clinical/models")

result = ner_pipeline.annotate("medication")

print(100 * '-')
print(result)
print(100 * '-')
예제 #6
0
##LEMATIZATION
import sparknlp
sparknlp.start()

from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import lower, col
from pyspark.ml.feature import StringIndexer

pipeline = PretrainedPipeline('explain_document_ml', 'en')
s_df2 = pipeline.annotate(s_df, "review_text")
s_df2 = s_df2.drop(
    *["document", "sentence", "token", "spell", "stems", "pos", "text"])


def mkString(line):
    return " ".join([str(x[3]) for x in line])


string_udf = udf(lambda z: mkString(z), StringType())
s_df2 = s_df2.withColumn("lemmatizedText", string_udf("lemmas"))
s_df2 = s_df2.withColumn("lemmatizedText", lower(col("lemmatizedText")))

# define processing 4 steps and execute them with a trsnformation pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml import Pipeline
##LEMATIZATION

# 1. Tokenizer, .setPattern("\\p{L}+") means that it remove accent from words (check it has no impact on the smileys !!!)
tokenizer = RegexTokenizer().setGaps(False)\
예제 #7
0
# COMMAND ----------

display(df.select("text"))

# COMMAND ----------

# MAGIC %md #### Extraction

# COMMAND ----------

import sparknlp 
from sparknlp.pretrained import PretrainedPipeline 

pipeline = PretrainedPipeline('recognize_entities_dl_noncontrib', 'en')
result = pipeline.annotate(df, column = 'text') 

# COMMAND ----------

result.cache().count()

# COMMAND ----------

from pyspark.sql.functions import explode, col
from wordcloud import WordCloud,STOPWORDS

import matplotlib.pyplot as plt

exploded = result.select(explode(col('entities.result')).alias("entities"))

l = list(exploded.toPandas()["entities"])
예제 #8
0
def extract_sentiment(aws_conn_id: str, tweets_path: str, summary_path: str,
                      language: str, **kwargs):
    aws_hook = AwsHook(aws_conn_id=aws_conn_id)
    aws_credentials = aws_hook.get_credentials()

    spark = (
        SparkSession.builder.master("local[*]").appName(
            "Analyse sentiment of given tweets").config(
                "spark.serializer",
                "org.apache.spark.serializer.KryoSerializer").config(
                    "spark.kryoserializer.buffer.max", "1000M")
        # .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.0,"
        #                                "org.apache.hadoop:hadoop-common:3.2.0,"
        #                                "org.apache.hadoop:hadoop-annotations:3.2.0,"
        #                                "org.apache.hadoop:hadoop-auth:3.2.0,"
        #                                "org.apache.hadoop:hadoop-client:3.2.0")
        .config("spark.jars.packages",
                "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5").config(
                    "spark.hadoop.fs.s3a.access.key",
                    aws_credentials.access_key).config(
                        "spark.hadoop.fs.s3a.secret.key",
                        aws_credentials.secret_key).config(
                            "spark.hadoop.fs.s3a.impl",
                            "org.apache.hadoop.fs.s3a.S3AFileSystem").config(
                                "spark.hadoop.fs.s3a.endpoint",
                                "s3-eu-central-1.amazonaws.com").
        config("spark.hadoop.fs.s3a.path.style.access", "true").config(
            "spark.executor.extraJavaOptions",
            "-Dcom.amazonaws.services.s3.enableV4=true").config(
                "spark.driver.extraJavaOptions",
                "-Dcom.amazonaws.services.s3.enableV4=true").getOrCreate())

    year = kwargs['execution_date'].year
    month = kwargs['execution_date'].month
    day = kwargs['execution_date'].day
    tweets_path = f'{tweets_path}/{year:04d}/{month:02d}/{day:02d}/*.jsonl.gz'
    summary_path = f'{summary_path}/{year:04d}-{month:02d}-{day:02d}.jsonl'

    logging.info(f'Reading tweets from: {tweets_path}')
    tweets = spark.read.json(tweets_path)

    english_tweets_only = tweets.select('full_text').where(
        tweets.lang == language)
    original_english_tweets_only = english_tweets_only.where(
        ~english_tweets_only.full_text.startswith('RT @'))

    sentiment_pipeline = PretrainedPipeline('analyze_sentiment', language)
    analysed_tweets = sentiment_pipeline.annotate(original_english_tweets_only,
                                                  column='full_text')

    main_sentiment = udf(lambda col: Counter(col).most_common(1)[0][0],
                         StringType())

    tweets_with_overall_sentiment = (analysed_tweets.withColumn(
        'overall_sentiment',
        main_sentiment(analysed_tweets.sentiment.result)).drop(
            'document', 'sentence', 'token', 'checked'))

    tweets_sentiment_summary = tweets_with_overall_sentiment.groupBy(
        'overall_sentiment').count()

    tweets_sentiment_record = dict(
        tweets_sentiment_summary.rdd.map(
            lambda r: (r['overall_sentiment'], r['count'])).collect())
    tweets_sentiment_record[
        'tweets_sentiment_id'] = f'{year:04d}-{month:02d}-{day:02d}({language})'
    tweets_sentiment_record['year'] = year
    tweets_sentiment_record['month'] = month
    tweets_sentiment_record['day'] = day
    tweets_sentiment_record['language'] = language
    tweets_sentiment_record['positive_count'] = tweets_sentiment_record[
        'positive']
    tweets_sentiment_record['negative_count'] = tweets_sentiment_record[
        'negative']
    tweets_sentiment_record['na_count'] = tweets_sentiment_record['na']
    del tweets_sentiment_record['positive']
    del tweets_sentiment_record['negative']
    del tweets_sentiment_record['na']

    logging.info(
        f'Extracted sentiment summary for {year:04d}-{month:02d}-{day:02d}: {tweets_sentiment_record}'
    )

    tweets_sentiment = spark.createDataFrame([tweets_sentiment_record])
    tweets_sentiment.write.json(summary_path, mode='overwrite')
예제 #9
0
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.embeddings import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

# Download a pre-trained pipeline
pipeline = PretrainedPipeline('explain_document_dl', lang='en')

# Your testing dataset
text = """
The Mona Lisa is a 16th century oil painting created by Leonardo. 
It’s held at the Louvre in Paris.
"""

# Annotate your testing dataset
result = pipeline.annotate(text)
# What’s in the pipeline
print(list(result.keys()))
print(result['entities'])
예제 #10
0
    .builder \
    .master('local') \
    .appName('Spark_NLP') \
    .config('spark.jars', 'spark-nlp-assembly-2.6.1.jar') \
    .getOrCreate()

#LOAD DATA

df = spark.read.json('Data/*.json')
df.printSchema()

#EXTRACT DATA FROM DF (STRUCT)

author = 'data.author'
title = 'data.title'
date = 'data.created_utc'

dfAuthorTitle = df.select(author, title, F.to_timestamp(F.from_unixtime(date)))
dfAuthorTitle.limit(5).toPandas()

#COUNT WORDS

dfWordCount = df.select(F.explode(F.split(
    title,
    '\\s+')).alias('word')).groupBy('word').count().orderBy(F.desc('count'))
dfWordCount.limit(10).toPandas()

#USE NLP MODULE

dfAnnotated = PretrainedPipeline.annotate(dfAuthorTitle, 'title')
예제 #11
0
dfSpark = dfSpark.withColumn(
    "label", dfSpark.Sentimiento.cast('float')).drop('Sentimiento')

# COMMAND ----------

dfSpark.orderBy(rand()).show(10)

# COMMAND ----------

length = dfSpark.count()
lista_sentimientos = []
#Como parametro de range solo seria necesario cambiarlo por la variable LENGTH para que evalue todo el dataset
for i in range(3):
    columna_content = dfSpark.select('content').collect()[i].__getitem__(
        "content")
    pred = pipeline.annotate(columna_content)
    if pred['sentiment'][0] == 'positive':
        lista_sentimientos.append('1')
    elif pred['sentiment'][0] == 'negative':
        lista_sentimientos.append('0')
    #print(pred['sentiment'])
#print(lista_sentimientos)
spark.createDataFrame(lista_sentimientos, StringType()).show()
#print(dfSpark.show())

# COMMAND ----------

len_udf = udf(lambda s: len(s), IntegerType())
dfSpark = dfSpark.withColumn("token_count", len_udf(col('refined_tokens')))
dfSpark.orderBy(rand()).show(10)