def test_API(): from sparknlp.pretrained import PretrainedPipeline ner_pipeline = PretrainedPipeline("ner_model_finder", "en", "clinical/models") result = ner_pipeline.annotate("medication") print(result) return result
def main(): spark, sc = init_spark() # Download a pre-trained pipeline pipeline = PretrainedPipeline('explain_document_dl', lang='en') # Your testing dataset text = """ The Mona Lisa is a 16th century oil painting created by Leonardo. It's held at the Louvre in Paris. """ # Annotate your testing dataset result = pipeline.annotate(text) # What's in the pipeline print(list(result.keys())) print(result['entities'])
class ReplaceTerms(): """ A class to generate sentence perturbations by replacement ... Methods ---------- replace_terms(sentence, importance_scores, num_replacements, num_output_sents, sampling_strategy, sampling_k) Generate synonyms for an input term """ global SPARK_NLP_ENABLED def __init__(self, rep_type: str = 'synonym', use_ner: bool = True): """Instantiate a ReplaceTerms object Parameters ---------- rep_type : Optional(str) The type of target perturbation. May include `synonym` for word2vec replacement, `mlmsynonym` for MLM-based replacement, or `misspelling` for misspelling replacement. use_ner : Optional(bool) Flag specifying whether to use entity-aware replacement. If True, when calculating the sampling weights for any perturbation, named entities will be zeroed. In this case, the NER model is loaded here. The default value is True. """ self.use_ner = use_ner if SPARK_NLP_ENABLED else False self.rep_type = rep_type if rep_type not in ['synonym', 'misspelling', 'mlmsynonym']: logger.error('{}:ReplaceTerms __init__ invalid rep_type'.format( __file__.split('/')[-1])) raise ValueError('Not an accepted generator type') self._generator = self._get_generator(rep_type) if not self._generator: raise RuntimeError('Unable to init generator') if self.use_ner: try: spark = sparknlp.start() self._ner_pipeline = PretrainedPipeline( 'recognize_entities_dl', lang='en') except Exception as e: logger.error( '{}:ReplaceTerms __init__ invalid rep_type'.format( __file__.split('/')[-1])) raise RuntimeError('Unable to load ner pkg') def _get_entities(self, sentence: str) -> Dict: """ Tokenize and annotate sentence """ if self.use_ner: # Use spark-nlp tokenizer for entity-aware mask allowed_tags = ['PER', 'LOC', 'ORG', 'MISC'] # Annotate your testing dataset result = self._ner_pipeline.annotate(sentence) toks = result['token'] mask = [ 1 if (any([y in x for y in allowed_tags]) or not toks[i].isalnum()) else 0 for i, x in enumerate(result['ner']) ] else: # Use simple NLTK tokenizer toks = nltk.word_tokenize(sentence) mask = [0] * len(toks) return mask, toks def _get_generator(self, name: str = None): if name == 'synonym': try: _syn = SynonymReplace() return _syn except Exception as e: logger.error( '{}:replace_terms: unable to load word vectors'.format( __file__.split('/')[-1])) elif name == 'misspelling': try: _missp = MisspReplace() return _missp except Exception as e: logger.error( '{}:replace_terms: unable to load misspellings'.format( __file__.split('/')[-1])) elif name == 'mlmsynonym': try: _syn = MLMSynonymReplace() return _syn except Exception as e: logger.error( '{}:replace_terms: unable to load word vectors'.format( __file__.split('/')[-1])) return def replace_terms(self, sentence: str, importance_scores: List = None, num_replacements: int = 1, num_output_sents: int = 1, sampling_strategy: str = 'random', sampling_k: int = None) -> List: """Generate a certain number of sentence perturbations by replacement using either misspelling or synonyms Parameters ---------- sentence : str The input sentence to be perturbed. importance_scores : Optional(List) List of tuples defining a weight for each term in the tokenized sentence. These weights are used during sampling to influence perturnation probabilities. If None, uniform sampling is used by default. num_replacements : Optional(int) Target number of terms to replace in the original sentence. The number is chosen randomly using the target as an upper bound, and lower bound of 1. The default is 1. num_output_sents : Optional(int) Target number of perturbed sentences to generate based on the original sentence. The default is 1. sampling_strategy : Optional(str) Strategy used to sample terms to perturb in the original sentence. The default is random. If importance_scores is given, then sampling_strategy may be `topK` or `bottomK`, in which case the importance_scores (or inverted scores) vector is used for weighted sampling. sampling_k : Optional(int) The number of terms in the importance score vector to include in topK or bottomK sampling. This parameter is not used by the default sampling_strategy, `random` sampling. Returns ------- [str] Returns a list of perturbed sentences for the input sentence. Example ------- >>> from term_replacement import ReplaceTerms >>> p = ReplaceTerms(rep_type="synonym") >>> sent = "I was born in a small town" >>> num_terms = 1 >>> num_output_sents = 1 >>> p.generate(sent, num_terms, num_output_sents) ['I born in a small village'] >>> from term_replacement import ReplaceTerms >>> p = ReplaceTerms(rep_type="misspelling") >>> sent = "I was born in a small town" >>> num_terms = 1 >>> num_output_sents = 1 >>> p.generate(sent, num_terms, num_output_sents) ['I born in a smal town'] """ inputs = validate_inputs(num_replacements, num_output_sents, sampling_strategy) num_replacements = inputs.pop(0) num_output_sents = inputs.pop(0) sampling_strategy = inputs.pop(0) # Extract entities in the input sentence to mask masked_vector, tokens = self._get_entities(sentence) # Check if there are enough candidate terms if num_replacements > (len(masked_vector) - sum(masked_vector)): logger.warning( '{}:replace_terms: unable to generate num_replacements - {} of ({})' .format( __file__.split('/')[-1], num_replacements, len(masked_vector) - sum(masked_vector))) num_replacements = len(masked_vector) - sum(masked_vector) if self.rep_type == 'misspelling': remove_stop = True else: remove_stop = False # Initialize sampling scores importance_scores = get_scores(tokens, sampling_strategy, sampling_k, importance_scores, remove_stop) if not importance_scores: return [] # Add index and mask to importance scores term_score_index = [(word[0], i, masked_vector[i]) for i, word in enumerate(importance_scores)] # Store only scores for later sampling importance_scores = [ x[1] if not masked_vector[i] else 0 # set masked scores to zero for i, x in enumerate(importance_scores) ] # Candidate terms for synonym replacement rep_term_indices = [w[1] for w in term_score_index if not w[2]] # Create List of Lists of term variants generated = {x[0]: None for x in term_score_index} generated = { x[0]: self._generator.generate(x[0].lower(), 10, **{ 'toks': tokens, 'token_idx': i }) for i, x in enumerate(term_score_index) if not x[2] } term_variants = { x[0]: generated.get(x[0], []) if (i in rep_term_indices and not masked_vector[i]) else [] for i, x in enumerate(term_score_index) } # Check if there are enough candidate terms if not term_variants: logger.warning( '{}:replace_terms: unable to generate num_variants - {} of ({})' .format( __file__.split('/')[-1], num_replacements, len(term_variants) - sum(masked_vector))) else: term_variants = { k: [x[0].upper() + x[1:] for x in v] if k[0].isupper() else v for k, v in term_variants.items() } # Set scores to zero for all terms w/o synonyms importance_scores = [ x if (term_score_index[i][0] in term_variants and len(term_variants[term_score_index[i][0]]) > 0) else 0 for i, x in enumerate(importance_scores) ] # Renormalize if sum(importance_scores) == 0: return [] # avoid division by 0 error importance_scores = [ x / sum(importance_scores) for x in importance_scores ] # Resize num_replacements to avoid p-sampling errors nonzero_entries = sum([x > 0. for x in importance_scores]) if num_replacements > nonzero_entries: num_replacements = nonzero_entries ''' # DEBUG # Create a List of Lists of all variants candidate_variants = [ v+[k] for k,v in term_variants.items() ] # Check the total number of variants candidate_sents = list( itertools.product(*candidate_variants) ) # Set number of output variants to the total possible if len(candidate_sents) < num_output_sents: num_output_sents = len(candidate_sents) ''' if not term_variants or len([x[2] == 0 for x in term_score_index]) == 0: raise Exception('no term variants or term_score_index') max_attempts = 50 counter = 0 new_sentences = set() while len(new_sentences) < num_output_sents: if counter > max_attempts: break # Select terms to replace rnd_indices = np.random.choice(len(term_score_index), size=num_replacements, replace=False, p=importance_scores) replace_terms = [term_score_index[i][0] for i in rnd_indices] # Create List of Lists of term variants term_combinations = [ term_variants.get(x[0], [x[0]]) if x[0] in replace_terms else [x[0]] for i, x in enumerate(term_score_index) ] # Generate combinatorial variants candidate_sents = list(itertools.product(*term_combinations)) for sent in candidate_sents: new_sentences.add(' '.join(sent)) counter += 1 # Shuffle permutations, sanitize and slice new_sentences = list(new_sentences) random.shuffle(new_sentences) new_sentences = [ re.sub(r'([A-Za-z0-9])(\s+)([^A-Za-z0-9])', r'\1\3', x.replace('\' s ', '\'s ')) for x in new_sentences[:num_output_sents] ] new_sentences = [x for x in new_sentences if x != sentence] if len(new_sentences) < num_output_sents: logger.debug( '{}:replace_terms: unable to generate num_output_sents - {} of ({})' .format( __file__.split('/')[-1], len(new_sentences), num_output_sents)) return new_sentences
text = re.sub('#', '', text) # Removing '#' hash tag text = re.sub('RT[\s]+', '', text) # Removing RT text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink text = re.sub(':', '', text) #remove colon text = re.sub('(\.|\!|\?|\,)', '', text) #remove punctutations return text udf_fun = udf(lambda text: cleantext(text), StringType()) preprocessed_text = twitter_df.select('id', udf_fun('text').alias('text'), 'user') preprocessed_text.show() #use pipeline pipeline = PretrainedPipeline("analyze_sentiment") result = pipeline.annotate(preprocessed_text, column='text') #result.select("sentiment.result").show() #write result to mongodb cols = ['id', 'text', 'sentiment.result', 'user'] output = result.select(cols) #output.show() output.write\ .format("com.mongodb.spark.sql.DefaultSource") \ .mode("append") \ .option("collection", "sentiment_predicted") \ .save()
from pyspark.sql import functions as F from sparknlp.annotator import * from sparknlp_jsl.annotator import * from sparknlp.base import * import sparknlp_jsl import sparknlp import warnings warnings.filterwarnings('ignore') params = { "spark.driver.memory": "16G", "spark.kryoserializer.buffer.max": "2000M", "spark.driver.maxResultSize": "2000M" } print("Spark NLP Version :", sparknlp.version()) print("Spark NLP_JSL Version :", sparknlp_jsl.version()) spark = sparknlp_jsl.start(SECRET, params=params) from sparknlp.pretrained import PretrainedPipeline ner_pipeline = PretrainedPipeline("ner_model_finder", "en", "clinical/models") result = ner_pipeline.annotate("medication") print(100 * '-') print(result) print(100 * '-')
##LEMATIZATION import sparknlp sparknlp.start() from sparknlp.pretrained import PretrainedPipeline from pyspark.sql.types import StringType from pyspark.sql.functions import udf from pyspark.sql.functions import lower, col from pyspark.ml.feature import StringIndexer pipeline = PretrainedPipeline('explain_document_ml', 'en') s_df2 = pipeline.annotate(s_df, "review_text") s_df2 = s_df2.drop( *["document", "sentence", "token", "spell", "stems", "pos", "text"]) def mkString(line): return " ".join([str(x[3]) for x in line]) string_udf = udf(lambda z: mkString(z), StringType()) s_df2 = s_df2.withColumn("lemmatizedText", string_udf("lemmas")) s_df2 = s_df2.withColumn("lemmatizedText", lower(col("lemmatizedText"))) # define processing 4 steps and execute them with a trsnformation pipeline from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF from pyspark.ml import Pipeline ##LEMATIZATION # 1. Tokenizer, .setPattern("\\p{L}+") means that it remove accent from words (check it has no impact on the smileys !!!) tokenizer = RegexTokenizer().setGaps(False)\
# COMMAND ---------- display(df.select("text")) # COMMAND ---------- # MAGIC %md #### Extraction # COMMAND ---------- import sparknlp from sparknlp.pretrained import PretrainedPipeline pipeline = PretrainedPipeline('recognize_entities_dl_noncontrib', 'en') result = pipeline.annotate(df, column = 'text') # COMMAND ---------- result.cache().count() # COMMAND ---------- from pyspark.sql.functions import explode, col from wordcloud import WordCloud,STOPWORDS import matplotlib.pyplot as plt exploded = result.select(explode(col('entities.result')).alias("entities")) l = list(exploded.toPandas()["entities"])
def extract_sentiment(aws_conn_id: str, tweets_path: str, summary_path: str, language: str, **kwargs): aws_hook = AwsHook(aws_conn_id=aws_conn_id) aws_credentials = aws_hook.get_credentials() spark = ( SparkSession.builder.master("local[*]").appName( "Analyse sentiment of given tweets").config( "spark.serializer", "org.apache.spark.serializer.KryoSerializer").config( "spark.kryoserializer.buffer.max", "1000M") # .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.0," # "org.apache.hadoop:hadoop-common:3.2.0," # "org.apache.hadoop:hadoop-annotations:3.2.0," # "org.apache.hadoop:hadoop-auth:3.2.0," # "org.apache.hadoop:hadoop-client:3.2.0") .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5").config( "spark.hadoop.fs.s3a.access.key", aws_credentials.access_key).config( "spark.hadoop.fs.s3a.secret.key", aws_credentials.secret_key).config( "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem").config( "spark.hadoop.fs.s3a.endpoint", "s3-eu-central-1.amazonaws.com"). config("spark.hadoop.fs.s3a.path.style.access", "true").config( "spark.executor.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true").config( "spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true").getOrCreate()) year = kwargs['execution_date'].year month = kwargs['execution_date'].month day = kwargs['execution_date'].day tweets_path = f'{tweets_path}/{year:04d}/{month:02d}/{day:02d}/*.jsonl.gz' summary_path = f'{summary_path}/{year:04d}-{month:02d}-{day:02d}.jsonl' logging.info(f'Reading tweets from: {tweets_path}') tweets = spark.read.json(tweets_path) english_tweets_only = tweets.select('full_text').where( tweets.lang == language) original_english_tweets_only = english_tweets_only.where( ~english_tweets_only.full_text.startswith('RT @')) sentiment_pipeline = PretrainedPipeline('analyze_sentiment', language) analysed_tweets = sentiment_pipeline.annotate(original_english_tweets_only, column='full_text') main_sentiment = udf(lambda col: Counter(col).most_common(1)[0][0], StringType()) tweets_with_overall_sentiment = (analysed_tweets.withColumn( 'overall_sentiment', main_sentiment(analysed_tweets.sentiment.result)).drop( 'document', 'sentence', 'token', 'checked')) tweets_sentiment_summary = tweets_with_overall_sentiment.groupBy( 'overall_sentiment').count() tweets_sentiment_record = dict( tweets_sentiment_summary.rdd.map( lambda r: (r['overall_sentiment'], r['count'])).collect()) tweets_sentiment_record[ 'tweets_sentiment_id'] = f'{year:04d}-{month:02d}-{day:02d}({language})' tweets_sentiment_record['year'] = year tweets_sentiment_record['month'] = month tweets_sentiment_record['day'] = day tweets_sentiment_record['language'] = language tweets_sentiment_record['positive_count'] = tweets_sentiment_record[ 'positive'] tweets_sentiment_record['negative_count'] = tweets_sentiment_record[ 'negative'] tweets_sentiment_record['na_count'] = tweets_sentiment_record['na'] del tweets_sentiment_record['positive'] del tweets_sentiment_record['negative'] del tweets_sentiment_record['na'] logging.info( f'Extracted sentiment summary for {year:04d}-{month:02d}-{day:02d}: {tweets_sentiment_record}' ) tweets_sentiment = spark.createDataFrame([tweets_sentiment_record]) tweets_sentiment.write.json(summary_path, mode='overwrite')
# Import Spark NLP from sparknlp.base import * from sparknlp.annotator import * from sparknlp.embeddings import * from sparknlp.pretrained import PretrainedPipeline import sparknlp from pyspark.sql import SparkSession spark = SparkSession.builder \ .appName("Spark NLP")\ .master("local[4]")\ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "2G") \ .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\ .config("spark.kryoserializer.buffer.max", "1000M")\ .getOrCreate() # Download a pre-trained pipeline pipeline = PretrainedPipeline('explain_document_dl', lang='en') # Your testing dataset text = """ The Mona Lisa is a 16th century oil painting created by Leonardo. It’s held at the Louvre in Paris. """ # Annotate your testing dataset result = pipeline.annotate(text) # What’s in the pipeline print(list(result.keys())) print(result['entities'])
.builder \ .master('local') \ .appName('Spark_NLP') \ .config('spark.jars', 'spark-nlp-assembly-2.6.1.jar') \ .getOrCreate() #LOAD DATA df = spark.read.json('Data/*.json') df.printSchema() #EXTRACT DATA FROM DF (STRUCT) author = 'data.author' title = 'data.title' date = 'data.created_utc' dfAuthorTitle = df.select(author, title, F.to_timestamp(F.from_unixtime(date))) dfAuthorTitle.limit(5).toPandas() #COUNT WORDS dfWordCount = df.select(F.explode(F.split( title, '\\s+')).alias('word')).groupBy('word').count().orderBy(F.desc('count')) dfWordCount.limit(10).toPandas() #USE NLP MODULE dfAnnotated = PretrainedPipeline.annotate(dfAuthorTitle, 'title')
dfSpark = dfSpark.withColumn( "label", dfSpark.Sentimiento.cast('float')).drop('Sentimiento') # COMMAND ---------- dfSpark.orderBy(rand()).show(10) # COMMAND ---------- length = dfSpark.count() lista_sentimientos = [] #Como parametro de range solo seria necesario cambiarlo por la variable LENGTH para que evalue todo el dataset for i in range(3): columna_content = dfSpark.select('content').collect()[i].__getitem__( "content") pred = pipeline.annotate(columna_content) if pred['sentiment'][0] == 'positive': lista_sentimientos.append('1') elif pred['sentiment'][0] == 'negative': lista_sentimientos.append('0') #print(pred['sentiment']) #print(lista_sentimientos) spark.createDataFrame(lista_sentimientos, StringType()).show() #print(dfSpark.show()) # COMMAND ---------- len_udf = udf(lambda s: len(s), IntegerType()) dfSpark = dfSpark.withColumn("token_count", len_udf(col('refined_tokens'))) dfSpark.orderBy(rand()).show(10)