def task_four(ngram): """ Set the ngram value :param ngram: :return: """ params = list(inspect.getargspec(task_four)) p = list(chain.from_iterable([i for i in params if i is not None])) param_values = {} if len(p) > 0: for i, v in enumerate(p): try: value = raw_input("Please enter a value for {} ==> ".format(v)) param_values.update({v: value}) except: pass ngram = param_values.get(p[0]) if int(ngram) == 2: # --- list of stopwords stopwords = { 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now', ' a ', 'insured', 'sured', 'coverage', 'year', 'dob', 'insd', 'left' } # --- remove stop words REMOVER = StopWordsRemover() stopwords = REMOVER.getStopWords() REMOVER.setInputCol("inter_wordlist") REMOVER.setOutputCol("inter_wordlist_two") stpwrds_rmvd_sdf = REMOVER.transform(VECTOR_DATAFRAME) \ .select(["Claim_Id", "filename", "inter_wordlist_two"]) else: pass
def tokenize_df(df): tokenizer = Tokenizer(inputCol="text", outputCol="vector") remover = StopWordsRemover() remover.setInputCol("vector") remover.setOutputCol("vector_no_stopw") stopwords = remover.getStopWords() stemmer = PorterStemmer() stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType())) df = df.select(clean_text(col("text")).alias("text")) df = tokenizer.transform(df).select("vector") df = remover.transform(df).select("vector_no_stopw") df = (df .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw")) .select("vector_stemmed") ) return df
def init_base_df(file_path=default_file_path): # Set legacy parsing as Spark 3.0+ cannot use 'E' for timestamp spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY") print("Loading", default_file_path) raw_df = ( spark.read.format("csv") .option("inferSchema", True) .load(file_path) .toDF("polarity", "tweet_id", "datetime", "query", "user", "text") ) # Parse string to timestamp time_parsed_df = raw_df.withColumn( "timestamp", to_timestamp("datetime", "EEE MMM dd HH:mm:ss zzz yyyy") ) df = time_parsed_df.drop("query").drop("datetime") # Shift polarity from a range of [0:4], to [-1:1] scaled_polarity_df = df.withColumn("sentiment", (col("polarity") / 2) - 1).drop( "polarity" ) clean_text_df = df.select(clean_text(col("text")).alias("text"), "tweet_id") tokenizer = Tokenizer(inputCol="text", outputCol="vector") vector_df = tokenizer.transform(clean_text_df).select("vector", "tweet_id") remover = StopWordsRemover() stopwords = remover.getStopWords() remover.setInputCol("vector") remover.setOutputCol("tokens") tokens_no_stopw_df = remover.transform(vector_df).select("tokens", "tweet_id") tweets_with_tokens_df = scaled_polarity_df.join(tokens_no_stopw_df, on=["tweet_id"]) return tweets_with_tokens_df
def spark_transformation_comments(filename_read_S3, filename_write_elastic, filename_write_S3): """ Columns in Input: 'archived', 'author', 'author_flair_css_class', 'author_flair_text', 'body', 'controversiality', 'created_utc', 'distinguished', 'downs', 'edited', 'gilded', 'id', 'link_id', 'name', 'parent_id', 'retrieved_on', 'score', 'score_hidden', 'subreddit', 'subreddit_id', 'ups' :param filename_read_S3: File to read from :param filename_write_elastic: Output file for Elastic :param filename_write_S3: Cleaned files to S3 :return: """ # --------------------------------------------- # -------- BASIC TRANSFORMATIONS -------- # ---------------------------------------------- logger.info("Stage 1: read file into Dataframe from S3") comments_df1 = sqlContext.read.parquet(filename_read_S3) columns = comments_df1.columns logger.info("List of columns for Comments - {0}".format(columns)) logger.info("Stage 2: select required columns from data") # NOTE: Column "Downs", 'name' isn't available for all years # Its available only from 2006-06 if 'downs' in columns and 'name' in columns: comments_df2 = comments_df1.select('subreddit', 'subreddit_id', 'created_utc', 'author', 'id', 'link_id', 'parent_id', 'body', 'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs', 'name') else: comments_df2 = comments_df1.select('subreddit', 'subreddit_id', 'created_utc', 'author', 'id', 'link_id', 'parent_id', 'body', 'controversiality', 'distinguished', 'gilded', 'score', 'ups') logger.info( "Stage 3: Removing rows where post has been deleted , #TODO: Need to include removed" ) comments_df3 = comments_df2.filter(comments_df2['author'] != '[deleted]') # Create and Register trim_link as UDF spark.udf.register("trimlinks", trim_link, StringType()) trim_link_udf = udf(trim_link) logger.info("Stage 4: get submission_id from link_id") comments_df4 = comments_df3.withColumn("submission_id", trim_link_udf(col("link_id"))) # --------------------------------------------- # -------- FEATURE ENGINEERING --------------- # --------------------------------------------- logger.info("Stage 5: Convert 'created_uct' to unix timestamp") comments_df5 = comments_df4.select( 'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body', 'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs', 'name', 'submission_id', from_unixtime('created_utc').alias('timestamp')) logger.info( "Stage 6: Add new features: Year, Month, day, hour, minute, week, julian day" ) comments_df6 = comments_df5.select( 'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body', 'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs', 'name', 'submission_id', year(comments_df5.timestamp).alias('year'), month(comments_df5.timestamp).alias('month'), dayofmonth(comments_df5.timestamp).alias('day'), dayofyear(comments_df5.timestamp).alias('day_of_year'), hour(comments_df5.timestamp).alias('hour'), minute(comments_df5.timestamp).alias('min'), weekofyear(comments_df5.timestamp).alias('week_of_year')) # --------------------------------------- # PERSIST Data for following reasons: # --------------------------------------- # 1. Write data to ElasticSearch after ETL # 2. Perform NLP based data cleaning for comments # 3. Identify popular words # 4. Load NLP cleaned data to S3 # 5. Load Words to ElasticSearch comments_df6.persist(StorageLevel.MEMORY_AND_DISK_SER) logger.info("persisted data after initial cleaning") # ------------------------------------------- # Write to ElasticSearch: NDJSON file # ------------------------------------------- # Load the Cleaned data to ElasticSearch logger.info("starting transforming data to NDJSON - for Large ES load") nd_json = comments_df6.rdd.map(lambda x: elastic_search_mapper_body(x)) logger.info("completed transformation to NDJSON") logger.info("save data as Text file") if not os.path.exists(filename_write_elastic): nd_json.saveAsTextFile(filename_write_elastic) ES_WRITE_STATUS = True else: logger.info("data already loaded") # ---------------------------- # NLP transformations Pipeline # ----------------------------- logger.info("Stage 7: Remove Punctuations") comments_df7 = comments_df6.select( 'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body', 'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs', 'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour', 'min', 'week_of_year', removePunctuation(col('body'))) logger.info("stage 8: Word Tokenization") tokenizer = Tokenizer(inputCol="cleaned_body", outputCol="tokenized_body") comments_df8 = tokenizer.transform(comments_df7).select( 'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body', 'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs', 'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour', 'min', 'week_of_year', 'tokenized_body') # StopWords Removal logger.info("Stage 9: Using SPARK default stopwords.") remover = StopWordsRemover() remover.setInputCol("tokenized_body") remover.setOutputCol("no_stop_words_body") comments_df9 = remover.transform(comments_df8).select( 'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body', 'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs', 'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour', 'min', 'week_of_year', 'no_stop_words_body') logger.info("Stage 10: Making a Custom list of words") # TODO: Get Reddit frequent words spark.udf.register("filterExtraStopWords", filter_stop_words, ArrayType(StringType())) filter_stop_words_udf = udf(filter_stop_words) comments_df10 = comments_df9.select( 'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body', 'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs', 'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour', 'min', 'week_of_year', filter_stop_words_udf("no_stop_words_body").alias( "body_without_stopwords")) logger.info(comments_df10.logger.infoSchema()) # ------------------------- # Upload Cleaned data to S3 # ------------------------- comments_df10.write.parquet(filename_write_S3) logger.info("completed loading the data to S3") return
vector_df.show(10) """**3. Remove** **stop words**""" from pyspark.ml.feature import StopWordsRemover # Define a list of stop words or use default list remover = StopWordsRemover() stopwords = remover.getStopWords() # Display default list stopwords[:10] # Specify input/output columns remover.setInputCol("vector") remover.setOutputCol("Body_no_stopw") # Transform existing dataframe with the StopWordsRemover Body_no_stopw_df = remover.transform(vector_df).select("Body_no_stopw") # Display Body_no_stopw_df.printSchema() Body_no_stopw_df.show() """**4. Tokenizing posts into words**""" # Import stemmer library from nltk.stem.porter import * # Instantiate stemmer object stemmer = PorterStemmer()
def import_data(self): # meta df meta_df = pd.read_csv(self.metadata_path, dtype={ 'pubmed_id': str, 'Microsoft Academic Paper ID': str, 'doi': str }) # json all_json = glob.glob(f"{self. DEFAULT_INPUT_PATH}/**/*.json", recursive=True) dict_ = {'paper_id': [], 'doi': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []} for idx, entry in enumerate(all_json): if idx % (len(all_json) // 10) == 0: print(f'Processing index: {idx} of {len(all_json)}') try: content = FileReader(entry) except Exception as e: continue # invalid paper format, skip # get metadata information meta_data = meta_df.loc[meta_df['sha'] == content.paper_id] # no metadata, skip this paper if len(meta_data) == 0: continue dict_['abstract'].append(content.abstract) dict_['paper_id'].append(content.paper_id) dict_['body_text'].append(content.body_text) # also create a column for the summary of abstract to be used in a plot if len(content.abstract) == 0: # no abstract provided dict_['abstract_summary'].append("Not provided.") elif len(content.abstract.split(' ')) > 100: # abstract provided is too long for plot, take first 100 words append with ... info = content.abstract.split(' ')[:100] summary = self.get_breaks(' '.join(info), 40) dict_['abstract_summary'].append(summary + "...") else: # abstract is short enough summary = self.get_breaks(content.abstract, 40) dict_['abstract_summary'].append(summary) # get metadata information meta_data = meta_df.loc[meta_df['sha'] == content.paper_id] try: # if more than one author authors = meta_data['authors'].values[0].split(';') if len(authors) > 2: # if more than 2 authors, take them all with html tag breaks in between dict_['authors'].append(self.get_breaks('. '.join(authors), 40)) else: # authors will fit in plot dict_['authors'].append(". ".join(authors)) except Exception as e: # if only one author - or Null valie dict_['authors'].append(meta_data['authors'].values[0]) # add the title information, add breaks when needed try: title = self.get_breaks(meta_data['title'].values[0], 40) dict_['title'].append(title) # if title was not provided except Exception as e: dict_['title'].append(meta_data['title'].values[0]) # add the journal information dict_['journal'].append(meta_data['journal'].values[0]) # add doi dict_['doi'].append(meta_data['doi'].values[0]) df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary']) df_covid['abstract_word_count'] = df_covid['abstract'].apply( lambda x: len(x.strip().split())) # word count in abstract df_covid['body_word_count'] = df_covid['body_text'].apply( lambda x: len(x.strip().split())) # word count in body df_covid['body_unique_words'] = df_covid['body_text'].apply( lambda x: len(set(str(x).split()))) # number of unique words in body # remove duplicates df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True) df_covid['abstract'].describe(include='all') df_covid.dropna(inplace=True) # handle multiple languages # set seed DetectorFactory.seed = 0 # hold label - language languages = [] # go through each text for ii in tqdm(range(0, len(df_covid))): # split by space into list, take the first x intex, join with space text = df_covid.iloc[ii]['body_text'].split(" ") lang = "en" try: if len(text) > 50: lang = detect(" ".join(text[:50])) elif len(text) > 0: lang = detect(" ".join(text[:len(text)])) # ught... beginning of the document was not in a good format except Exception as e: all_words = set(text) try: lang = detect(" ".join(all_words)) # what!! :( let's see if we can find any text in abstract... except Exception as e: try: # let's try to label it through the abstract then lang = detect(df_covid.iloc[ii]['abstract_summary']) except Exception as e: lang = "unknown" pass # get the language languages.append(lang) languages_dict = {} for lang in set(languages): languages_dict[lang] = languages.count(lang) df_covid['language'] = languages # drop df_covid = df_covid[df_covid['language']=='en'] # change to spark # Enable Arrow-based columnar data transfers spark = SparkSession \ .builder \ .appName("PySparkKMeans") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() spark.conf.set("spark.sql.execution.arrow.enabled", "true") # Create a Spark DataFrame from a pandas DataFrame using Arrow df_english = spark.createDataFrame(df_covid) clean_text_df = df_english.withColumn("text", self.clean_text(col("body_text"))) tokenizer = Tokenizer(inputCol="text", outputCol="vector") vector_df = tokenizer.transform(clean_text_df) # remove stopwords punctuations = string.punctuation stopwords = list(STOP_WORDS) stopwords[:10] custom_stop_words = [ 'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'elsevier', 'pmc', 'czi', 'www', "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ] for w in custom_stop_words: if w not in stopwords: stopwords.append(w) # Define a list of stop words or use default list remover = StopWordsRemover(stopWords=stopwords) # Specify input/output columns remover.setInputCol("vector") remover.setOutputCol("vector_no_stopw") # Transform existing dataframe with the StopWordsRemover vector_no_stopw_df = remover.transform(vector_df) # tdidf hashingTF = HashingTF() tf = hashingTF.transform(vector_no_stopw_df.select("vector_no_stopw")) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # PCA mat = RowMatrix(tfidf) # Compute the top 4 principal components. # Principal components are stored in a local dense matrix. pc = mat.computePrincipalComponents(1325) # Project the rows to the linear space spanned by the top 4 principal components. projected = mat.multiply(pc) projected.toPandas().to_csv(f"{self.DEFAULT_OUTPUT_FILE}") return projected