def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: column_spec = array_join( self.column.get_column_spec(source_df=source_df, current_column=current_column), self.delimiter) return column_spec
def process_item_sequence(rating_data): sequences = rating_data.filter(F.col("rating") >= 3.5).groupBy("userId")\ .agg(udf_combine_movies_by_timeline(F.collect_list("movieId"), F.collect_list("timestamp")).alias("movieIds"))\ .withColumn("movieIdStr", F.array_join(F.col("movieIds"), " ")) print_info(sequences, message="after build movieIdStr: ") sequences = sequences.select("movieIds") return sequences
def transform_abstracts_words(dataframe): udf_function_clean = udf(generate_cleaned_abstracts, StringType()) udf_function_sentiment = udf(generate_sentiment, DoubleType()) df_abstracts = (dataframe.select( "paper_id", func.posexplode("abstract").alias("pos", "value")).select( "paper_id", "pos", "value.text").withColumn( "ordered_text", func.collect_list("text").over( Window.partitionBy("paper_id").orderBy("pos")) ).groupBy("paper_id").agg( func.max("ordered_text").alias("sentences")).select( "paper_id", func.array_join("sentences", " ").alias("abstract")).withColumn( "words", func.size(func.split( "abstract", "\s+")))) df_abstracts = df_abstracts.withColumn("clean_abstract", udf_function_clean("abstract")) df_abstracts = df_abstracts.withColumn( "sentiment_abstract", udf_function_sentiment("clean_abstract")) return df_abstracts
def viveknSentimentAnalysis(self, infoData): dataset = infoData.get(pc.DATASET) labelCol = infoData.get(pc.LABELCOLM) dataset = self.changeSentimentVal(dataset, labelCol) (trainDataset, testDataset) = dataset.randomSplit([0.80, 0.20], seed=0) viveknSentiment = ViveknSentimentApproach().setInputCols(["document", pc.DMXSTOPWORDS])\ .setOutputCol("viveknSentiment").setSentimentCol("original_sentiment") viveknSentimentModel = viveknSentiment.fit(trainDataset) testDatasetPrediction = viveknSentimentModel.transform(testDataset) #storing the model at a location for future use in case of prediction of sentiment analysis. """you will get the list of all trained models and pretrained pipelines for using in the prediction of sentiment""" storagePath = infoData.get(pc.STORAGELOCATION) modelName = "testViveknSentiment" #sahil - temporary only modelPath = storagePath + modelName viveknSentimentModel.write().overwrite().save(modelPath) infoData.get( infoData.get(pc.SPARKNLPPATHMAPPING).update( {pc.SENTIMENTMODEL: modelPath})) #convert back the column type to the string format testDatasetPrediction = testDatasetPrediction.withColumn( "viveknSentiment", array_join("viveknSentiment.result", "")) infoData.update({pc.TESTDATA: testDatasetPrediction}) # need to coverts both the colms original sentiment and predicted sentiment for evaluation. infoData = self.evaluation(self.stringToIndex(infoData)) """ --> first check if the indexing is matching with label/original sentiment if not then match with the below method. finalDatasetTest = finalDatasetTest.withColumn("finalDataset_indexed", when(finalDatasetTest["finalDataset_indexed"] == 0.0, 1.0) .when(finalDatasetTest["finalDataset_indexed"] == 1.0, 0.0)) """ return infoData
def main(input_dir,output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('title', types.StringType()), types.StructField('created_utc_iso', types.DateType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())) ]) headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80) split_sentiment_df = headlines_df.withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1) ).withColumn( 'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2) ).cache() for year_int in range(2008,2020): print('Plotting for '+str(year_int)) headlines_year = split_sentiment_df.where( functions.year(split_sentiment_df['created_utc_iso']) == year_int ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso'])) headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg( functions.collect_set(headlines_year['title_clean']).alias('titles_group') ) headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') ) string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot) wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')
def performSentiment(self, infoData): dataset = infoData.get(pc.DATASET) sentimentModelPath = (infoData.get(pc.SPARKNLPPATHMAPPING)).get( pc.SENTIMENTMODEL) viveknSentimentModel = ViveknSentimentModel.load(sentimentModelPath) dataset = viveknSentimentModel.transform(dataset) dataset = dataset.withColumn("viveknSentiment", array_join("viveknSentiment.result", "")) infoData.update({pc.DATASET: dataset}) return infoData
def column_revalue(vcf): # info 값 수정 필요 name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"] for name in name_list: if name == "FORMAT": vcf = vcf.withColumn( name, F.array_sort(F.array_distinct(F.flatten(F.col(name))))) vcf = vcf.withColumn( name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":"))) else: vcf = vcf.withColumn(name, F.array_max(F.col(name))) return vcf
def vivekSentimentPretrained(self, infoData): applySentimentOn = infoData.get(vc.APPLY_SENTIMENT_ON) dataset = infoData.get(mc.DATASET) viveknPretrainedModelPath = infoData.get(vc.VIVEKNPRETRAINEDPATH) predictionCol = infoData.get(mc.PREDICTIONCOL) """use to download it once later we need to load it from the local to avoid dependency on online downloader.""" viveknSentiment = ViveknSentimentModel.load(viveknPretrainedModelPath).setInputCols( ["document", applySentimentOn]).setOutputCol(predictionCol) dataset = viveknSentiment.transform(dataset) dataset = dataset.withColumn(predictionCol, array_join(predictionCol + ".result", "")) dataset = dataset.select(mc.PA_INDEX, predictionCol) return dataset
def variant_SPDI( contig="contigName", start="start", # end="end", ref="referenceAllele", alt="alternateAllele", join_alternate_alleles=False): # if alt allele column is an array, concat with ',' if join_alternate_alleles: alt = f.array_join(alt, ",") return f.concat(contig, f.lit(":"), start, f.lit(":"), ref, f.lit(":"), alt)
def parquet_revalue(vcf, indel_com): temp = indel_com.join(vcf, ["#CHROM", "POS"], "full") sample_name = temp.columns[-1] sample_w = Window.partitionBy(F.col("#CHROM")).orderBy( F.col("POS")).rangeBetween(Window.unboundedPreceding, Window.currentRow) temp = temp.withColumn( sample_name, F.last(sample_name, ignorenulls=True).over(sample_w)).withColumnRenamed( "#CHROM", "CHROM") # scala UDF null_not_value = temp.filter(F.map_keys(F.col(sample_name)) != F.col("FORMAT"))\ .selectExpr("CHROM", "POS","index2dict({}, FORMAT) as {}".format(sample_name, sample_name))\ .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.col(sample_name), ":"))) null_value = temp.filter(F.map_keys(F.col(sample_name)) == F.col("FORMAT")).drop("FORMAT")\ .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.map_values(F.col(sample_name)), ":"))) value_union = null_not_value.union(null_value).withColumnRenamed( "CHROM", "#CHROM") return value_union
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: column_spec = array_join( self.column.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ), self.delimiter, ) return column_spec
def process_tweet_text(df): """Removes punctuation, stop words from inputCol and the output is in the outputCol Column. Args: df (DataFrame): A DataFrame with the column from which Stop Words need to be removed. Returns: DataFrame: Applying StopWordsRemover with text_clean as the input column and filtered as the output column. """ df = df.withColumn('text', split(removePunctuation(df['text']), ' ').alias('text')) stopWordList = list(string.punctuation) + ['http', 'https', 'rt','via','...','…','’','—','—:','“'] + StopWordsRemover.loadDefaultStopWords('english') remover = StopWordsRemover(inputCol="text", outputCol="filtered", stopWords = stopWordList) df = remover.transform(df) df = df.withColumn('tweet', array_join(df['filtered'], ' ')) return df.select('date', 'tweet', 'hashtags')
def vivekSentimentPretrained(self, infoData): dataset = infoData.get(pc.DATASET) viveknPretrainedModelPath = infoData.get(pc.VIVEKNPRETRAINEDMODEL) predictionCol = infoData.get(pc.PREDICTIONCOLM) """use to download it once later we need to load it from the local to avoid dependency on online downloader.""" viveknSentiment = ViveknSentimentModel.load( viveknPretrainedModelPath).setInputCols( ["document", pc.DMXSTOPWORDS]).setOutputCol(predictionCol) dataset = viveknSentiment.transform(dataset) dataset = dataset.withColumn(predictionCol, array_join(predictionCol + ".result", "")) dataset = dataset.select(pc.DMXINDEX, predictionCol) dataset = dataset.withColumn( predictionCol, when(dataset[predictionCol] == "negative", pc.NEGATIVE).when(dataset[predictionCol] == "positive", pc.POSITIVE).otherwise(pc.NEUTRAL)) return dataset
def test_auto_mapper_join_using_delimiter(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (123456789, "Gagan", "Chawla", ["MD", "PhD"]), ], ["npi", "first_name", "last_name", "suffix"], ).createOrReplaceTempView("practitioners") source_df: DataFrame = spark_session.table("practitioners") df = source_df.select("npi") df.createOrReplaceTempView("physicians") # Act mapper = AutoMapper( view="physicians", source_view="practitioners", keys=[ "npi" ]).columns(my_column=A.join_using_delimiter(A.column("suffix"), ", ")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["my_column"], array_join(col("b.suffix"), ", ").alias("my_column"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("npi == 123456789").select("my_column").collect() [0][0] == "MD, PhD")
def stringify(self, result, recommendation_info_item_delimiter="\001", recommendation_info_field_delimiter="\004", item_embedding_value_delimiter="\003", user_embedding_value_delimiter="\003"): import pyspark.sql.functions as F from pyspark.sql.functions import pandas_udf output_item_embeddings = self.output_item_embeddings @pandas_udf('string') def format_rec_info(rec_info): import pandas as pd output = [] for record in rec_info: string = '' for item in record: if string: string += recommendation_info_item_delimiter string += item['name'] string += recommendation_info_field_delimiter string += str(item['distance']) if output_item_embeddings: string += recommendation_info_field_delimiter string += item_embedding_value_delimiter.join( map(str, item['item_embedding'])) output.append(string) return pd.Series(output) result = result.withColumn( self.recommendation_info_column_name, format_rec_info(self.recommendation_info_column_name)) if self.output_user_embeddings: result = result.withColumn( self.user_embedding_column_name, F.array_join(F.col(self.user_embedding_column_name), user_embedding_value_delimiter)) return result
def sparkLemmatizer(self, dataset, colName, lemmatizedModelPath): dataset = dataset.select( "SA_index", concat_ws(",", dataset[colName]).alias(colName)) dataset = dataset.withColumn(colName, regexp_replace(col(colName), ",", " ")) dataset = dataset.drop('lemma', 'document') documentAssembler = DocumentAssembler() \ .setInputCol(colName) \ .setOutputCol("document") tokenizer = Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") lemmaModel = LemmatizerModel.load(lemmatizedModelPath) \ .setInputCols(["document", "token"]).setOutputCol("lemma") finisher = Finisher() \ .setInputCols(["lemma"]) \ .setOutputCols(["ntokens"]) \ .setOutputAsArray(True) \ .setCleanAnnotations(True) lemmatizerPipeline = Pipeline( stages=[documentAssembler, tokenizer, lemmaModel]) dataset = lemmatizerPipeline.fit(dataset).transform(dataset) dataset = dataset.withColumn("lemma_result", array_join("lemma.result", "")) sentimentTokenizer = RegexTokenizer(inputCol="lemma_result", outputCol="SA_lemma", toLowercase=True, pattern="\\W") dataset = sentimentTokenizer.transform(dataset) return dataset
.registerTempTable("input") #load dictionary as csv to reduce proccessing required to pivot json fields dictSchema = [ StructField('colID', StringType(), True), StructField('colValue', IntegerType(), True) ] finalStruct = StructType(fields=dictSchema) df2 = sqlContext.read.csv(path='inputs/freq_dict_mini.csv', header=True, schema=finalStruct, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True) newDf = df2\ .withColumn("sortedID",F.array_join(F.sort_array(F.split(df2["colID"],"")),"",""))\ .withColumn("wordLen", F.length('colID')) newDf.registerTempTable("dictionary") #join on sorted characters to get unscrambled possibilities sqlContext.sql("""select puzzle_id, letters, keyPositions, colID, colValue, getKeyLetters_udf(colID,keyPositions) as keyLetters, answerLengths from input i inner join dictionary d on d.sortedID = i.sortedLetters """).registerTempTable("unscrambled") #aggregate to posibilities into list unscrambled = sqlContext.sql("""select puzzle_id,
def text_formatting(spark): """ Extract formatting features from the text of a post Args: spark (SparkSession): used to run queries and commands Returns: DataFrame: With columns [ (post)_Id, #codelines, #html_blocks, #headings, #referencelist, #quotes, #codeblocks, #themebreaks, #codespans, #references, #links, #inline_images, #mail_addresses, #emphasis, #strong ] """ # Replaces formatted text that has already been processed FILLER = 'x' # Parser helper column COLNAME = 'processed_text' COL = col(COLNAME) # Data loading post_history_df = spark.read.parquet("/user/***REMOVED***/StackOverflow/PostHistory.parquet") \ .select(['_PostId', '_Text', '_PostHistoryTypeId']) \ .filter(col('_PostHistoryTypeId') == 2) \ .drop('_PostHistoryTypeId') post_df = spark.read.parquet('/user/***REMOVED***/StackOverflow/Posts.parquet') \ .select(['_Id', '_PostTypeId']) \ .filter(col('_PostTypeId') == 1) \ .drop("_PostTypeId") df = post_history_df.join(post_df, post_df['_Id'] == post_history_df['_PostId']) # Count lines and words of the formatted text df = df.withColumn('#lines', size(split(col('_Text'), r'\n'))) \ .withColumn('#words', size(split(col('_Text'), r'\s+'))) # BLOCK ELEMENTS # Count code lines df = df.withColumn(COLNAME, split(col('_Text'), regex.CODE_BLOCK_RE)) \ .withColumn('#codelines', size(COL) - 1) \ .withColumn('codeline_ratio', col('#codelines') / col('#lines')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count HTML blocks df = df.withColumn(COLNAME, split(COL, regex.HTML_BLOCK_RE)) \ .withColumn('#html_blocks', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # # Count headings (1/2) df = df.withColumn(COLNAME, split(COL, regex.SETEXT_HEADING_RE)) \ .withColumn('#headings', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count reference list df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_LIST_RE)) \ .withColumn('#referencelist', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count quotes df = df.withColumn(COLNAME, split(COL, regex.QUOTE_RE)) \ .withColumn('#quotes', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count headings (2/2) df = df.withColumn(COLNAME, split(COL, regex.HEADING_RE)) \ .withColumn('#headings', size(COL) - 1 + col('#headings')) \ .withColumn('heading_ratio', col('#headings') / col('#lines')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count code blocks df = df.withColumn(COLNAME, split(COL, regex.FENCED_CODE_RE)) \ .withColumn('#codeblocks', size(COL) - 1) \ .withColumn('codeblock_ratio', col('#codeblocks') / col('#lines')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count thematic break df = df.withColumn(COLNAME, split(COL, regex.THEME_BREAK_RE)) \ .withColumn('#themebreaks', size(COL) - 1) \ .withColumn('themebreak_ratio', col('#themebreaks') / col('#lines')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # INLINE ELEMENTS # Count codespans df = df.withColumn(COLNAME, split(COL, regex.CODESPAN_RE)) \ .withColumn('#codespans', size(COL) - 1) \ .withColumn('codespan_ratio', col('#codespans') / col('#words')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Remove markdown escapes df = df.withColumn(COLNAME, regexp_replace(COL, regex.ESCAPE_RE, FILLER)) # Count references (1/2) df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_RE)) \ .withColumn('#references', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count links (1/2) df = df.withColumn(COLNAME, split(COL, regex.LINK_RE)) \ .withColumn('#links', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count inline images df = df.withColumn(COLNAME, split(COL, regex.INLINE_IMAGE_RE)) \ .withColumn('#inline_images', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # # Count references (2/2) df = df.withColumn(COLNAME, split(COL, regex.SHORT_REFERENCE_RE)) \ .withColumn('#references', size(COL) - 1 + col('#references')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count links (2/2) df = df.withColumn(COLNAME, split(COL, regex.AUTOLINK_RE)) \ .withColumn('#links', size(COL) - 1 + col('#links')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count mails df = df.withColumn(COLNAME, split(COL, regex.AUTOMAIL_RE)) \ .withColumn('#mail_addresses', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Remove line breaks, html, stand-alone * or _ df = df.withColumn(COLNAME, regexp_replace(COL, regex.LINE_BREAK_RE, FILLER)) df = df.withColumn(COLNAME, regexp_replace(COL, regex.HTML_RE, FILLER)) df = df.withColumn(COLNAME, regexp_replace(COL, regex.NOT_STRONG_RE, FILLER)) # Count strong & emphasis df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG_RE)) \ .withColumn('#emphasis', size(COL) - 1) \ .withColumn('#strong', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM3_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.STRONG_RE)) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG2_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.STRONG2_RE)) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS2_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn('emphasis_ratio', col('#emphasis') / col('#words')) \ .withColumn('strong_ratio', col('#strong') / col('#words')) # Remove unnecessary columns, including parser helper column df = df.drop('_Text', '_PostHistoryTypeId', '_PostId', '#lines', '#words', COLNAME) return df
func.col('Au.auid').cast('long')) # select affiliations per author based on the most dominant affiliation in the last year. # keep only records from the final year .withColumn( 'yearRank', func.rank().over( Window.partitionBy('auid').orderBy( func.desc(sort_pub_year)))).filter('yearRank=1') # now count per author the afid occurences. .groupBy('auid', func.col('affiliation.afid').alias('affil_id')).agg( func.count('*').alias('aff_occurences'), func.first( func.array_join( 'affiliation.affiliation_organization', ", ")).alias('affil_name'), func.first('affiliation.affiliation_tag_country').alias( 'cntry'), func.max('datesort').alias('max_datesort'), ).withColumn( 'affRank', func.rank().over( Window.partitionBy('auid').orderBy( func.desc('aff_occurences'), func.desc('max_datesort'), func.asc('affil_id')))).filter('affRank=1').drop( 'affRank').drop('aff_occurences') # get name from ipr record (preferred name), otherwise default to the name as printed on the paper. .join(
def export_vector_df(vector_df, vocab_df, output_folder): vector_df = vector_df.join(vocab_df, 'id').select('standard_concept_id', 'vector') vector_df.withColumn('vector', F.array_join(F.col('vector'), ',')) \ .select('standard_concept_id', 'vector').repartition(1) \ .write.option('header', 'true').mode('overwrite').csv(create_file_path(output_folder, 'embedding_csv'))