def anon_firstlastname_spark(df, blacklist, firstnames, lastnames, *columns_to_anon): from pyspark.sql.functions import udf, StringType, col sc = df._sc blacklist = set(map(str.lower, blacklist)) firstnames = set(map(str.lower, firstnames)) lastnames = set(map(str.lower, lastnames)) blacklist_bc = sc.broadcast(blacklist) firstnames_bc = sc.broadcast(firstnames) lastnames_bc = sc.broadcast(lastnames) firstlastnames_bc = sc.broadcast(firstnames | lastnames) anonymizer = AnonFirstLastName(blacklist_bc, firstnames_bc, lastnames_bc, firstlastnames_bc) anonymizer_udf = udf(anonymizer.__call__, StringType()) result = df for colname in columns_to_anon: print("Anonymizing column {}".format(colname)) result = result.withColumn(colname + "_anonymized", anonymizer_udf(col(colname))).drop(colname) return result
def map_tag(momo2tag): def mapping(col): tag = momo2tag.get(col,'') return tag return udf(mapping, StringType())
def translate(tag): def translate_(col): data = json.loads(col) return data.get(tag) return udf(translate_, StringType())
# instantiate SparkContext for spark-submit conf = SparkConf().setAppName("transforming effective care and readmissions") sc = SparkContext(conf=conf) # read Hive tables hive_context = HiveContext(sc) measure_effective_care = hive_context.table("default.measure_effective_care") # filter measure_effective_care.registerTempTable("effective_temp") measure_effective_selected = hive_context.sql( " SELECT provider_id, measure_id,score,sample,measure_start,measure_end FROM effective_temp" ) # add a column to flag the data as effective care measure_effective_with_care_type = measure_effective_selected.withColumn( 'care_type', lit("effective").cast(StringType())) # add an empty column for readmission denominators measure_effective_with_care_type_denominator = measure_effective_with_care_type.withColumn( 'denominator', lit(None).cast(StringType())) # read in readmission data measure_readmission = hive_context.table("default.measure_readmission") measure_readmission.registerTempTable("readmission_temp") measure_readmission_selected = hive_context.sql( " SELECT provider_id, measure_id,denominator,score,measure_start,measure_end FROM readmission_temp" ) # prepare readmissions for a union with effective measures: add empty column for sample measure_readmission_with_sample = measure_readmission_selected.withColumn( 'sample', lit(None).cast(StringType()))
import spark_functions as sf from pyspark.sql.functions import udf, StringType if __name__ == '__main__': table_to_read = 'unified_minute_view' table_to_write = 'unified_hour' sf.initialize_environment() new_sql_context = sf.create_sql_context('spark_minute_to_hour') udf_aggregator = udf(sf.minute_aggregator, StringType()) hour_table = sf.spark_unified_aggregator( udf_aggregator, sf.load_and_get_table_df(new_sql_context, table_to_read)) sf.write_df_to_table(hour_table, table_to_write)
"via","while","posrr","scorr","thankyou","across","alone","another","becoming","bottom","due","ever","formerly","hereafter", "is","moreover","of","please","so","then","thus","w","whither","posrt","scodi","along","any","been","but","during","every", "forty","hereby","it","most","often","put","some","thence","tl""was","who","posdi","scodu","after","already","anyhow","before", "by","each","everyone","fosm","herein","its","mostly","once","rather","somehow","there","tm","we","whoever","posdu","scord","afterwards", "also","anyone","beforehand"]) cleaned=' '.join([w for w in sentence.split() if not w in stops]) cleaned=' '.join([w for w in cleaned.split() if not len(w)<2 ]) cleaned=cleaned if(len(cleaned)<=1): return "NA" else: return cleaned org_val=udf(cleaning_text,StringType()) data=data.withColumn("cleaned",org_val(data.summary)) data = data.filter(data["cleaned"]!= "NA") tokenizer = Tokenizer(inputCol="cleaned", outputCol="words") wordsData = tokenizer.transform(data) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # assembler=VectorAssembler(inputCols=['op1','op2','op3','pr1','features'],outputCol='FeaturesFinal') # assembled=assembler.transform(rescaledData) ADS=rescaledData.select('label','features')
def transform(df): """ Transform dataframe to an acceptable form. Args: df: raw dataframe Return: df: processed dataframe """ # todo: write the your code here # Remove user field df_flatten_one_level = df.select('message.*', 'kafka_consume_ts') df_flatten_one_level = df_flatten_one_level.drop('user') # Remove retweeted_status and quoted_status if they are available in JSON objects and add them to dataframe as new # rows. df_retweeted_status = df_flatten_one_level \ .where(df_flatten_one_level.retweeted_status.isNotNull()) \ .select('retweeted_status.*') \ .drop('user') \ .withColumn('keyword', lit(None).cast(StringType())) \ .withColumn('kafka_consume_ts', lit(None).cast(StringType())) df_quoted_status = df_flatten_one_level \ .where(df_flatten_one_level.quoted_status.isNotNull()) \ .select('quoted_status.*') \ .drop('user') \ .withColumn('keyword', lit(None).cast(StringType())) \ .withColumn('kafka_consume_ts', lit(None).cast(StringType())) df_flatten_one_level = df_flatten_one_level.drop("retweeted_status", "quoted_status") common_unnested_columns = [ 'created_at', 'favorite_count', 'favorited', 'id', 'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status', 'keyword', 'lang', 'possibly_sensitive', 'quoted_status_id', 'quoted_status_id_str', 'retweet_count', 'retweeted', 'source', 'text', 'truncated', 'kafka_consume_ts' ] df_flatten_one_level = df_flatten_one_level.select(common_unnested_columns) \ .unionAll(df_retweeted_status.select(common_unnested_columns)) \ .unionAll(df_quoted_status.select(common_unnested_columns)) # Remove duplicate tweets. df_without_duplicate = df_flatten_one_level.dropDuplicates(subset=['id']) # Remove space characters from text fields. for item in df_without_duplicate.dtypes: if item[1].startswith('string'): df_without_duplicate = df_without_duplicate.withColumn( item[0], trim(df_without_duplicate[item[0]])) # Convert created_at field to DateTime with (year-month-day) format. split_col = split(df_without_duplicate['created_at'], ' ') df_with_date = df_without_duplicate.withColumn( 'created_at_date', concat_ws('-', split_col.getItem(5), split_col.getItem(1), split_col.getItem(2))) df_final = df_with_date.withColumn( "created_at_date", from_unixtime( unix_timestamp(df_with_date.created_at_date, 'yyyy-MMM-dd'), 'yyyy-MM-dd')) return df_final
def translate(): def translate_(col): hash_md5 = hashlib.md5(col) return hash_md5.hexdigest() return udf(translate_, StringType())
def __init__(self, sc, data_path): self.sc = sc self.data_path = data_path #sc = SparkContext.getOrCreate() self.sqlContext = SQLContext(sc) #incluir aqui repositorio a S3 bigT = sc.textFile(data_path, 2) bigTT = bigT.map(logs2) rows = bigTT.map(lambda x: Row(visitorID=x[0], url=x[1], action=x[2], pais=x[3], provincia=x[4], time=x[5])) dailyMaster = self.sqlContext.createDataFrame(rows) ndf = dailyMaster.withColumn('_1', dailyMaster['time'].cast(DateType())) ndf2 = ndf.withColumn('_1', dailyMaster['time'].cast(DateType())) def url2(x): try: a = x.split('//')[1] except: a = "0" return a udf2 = udf(lambda x: url2(x), StringType()) def hosta(x): try: a = x.split('/')[0] except: a = "0" return a udf3 = udf(lambda x: hosta(x), StringType()) def path(x, n): try: a = x.split('/')[n] except: a = "0" return a #vamos recuperando todas las componentes de la url udf4 = udf(lambda x: path(x, 1), StringType()) udf5 = udf(lambda x: path(x, 2), StringType()) udf6 = udf(lambda x: path(x, 3), StringType()) udf7 = udf(lambda x: path(x, 4), StringType()) ndf_url = ndf2.withColumn('urlClean', udf2(ndf2.url)) ndf_host = ndf_url.withColumn('host', udf3(ndf_url.urlClean)) ndf_path1 = ndf_host.withColumn('path1', udf4(ndf_host.urlClean)) self.ndf5 = ndf_path1.withColumn('path2', udf5(ndf_path1.urlClean))
import spark_functions as sf from pyspark.sql.functions import udf, StringType, col if __name__ == '__main__': table_to_write = 'unified_minute' value_name = 'follower_count' value_alias = 'follower_count' sf.initialize_environment() new_sql_context = sf.create_sql_context('spark_live_to_minute') udf_aggregator = udf(sf.second_aggregator, StringType()) # Get live tables and aggregate into minutes youtube_minute = sf.spark_live_aggregator(udf_aggregator, sf.load_and_get_table_df(new_sql_context, 'youtube_live_view'), value_name, value_alias) twitch_minute = sf.spark_live_aggregator(udf_aggregator, sf.load_and_get_table_df(new_sql_context, 'twitch_live_view'), value_name, value_alias) twitter_minute = sf.spark_live_aggregator(udf_aggregator, sf.load_and_get_table_df(new_sql_context, 'twitter_live_view'), value_name, value_alias) joined_minute_tables = sf.join_3_tables_by_streamer_and_timestamp(youtube_minute, twitch_minute, twitter_minute) sf.write_df_to_table(joined_minute_tables .withColumn('total_count', col('youtube_count') + col('twitch_count') + col('twitter_count')), table_to_write)
# Run the function over the entire data set all_genres = genre_counts.collect() movies_add_genres = ratings_with_titles for genre in all_genres: movies_add_genres = process_genres(movies_add_genres, genre[0]) print("--Genres function applied to all movies--") movies_add_genres.show(20, False) # Extract the year from the movie title def extract_year(title_and_year): if "(" in str(title_and_year): return str(title_and_year).split("(")[-1].replace(")", "") else: return None from pyspark.sql.functions import StringType year_udf = udf(extract_year, StringType()) movies_with_year = movies_add_genres.withColumn( "year", year_udf(movies_add_genres["title"])) print("--Movie years--") movies_with_year.show(20, False) movies_pandas = movies_with_year.toPandas() movies_pandas.to_csv("movies_pandas.csv")
import spark_functions as sf from pyspark.sql.functions import udf, StringType if __name__ == '__main__': table_to_read = 'unified_hour_view' table_to_write = 'unified_day' sf.initialize_environment() new_sql_context = sf.create_sql_context('spark_hour_to_day') new_sql_context.addPyFile('./spark_functions.py') udf_aggregator = udf(sf.hour_aggregator, StringType()) hour_table = sf.spark_unified_aggregator( udf_aggregator, sf.load_and_get_table_df(new_sql_context, table_to_read)) sf.write_df_to_table(hour_table, table_to_write)
"%s (%d incidents)" % (mostOccurrencesPremise["PREMISE TYPE"], mostOccurrencesPremise["Total"])))) showCrime = CrimeOccurrencesDF.orderBy(col("Total").desc()).show() # ### 4.3 Location Columns Analysis # In[7]: ## Address Columns Anaylsis: from IPython.display import display, Markdown from pyspark.sql.functions import StringType ## Casting ZIP Column into a string. crime_DF.withColumn("ZIP", col("ZIP").cast(StringType())).printSchema() print("Checking for nulls on columns 100 BLOCK ADDR, ZIP:") crime_DF.select([count(when(col(c).isNull(), c)).alias(c) for c in ["ZIP","100 BLOCK ADDR"]]).show() print("Checking amount of distinct values in columns 100 BLOCK ADDR, ZIP:") crime_DF.select([countDistinct(c).alias(c) for c in ["ZIP","100 BLOCK ADDR"]]).show() print ("Most and least frequent occurrences for ZIP and 100 BLOCK ADDR:") ZIPOccurrencesDF = crime_DF.groupBy("ZIP").agg(count(lit(1)).alias("Total")) ADDRDF = crime_DF.groupBy("100 BLOCK ADDR").agg(count(lit(1)).alias("Total")) print("ZIP codes with the LEAST amount of crimes:") leastOccurrencesZIP = ZIPOccurrencesDF.orderBy(col("Total").asc()).show(10) print("ZIP codes with the MOST amount of crimes:") mostOccurrencesZIP = ZIPOccurrencesDF.orderBy(col("Total").desc()).show(10)
data = spark.read.format('com.databricks.spark.csv').option( 'header', 'true').load('data0201.csv') from pyspark.sql.functions import udf, StringType def zerofill(x): return x.zfill(2) strzfill = udf(zerofill, StringType()) data_0 = data.withColumn('month',strzfill(data['month']))\ .withColumn('day',strzfill(data['day']))\ .withColumn('hour',strzfill(data['hour']))\ .withColumn('year',lit('2019')) # 数据合并 from pyspark.sql.functions import to_timestamp data_1 = data_0.withColumn( 'dptime0', concat_ws('-', data_0['year'], data_0['month'], data_0['day'])) data_2 = data_1.withColumn('dptime1', concat_ws(' ', data_1['dptime0'], data_1['hour'])) data_2 = data_2.withColumn('tail', lit('00:00')) data_3 = data_2.withColumn('dptime2', concat_ws(':', data_2['hour'], data_2['tail'])) data_use = data_3.withColumn('dptime',to_timestamp(data_3['dptime2']))\ .drop('dptime0','dptime1', 'dptime2') # 关联旅客 data_use.createOrReplaceTempView('datas') path_distc = spark.sql(
def map_tag(app2tag): def mapping(col): tag = app2tag.get(col) return tag return udf(mapping, StringType())
"County": County, "Address": Address, "GPSLatitude": GPSLatitude, "GPSLongitude": GPSLongitude, "PlaceID": PlaceID, "MapURL": MapURL, "PartialMatch": PartialMatch, "LocationType": LocationType, "FormattedAddress": FormattedAddress, "ErrorMessage": ErrorMessage, "jsonResults": jsonResults } return json.dumps(returnJson) geocodeUDF = udf(lambda z: geocodeRequest(z), StringType()) def truncLatLng(latLng): if latLng == None: return latLng else: numInt = int(latLng) numLen = len(str(numInt)) if latLng >= 0: res = round(latLng, 6 - numLen) else: res = round(latLng, 7 - numLen) return str(res).rstrip('0').rstrip('.')
data_dir = sc.wholeTextFiles(",".join(li)) data_dir = data_dir.toDF() data_dir = data_dir.dropDuplicates() if index == 0: data = data_dir else: data = data.union(data_dir) data = spark.read.format('parquet').options( headers=True).load('Downloads/enron_correct') data = data.withColumnRenamed("_1", "location") data = data.withColumnRenamed("_2", "message") data = data.withColumn("user", split(col("location"), "/").getItem(5)) data = data.withColumn("message", regexp_replace("message", "N", "_n")) ToEmailParses = udf(lambda z: ToEmailParse(z), StringType()) spark.udf.register("ToEmailParse", ToEmailParses) def ToEmailParse(s): msg = email.message_from_string(r'{}'.format(s)) return msg['to'] data = data.withColumn('to', ToEmailParses('message')) FromEmailParses = udf(lambda z: FromEmailParse(z), StringType()) spark.udf.register("FromEmailParse", FromEmailParses) def FromEmailParse(s):