def main():

    # Specify test file, a csv of urls to parse
    TEST_FILE = "test_urls.csv"
    OUTPUT_FILE = "parsed_test_urls.csv"

    # Read in dataset, selecting the 'script_url' column and filter duplicates
    data = spark.read.csv(TEST_FILE,header='true').distinct()

    # Split the string on reserved url characters to get canonical url
    data = data.withColumn(
                    "parsed_url",
                    functions.split("script_url", "[\?\#\,\;]")[0]
                ).distinct()

    # Only keep urls that are actually .js files
    data = data.filter(
                data["parsed_url"].rlike("\.js$")
            ).dropDuplicates(["parsed_url"])

    # User Defined Function to convert script URL to a filename usable by ext4
    shorten_udf = functions.udf(shorten_name, returnType=types.StringType())

    # Apply the UDF over the whole list to generate a new column 'filename'
    data = data.withColumn(
                'filename',
                shorten_udf(data.parsed_url)
            )#.sort('filename')

    # Save the data to parquet files
    data.toPandas().to_csv(OUTPUT_FILE)
示例#2
0
def reformat_v1_0(flight, pqFolder, pqFileName):
	"""
		Read in the original v1.0 dataframe and save as a new parquet file compatible with v1.1
		@params:        
			flight		  - Required  : original v1.0 data(Spark DataFrame)        
			pqFolder      - Required  : folder to save the parquet files into (Str)        
			pqFileName    - Required  : parquet file name (Bool)                        
	"""
	flight2 = (flight.withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stay_days')))
					 .drop('stay_days')           
					 .withColumnRenamed('start_date', 'depDate')                 
					 .withColumn('depDate', to_date('depDate'))
					 .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
					 .withColumnRenamed('from_city_name', 'fromCity')
					 .withColumnRenamed('to_city_name', 'toCity')                 
					 .withColumnRenamed('search_date', 'searchDate')                 
					 .withColumn('searchDate', to_date('searchDate'))
					 .withColumnRenamed('company', 'airlineName')                 
					 .withColumnRenamed('dep_time', 'departureTime')                                  
					 .withColumnRenamed('arr_time', 'arrivalTime')                                                   
					 .withColumn('duration_h', split(flight.duration,'h').getItem(0))
					 .withColumn('duration_m', F.substring_index(split(flight.duration,'h').getItem(1), 'm', 1))
	#                  .withColumn('duration', F.struct(col('duration_h'), col('duration_m')))
					 .withColumn('duration_m', (col('duration_h')*60 + col('duration_m')))
					 .drop('duration', 'duration_h', 'flight_number')
					 .withColumnRenamed('price_code', 'currencyCode')                                  
					 .withColumnRenamed('stop', 'stops')
					 .withColumn('stops', col('stops').cast('byte')) 
					 .withColumn('stop_info', split(col('stop_info'), ';'))
	#                  .withColumn('stop_duration', take_all_duration_UDF(col('stop_info')))
					 .withColumn('noOfTicketsLeft', correct_tickets_left_UDF('ticket_left'))
					 .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte')) 
					.drop('ticket_left')
				   .withColumnRenamed('table_name', 'tableName')
				   .withColumn('task_id', col('task_id').cast('long')) 
				   .withColumn('span_days', col('span_days').cast('integer')) 
					.select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
							'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
							'stayDays', 
						   'departureTime', 'arrivalTime', 
							'airlineName',  'duration_m', 
							'flight_code', 'plane', 'stops', 'noOfTicketsLeft',
						   'airline_code', 'airline_codes',
						   'stop_info', 'span_days', 'power', 'video', 'wifi')                #'stop_duration', 
			  )

	flight2.repartition(1).write.parquet(os.path.join(pq_folder, pqFileName))
示例#3
0
	def parse_raw_df(self,raw_df, train = True):
		"""Convert a DataFrame consisting of rows of comma separated text into labels and feature.

		Args:
			raw_df (DataFrame with a 'text' column): DataFrame containing the raw comma separated data.

		Returns:
			DataFrame: A DataFrame with 'label' and 'feature' columns.
		"""

		temp = raw_df
		temp = temp.withColumn("features",self.parse_point_udf(temp.text))
		if train:
			temp = temp.withColumn("label",split(temp.text,",").getItem(self.LabelPos).cast(DoubleType()))
		else:
			temp = temp.withColumn("label",split(temp.text,",").getItem(self.LabelPos).cast(StringType()))
		return temp
示例#4
0
def parse_points(DF):
    """Converts a DataFrame of comma separated unicode strings into a DataFrame of `LabeledPoints`.

    Args:
        df: DataFrame where each row is a comma separated unicode string. The first element in the string
            is the label and the remaining elements are the features.

    Returns:
        DataFrame: Each row is converted into a `LabeledPoint`, which consists of a label and
            features. To convert an RDD to a DataFrame, simply call toDF().
    """
    df2 = DF.select(split(DF.value, ',').alias('s'))
    rd1 = df2.rdd
    rd2 = rd1.map(lambda x: x[0])
    rd3 = rd2.map(lambda x: (x[0],x[1:]))   
    rd4 = rd3.map(hash_labeledpoint)
    df4 = rd4.toDF()
    return df4
def main():

    # Specify target directory
    config = configparser.ConfigParser()
    config.read('config.ini')

    datatop = config['DEFAULT']['datatop']
    parquet_dataset = path.join(datatop,config['DEFAULT']['parquet_dataset'])
    output_dir = path.join(datatop,config['DEFAULT']['output_dir'])

    # Read in dataset, selecting the 'script_url' column and filter duplicates
    data = spark.read.parquet(parquet_dataset).select('script_url').distinct()

    # Split the string on reserved url characters to get canonical url
    data = data.withColumn(
                    "parsed_url",
                    functions.split("script_url", "[\?\#\,\;]")[0]
                ).distinct()

    # Only keep urls that are actually .js files
    data = data.filter(
                data["parsed_url"].rlike("\.js$")
            ).dropDuplicates(["parsed_url"])

    # User Defined Function to convert script URL to a filename usable by ext4
    shorten_udf = functions.udf(shorten_name, returnType=types.StringType())

    # Apply the UDF over the whole list to generate a new column 'filename'
    data = data.withColumn(
                'filename',
                shorten_udf(data.parsed_url)
            ).sort('filename')

    # Save the data to parquet files
    data.write.parquet(output_dir)
    config = configparser.ConfigParser()
    config.read('config.ini')
示例#6
0
        return None

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: moa-load.py <input xml> <series json> <output parquet>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="MoA Load")
    sqlContext = SQLContext(sc)

    raw = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='doc').load(sys.argv[1])

    series = sqlContext.read.json(sys.argv[2])

    convdate = udf(lambda s: normdate(s), StringType())
    mkurl = udf(lambda ser, id: 'http://ebooks.library.cornell.edu/cgi/t/text/text-idx?c=%s;idno=%s' % (ser, id))

    df = raw.select((split(col('docno'), '_')[0]).alias('moaseries'),
                    (split(col('docno'), '_')[1]).alias('id'),
                    convdate(col('date')).alias('date'),
                    regexp_replace(regexp_replace(col('text'), '&', '&amp;'),
                                   '<', '&lt;').alias('text'))\
            .withColumn('issue', col('id'))\
            .withColumn('url', mkurl(col('moaseries'), col('id')))

    df.join(series, (df.moaseries == series.moaseries) \
            & (df.date >= series.startdate) & (df.date <= series.enddate), 'left_outer')\
        .drop('moaseries').drop('startdate').drop('enddate')\
        .write.save(sys.argv[3])

    sc.stop()
from pyspark.sql import SparkSession
from pyspark.sql import functions as func

spark = SparkSession.builder.appName("WordCount").getOrCreate()

# Read each line of my book into a dataframe
inputDF = spark.read.text(
    "C:/Users/eduardo/Desktop/SparkCourse/Spark/book.txt")

# Split using a regular expression that extracts words
words = inputDF.select(
    func.explode(func.split(inputDF.value, "\\W+")).alias("word"))
words.filter(words.word != "")

# Normalize everything to lowercase
lowercaseWords = words.select(func.lower(words.word).alias("word"))

# Count up the occurrences of each word
wordCounts = lowercaseWords.groupBy("word").count()

# Sort by counts
wordCountsSorted = wordCounts.sort("count")

# Show the results.
wordCountsSorted.show(wordCountsSorted.count())
示例#8
0
# Databricks notebook source
# MAGIC %md ###### Read User Data from s3

# COMMAND ----------

filepath = "/mnt/preprocess_user/*.parquet"
userDF = spark.read.parquet(filepath)

# COMMAND ----------

# MAGIC %md 1. Convert friends column to array type
# MAGIC 2. Convert yelping_since column to unix_datetime format

# COMMAND ----------

from pyspark.sql.functions import column as col, split, unix_timestamp, from_unixtime
from pyspark.sql.types import ArrayType, StringType, TimestampType
userDF = (userDF.withColumn(
    "friends",
    split(col("friends"), ",\s*").cast(ArrayType(StringType()))).withColumn(
        "yelping_since", from_unixtime(unix_timestamp("yelping_since"))))

# COMMAND ----------

# MAGIC %md ###### write cleansed user data to s3

# COMMAND ----------

filepath = "/mnt/cleansed_user"
userDF.write.parquet(path=filepath, mode="overwrite")
        .appName("StructuredNetworkWordCountWindowed")\
        .getOrCreate()

    # Create DataFrame representing the stream of input lines from connection to host:port
    lines = spark\
        .readStream\
        .format('socket')\
        .option('host', host)\
        .option('port', port)\
        .option('includeTimestamp', 'true')\
        .load()

    # Split the lines into words, retaining timestamps
    # split() splits each line into an array, and explode() turns the array into multiple rows
    words = lines.select(
        explode(split(lines.value, ' ')).alias('word'), lines.timestamp)

    # Group the data by window and word and compute the count of each group
    windowedCounts = words.groupBy(
        window(words.timestamp, windowDuration, slideDuration),
        words.word).count().orderBy('window')

    # Start running the query that prints the windowed word counts to the console
    query = windowedCounts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .start()

    query.awaitTermination()
  .format("kafka")\
  .option("kafka.bootstrap.servers", "localhost:9092")\
  .option("subscribe", "dztopic1")\
  .load()


#######################################################################################
#
#   Data Processing / Transformation on Streaming DF
#
#######################################################################################

#events.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

#events = spark.createDataFrame([('100001|1985|20|1228|81|1328|64|N|0',),('100002|1985|25|1106|77|1354|70|H|0',)],['value'])
events2 = events.withColumn('uid', split(events['value'],'\\|')[0] )    \
        .withColumn('season', split(events['value'],'\\|')[1] )         \
        .withColumn('daynum', split(events['value'],'\\|')[2] )         \
        .withColumn('wteam', split(events['value'],'\\|')[3] )          \
        .withColumn('wscore', split(events['value'],'\\|')[4] )         \
        .withColumn('lteam', split(events['value'],'\\|')[5] )          \
        .withColumn('lscore', split(events['value'],'\\|')[6] )         \
        .withColumn('wloc', split(events['value'],'\\|')[7] )           \
        .withColumn('ot', split(events['value'],'\\|')[8] )             \
        .withColumn('score_diff', col('wscore') - col('lscore') )



#######################################################################################
#
#   Load Static DF
示例#11
0
print '[ INFO ] Number of Records in movies:  ' + str(movies.count())
print '[ INFO ] Number of Records in ratings: ' + str(ratings.count())
print '[ INFO ] Number of Records in links:   ' + str(links.count())
print '[ INFO ] Number of Records in tags:    ' + str(tags.count())

# Pyspark Dataframe Joins
join1 = ratings.join(movies, ratings.movieId == movies.movieId).drop(movies.movieId)
print '[ INFO ] Number of Records in join1:   ' + str(join1.count())
join1.show()

join2 = join1.join(tags, (join1.movieId == tags.movieId) & (join1.userId == tags.userId), how='leftouter').drop(movies.movieId).drop(tags.movieId).drop(tags.userId).drop(tags.timestamp)
print '[ INFO ] Number of Records in join2:   ' + str(join2.count())
join2.show()

genres = join2.select(explode(split(col('genres'), '\|'))).distinct()
genres.show()
genres = genres.collect()
genres = [genre[0].encode('utf-8') for genre in genres]

def extract_genres(genres_string):
    return [1 if genre in genres_string.split('|') else 0 for genre in genres]

udf_extract_genres = udf(extract_genres, ArrayType(IntegerType()))

#join2.select(['userid','movieid', udf_extract_genres(col('genres'))]).show()

enriched1 = join2.withColumn('genres_vector', udf_extract_genres(col('genres'))) \
                 .withColumn(genres[0].lower(), col('genres_vector')[0]) \
                 .withColumn(genres[1].lower(), col('genres_vector')[1]) \
                 .withColumn(genres[2].lower(), col('genres_vector')[2]) \
# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                <FILL IN>)

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import split, size, explode
shakeWordsDF = (shakespeareDF
                .select(split('sentence', '\s+').alias('words'))
                .select(explode('words').alias('word'))
                .where(col('word') != ''))

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
# In[27]:

title = "data.title"
author = "data.author"
dfAuthorTilte = df.select(title, author)
dfAuthorTilte.limit(5).toPandas()

# # Try to implement the equivalent of flatMap in dataframes

# In[28]:

import pyspark.sql.functions as F

dfWordCount = df.select(F.explode(F.split(
    title,
    "\\s+")).alias("word")).groupBy("word").count().orderBy(F.desc("count"))
dfWordCount.limit(10).toPandas()

# # Use an NLP libary to do Part-of-Speech Tagging

# In[29]:

from com.johnsnowlabs.nlp.pretrained.pipeline.en import BasicPipeline as bp
dfAnnotated = bp.annotate(dfAuthorTilte, "title")
dfAnnotated.printSchema()

# ## Deal with Map type to query subfields

# In[30]:
示例#14
0
        for r in collect_cols
    ]

    # 聚合上下文信息
    context_data = context_data.groupBy('session_id').agg(*funcs_collect)
    # Step 3 连接上下文信息
    train_data = click_data.join(context_data, on='session_id', how='left')

    # Step 4
    # counter action type
    train_data = train_data.withColumn('action_type_map',
                                       counter('action_type_list'))
    # Step 5
    # explode data and get impression rank /impression and price rank /price
    train_data = train_data.withColumn(
        'impressions', F.split("impressions", "\|")).withColumn(
            'prices', F.split("prices", "\|")).withColumn(
                "tmp", zip_imp_price("impressions", 'prices')).withColumn(
                    "tmp", F.explode('tmp')).withColumn(
                        "impresssion", F.col("tmp.item0")).withColumn(
                            'impress_rank', F.col("tmp.item1")).withColumn(
                                "price", F.col("tmp.item2")).withColumn(
                                    "price_rank",
                                    F.col("tmp.item3")).drop('tmp')

    # hour
    train_data = train_data.withColumn(
        'hour', F.hour(F.from_unixtime('timestamp', "yyyy/MM/dd HH:mm:ss")))

    # impression 价格特征
    train_data = train_data.withColumn(
示例#15
0
                                        "dense_rank_val",
                                        dense_rank().over(spec)).show()

# COMMAND ----------

from pyspark.sql.functions import col, substring, substring_index, instr, split, concat_ws, repeat
from pyspark.sql.types import StringType
#substring
#orders_new_col.show()
func_df = orders_new_col.select(
    'order_status',
    substring('order_status', 1, 2).alias("sub"),
    substring_index('order_status', "E", -3).alias("sub_ind")).select(
        "*",
        instr('sub_ind', 'E').alias("instr_val"),
        split('order_status', "_")[0].alias("split_val")).select(
            "*",
            concat_ws("|", "order_status", "sub").alias("concat_val"))
func_df.withColumn("repeat_val", repeat("instr_val", 3)).select(
    "*",
    concat_ws("|", *func_df.columns).alias("conc_ws")).show(truncate=False)
#orders_new_col.select(substring_index('order_status', "_", 2)).show()
#list_1 = ["col_1", "col_2"]
#df_1 = spark.createDataFrame(list_1, StringType())
#df_1.select(substring_index("value", "_", 1)).show()

# COMMAND ----------

#Date
from pyspark.sql.functions import current_timestamp, current_date, date_format, dayofyear, year, month, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, date_trunc, lit
orders_new_col.select(
dfrmsNS2 = dfrms.where(sf.col("shadow_record") == 0)

dfrmsNS = dfrmsNS1.union(dfrmsNS2)

srcfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/year=" + year + "/month=" + month + "/day=" + day +""

tgtfilePathid = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/riskrun/"
tgtfilePathProduct1320 = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/Product1320/"
tgtfilePathProduct1325 = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/Product1325/"
tgtfilePathProduct1335 = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/Product1335/"
tgtfilePathProduct3260 = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/Product3260/"

dfparquet = sparkSession.read.format("parquet").load(srcfilePath)

df = dfparquet.withColumn("year",sf.split("createdDate","\-")[0]) \
          .withColumn("month",sf.split("createdDate","\-")[1]) \
          .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

dfbaseData = df.select([col for col in df.columns])

dfjoin = dfbaseData.join(dfrmi,(dfbaseData.loanApplicationId == dfrmi.loan_application_id) & \
                                    (sf.unix_timestamp(dfbaseData.createdDatePT) - sf.unix_timestamp(dfrmi.date_created) >= 0),'left_outer') \
                    .join(dfrmsNS,(dfrmi.id == dfrmsNS.input_id),'left_outer') \
                    .select(dfbaseData.id \
                    ,dfbaseData.mvpApplicantId \
                    ,dfbaseData.loanApplicationId \
                    # ,dfbaseData.mvpLoanApplicationId \
                    ,sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("neustarTimestampUTC") \
                    ,dfbaseData.createdDatePT.alias("neustarTimestampPT") \
                    ,dfrmi.id.alias("rmiId") \
示例#17
0
文件: PFP.py 项目: NYU-FIM/PFP
import pyspark
from pyspark.sql.functions import split
from pyspark.ml.fpm import FPGrowth
import os

if __name__ == '__main__':
    spark = pyspark.sql.SparkSession.builder\
            .master("local[*]")\
            .appName("FPGrowth")\
            .getOrCreate()
    inFile = "transData"

    # data: DataFrame
    # \s: matches unicode white spaces
    data = spark.read.text(inFile)\
            .select(split("value", "\s+")\
            .alias("items"))

    data.show(truncate=False)
    
    fp = FPGrowth(minSupport=0.2, minConfidence=0.7)
    fpm = fp.fit(data)
    fpm.freqItemsets.show(5)
    fpm.associationRules.show(5)

    #transData = data.map(lambda s : s.trim.split(' '))

    spark.stop()
示例#18
0
                        format('kafka').\
                        option('kafka.bootstrap.servers', 'localhost:9092').\
                        option('subscribe', 'india-tweets').\
                        option('startingOffsets', 'earliest').\
                        load()

    tweets_df = tweets_df_raw.selectExpr("cast(value AS STRING)")

    # In this transformation we are applying filter over the source dataframe and
    # filtering out all the tweets which do not contains #.
    tweets_stream_filtered = tweets_df.filter(col('value').contains('#'))

    # In this transformation we are splitting our tweet into words and transformed
    # data frame will have an array of words in each row.
    tweet_stream_words = tweets_stream_filtered.select(
        split(col('value'), ' ').alias('tweet_words'))

    # In this transformation we are exploding each row, and all the words present in the array
    # will be each inserted as a row.
    tweet_stream_exploded_words = tweet_stream_words.select(
        explode(col('tweet_words')).alias('tweet_word'))

    # In this transformation we are filtering all the rows which don't start with '#'. So, we
    # just want to keep all the hashtags.
    tweet_stream_hashtags = tweet_stream_exploded_words.filter(
        col('tweet_word').contains('#'))

    # In this transformation we are doing a group by on hashtag and taking the count.
    trending_hashatags_grouped = tweet_stream_hashtags.groupBy(
        lower(col('tweet_word')).alias('tweet_word')).count().sort(
            desc('count'))
示例#19
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, mean, desc

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

review = spark.read.text("review.csv")
business = spark.read.text("business.csv")

review_col = split(review['value'], '::')

#create review dataframe
review = review.withColumn('business_id', review_col.getItem(2)).withColumn(
    'star', review_col.getItem(3)).drop('value')
#calculate average based on business_id
review = review.groupBy("business_id").agg(
    mean("star").alias("avg_star")).orderBy(desc("avg_star")).limit(10)

business_col = split(business['value'], '::')
business = business.withColumn('business_id', business_col.getItem(0)).withColumn('address', business_col.getItem(1)) \
    .withColumn('categories', business_col.getItem(2)).drop('value')

#join review with business
result = business.join(
    review, business['business_id'] == review['business_id'],
    'inner').drop(review.business_id).dropDuplicates().limit(10)

#convert to rdd foramt to output
"""
(4d) Words from lines
Before we can use the wordcount() function, we have to address two issues with the format of the DataFrame:
The first issue is that that we need to split each line by its spaces.
The second issue is we need to filter out empty lines or words.
Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row. To accomplish these two tasks you can use the split and explode functions found in pyspark.sql.functions.
Once you have a DataFrame with one word per row you can apply the DataFrame operation where to remove the rows that contain ''.
Note that shakeWordsDF should be a DataFrame with one column named word.
"""

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                .select(
                 split('sentence', ' ') 
                .alias('sentence')
                ))
shakeWordsDF = (shakeWordsDF
                .select(
                 explode('sentence')
                .alias('word')
                ).filter("word != ''"))
shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount


"""
(4e) Count the words
We now have a DataFrame that is only words. Next, let's apply the wordCount() function to produce a list of word counts. We can view the first 20 words by using the show() action; however, we'd like to see the words in descending order of count, so we'll need to apply the orderBy DataFrame method to first sort the DataFrame that is returned from wordCount().
You'll notice that many of the words are common English words. These are called stopwords. In a later lab, we will see how to eliminate them from the results.
示例#21
0
# Transformación columnas classic_mode: t = 1 , f = 0

dataset = dataset.withColumn(
    'classic_mode_index',
    F.when(F.col('classic_mode') == 't', 1.0).otherwise(0.0))

# Transformación columnas le_mode: t = 1 , f = 0

dataset = dataset.withColumn(
    'le_mode_index',
    F.when(F.col('le_mode') == 't', 1.0).otherwise(0.0))

# Guardamos version de lmp_version

dataset = dataset.withColumn('lmp_version_split',
                             F.split(F.col('lmp_version'), "-").getItem(0))

# Separamos nap, uap y lap de la columna address

dataset = dataset.withColumn('nap', dataset.address.substr(1, 5))
dataset = dataset.withColumn('uap', dataset.uap_lap.substr(1, 2))
dataset = dataset.withColumn('lap', dataset.uap_lap.substr(4, 11))

# StringIndexer

string_indexer_model_path = "{}/data/stringIndexerModel.bin".format(base_path)
string_indexer = PipelineModel.load(string_indexer_model_path)
dataset = string_indexer.transform(dataset)

# MinMaxScaler
        appName("Read .dat  file and write to hive"). \
        config("spark.sql.warehouse.dir", warehouseLocation). \
        config("hive.metastore.uris", "thrift://localhost:9083"). \
        enableHiveSupport(). \
        getOrCreate()
    logger = Log4j(spark)

    df = spark.read.format("text").option("encode",
                                          "utf-8").load(r"dat/movies.dat")
    df.printSchema()

    logger.info(
        ("Splitting dataframe columns with double/multiple delimiter.."))


    doubleDelimterSplitDF = df.withColumn("data", split(col("value"), "::")). \
        select(expr("data[0]").cast(IntegerType()).alias("Movie_id"),
               expr("data[1]").alias("Movie_Title"),
               expr("data[2]").alias("geners"))

    logger.info(
        "Further splitting dataframe column with single delimiter as pipe")
    furtherSplitDF = doubleDelimterSplitDF. \
        withColumn("exploded_geners", explode(split(col("geners"), "\|+"))).drop("geners")

    logger.info("Extracting value from bracket using regular expressions")
    regexpExtractDF = furtherSplitDF. \
        withColumn("year", regexp_extract(col("Movie_Title"), r"\(([^()]+)\)$", 1))

    regexpExtractDF.show(truncate=False)
示例#23
0
def quebra_string_tipo(df):
    return df.withColumn("tipo", F.split(df['instance_type'], "\.")[0])
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
# shakeWordsDF = (shakespeareDF
#                 .select(shakespeareDF[0].alias('sentence'))
#                ) 
shakeWordsDF = (shakespeareDF
                .select(split(shakespeareDF[0], " ").alias('wordLst'))
               )
shakeWordsDF = (shakeWordsDF
                .select(explode(shakeWordsDF.wordLst).alias('word'))
                .where("word != ''")
               )

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")
                    F.col('max_daily_temp')))
    date_format_function = udf(lambda x: datetime.strptime(x, '%Y-%m-%d'),
                               DateType())
    df = df.withColumn(
        "weather_date",
        date_format_function(date_format(col("weather_date"), "yyyy-MM-dd")))

    # applying pivot on weather concepts

    df1 = df.groupby('zip_code',
                     'weather_date').pivot('weather_concepts',
                                           ['DBT', 'DPT', 'HUM']).agg(
                                               F.first('temp_set')).orderBy(
                                                   'zip_code', 'weather_date')

    split_col = F.split(df1['DBT'], ',')
    df1 = df1.withColumn('avg_daily_temp_DBT',
                         split_col.getItem(0).cast(DoubleType()))
    df1 = df1.withColumn('min_daily_temp_DBT',
                         split_col.getItem(1).cast(DoubleType()))
    df1 = df1.withColumn('max_daily_temp_DBT',
                         split_col.getItem(2).cast(DoubleType()))

    split_col = F.split(df1['DPT'], ',')
    df1 = df1.withColumn('avg_daily_temp_DPT',
                         split_col.getItem(0).cast(DoubleType()))
    df1 = df1.withColumn('min_daily_temp_DPT',
                         split_col.getItem(1).cast(DoubleType()))
    df1 = df1.withColumn('max_daily_temp_DPT',
                         split_col.getItem(2).cast(DoubleType()))
示例#26
0
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (
  shakespeareDF.select(explode(split(shakespeareDF.sentence, " ")))
  .where("col != ''")
  .selectExpr("col as word"))

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
示例#27
0
    StructField("Hashtags", StringType(), True),
    StructField("UserMentionNames", StringType(), True),
    StructField("UserMentionID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Place", StringType(), True),
    StructField("Followers", IntegerType(), True),
    StructField("Friends", IntegerType(), True)
])
csvDF =( spark \
    .readStream \
    .schema(userSchema) \
    .option("delimiter", ";") \
    .option("maxFilesPerTrigger",1)\
    .csv(inputpath))
query1 = csvDF.select(explode(
    split("Hashtags",
          ",")).alias("hos")).groupBy("hos").count().orderBy('count',
                                                             ascending=False)
query1.writeStream\
 .outputMode("complete")\
 .format("console")\
 .option("numrows" ,1)\
 .queryName("counts")\
 .start()\

query2 = csvDF.select("Name", "Followers", "Friends")
aria = query2.withColumn("rate", query2.Followers / query2.Friends).select(
    "Name", "rate")
spar = aria.orderBy('rate', ascending=False)
aria.writeStream\
 .outputMode("append")\
 .format("console")\
  "STATION_NAME",
  "STATE_PROVINCE",
  "COUNTY",
  "COUNTRY",
  "CALL_SIGN",
  "LOCATION"
).show(10)
wban_with_location.count() # 1409

#
# Get the Latitude/Longitude of those stations with parsable locations
#
from pyspark.sql.functions import split
wban_with_lat_lon = wban_with_location.select(
  wban_with_location.WBAN_ID,
  (split(wban_with_location.LOCATION, ',')[0]).cast("float").alias("Latitude"),
  (split(wban_with_location.LOCATION, ',')[1]).cast("float").alias("Longitude")
)
wban_with_lat_lon.show(10)

# Count those that got a latitude and longitude
wban_with_lat_lon = wban_with_lat_lon.filter(
  wban_with_lat_lon.Longitude.isNotNull()
)
wban_with_lat_lon.count() # 391

#
# Extend the number of locations through geocoding
#

# Count the number of US WBANs
示例#29
0
  .registerTempTable("input")

#load dictionary as csv to reduce proccessing required to pivot json fields
dictSchema = [
    StructField('colID', StringType(), True),
    StructField('colValue', IntegerType(), True)
]
finalStruct = StructType(fields=dictSchema)
df2 = sqlContext.read.csv(path='inputs/freq_dict_mini.csv',
                          header=True,
                          schema=finalStruct,
                          ignoreLeadingWhiteSpace=True,
                          ignoreTrailingWhiteSpace=True)

newDf = df2\
  .withColumn("sortedID",F.array_join(F.sort_array(F.split(df2["colID"],"")),"",""))\
  .withColumn("wordLen", F.length('colID'))
newDf.registerTempTable("dictionary")

#join on sorted characters to get unscrambled possibilities
sqlContext.sql("""select puzzle_id,
                         letters, 
                         keyPositions, 
                         colID, 
                         colValue, 
                         getKeyLetters_udf(colID,keyPositions) as keyLetters, 
                         answerLengths 
                  from input i inner join dictionary d on d.sortedID = i.sortedLetters
""").registerTempTable("unscrambled")
#aggregate to posibilities into list
unscrambled = sqlContext.sql("""select puzzle_id, 
示例#30
0
	.appName("StructuredNetworkWordCount") \
	.getOrCreate()

#("127.0.0.1", 1337)
 # Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = spark\
	.readStream\
	.format('socket')\
	.option('host', '127.0.0.1')\
	.option('port', 1337)\
	.load()

# Split the lines into words
words = lines.select(
	explode(
    	split(lines.value, ' ')
	).alias('word')
)

# Generate running word count
wordCounts = words.groupBy('word').count()


# Start running the query that prints the running counts to the console
query = wordCounts\
	.writeStream\
	.outputMode('complete')\
	.format('console')\
	.start()

# Await Spark Streaming termination
def main():
    sc = SparkSession.builder.appName("SentencingAnalyzer")\
        .config("spark.driver.memory", "10G")\
        .getOrCreate()

    # main df
    cases = sc.read.json("../data/sentencingCases2.jsonl")
    df = cleanDf(cases)

    # read categorized csv
    categorizedCsv = sc.read.csv("../data/categorized.csv", header=True)
    categorizedCsv = categorizedCsv.select(
        'caseName',
        f.split(f.col("type"), " - ").alias('offenseType'), 'duration1',
        'sentenceType1')

    # create the search df
    df = extractOffenseKeywords(df)
    df.cache()
    dfSearch = sc.createDataFrame(searchData, ["term", "offenseKeywords"])

    # CLASSIFICATION OF OFFENSE
    hashingTF = HashingTF(inputCol="offenseKeywords",
                          outputCol="rawFeatures",
                          numFeatures=1000)
    result = hashingTF.transform(df)
    resultSearch = hashingTF.transform(dfSearch)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(result)
    rescaledData = idfModel.transform(result).filter(
        f.size('offenseKeywords') > 0)
    idfModelSearch = idf.fit(resultSearch)
    rescaledDataSearch = idfModelSearch.transform(resultSearch)

    mh = MinHashLSH(inputCol="features",
                    outputCol="hashes",
                    seed=12345,
                    numHashTables=20)
    modelMH = mh.fit(rescaledData)
    transformedData = modelMH.transform(rescaledData)

    modelMHSearch = mh.fit(rescaledDataSearch)
    transformedDataSearch = modelMH.transform(rescaledDataSearch)

    categorizedDf = modelMHSearch.approxSimilarityJoin(
        transformedDataSearch,
        transformedData,
        0.89,
        distCol="JaccardDistance")
    distanceDf = categorizedDf.select([f.col('datasetA.term')] + [f.col('datasetB.caseID')] + [f.col("JaccardDistance")]) \
        .orderBy('caseID', 'JaccardDistance')
    distanceDf = distanceDf.groupBy('caseID').agg(
        f.collect_list('term').alias('predictedOffences'),
        f.collect_list('JaccardDistance').alias('JaccardDistances'))
    distanceDf.cache()
    distanceDf.show()

    # EVALUATE CATEGORIZATION AGAINST MANUAL CATEGORIZATION
    distanceDfEval = distanceDf.join(
        categorizedCsv, distanceDf.caseID == categorizedCsv.caseName)
    distanceDfEval = distanceDfEval.filter(
        distanceDfEval.offenseType[0] != "N/A").filter(
            distanceDfEval.offenseType[0] != "multiple party sentence")
    calcuateDifferenceInPredictedVsActualOffences_udf = f.udf(
        calcuateDifferenceInPredictedVsActualOffences, FloatType())
    distanceDfEval = distanceDfEval.withColumn(
        "error",
        calcuateDifferenceInPredictedVsActualOffences_udf(
            distanceDfEval.predictedOffences, distanceDfEval.offenseType))
    calcuateDifferenceInPredictedVsActualOffencesPercentage_udf = f.udf(
        calcuateDifferenceInPredictedVsActualOffencesPercentage, FloatType())
    distanceDfEval = distanceDfEval.withColumn(
        "pctCorrect",
        calcuateDifferenceInPredictedVsActualOffencesPercentage_udf(
            distanceDfEval.predictedOffences, distanceDfEval.offenseType))
    distanceDfEval.select('caseID', 'predictedOffences', 'offenseType',
                          'JaccardDistances', 'error',
                          'pctCorrect').show(200, truncate=False)
    rmse = (distanceDfEval.groupBy().agg(f.sum('error')).collect()[0][0] /
            distanceDfEval.count())**(1.0 / 2)
    print("Offense category RMSE:", rmse)
    pctCorrectOffense = (distanceDfEval.groupBy().agg(
        f.sum('pctCorrect')).collect()[0][0] / distanceDfEval.count()) * 100
    print("Percentage of offenses correctly categorized: ", pctCorrectOffense)
示例#32
0
			StructField("Hashtags",StringType(), True),
			StructField("usermentionnames", StringType(), True),
			StructField("usermentionid", StringType(), True), 
			StructField("name", StringType(), True),
			StructField("place", StringType(), True),
			StructField("followers",DoubleType(), True),
			StructField("friends",DoubleType() , True),])
lines=spark \
	.readStream \
	.option("sep",";") \
	.schema(schema) \
	.csv("hdfs://localhost:9000/stream")

tags=lines.select("Hashtags")
hashs = tags.select(
   explode(
       split("Hashtags", ",")
   ).alias("Hashtags")
)

count=hashs.groupby("Hashtags").count()

most_common=count.select("Hashtags","count").orderBy("count",ascending=False).limit(5)

query=most_common.writeStream.outputMode("complete").format("console").start()


query.awaitTermination(60)
query.stop()

示例#33
0
    def read_spark_df_from_msexchange_data_store(self, **args):
        url = args["hbase_url"]

        r = requests.get(url)

        # Converting api data in json file
        try:
            d = r.json()

        except:
            print("Invalid URL")

        # Checking for data availability
        if len(d) == 0:
            print(
                "There are no events to process. Please enter a different search criteria in the url."
            )

        # Converting API data into Spark Dataframe
        print("Reading the data from profiler...")
        spark = SparkSession.builder.appName(
            'mseapi').enableHiveSupport().getOrCreate()
        sc = spark.sparkContext
        tsRDD = sc.parallelize(d)
        df_mail = spark.read.option('multiline', "true").json(tsRDD)
        total_evt_count = df_mail.count()
        print("Total number of records: " + str(total_evt_count))

        if total_evt_count > 0:
            mail_len = f.udf(lambda s: len(s), LongType())
            mail_sum = f.udf(lambda s: sum(s), LongType())
            # mail_mean  = f.udf(lambda s: round(mean(s),4), FloatType())
            # mail_stdev = f.udf(lambda s: round(stdev(s),4), FloatType())


            df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\
                            .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\
                            .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
                            .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\
                            .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
                                                            .cast(ArrayType(IntegerType())).alias("email_size"),
                                                    f.sum("ext_sndrs").alias("ext_sndrs"))\
                            .withColumn("no_of_emails", mail_len("email_size"))\
                            .withColumn("tot_email_size", mail_sum("email_size"))\
                            .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\
                            .drop("email_size")
            #.withColumn("email_size_mean", mail_mean("email_size"))\
            #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\

            # df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\
            #                 .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\
            #                 .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
            #                 .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\
            #                 .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
            #                                                 .cast(ArrayType(IntegerType())).alias("email_size"),
            #                                         f.sum("ext_sndrs").alias("ext_sndrs"))\
            #                 .withColumn("no_of_emails", mail_len("email_size"))\
            #                 .withColumn("tot_email_size", mail_sum("email_size"))\
            #                 .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\
            #                 .drop("email_size")
            #                 #.withColumn("email_size_mean", mail_mean("email_size"))\
            #                 #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\

            # df_mail_grp = df_mail.withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
            #                     .withColumn("mail_size", f.regexp_replace('mail_size', ' ', ''))\
            #                     .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
            #                                                     .cast(ArrayType(IntegerType())).alias("email_size"),
            #                                             f.sum("ext_sndrs").alias("ext_sndrs"))\
            #                     .withColumn("no_of_emails", mail_len("email_size"))\
            #                     .withColumn("tot_email_size", mail_sum("email_size"))\
            #                     .withColumn("email_size_mean", mail_mean("email_size"))\
            #                     .withColumn("email_size_stdev", mail_stdev("email_size"))\
            #                     .drop("email_size")
            df_mail_grp.show(3)
            return df_mail_grp

        else:
            schema = StructType([])
            sqlContext = SQLContext(sc)
            sdf = sqlContext.createDataFrame(sc.emptyRDD(), schema)
            return sdf
示例#34
0
# Extract Trip time
def time_delta(pickup_time, dropoff_time):
    pickup_time_out  = datetime.datetime.strptime(pickup_time, '%m/%d/%y %H:%M')
    dropoff_time_out = datetime.datetime.strptime(dropoff_time, '%m/%d/%y %H:%M')
    trip_time        = (dropoff_time_out - pickup_time_out).seconds / float(60)
    return trip_time

f = udf(time_delta, FloatType())

# (1) Calculate "trip_time"
# (2) Create a "tip_flag" for any record where a customer leaves a tip
# (3) Extract the Pickup Day (as an integer)
# (4) Extract the Pickup Hour (as an integer)
transformed1 = rawdata.withColumn("trip_time", f(rawdata.pickup_datetime, rawdata.dropoff_datetime)) \
                      .withColumn("tip_flag", (when(rawdata.tip_amount > 0.0, 1).otherwise(0)) ) \
                      .withColumn("pickup_day", split(rawdata.pickup_datetime,"\/")[1].cast("integer") ) \
                      .withColumn("pickup_hour", split(split(rawdata.pickup_datetime," ")[1],":")[0].cast("integer") )


#######################################################################################
#
#   Model Prep
#
#######################################################################################

# String Indexer
strindexer = StringIndexer(inputCol="vehicle_id", outputCol="vehicle_id_index")
modelprep1 = strindexer.fit(transformed1).transform(transformed1)

features = ['pickup_longitude','passenger_count','tolls_amount','tip_amount','trip_distance']
modifiedCFLogsDF = modifiedCFLogs.toDF()

## Create dyanmaic frame from optimized(Parquet format) Application Loadbalancer logs as the datasource. Glue Data Catalog = {database = "reInvent2018_aws_service_logs", table_name = "alb_access_optimized"}
albLogs = glueContext.create_dynamic_frame.from_catalog(database = "reInvent2018_aws_service_logs", table_name = "alb_access_optimized", transformation_ctx = "albLog")

## Drop the "year", "month", "day", "hour" fields
trimmedALBLogs = DropFields.apply(frame = albLogs, paths=["year", "month", "day", "hour"], transformation_ctx ="trimmedALBLogs")

## Rename the time field in the ALB logs to alb_time
modifiedALBLogs = RenameField.apply(frame = trimmedALBLogs, old_name = "time", new_name = "alb_time", transformation_ctx ="modifiedALBLogs" )

## Convert ALB Log dynamic frame to Apache Spark data frame
modfiedALBLogDF = modifiedALBLogs.toDF()

## Extract the custom trace id from the albLog coloumn name trace_id in the alb logs, as the Application Load Balancer would have updated the trace_id value with the self field
split_col = split(modfiedALBLogDF['trace_id'], ';')
finalALBLogDF = modfiedALBLogDF.withColumn("custom_trace_id", split_col.getItem(1))

## Join(let outer join) the Lambda@Edge logs with the ALB logs based on the custom trace id
leALBCombinedLogsDF = trimmedLambdaEdgeLogsDF.join(finalALBLogDF, trimmedLambdaEdgeLogsDF["customtraceid"] == finalALBLogDF["custom_trace_id"], "left_outer")

## Join(let outer join) the CloudFront access logs with the combine Lambda@Edge and ALB logs based on the requestid
combinedLogsDF = modifiedCFLogsDF.join(leALBCombinedLogsDF, modifiedCFLogsDF["cf_requestid"] == leALBCombinedLogsDF["requestid"], "left_outer")

## Convert the ALB Log data frame to dynamic frame
combinedLogs = DynamicFrame.fromDF(combinedLogsDF, glueContext, "combinedLogs")

## Drop custom trace id and requestid from combined logs
finalCombinedLogs = DropFields.apply(frame = combinedLogs, paths=["custom_trace_id", "cf_requestid"], transformation_ctx ="finalCombinedLogs")

#Destnation S3 loaction for combine logs
示例#36
0
# MAGIC 
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = shakespeareDF.where(length(shakespeareDF.sentence) > 0).select(explode(split(shakespeareDF.sentence, '\s+')).alias('word'))
shakespeareDF.show()
shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Count the words **
示例#37
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, size

spark = SparkSession.builder.appName("Packt").getOrCreate()

data = spark.read.csv('../../Datasets/netflix_titles_nov_2019.csv',
                      header='true')
# data.show()

# take only the movies
movies = data.filter((col('type') == 'TV Show')
                     & ((col('rating') == 'TV-G') | (col('rating') == 'TV-Y')))
movies.show()

# add a column with the number of actors
transformed = movies.withColumn('count_lists',
                                size(split(movies['listed_in'], ',')))

# select a subset of columns to store
selected = transformed.select('title', 'cast', 'rating', 'release_year',
                              'duration', 'count_lists', 'listed_in',
                              'description')
selected.show()

# write the contents of the DataFrame to disk
selected.write.csv('transformed.csv', header='true')
示例#38
0
sentenceDF.show(truncate=False)
(sentenceDF
 .select(removePunctuation(col('sentence')))
 .show(truncate=False))
 
 # 4c
 
 fileName = "dbfs:/databricks-datasets/cs100/lab1/data-001/shakespeare.txt"

shakespeareDF = sqlContext.read.text(fileName).select(removePunctuation(col('value')).alias('sentence'))
shakespeareDF.show(15, truncate=False)

$ 4d

from pyspark.sql.functions import split, explode
shakeWordsDF = shakespeareDF.select(split(shakespeareDF.sentence, ' ').alias('word'))
shakeWordsDF = shakeWordsDF.select(explode(shakeWordsDF.word))
shakeWordsDF = shakeWordsDF.select(col('col').alias('word'))
shakeWordsDF = shakeWordsDF.where(shakeWordsDF.word != '')

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# 4e

from pyspark.sql.functions import desc
topWordsAndCountsDF = wordCount(shakeWordsDF)
topWordsAndCountsDF = topWordsAndCountsDF.select(col("word").alias("word"), col("count").alias("number"))
topWordsAndCountsDF = topWordsAndCountsDF.orderBy(topWordsAndCountsDF.number.desc())
topWordsAndCountsDF.show()
示例#39
0
decode_udf = F.udf(decode, StringType())


def normalize(string):
    return string.lower()


normalize_udf = F.udf(normalize, StringType())

if __name__ == "__main__":
    spark = SparkSession.builder.appName("Test Stream").getOrCreate()
    spark.sparkContext.setLogLevel("FATAL")

    ds = (
        spark.readStream.format("kafka").option(
            "kafka.bootstrap.servers",
            "localhost:9092")  # can add more to this list, seperate with ','
        .option("subscribe", "test").load())

    decoded_ds = ds.select(decode_udf(F.col("value")).alias("content"))
    words = decoded_ds.withColumn("word",
                                  F.explode(F.split(F.col("content"), " ")))
    normalized_words = words.select(normalize_udf(F.col("word")).alias("word"))
    word_counts = normalized_words.groupBy("word").count()

    query = (
        word_counts.writeStream.outputMode("update").format("console").start())

    # to start stream
    query.awaitTermination()
示例#40
0
                           StructField("stop_id", StringType(), True),\
                           StructField("stop_sequence", IntegerType(), True),\
                           StructField("pickup_type", IntegerType(), True),
                           StructField("drop_off_type", IntegerType(), True)])

    real_stoptimes_schema = StructType([StructField("ROUTE_ID", StringType(), True),\
                           StructField("TRIP_ID", StringType(), True),\
                           StructField("STOP_ID", StringType(), True),\
                           StructField("time",StringType(), True)])

    real_stoptimes = sqlContext.read.format('com.databricks.spark.csv').options(header='false')\
                                                                       .load('new_predict.csv', schema = real_stoptimes_schema)              
    stoptimes = sqlContext.read.format('com.databricks.spark.csv').options(header='false')\
                                                                  .load('stop_times.txt',schema = stop_times_schema)
    
    new_time = real_stoptimes.withColumn('realtime',split(pyspark.sql.functions.from_unixtime(real_stoptimes.time), ' ')[1])\
                             .withColumn('date',split(pyspark.sql.functions.from_unixtime(real_stoptimes.time), ' ')[0])
   
    new_time.registerTempTable('new_time')
    stoptimes.registerTempTable('stoptimes')

    sqlContext.registerFunction("getsec", lambda x: get_sec(x), IntegerType()) #register python function into sql

    join = sqlContext.sql('SELECT ROUTE_ID,TRIP_ID,STOP_ID,realtime,date,(getsec(realtime)-getsec(arrival_time)) as delay\
                           FROM new_time\
                           INNNER JOIN stoptimes\
                           ON (TRIP_ID = trip_id AND STOP_ID = stop_id)') # join with GTFS data

    join.registerTempTable('new_join')

    with open(sys.argv[-2]) as fr: #read sql                                                                                               
示例#41
0
AJdataDF.count()

# deleting a column from a dataframe using drop()
# drop() is like the opposite of select(): Instead of selecting specific columns from a DataFrame, it drops a specifed column from a DataFrame
dataDF.drop('occupation').drop('age').show()

# the sample() transformation returns a new DataFrame with a random sample
sampledDF = dataDF.sample(withReplacement=False, fraction=0.10)
print sampledDF.count()
sampledDF.show()

# split() and explode() transformations
from pyspark.sql.functions import split, explode

shakeWordsSplit = (shakespeareDF
                .select(split(shakespeareDF.word,' ').alias('word'))) # here split(DF,' ') splits the sentence at a space and returns each word in a single row
				
shakeWordsExplode = (shakeWordsSplit
                    .select(explode(shakeWordsSplit.word).alias('word'))) # explode() Returns a new row for each element in the given array
					
shakeWordsDF = shakeWordsExplode.filter(shakeWordsExplode.word != '') # removes all the blanks

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

###############################################################
###															###
###															###
###   						GROUP BY						###
###															###
示例#42
0
sentenceDF.show(truncate=False)
(sentenceDF
 .select(removePunctuation(col('sentence')))
 .show(truncate=False))
 
 # loading text file 
 
 fileName = ""

shakespeareDF = sqlContext.read.text(fileName).select(removePunctuation(col('value')))
shakespeareDF.show(15, truncate=False)

# sample for splitting lines to words and making words as seperate rows in data frames

from pyspark.sql.functions import split, explode
shakeWordsDF1 = shakespeareDF.select(explode(split(shakespeareDF.sentence,' ')).alias('word'))
shakeWordsDF = shakeWordsDF1.where(shakeWordsDF1.word !="")
shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

from pyspark.sql.functions import desc
topWordsAndCountsDF = wordCount(shakeWordsDF).orderBy(desc("count"))
topWordsAndCountsDF.show()

# sample for creating collection of few records 
from faker import Factory
fake = Factory.create()
fake.seed(4321)
from pyspark.sql import Row
def fake_entry():
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                .select(explode(split(shakespeareDF.value , ' ')).alias('word'))
                .filter("word<>''"))
shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Count the words **
示例#44
0
文件: wc.py 项目: cottrell/notebooks
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()

# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Split the lines into words
words = lines.select(
   explode(
       split(lines.value, " ")
   ).alias("word")
)

# Generate running word count
wordCounts = words.groupBy("word").count()

output_mode = "complete"
# output_mode = "append"

# Start running the query that prints the running counts to the console
query = wordCounts \
    .writeStream \
    .outputMode(output_mode) \
    .format("console") \
    .start()
# MAGIC 
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF.select(explode(split(col('sentence'),' ')).alias('word')))

shakeWordsDF = shakeWordsDF.filter(col('word') != '')

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
        .appName("StructuredNetworkWordCountWindowed")\
        .getOrCreate()

    # Create DataFrame representing the stream of input lines from connection to host:port
    lines = spark\
        .readStream\
        .format('socket')\
        .option('host', host)\
        .option('port', port)\
        .option('includeTimestamp', 'true')\
        .load()

    # Split the lines into words, retaining timestamps
    # split() splits each line into an array, and explode() turns the array into multiple rows
    words = lines.select(
        explode(split(lines.value, ' ')).alias('word'),
        lines.timestamp
    )

    # Group the data by window and word and compute the count of each group
    windowedCounts = words.groupBy(
        window(words.timestamp, windowDuration, slideDuration),
        words.word
    ).count().orderBy('window')

    # Start running the query that prints the windowed word counts to the console
    query = windowedCounts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
def insert_packets_into_table(**kwargs):
    print("Inserting data into the IoT-23 Packets table...")

    print("Step 1 | Get the year, month, and day for the current interval")
    ds = kwargs["ds"]
    year, month, day = get_ds_time(ds)

    print("Step 2 | Get and check path of current dataset partition")
    path_iot23_partition = get_iot23_partition(year, month, day)
    iot23_partition_exists = os.path.exists(path_iot23_partition)

    if (not iot23_partition_exists):
        print("This partition does not exist!")
        kwargs['ti'].xcom_push(key='packets_count', value=0)
    else:
        print("Step 3 | Get or create a Spark Session")
        spark = get_spark_session("IoT-23 Dataset Inserter")

        print("Step 4 | Extract the current partition of the IoT-23 dataset")
        df = spark \
            .read \
            .parquet("file://{}".format(path_iot23_partition))

        print("Step 5 | Add the missing columns")
        df = df.withColumn(
            "originate_network_id",
            F.concat(
                F.split(df.originate_host, "\.").getItem(0), F.lit("."),
                F.split(df.originate_host, "\.").getItem(1), F.lit("."),
                F.split(df.originate_host, "\.").getItem(2))).withColumn(
                    "response_network_id",
                    F.concat(
                        F.split(df.response_host, "\.").getItem(0), F.lit("."),
                        F.split(df.response_host, "\.").getItem(1), F.lit("."),
                        F.split(df.response_host,
                                "\.").getItem(2))).withColumn(
                                    "insert_date",
                                    F.to_date(F.lit(ds), "yyyy-MM-dd"))

        print("Step 6 | Select and rename the columns for the current table")
        df = df.selectExpr(
            "timestamp as timestamp", "uid as uid",
            "originate_network_id as originate_network_id",
            "response_network_id as response_network_id",
            "protocol as protocol", "service as service",
            "duration as duration", "connection_state as connection_state",
            "missed_bytes as missed_bytes", "history as history",
            "tunnel_parents as tunnel_parents", "label as label",
            "detailed_label as detailed_label", "insert_date as insert_date")

        df.printSchema()
        df.show(10, truncate=False)

        print(
            "Step 7 | Log the count of the DataFrame for data quality checks")
        kwargs['ti'].xcom_push(key='packets_count', value=df.count())

        print("Step 8 | Load the dataset data to the Packets table.")
        write_to_db(df, "append", "packets")

        print("Inserting data into the IoT-23 Packets table completed!")
# COMMAND ----------

# Instead of registering a UDF, call the builtin functions to perform operations on the columns.
# This will provide a performance improvement as the builtins compile and run in the platform's JVM.

# Convert to a Date type
df = df.withColumn('date', F.to_date(df.end_date))

# Parse out the date only
df = df.withColumn(
    'date_only', F.regexp_replace(df.end_date, ' (\d+)[:](\d+)[:](\d+).*$',
                                  ''))

# Split a string and index a field
df = df.withColumn('city', F.split(df.location, '-')[1])

# Perform a date diff function
df = df.withColumn(
    'date_diff', F.datediff(F.to_date(df.end_date), F.to_date(df.start_date)))

# COMMAND ----------

df.createOrReplaceTempView("sample_df")
display(sql("select * from sample_df"))
df.printSchema()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Convert the DataFrame back to JSON strings and send back to Kafka
示例#49
0
    spark = SparkSession\
        .builder\
        .appName("StructuredKafkaWordCount")\
        .getOrCreate()

    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")
    
    probes = lines.select(
        split(lines.value,',')[0].alias('timestamp'),
        split(lines.value,',')[1].alias('mac'),
        split(lines.value,',')[2].alias('SSID'),
        split(lines.value,',')[3].alias('fornecedor'),
        split(lines.value,',')[4].alias('macId')
    )
    
    dispositivosPorFornecedor = probes.select("fornecedor", "mac").distinct().groupBy("fornecedor").count()
    
    query = dispositivosPorFornecedor\
        .writeStream\
        .outputMode("complete")\
        .foreach(processRow)\
        .start()

    query.awaitTermination()
# MAGIC 
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF.select(explode(split(shakespeareDF[0],"\s+")).alias("word"))).where("length(word) > 0")

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Count the words **
def main(spark):
    path = '../../../../data/census/'
    filename = "PEP_2017_PEPANNRES.csv"
    absolute_file_path = get_absolute_file_path(path, filename)

    # Ingestion of the census data
    intermediate_df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(absolute_file_path)

    # Renaming and dropping the columns we do not need
    intermediate_df = intermediate_df.drop("GEO.id") \
        .withColumnRenamed("GEO.id2", "id") \
        .withColumnRenamed("GEO.display-label", "label") \
        .withColumnRenamed("rescen42010", "real2010") \
        .drop("resbase42010") \
        .withColumnRenamed("respop72010", "est2010") \
        .withColumnRenamed("respop72011", "est2011") \
        .withColumnRenamed("respop72012", "est2012") \
        .withColumnRenamed("respop72013", "est2013") \
        .withColumnRenamed("respop72014", "est2014") \
        .withColumnRenamed("respop72015", "est2015") \
        .withColumnRenamed("respop72016", "est2016") \
        .withColumnRenamed("respop72017", "est2017")

    intermediate_df.printSchema()
    intermediate_df.show(5)

    # Creates the additional columns
    intermediate_df = intermediate_df \
    .withColumn("countyState", F.split(F.col("label"), ", ")) \
    .withColumn("stateId", F.expr("int(id/1000)")) \
    .withColumn("countyId", F.expr("id%1000"))

    intermediate_df.printSchema()
    intermediate_df.sample(.01).show(5, False)

    intermediate_df = intermediate_df \
        .withColumn("state", F.col("countyState").getItem(1)) \
        .withColumn("county", F.col("countyState").getItem(0)) \
        .drop("countyState")

    intermediate_df.printSchema()
    intermediate_df.sample(.01).show(5, False)

    # I could split the column in one operation if I wanted:
    # countyStateDf: Dataset[Row] = intermediateDf
    #  .withColumn("state", F.split(F.col("label"), ", ").getItem(1))
    #  .withColumn("county", F.split(F.col("label"), ", ").getItem(0))

    # Performs some statistics on the intermediate dataframe
    statDf = intermediate_df \
        .withColumn("diff", F.expr("est2010-real2010")) \
        .withColumn("growth", F.expr("est2017-est2010")) \
        .drop("id") \
        .drop("label") \
        .drop("real2010") \
        .drop("est2010") \
        .drop("est2011") \
        .drop("est2012") \
        .drop("est2013") \
        .drop("est2014") \
        .drop("est2015") \
        .drop("est2016") \
        .drop("est2017")

    statDf.printSchema()
    statDf.sample(.01).show(5, False)
    def dataB_process_otm(df_s: DataFrame, df_cost: DataFrame,
                          df_sr: DataFrame, df_sstat: DataFrame,
                          df_ss: DataFrame, df_ss_remark: DataFrame,
                          df_inv: DataFrame) -> DataFrame:
        """
        This function takes OTM source data frames and processes Freght Costs for the
        invoices in the dataB database that correspond to the Recycle Mills.
        :param df_s: Shipment table dataframe
        :param df_cost: Shipment_Cost table dataframe
        :param df_sr: Shipment_Refnum table dataframe
        :param df_sstat: Shipment_Status table dataframe
        :param df_ss: Shipment_Stop table dataframe
        :param df_ss_remark: Shipment_Stop_Remark table dataframe
        :param df_inv: dataB Recycle Invoices dataframe to match with OTM data
        :return: df: OTM processed dataframe
        """

        # Filter the Shipment_Refnum dataframe to only include records
        # with domain_name of SSCC and shipment_refnum_qual_gid's that equal MB or SSCC.MB.
        # Including a distinct because of duplicate values in the source.
        df_sr = df_sr.where(df_sr.domain_name == 'SSCC')\
            .where(df_sr.shipment_refnum_qual_gid.isin(['MB', 'SSCC.MB']))\
            .distinct()

        # Join on the shipment_stop_remark table to include sub-BOL's
        df_sr_sub_bol = df_sr.join(df_ss_remark, [df_sr.shipment_gid == df_ss_remark.shipment_gid], 'inner')\
            .select(df_sr.shipment_gid,
                    df_ss_remark.remark_text,
                    df_ss_remark.insert_date)
        df_sr_sub_bol = df_sr_sub_bol.withColumnRenamed(
            'remark_text', 'shipment_refnum_value')
        df_sr = df_sr.select(df_sr.shipment_gid, df_sr.shipment_refnum_value,
                             df_sr.insert_date)
        df_sr = df_sr.union(df_sr_sub_bol)

        # Filter down the Shipment_Refnum dataset to only include the records
        # that match with invoices so that the later joins and calculations
        # are more focused.  Since the bol's are unique in OTM, we only
        # need unique bol numbers here to filter down.
        df_inv = df_inv.where(df_inv.lbs >= 0)\
            .groupBy(df_inv.bol_number)\
            .agg((F.sum(df_inv.lbs)/2000).alias("total_bol_tons_from_inv"))

        df_sr = df_sr.groupBy(df_sr.shipment_gid,
                              df_sr.shipment_refnum_value)\
            .agg(F.max(df_sr.insert_date).alias("max_insert_date"))
        df_sr = df_sr.orderBy(df_sr.shipment_gid,
                              df_sr.shipment_refnum_value,
                              df_sr.max_insert_date.desc())\
            .dropDuplicates(["shipment_gid",
                             "shipment_refnum_value"])

        df_sr = df_inv.join(df_sr, [df_inv.bol_number == df_sr.shipment_refnum_value], 'inner')\
            .select(df_sr.shipment_gid,
                    df_sr.shipment_refnum_value,
                    df_inv.total_bol_tons_from_inv)

        df_sr_join_back = df_sr.select(df_sr.shipment_gid,
                                       df_sr.shipment_refnum_value)\
            .withColumnRenamed('shipment_gid', 'df_sr_join_back_shipment_gid')

        df_sr = df_sr.select(df_sr.shipment_gid,
                             df_sr.total_bol_tons_from_inv)\
            .groupBy(df_sr.shipment_gid)\
            .agg(F.sum(df_inv.total_bol_tons_from_inv).alias("total_shipment_gid_tons_from_inv"))

        df_sr = df_sr.join(df_sr_join_back, [df_sr.shipment_gid == df_sr_join_back.df_sr_join_back_shipment_gid], 'inner')\
            .select(df_sr.shipment_gid,
                    df_sr.total_shipment_gid_tons_from_inv,
                    df_sr_join_back.shipment_refnum_value)\
            .withColumnRenamed('shipment_gid', 'sr_shipment_gid')

        df_sstat = df_sstat.groupBy(df_sstat.domain_name,
                                    df_sstat.shipment_gid,
                                    df_sstat.status_type_gid,
                                    df_sstat.status_value_gid)\
            .agg(F.max(df_sstat.insert_date).alias("max_insert_date"))
        df_sstat = df_sstat.orderBy(df_sstat.shipment_gid,
                                    df_sstat.status_type_gid,
                                    df_sstat.status_value_gid,
                                    df_sstat.max_insert_date.desc())\
            .dropDuplicates(["shipment_gid",
                             "status_type_gid",
                             "status_value_gid"])

        # Filter the Shipment Refnum dataframe to only include records where the Shipment Status
        # matches what we are looking for with domain_name of SSCC and specific status_value_id's
        # At least three of the status_value_gid's need to match in order to put the confidence
        # level high enough to signify a match.
        df_sr = df_sr.join(df_sstat, [df_sr.sr_shipment_gid == df_sstat.shipment_gid], 'inner')\
            .where((df_sstat.domain_name == 'SSCC') &
                   (df_sstat.status_value_gid.isin({
                       'SSCC.BOL_ACTUALS_ENTERED_TRANSMISSION',
                       'SSCC.BOL DELETED_NO',
                       'SSCC.SECURE RESOURCES_ACCEPTED',
                       'SSCC.SECURE RESOURCES_PICKUP NOTIFICATION'})))\
            .groupBy(df_sr.sr_shipment_gid,
                     df_sr.shipment_refnum_value,
                     df_sr.total_shipment_gid_tons_from_inv).count()\
            .where(F.col('count') > 2) \
            .select(df_sr.sr_shipment_gid,
                    df_sr.shipment_refnum_value,
                    df_sr.total_shipment_gid_tons_from_inv)

        df_ss = df_ss.groupBy(df_ss.domain_name,
                              df_ss.shipment_gid,
                              df_ss.stop_num,
                              df_ss.dist_from_prev_stop_base)\
            .agg(F.max(df_ss.insert_date).alias("max_insert_date"))
        df_ss = df_ss.select(df_ss.domain_name,
                             df_ss.shipment_gid,
                             df_ss.stop_num,
                             df_ss.dist_from_prev_stop_base,
                             df_ss.max_insert_date)\
            .orderBy(df_ss.shipment_gid,
                     df_ss.stop_num,
                     df_ss.dist_from_prev_stop_base,
                     df_ss.max_insert_date.desc())\
            .dropDuplicates(["shipment_gid",
                             "stop_num"])

        # Filter the Shipment_Stop dataframe to only include records
        # with domain_name of SSCC and then add up the dist_from_prev_stop_base values
        # to determine the mileage
        df_ss = df_ss.where(df_ss.domain_name == 'SSCC')\
            .groupBy(df_ss.shipment_gid)\
            .agg(F.sum('dist_from_prev_stop_base').alias('mileage'))
        df_ss = df_ss.withColumnRenamed('shipment_gid', 'ss_shipment_gid')

        df_cost = df_cost.groupBy(df_cost.cost_type,
                                  df_cost.cost_base,
                                  df_cost.accessorial_code_gid,
                                  df_cost.is_weighted,
                                  df_cost.domain_name,
                                  df_cost.shipment_gid)\
            .agg(F.max(df_cost.insert_date).alias("max_insert_date"))

        # Drop the extra entries based on the max insert date
        df_cost = df_cost.orderBy(df_cost.shipment_gid,
                                  df_cost.max_insert_date.desc())\
            .dropDuplicates(["shipment_gid", "cost_base"])\
            .select(df_cost.shipment_gid,
                    df_cost.accessorial_code_gid,
                    df_cost.cost_type,
                    df_cost.cost_base,
                    df_cost.is_weighted,
                    df_cost.domain_name)

        # Filter the Shipment_Cost dataframe to only include records
        # with domain_name of SSCC
        df_cost = df_cost.where(df_cost.domain_name == 'SSCC')

        # Create a dataframe from Shipment_Cost that includes the detention costs
        df_cost_det = df_cost.where(df_cost.cost_type == 'A')\
            .where(df_cost.accessorial_code_gid.isin({
                'SSCC.DETENTION',
                'SSCC.DETENTION_DESTINATION',
                'SSCC.DTL LOADING',
                'SSCC.STORAGE'}))\
            .groupBy(df_cost.shipment_gid)\
            .agg(F.sum('cost_base').alias('det_cost_base_sum'))\
            .select(df_cost.shipment_gid, 'det_cost_base_sum')
        df_cost_det = df_cost_det.withColumnRenamed('shipment_gid',
                                                    'det_shipment_gid')
        df_cost_det = df_cost_det\
            .withColumn('det_cost_base_sum',
                        F.when(df_cost_det.det_cost_base_sum.isNotNull(), df_cost_det.det_cost_base_sum)
                        .otherwise(0))

        # Create a dataframe from Shipment_Cost that includes the accessorial costs
        df_cost_acc = df_cost.where(df_cost.cost_type.isin({'A', 'S', 'O'}))\
            .where(df_cost.accessorial_code_gid.isin({
                'SSCC.DETENTION',
                'SSCC.DETENTION_DESTINATION',
                'SSCC.DTL LOADING',
                'SSCC.STORAGE'}) == False)\
            .where(F.split(df_cost.accessorial_code_gid, '.').getItem(1).contains('FSC') == False)\
            .where(df_cost.is_weighted == 'N')\
            .groupBy(df_cost.shipment_gid)\
            .agg(F.sum('cost_base').alias('acc_cost_base_sum'))\
            .select(df_cost.shipment_gid, 'acc_cost_base_sum')
        df_cost_acc = df_cost_acc.withColumnRenamed('shipment_gid',
                                                    'acc_shipment_gid')
        df_cost_acc = df_cost_acc\
            .withColumn('acc_cost_base_sum',
                        F.when(df_cost_acc.acc_cost_base_sum.isNotNull(), df_cost_acc.acc_cost_base_sum)
                        .otherwise(0))

        # Create a dataframe from Shipment_Cost that includes the accessorial costs
        df_cost_fsrchg = df_cost.where(df_cost.cost_type == 'A')\
            .where(F.split(df_cost.accessorial_code_gid, '.').getItem(1).contains('FSC'))\
            .groupBy(df_cost.shipment_gid)\
            .agg(F.sum('cost_base').alias('fsrchg_cost_base_sum'))\
            .select(df_cost.shipment_gid, 'fsrchg_cost_base_sum')
        df_cost_fsrchg = df_cost_fsrchg.withColumnRenamed(
            'shipment_gid', 'fsrchg_shipment_gid')
        df_cost_fsrchg = df_cost_fsrchg\
            .withColumn('fsrchg_cost_base_sum',
                        F.when(df_cost_fsrchg.fsrchg_cost_base_sum.isNotNull(), df_cost_fsrchg.fsrchg_cost_base_sum)
                        .otherwise(0))

        # Create a dataframe from Shipment_Cost that includes the base rate costs
        # for LTL based shipments.  This will be later joined to the Shipment
        # table to apply a Freight cost value if the transport mode is LTL
        df_cost_ltl_base = df_cost.where(df_cost.cost_type.isin({'B', 'D'}))\
            .groupBy(df_cost.shipment_gid)\
            .agg(F.sum('cost_base').alias('ltl_base_cost_base_sum'))\
            .select(df_cost.shipment_gid, 'ltl_base_cost_base_sum')
        df_cost_ltl_base = df_cost_ltl_base.withColumnRenamed(
            'shipment_gid', 'ltl_base_shipment_gid')
        df_cost_ltl_base = df_cost_ltl_base\
            .withColumn('ltl_base_cost_base_sum',
                        F.when(df_cost_ltl_base.ltl_base_cost_base_sum.isNotNull(),
                               df_cost_ltl_base.ltl_base_cost_base_sum)
                        .otherwise(0))

        # Create a dataframe from Shipment_Cost that includes the base rate costs
        # for non-LTL based shipments.  This will be later joined to the Shipment
        # table to apply a Freight cost value if the transport mode is not LTL
        df_cost_nonltl_base = df_cost.where(df_cost.cost_type == 'B')\
            .groupBy(df_cost.shipment_gid)\
            .agg(F.sum('cost_base').alias('nonltl_base_cost_base_sum')) \
            .select(df_cost.shipment_gid, 'nonltl_base_cost_base_sum')
        df_cost_nonltl_base = df_cost_nonltl_base.withColumnRenamed(
            'shipment_gid', 'nonltl_base_shipment_gid')
        df_cost_nonltl_base = df_cost_nonltl_base\
            .withColumn('nonltl_base_cost_base_sum',
                        F.when(df_cost_nonltl_base.nonltl_base_cost_base_sum.isNotNull(),
                               df_cost_nonltl_base.nonltl_base_cost_base_sum)
                        .otherwise(0))

        df_s = df_s.groupBy(df_s.shipment_gid,
                            df_s.transport_mode_gid,
                            df_s.total_weight_base) \
            .agg(F.max(df_s.insert_date).alias("max_insert_date"))
        df_s = df_s.orderBy(df_s.shipment_gid, df_s.max_insert_date.desc())\
            .dropDuplicates(["shipment_gid"])

        # This join filters down the ref_nums to only shipments with good statuses that are
        # relevant to our invoices.
        df = df_sr.join(df_s, [df_sr.sr_shipment_gid == df_s.shipment_gid],
                        'left_outer')
        df = df.join(df_ss, [df_ss.ss_shipment_gid == df.sr_shipment_gid],
                     'left_outer')
        df = df.join(df_cost_det,
                     [df_cost_det.det_shipment_gid == df.sr_shipment_gid],
                     'left_outer')
        df = df.join(df_cost_acc,
                     [df_cost_acc.acc_shipment_gid == df.sr_shipment_gid],
                     'left_outer')
        df = df.join(
            df_cost_fsrchg,
            [df_cost_fsrchg.fsrchg_shipment_gid == df.sr_shipment_gid],
            'left_outer')
        df = df.join(
            df_cost_ltl_base,
            [df_cost_ltl_base.ltl_base_shipment_gid == df.sr_shipment_gid],
            'left_outer')
        df = df.join(df_cost_nonltl_base, [
            df_cost_nonltl_base.nonltl_base_shipment_gid == df.sr_shipment_gid
        ], 'left_outer')
        df = df.select(df.shipment_gid, df.transport_mode_gid,
                       df.total_shipment_gid_tons_from_inv,
                       df.shipment_refnum_value, df.mileage,
                       df.det_cost_base_sum, df.acc_cost_base_sum,
                       df.fsrchg_cost_base_sum, df.ltl_base_cost_base_sum,
                       df.nonltl_base_cost_base_sum)
        df = df.withColumn('det_cost_base_sum',
                           F.when(df.det_cost_base_sum.isNotNull(), df.det_cost_base_sum)
                           .otherwise(0))\
            .withColumn('acc_cost_base_sum',
                        F.when(df.acc_cost_base_sum.isNotNull(), df.acc_cost_base_sum)
                        .otherwise(0))\
            .withColumn('fsrchg_cost_base_sum',
                        F.when(df.fsrchg_cost_base_sum.isNotNull(), df.fsrchg_cost_base_sum)
                        .otherwise(0))\
            .withColumn('ltl_base_cost_base_sum',
                        F.when(df.ltl_base_cost_base_sum.isNotNull(), df.ltl_base_cost_base_sum)
                        .otherwise(0))\
            .withColumn('nonltl_base_cost_base_sum',
                        F.when(df.nonltl_base_cost_base_sum.isNotNull(), df.nonltl_base_cost_base_sum)
                        .otherwise(0))

        # Calculate the individual costs based on all of the joined tables.
        df = df.withColumn('tons',
                           F.when(df.total_shipment_gid_tons_from_inv.isNull(), 0)
                           .otherwise(df.total_shipment_gid_tons_from_inv).cast(T.DecimalType(38, 18)))\
            .withColumn('detention',
                        F.when(df.det_cost_base_sum.isNull(), 0)
                        .otherwise(df.det_cost_base_sum).cast(T.DecimalType(38, 18)))\
            .withColumn('accessorials',
                        F.when(df.acc_cost_base_sum.isNull(), 0)
                        .otherwise(df.acc_cost_base_sum).cast(T.DecimalType(38, 18)))\
            .withColumn('fuel_surcharge',
                        F.when(df.fsrchg_cost_base_sum.isNull(), 0)
                        .otherwise(df.fsrchg_cost_base_sum).cast(T.DecimalType(38, 18)))\
            .withColumn('base_rate',
                        F.when((df.ltl_base_cost_base_sum.isNull() & df.nonltl_base_cost_base_sum.isNull()), 0)
                        .when(F.trim(F.upper(df.transport_mode_gid)) == 'LTL', df.ltl_base_cost_base_sum)
                        .otherwise(df.nonltl_base_cost_base_sum).cast(T.DecimalType(38, 18)))
        # Calculate the freight rate per ton
        df = df.withColumn('freight_rate_per_ton',
                           F.when(df.tons > 0,
                                  (df.detention + df.accessorials + df.fuel_surcharge + df.base_rate) / df.tons)
                           .otherwise(0))\
            .withColumnRenamed('shipment_refnum_value', 'bol_number_join')

        # It is possible for multiple shipment_gid to match to the same bol_number and have the same cost.
        # This distinct removes those cases so as not to introduce duplicates when joining with the invoice table.
        df = (df.select(df.freight_rate_per_ton,
                        df.bol_number_join).distinct())

        return df
#
# with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}
decodedCustomerStreamDF = encodedCustomerStreamDF.withColumn("customer", unbase64(encodedCustomerStreamDF.encodedCustomer).cast("string"))

# TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords
decodedCustomerStreamDF\
    .withColumn("customer", from_json("customer", customerSchema))\
    .select(col('customer.*'))\
    .createOrReplaceTempView("CustomerRecords")

# TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF
emailAndBirthDayStreamingDF = sparkApp.sql("select birthDay, email from CustomerRecords where birthDay is not null and email is not null")

# TO-DO: Split the birth year as a separate field from the birthday
# TO-DO: Select only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF
emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF.select('email', split(emailAndBirthDayStreamingDF.birthDay,"-").getItem(0).alias("birthYear"))

emailAndBirthYearStreamingDF.writeStream.outputMode("append").format("console").start()
# TO-DO: using the spark application object, read a streaming dataframe from the Kafka topic stedi-events as the source
# Be sure to specify the option that reads all the events from the topic including those that were published before you started the spark stream
print("Start reading file")                                   
# TO-DO: cast the value column in the streaming dataframe as a STRING 

riskStreamingFile = sparkApp.readStream.text('stedi-application')

# TO-DO: parse the JSON from the single column "value" with a json object in it, like this:
# +------------+
# | value      |
# +------------+
# |{"custom"...|
# +------------+
# COMMAND ----------

df.na.replace([""], ["UNKNOWN"], "Description")


# COMMAND ----------

from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")


# COMMAND ----------

from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(2)


# COMMAND ----------

df.select(split(col("Description"), " ").alias("array_col"))\
  .selectExpr("array_col[0]").show(2)


# COMMAND ----------

from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3


# COMMAND ----------
Being able to take a compound field like GARAGEDESCRIPTION and massaging it into something useful is an involved process. It's helpful to understand early what value you might gain out of expanding it. In this example, we will convert our string to a list-like array, explode it and then inspect the unique values.

Instructions
100 XP
Import the needed functions split() and explode() from pyspark.sql.functions
Use split() to create a new column garage_list by splitting df['GARAGEDESCRIPTION'] on ', ' which is both a comma and a space.
Create a new record for each value in the df['garage_list'] using explode() and assign it a new column ex_garage_list
Use distinct() to get unique values of ex_garage_list and show the 100 first rows, truncating them at 50 characters to display the values.
'''

# Code
# Import needed functions
from pyspark.sql.functions import split, explode

# Convert string to list-like array
df = df.withColumn('garage_list', split(df['GARAGEDESCRIPTION'], ', '))

# Explode the values into new records
ex_df = df.withColumn('ex_garage_list', explode(df['garage_list']))

# Inspect the values
ex_df[['ex_garage_list']].distinct().show(100, truncate=50)
'''result
<script.py> output:
    +----------------------------+
    |              ex_garage_list|
    +----------------------------+
    |             Attached Garage|
    |      On-Street Parking Only|
    |                        None|
    | More Parking Onsite for Fee|
        .getOrCreate()

    userSchema = StructType().add("value", "string")

    # Create DataFrame representing the stream of input lines from connection to host:port
    lines = spark\
        .readStream\
        .format('csv')\
        .schema(userSchema)\
        .load('history')

    # Split the lines into words
    words = lines.select(
        # explode turns each item in an array into a separate row
        explode(
            split(lines.value, ' ')
        ).alias('word')
    )

    # Generate running word count
    wordCounts = words.groupBy('word').count()

    # Start running the query that prints the running counts to the console
    query = wordCounts\
        .writeStream\
        .outputMode('complete')\
        .format('memory')\
        .queryName('table')\
        .start()
    
    # TODO
示例#57
0
if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("StructuredNetworkWordCountFileSink") \
        .getOrCreate()

    spark.sparkContext.setLogLevel('WARN')

    lines = spark \
        .readStream \
        .format("socket") \
        .option("host", "localhost") \
        .option("port", 9999) \
        .load()

    words = lines.select(explode(split(lines.value, " ")).alias("word"))

    all_length_5_words = words.filter(length("word") == 5)

    query = all_length_5_words \
        .writeStream \
        .outputMode("append") \
        .format("parquet") \
        .option("path", "file:///tmp/filesink") \
        .option("checkpointLocation", "file:///tmp/file-sink-cp") \
        .trigger(processingTime="8 seconds") \
        .start()

    query.awaitTermination()
示例#58
0
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                .select(
                  explode(
                    split(
                      shakespeareDF.sentence, '\s')
                    ).alias('word')
                  )
               ).where("word != ''")

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
# MAGIC 
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode, col
shakeWordsDF = shakespeareDF.select(explode(split(shakespeareDF["word"], "\s+")).alias("word")).where(col("word") != '')
                

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# MAGIC %md