def main(): print("Starting at " + str(datetime.datetime.now())) file_name = '{}_tripdata_{}-{}.csv'.format(taxi_color, year, str(month).zfill(2)) dt_partition = '{}-{}-01'.format(year, str(month).zfill(2)) spark = utils.get_spark_session("ny_taxi") load_file_hdfs(local_dir=local_dir, hdfs_path=hdfs_raw_path, file_name=file_name, overwrite=overwrite_file) df_transform, df_corrupted = \ transform_file(hdfs_raw_path, schema=define_schema(taxi_color), partition=dt_partition, spark=spark, file_name=file_name) utils.write_parquet(df=df_transform, hdfs_path=hdfs_path, file_name='{}_tripdata'.format(taxi_color), schema=write_schema(taxi_color)) if df_corrupted.count() != 0: utils.write_parquet(df=df_corrupted, hdfs_path=hdfs_path, file_name='{}_corrupted'.format(taxi_color), schema=write_schema("corrupted")) print("Finished at " + str(datetime.datetime.now()))
from __future__ import print_function from pyspark.ml.feature import VectorAssembler from pyspark.ml.regression import DecisionTreeRegressor from utils import get_spark_session if __name__ == "__main__": spark = get_spark_session('Real Estate Predictor') # Load up our data and convert it to the format MLLib expects. data = spark.read.option('header', 'true').option( 'inferSchema', 'true').csv("/opt/bitnami/spark/spark-data/regression.txt") assembler = VectorAssembler().setInputCols( ['HouseAge', 'DistanceToMRT', 'NumberConvenienceStores']).setOutputCol('features') df = assembler.transform(data).select('PriceOfUnitArea', 'features') # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0] testDF = trainTest[1] # Now create our linear regression model dtr = DecisionTreeRegressor().setFeaturesCol('features').setLabelCol( 'PriceOfUnitArea') # Train the model using our training data
func.when(func.col("denominator") != 0, func.col("numerator") / func.col("denominator")) \ .otherwise(0) \ ).select("movie1", "movie2", "score", "numPairs") return result # Get movie name by given movie id def getMovieName(movieNames, movieId): result = movieNames.filter(func.col("movieID") == movieId) \ .select("movieTitle").collect()[0] return result[0] spark = get_spark_session('MovieSimilarities') movieNamesSchema = StructType([ \ StructField("movieID", IntegerType(), True), \ StructField("movieTitle", StringType(), True) \ ]) moviesSchema = StructType([ \ StructField("userID", IntegerType(), True), \ StructField("movieID", IntegerType(), True), \ StructField("rating", IntegerType(), True), \ StructField("timestamp", LongType(), True)]) # Create a broadcast dataset of movieID and movieTitle. # Apply ISO-885901 charset movieNames = spark.read \
from pyspark.sql import Row from utils import SPARK_DATA_PATH, get_spark_session # Create a SparkSession spark = get_spark_session('Spark SQL') def mapper(line): fields = line.split(',') return Row(ID=int(fields[0]), name=str(fields[1].encode("utf-8")), \ age=int(fields[2]), numFriends=int(fields[3])) lines = spark.sparkContext.textFile(f"{SPARK_DATA_PATH}/fakefriends.csv") people = lines.map(mapper) # Infer the schema, and register the DataFrame as a table. schemaPeople = spark.createDataFrame(people).cache() schemaPeople.createOrReplaceTempView("people") # SQL can be run over DataFrames that have been registered as a table. teenagers = spark.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19") # The results of SQL queries are RDDs and support all the normal RDD operations. for teen in teenagers.collect(): print(teen) # We can also use functions instead of SQL queries: schemaPeople.groupBy("age").count().orderBy("age").show()
from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, StringType from utils import SPARK_DATA_PATH, get_spark_session spark = get_spark_session('MostPopularSuperhero') schema = StructType([ StructField("id", IntegerType(), True), StructField("name", StringType(), True)] ) names = spark.read.schema(schema).option("sep", " ").csv(f"{SPARK_DATA_PATH}/Marvel-names.txt") lines = spark.read.text(f"{SPARK_DATA_PATH}/Marvel-graph.txt") # Small tweak vs. what's shown in the video: we trim each line of whitespace as that could # throw off the counts. connections = lines.withColumn( "id", func.split( func.trim(func.col("value")), " " )[0] ).withColumn( "connections", func.size( func.split( func.trim( func.col("value") ), " " ) ) - 1
def loadMovieNames(): movieNames = {} # CHANGE THIS TO THE PATH TO YOUR u.ITEM FILE: with codecs.open(f"{SPARK_DATA_PATH}/ml-100k/u.ITEM", "r", encoding='ISO-8859-1', errors='ignore') as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames spark = get_spark_session('Popular movies nice') nameDict = spark.sparkContext.broadcast(loadMovieNames()) # Create schema when reading u.data schema = StructType([ StructField("userID", IntegerType(), True), StructField("movieID", IntegerType(), True), StructField("rating", IntegerType(), True), StructField("timestamp", LongType(), True) ]) # Load up movie data as dataframe moviesDF = spark.read.option( "sep", "\t").schema(schema).csv(f"{SPARK_DATA_PATH}/ml-100k/u.data")
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, LongType from utils import SPARK_DATA_PATH, get_spark_session spark = get_spark_session('pop moveis') # Create schema when reading u.data schema = StructType([ StructField("userID", IntegerType(), True), StructField("movieID", IntegerType(), True), StructField("rating", IntegerType(), True), StructField("timestamp", LongType(), True) ]) # Load up movie data as dataframe moviesDF = spark.read.option("sep", "\t").schema(schema).csv( f"{SPARK_DATA_PATH}/ml-100k/u.data" ) # Some SQL-style magic to sort all movies by popularity in one line! topMovieIDs = moviesDF.groupBy("movieID").count().orderBy(func.desc("count")) # Grab the top 10 topMovieIDs.show(10) # Stop the session spark.stop()
from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType from utils import SPARK_DATA_PATH, get_spark_session spark = get_spark_session('MinTemperatures') schema = StructType([ StructField("stationID", StringType(), True), StructField("date", IntegerType(), True), StructField("measure_type", StringType(), True), StructField("temperature", FloatType(), True)] ) # // Read the file as dataframe df = spark.read.schema(schema).csv(f"{SPARK_DATA_PATH}/1800.csv") df.printSchema() # Filter out all but TMIN entries minTemps = df.filter(df.measure_type == "TMIN") # Select only stationID and temperature stationTemps = minTemps.select("stationID", "temperature") # Aggregate to find minimum temperature for every station minTempsByStation = stationTemps.groupBy("stationID").min("temperature") minTempsByStation.show() # Convert temperature to fahrenheit and sort the dataset minTempsByStationF = minTempsByStation.withColumn( "temperature",
# -*- coding: utf-8 -*- """ Created on Wed Dec 18 09:15:05 2019 @author: Frank """ from pyspark.sql.functions import regexp_extract from utils import SPARK_DATA_PATH, get_spark_session # Create a SparkSession (the config bit is only for Windows!) spark = get_spark_session('Structured Streaming') # Monitor the logs directory for new log data, and read in the raw lines as accessLines accessLines = spark.readStream.text(f"{SPARK_DATA_PATH}/logs") # Parse out the common log format to a DataFrame contentSizeExp = r'\s(\d+)$' statusExp = r'\s(\d{3})\s' generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"' timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]' hostExp = r'(^\S+\.[\S+\.]+\S+)\s' logsDF = accessLines.select( regexp_extract('value', hostExp, 1).alias('host'), regexp_extract('value', timeExp, 1).alias('timestamp'), regexp_extract('value', generalExp, 1).alias('method'), regexp_extract('value', generalExp, 2).alias('endpoint'), regexp_extract('value', generalExp, 3).alias('protocol'), regexp_extract('value', statusExp, 1).cast('integer').alias('status'), regexp_extract('value', contentSizeExp,
# -*- coding: utf-8 -*- from pyspark.sql.functions import col, current_timestamp, regexp_extract, window from utils import SPARK_DATA_PATH, get_spark_session # Create a SparkSession (the config bit is only for Windows!) spark = get_spark_session('Windowed Structured Streaming') # Monitor the logs directory for new log data, and read in the raw lines as accessLines accessLines = spark.readStream.text(f"{SPARK_DATA_PATH}/logs") # Parse out the common log format to a DataFrame contentSizeExp = r'\s(\d+)$' statusExp = r'\s(\d{3})\s' generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"' timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]' hostExp = r'(^\S+\.[\S+\.]+\S+)\s' logsDF = accessLines.select( regexp_extract('value', hostExp, 1).alias('host'), regexp_extract('value', timeExp, 1).alias('timestamp'), regexp_extract('value', generalExp, 1).alias('method'), regexp_extract('value', generalExp, 2).alias('endpoint'), regexp_extract('value', generalExp, 3).alias('protocol'), regexp_extract('value', statusExp, 1).cast('integer').alias('status'), regexp_extract('value', contentSizeExp, 1).cast('integer').alias('content_size')) logsDF = logsDF.withColumn('eventTime', current_timestamp()) # Keep a running count of every access by status code endpointCounts = logsDF.groupBy( window(
from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, StringType from utils import SPARK_DATA_PATH, get_spark_session spark = get_spark_session('MostObscureSuperhero') schema = StructType([ StructField("id", IntegerType(), True), StructField("name", StringType(), True) ]) names = spark.read.schema(schema).option( "sep", " ").csv(f"{SPARK_DATA_PATH}/Marvel-names.txt") lines = spark.read.text(f"{SPARK_DATA_PATH}/Marvel-graph.txt") # Small tweak vs. what's shown in the video: we trim each line of whitespace as that could # throw off the counts. connections = lines.withColumn( "id", func.split(func.trim(func.col("value")), " ")[0]).withColumn( "connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1).groupBy("id").agg(func.sum("connections").alias("connections")) # Show the minimum number of connections minConnections = connections.agg( func.min('connections').alias('min_connections')) print( f'Minimum number of connections is {minConnections.first().min_connections}'
from pyspark.sql import types, functions from utils import SPARK_DATA_PATH, get_spark_session spark = get_spark_session('Customer spend dataframe') schema = types.StructType([ types.StructField('customer_id', types.IntegerType(), True), types.StructField('order_id', types.IntegerType(), True), types.StructField('order_price', types.FloatType(), True), ]) df = spark.read.schema(schema).csv(f"{SPARK_DATA_PATH}/customer-orders.csv") df.printSchema() # Remove unused column df = df.select('customer_id', 'order_price') df = df.groupBy('customer_id').sum('order_price') df = df.withColumn('total', functions.round('sum(order_price)', 2)).select('customer_id', 'total').sort('total') df.show(df.count())
from __future__ import print_function from pyspark.ml.regression import LinearRegression from pyspark.sql import SparkSession from pyspark.ml.linalg import Vectors from utils import get_spark_context, SPARK_DATA_PATH, get_spark_session if __name__ == "__main__": # Create a SparkSession (Note, the config section is only for Windows!) spark = get_spark_session('LinearRegression') # Load up our data and convert it to the format MLLib expects. inputLines = spark.sparkContext.textFile( "/opt/bitnami/spark/spark-data/regression.txt") data = inputLines.map(lambda x: x.split(",")).map( lambda x: (float(x[0]), Vectors.dense(float(x[1])))) # Convert this RDD to a DataFrame colNames = ["label", "features"] df = data.toDF(colNames) # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame. # Perhaps you're importing data from a real database. Or you are using structured streaming # to get your data. # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0]
from pyspark.sql import functions as func from utils import SPARK_DATA_PATH, get_spark_session spark = get_spark_session('Word count') # Read each line of my book into a dataframe inputDF = spark.read.text(f"{SPARK_DATA_PATH}/Book") # Split using a regular expression that extracts words words = inputDF.select( # Explode out into a new row for every word in the book func.explode( # Split at word boundaries func.split(inputDF.value, "\\W+")).alias("word") # Name the new column "word" ) words.filter(words.word != "") # Normalize everything to lowercase lowercaseWords = words.select(func.lower(words.word).alias("word")) # Count up the occurrences of each word wordCounts = lowercaseWords.groupBy("word").count() # Sort by counts wordCountsSorted = wordCounts.sort("count") # Show the results. wordCountsSorted.show(wordCountsSorted.count())
def loadMovieNames(): movieNames = {} # CHANGE THIS TO THE PATH TO YOUR u.ITEM FILE: with codecs.open("/opt/bitnami/spark/spark-data/ml-100k/u.item", "r", encoding='ISO-8859-1', errors='ignore') as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames spark = get_spark_session('ALSExample') moviesSchema = StructType([ \ StructField("userID", IntegerType(), True), \ StructField("movieID", IntegerType(), True), \ StructField("rating", IntegerType(), True), \ StructField("timestamp", LongType(), True)]) names = loadMovieNames() ratings = spark.read.option("sep", "\t").schema(moviesSchema) \ .csv(f"{SPARK_DATA_PATH}/ml-100k/u.data") print("Training recommendation model...") als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID").setItemCol("movieID") \