Пример #1
def main():
    print("Starting at " + str(datetime.datetime.now()))

    file_name = '{}_tripdata_{}-{}.csv'.format(taxi_color, year, str(month).zfill(2))
    dt_partition = '{}-{}-01'.format(year, str(month).zfill(2))

    spark = utils.get_spark_session("ny_taxi")

    load_file_hdfs(local_dir=local_dir, hdfs_path=hdfs_raw_path, file_name=file_name, overwrite=overwrite_file)

    df_transform, df_corrupted = \
        transform_file(hdfs_raw_path, schema=define_schema(taxi_color),
                       partition=dt_partition, spark=spark, file_name=file_name)

    utils.write_parquet(df=df_transform, hdfs_path=hdfs_path, file_name='{}_tripdata'.format(taxi_color),

    if df_corrupted.count() != 0:
        utils.write_parquet(df=df_corrupted, hdfs_path=hdfs_path, file_name='{}_corrupted'.format(taxi_color),

    print("Finished at " + str(datetime.datetime.now()))
Пример #2
from __future__ import print_function

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

from utils import get_spark_session

if __name__ == "__main__":

    spark = get_spark_session('Real Estate Predictor')
    # Load up our data and convert it to the format MLLib expects.

    data = spark.read.option('header', 'true').option(

    assembler = VectorAssembler().setInputCols(
        ['HouseAge', 'DistanceToMRT',
    df = assembler.transform(data).select('PriceOfUnitArea', 'features')

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
    dtr = DecisionTreeRegressor().setFeaturesCol('features').setLabelCol(

    # Train the model using our training data
        func.when(func.col("denominator") != 0, func.col("numerator") / func.col("denominator")) \
          .otherwise(0) \
      ).select("movie1", "movie2", "score", "numPairs")

    return result

# Get movie name by given movie id
def getMovieName(movieNames, movieId):
    result = movieNames.filter(func.col("movieID") == movieId) \

    return result[0]

spark = get_spark_session('MovieSimilarities')

movieNamesSchema = StructType([ \
                               StructField("movieID", IntegerType(), True), \
                               StructField("movieTitle", StringType(), True) \

moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

# Create a broadcast dataset of movieID and movieTitle.
# Apply ISO-885901 charset
movieNames = spark.read \
Пример #4
from pyspark.sql import Row

from utils import SPARK_DATA_PATH, get_spark_session

# Create a SparkSession
spark = get_spark_session('Spark SQL')

def mapper(line):
    fields = line.split(',')
    return Row(ID=int(fields[0]), name=str(fields[1].encode("utf-8")), \
               age=int(fields[2]), numFriends=int(fields[3]))

lines = spark.sparkContext.textFile(f"{SPARK_DATA_PATH}/fakefriends.csv")
people = lines.map(mapper)

# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people).cache()

# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")

# The results of SQL queries are RDDs and support all the normal RDD operations.
for teen in teenagers.collect():

# We can also use functions instead of SQL queries:
Пример #5
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

from utils import SPARK_DATA_PATH, get_spark_session

spark = get_spark_session('MostPopularSuperhero')

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)]

names = spark.read.schema(schema).option("sep", " ").csv(f"{SPARK_DATA_PATH}/Marvel-names.txt")

lines = spark.read.text(f"{SPARK_DATA_PATH}/Marvel-graph.txt")

# Small tweak vs. what's shown in the video: we trim each line of whitespace as that could
# throw off the counts.
connections = lines.withColumn(
    "id", func.split(
        func.trim(func.col("value")), " "
            ), " "
    ) - 1
Пример #6

def loadMovieNames():
    movieNames = {}
    with codecs.open(f"{SPARK_DATA_PATH}/ml-100k/u.ITEM",
                     errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

spark = get_spark_session('Popular movies nice')

nameDict = spark.sparkContext.broadcast(loadMovieNames())

# Create schema when reading u.data
schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("movieID", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True)

# Load up movie data as dataframe
moviesDF = spark.read.option(
    "sep", "\t").schema(schema).csv(f"{SPARK_DATA_PATH}/ml-100k/u.data")
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

from utils import SPARK_DATA_PATH, get_spark_session

spark = get_spark_session('pop moveis')

# Create schema when reading u.data
schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("movieID", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True)

# Load up movie data as dataframe
moviesDF = spark.read.option("sep", "\t").schema(schema).csv(

# Some SQL-style magic to sort all movies by popularity in one line!
topMovieIDs = moviesDF.groupBy("movieID").count().orderBy(func.desc("count"))

# Grab the top 10

# Stop the session
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

from utils import SPARK_DATA_PATH, get_spark_session

spark = get_spark_session('MinTemperatures')

schema = StructType([
    StructField("stationID", StringType(), True),
    StructField("date", IntegerType(), True),
    StructField("measure_type", StringType(), True),
    StructField("temperature", FloatType(), True)]

# // Read the file as dataframe
df = spark.read.schema(schema).csv(f"{SPARK_DATA_PATH}/1800.csv")

# Filter out all but TMIN entries
minTemps = df.filter(df.measure_type == "TMIN")

# Select only stationID and temperature
stationTemps = minTemps.select("stationID", "temperature")

# Aggregate to find minimum temperature for every station
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")

# Convert temperature to fahrenheit and sort the dataset
minTempsByStationF = minTempsByStation.withColumn(
# -*- coding: utf-8 -*-
Created on Wed Dec 18 09:15:05 2019

@author: Frank

from pyspark.sql.functions import regexp_extract
from utils import SPARK_DATA_PATH, get_spark_session

# Create a SparkSession (the config bit is only for Windows!)
spark = get_spark_session('Structured Streaming')

# Monitor the logs directory for new log data, and read in the raw lines as accessLines
accessLines = spark.readStream.text(f"{SPARK_DATA_PATH}/logs")

# Parse out the common log format to a DataFrame
contentSizeExp = r'\s(\d+)$'
statusExp = r'\s(\d{3})\s'
generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"'
timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
hostExp = r'(^\S+\.[\S+\.]+\S+)\s'

logsDF = accessLines.select(
    regexp_extract('value', hostExp, 1).alias('host'),
    regexp_extract('value', timeExp, 1).alias('timestamp'),
    regexp_extract('value', generalExp, 1).alias('method'),
    regexp_extract('value', generalExp, 2).alias('endpoint'),
    regexp_extract('value', generalExp, 3).alias('protocol'),
    regexp_extract('value', statusExp, 1).cast('integer').alias('status'),
    regexp_extract('value', contentSizeExp,
# -*- coding: utf-8 -*-
from pyspark.sql.functions import col, current_timestamp, regexp_extract, window
from utils import SPARK_DATA_PATH, get_spark_session

# Create a SparkSession (the config bit is only for Windows!)
spark = get_spark_session('Windowed Structured Streaming')

# Monitor the logs directory for new log data, and read in the raw lines as accessLines
accessLines = spark.readStream.text(f"{SPARK_DATA_PATH}/logs")

# Parse out the common log format to a DataFrame
contentSizeExp = r'\s(\d+)$'
statusExp = r'\s(\d{3})\s'
generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"'
timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
hostExp = r'(^\S+\.[\S+\.]+\S+)\s'

logsDF = accessLines.select(
    regexp_extract('value', hostExp, 1).alias('host'),
    regexp_extract('value', timeExp, 1).alias('timestamp'),
    regexp_extract('value', generalExp, 1).alias('method'),
    regexp_extract('value', generalExp, 2).alias('endpoint'),
    regexp_extract('value', generalExp, 3).alias('protocol'),
    regexp_extract('value', statusExp, 1).cast('integer').alias('status'),
    regexp_extract('value', contentSizeExp,
logsDF = logsDF.withColumn('eventTime', current_timestamp())

# Keep a running count of every access by status code
endpointCounts = logsDF.groupBy(
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

from utils import SPARK_DATA_PATH, get_spark_session

spark = get_spark_session('MostObscureSuperhero')

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)

names = spark.read.schema(schema).option(
    "sep", " ").csv(f"{SPARK_DATA_PATH}/Marvel-names.txt")

lines = spark.read.text(f"{SPARK_DATA_PATH}/Marvel-graph.txt")

# Small tweak vs. what's shown in the video: we trim each line of whitespace as that could
# throw off the counts.
connections = lines.withColumn(
    func.split(func.trim(func.col("value")), " ")[0]).withColumn(
        func.size(func.split(func.trim(func.col("value")), " ")) -

# Show the minimum number of connections
minConnections = connections.agg(
    f'Minimum number of connections is {minConnections.first().min_connections}'
from pyspark.sql import types, functions

from utils import SPARK_DATA_PATH, get_spark_session

spark = get_spark_session('Customer spend dataframe')

schema = types.StructType([
    types.StructField('customer_id', types.IntegerType(), True),
    types.StructField('order_id', types.IntegerType(), True),
    types.StructField('order_price', types.FloatType(), True),
df = spark.read.schema(schema).csv(f"{SPARK_DATA_PATH}/customer-orders.csv")


# Remove unused column
df = df.select('customer_id', 'order_price')

df = df.groupBy('customer_id').sum('order_price')

df = df.withColumn('total', functions.round('sum(order_price)',

from __future__ import print_function

from pyspark.ml.regression import LinearRegression

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

from utils import get_spark_context, SPARK_DATA_PATH, get_spark_session

if __name__ == "__main__":

    # Create a SparkSession (Note, the config section is only for Windows!)
    spark = get_spark_session('LinearRegression')

    # Load up our data and convert it to the format MLLib expects.
    inputLines = spark.sparkContext.textFile(
    data = inputLines.map(lambda x: x.split(",")).map(
        lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

    # Convert this RDD to a DataFrame
    colNames = ["label", "features"]
    df = data.toDF(colNames)

    # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
    # Perhaps you're importing data from a real database. Or you are using structured streaming
    # to get your data.

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
from pyspark.sql import functions as func
from utils import SPARK_DATA_PATH, get_spark_session

spark = get_spark_session('Word count')

# Read each line of my book into a dataframe
inputDF = spark.read.text(f"{SPARK_DATA_PATH}/Book")

# Split using a regular expression that extracts words
words = inputDF.select(
    # Explode out into a new row for every word in the book
        # Split at word boundaries
                   "\\W+")).alias("word")  # Name the new column "word"
words.filter(words.word != "")

# Normalize everything to lowercase
lowercaseWords = words.select(func.lower(words.word).alias("word"))

# Count up the occurrences of each word
wordCounts = lowercaseWords.groupBy("word").count()

# Sort by counts
wordCountsSorted = wordCounts.sort("count")

# Show the results.

def loadMovieNames():
    movieNames = {}
    with codecs.open("/opt/bitnami/spark/spark-data/ml-100k/u.item",
                     errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

spark = get_spark_session('ALSExample')

moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

names = loadMovieNames()

ratings = spark.read.option("sep", "\t").schema(moviesSchema) \

print("Training recommendation model...")

als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID").setItemCol("movieID") \