def read_csv(spark: SparkContext, path: str, schema=None): """Load csv files from the source directory into a dataframe""" df = spark.read.csv(path, header=True, schema=schema) if not schema: return df # recreate the dataframe with the correct schema # this will ensure non-nullable columns do not contain null values return spark.createDataFrame(df.rdd, schema, verifySchema=True)
def read_parquet(spark: SparkContext, path: str, schema: DataType = None, merge_schema=True): """Read a directory of parquet files into a dataframe""" # initially all the columns will be nullable df = spark.read.option("mergeSchema", str(merge_schema).lower()).parquet(path) if not schema: # return the dataframe without validating the schema return df # recreate the dataframe with the correct schema # this will ensure non-nullable columns do not contain null values return spark.createDataFrame(df.rdd, schema, verifySchema=True)
def get_channel_mapping(spark: pyspark.SparkContext) -> pyspark.sql.DataFrame: """ Creates the channel mapping dataframe from the hard-coded values Parameters ---------- spark : pyspark.SparkContext Spark context to initialize variables and get data from hive Returns ------- pyspark.sql.DataFrame PySpark dataframe with channel mapping data """ channel_mapping = spark.createDataFrame( [ ("01", "Distribution Channel 01"), ("10", "Other"), ("11", "DSD Bis Intercompany"), ("12", "DSD Pizza Intercomp"), ("20", """Warehouse/Exports"""), ("30", "Foodservice"), ("40", "DSD Pizza"), ("45", "DSD"), ("50", "KFI"), ("55", "Plant Ingredient"), ("60", "Imports"), ("65", "Bulk FS - Specialty"), ], StructType([ StructField("bic_zdistr_ch", StringType(), True), StructField("channel_desc", StringType(), True), ]), # add your columns label here ) return channel_mapping
# заготовка для API from pyspark.ml.clustering import KMeans, KMeansModel from pyspark.ml.evaluation import ClusteringEvaluator from pyspark.ml.feature import VectorAssembler from pyspark.ml import PipelineModel from pyspark import SparkConf, SparkContext spark = SparkContext() df1 = spark.createDataFrame([ (1353, 1347), ], ['user', 'summa']) va = VectorAssembler(inputCols=['user', 'summa'], outputCol="features") modelka = KMeansModel.load('./models/clusters.model') result = modelka.transform( va.transform(df1)).select('prediction').take(1)[0][0] # Проверка на новенького. # Если страый юзер - тянем монгу # Иначе поднимаем мини-инстанс.
"GDELT dataset found here: https://aws.amazon.com/public-datasets/gdelt/ # Column headers found here: http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt # Load RDD from pyspark.sql import SparkSession from pyspark import SparkContext ,SparkConf #spark = SparkSession.builder.appName("gdelt").getOrCreate() conf = SparkConf().setAppName("gdelt")#.setMaster(master) sc = SparkContext(conf=conf) lines = sc.textFile("s3a://gdelt-open-data/events/2018*") # Loads 73,385,698 records from 2016 # Split lines into columns; change split() argument depending on deliminiter e.g. '\t' parts = lines.map(lambda l: l.split('\t')) # Convert RDD into DataFrame from urllib import urlopen html = urlopen("http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt").read().rstrip() columns = html.split('\t') df = sc.createDataFrame(parts, columns) df.printSchema sc.stop()
from datetime import datetime as dt from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("appName").setMaster("local") sc = SparkContext(conf=conf) df = sc.createDataFrame([{ 'a': 10, 'date': dt(2017, 1, 1) }, { 'a': 23, 'date': dt(2018, 1, 1, 1) }]) print(df.show())
from pyspark.mllib.feature import Word2Vec from pyspark import SparkContext, SparkConf import nltk import os stop_words = nltk.corpus.stopwords.words('english') stop_words += ['?', '.', '!', ','] sparkConf = SparkConf().setMaster("local").setAppName("Word2Vec").set( "spark.app.id", "Word2Vec") sc = SparkContext(conf=sparkConf) #inp = sc.textFile("Trump.txt").map(lambda row: row.split(" ")) tweets = open(os.getcwd() + "/Trump.txt").read.splitlines() documentDF = sc.createDataFrame(tweets) word2vec = Word2Vec() model = word2vec.fit(inp) result = model.transform(inp) for row in result.collect(): text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
agents agents.count() # Filter french agents fr_agents = agents.filter(agents.country_name == "France") fr_agents fr_agents.count() agent = fr_agents.first() agent print(agent.country_name, agent.id) # Several operations agents.filter(agents.country_name == "France").filter( agents.latitude < 0).count() agents.filter((agents.country_name == "France") & (agents.latitude < 0)).count() agents.limit(5).show() # Create a view (Spark-SQL) agents.createTempView("agents_table") #createOrReplaceTempView() spark.sql("SELECT * FROM agents_table ORDER BY id DESC LIMIT 10").show() # Load a DF in memory and convert it in RDD agents.persist() agents.rdd.filter(lambda row: row.country_name == "France").count() # Convert RDD (containing Row) in DF from pyspark.sql import Row rdd = sc.parallelize([Row(name="Alice"), Row(name="Bob")]) spark.createDataFrame(rdd)
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2]) departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4]) departmentWithEmployees3 = Row(department=department3, employees=[employee1, employee4]) departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3]) print(department1) print(departmentWithEmployees1.employees[0].email) departmentsWithEmployeesSeq1 = [ departmentWithEmployees1, departmentWithEmployees2 ] df1 = sc.createDataFrame(departmentsWithEmployeesSeq1) departmentsWithEmployeesSeq2 = [ departmentWithEmployees3, departmentWithEmployees4 ] df2 = sc.createDataFrame(departmentsWithEmployeesSeq2) unionDF = df1.unionAll(df2) #dbutils.fs.rm("/tmp/databricks-df-example.parquet", True) unionDF.write.parquet("databricks-df-example.parquet") explodeDF = unionDF.selectExpr("e.firstName", "e.lastName", "e.email", "e.salary") explodeDF.show()
# Spark from pyspark import SparkContext # Spark Streaming from pyspark.streaming import StreamingContext # Kafka from pyspark.streaming.kafka import KafkaUtils # json parsing import json from pyspark.sql.types import StringType sc = SparkContext(appName="spark1") sc.setLogLevel("WARN") df = sc.createDataFrame(["10", "11", "13"], "string").toDF("age") df2 = sc.createDataFrame(["10", "11", "13"], StringType()).toDF("age")
rolledUpDF.where("Country IS NULL").show() rolledUpDF.where("Date IS NULL").show() #cubes dfNoNull.cube("Date", "Country").agg(_sum(col("Quantity")))\ .select("Date", "Country", "sum(Quantity)").orderBy("Date").show() ################### ## rdd and dataframes a = spark.range(10).rdd b = spark.range(10).toDF("id").rdd.map(lambda row: row[0]) a.take(4) sqldf = SparkContext.createDataFrame(ratings) ## using spark dataframes df = spark.read.csv('KCLT.csv',header=True) df1 = spark.read.load('KCLT.csv',format='com.databricks.spark.csv', header='true',inferSchema='true') type(df) df.describe().show() df.dtypes df.count() #change data type and column name df = df.withColumn('date',df.date.cast('timestamp')) ## using spark rdd sc = SparkContext('local','example') rdd = sc.textFile('KCLT.csv').map(lambda line: line.split(",")[1]) ratings = sc.textFile('../Python_Projects/ml-100k/u.data').map(lambda line: line.split()[2])