class SampleData(object): env = PixiedustTemplateEnvironment() def __init__(self, dataDefs): self.dataDefs = dataDefs def sampleData(self, dataId=None): if dataId is None: self.printSampleDataList() elif str(dataId) in dataDefs: return self.loadSparkDataFrameFromSampleData(dataDefs[str(dataId)]) elif "https://" in str(dataId) or "http://" in str( dataId) or "file://" in str(dataId): return self.loadSparkDataFrameFromUrl(str(dataId)) else: print( "Unknown sample data identifier. Please choose an id from the list below" ) self.printSampleDataList() def printSampleDataList(self): display( HTML( self.env.getTemplate("sampleData.html").render( dataDefs=iteritems(self.dataDefs)))) #for key, val in iteritems(self.dataDefs): # print("{0}: {1}".format(key, val["displayName"])) def dataLoader(self, path, schema=None): #TODO: if in Spark 2.0 or higher, use new API to load CSV load = ShellAccess["sqlContext"].read.format( 'com.databricks.spark.csv') if schema is not None: def getType(t): if t == 'int': return IntegerType() elif t == 'double': return DoubleType() else: return StringType() return load.options(header='true', mode="DROPMALFORMED").load( path, schema=StructType([ StructField(item[0], getType(item[1]), True) for item in schema ])) else: return load.options(header='true', mode="DROPMALFORMED", inferschema='true').load(path) def loadSparkDataFrameFromSampleData(self, dataDef): return Downloader(dataDef).download(self.dataLoader) def loadSparkDataFrameFromUrl(self, dataUrl): i = dataUrl.rfind('/') dataName = dataUrl[(i + 1):] dataDef = {"displayName": dataUrl, "url": dataUrl} return Downloader(dataDef).download(self.dataLoader)
class SampleData(object): env = PixiedustTemplateEnvironment() def __init__(self, dataDefs): self.dataDefs = dataDefs self.url = "" def sampleData(self, dataId=None, type='csv'): if dataId is None: self.printSampleDataList() elif str(dataId) in dataDefs: return self.loadSparkDataFrameFromSampleData(dataDefs[str(dataId)]) elif "https://" in str(dataId) or "http://" in str( dataId) or "file://" in str(dataId): if type is 'json': self.url = str(dataId) return self.JSONloadSparkDataFrameFromUrl(str(dataId)) else: return self.loadSparkDataFrameFromUrl(str(dataId)) else: print( "Unknown sample data identifier. Please choose an id from the list below" ) self.printSampleDataList() def printSampleDataList(self): display( HTML( self.env.getTemplate("sampleData.html").render( dataDefs=iteritems(self.dataDefs)))) def dataLoader(self, path, schema=None): if schema is not None and Environment.hasSpark: from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType def getType(t): if t == 'int': return IntegerType() elif t == 'double': return DoubleType() else: return StringType() if Environment.sparkVersion == 1: print("Loading file using 'com.databricks.spark.csv'") load = ShellAccess.sqlContext.read.format( 'com.databricks.spark.csv') if schema is not None: return load.options(header='true', mode="DROPMALFORMED").load( path, schema=StructType([ StructField(item[0], getType(item[1]), True) for item in schema ])) else: return load.options(header='true', mode="DROPMALFORMED", inferschema='true').load(path) elif Environment.sparkVersion == 2: print("Loading file using 'SparkSession'") csvload = ShellAccess.SparkSession.builder.getOrCreate() \ .read \ .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") \ .option("header", "true") \ .option("mode", "DROPMALFORMED") if schema is not None: return csvload.schema( StructType([ StructField(item[0], getType(item[1]), True) for item in schema ])).load(path) else: return csvload.option("inferSchema", "true").load(path) else: print("Loading file using 'pandas'") return pd.read_csv(path) def JSONdataLoader(self, path, schema=None): if schema is not None and Environment.hasSpark: from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType def getType(t): if t == 'int': return IntegerType() elif t == 'double': return DoubleType() else: return StringType() res = open(path, 'r').read() if Environment.sparkVersion == 1: print("Loading file using a pyspark dataframe for spark 1") dataRDD = ShellAccess.sc.parallelize([res]) return ShellAccess.sqlContext.jsonRDD(dataRDD) elif Environment.sparkVersion == 2: print("Loading file using a pyspark dataframe for spark 2") dataRDD = ShellAccess.sc.parallelize([res]) return ShellAccess.spark.read.json(dataRDD) else: print("Loading file using 'pandas'") data = json.loads(res) df = json_normalize(data) return df def loadSparkDataFrameFromSampleData(self, dataDef): return Downloader(dataDef).download(self.dataLoader) def loadSparkDataFrameFromUrl(self, dataUrl): i = dataUrl.rfind('/') dataName = dataUrl[(i + 1):] dataDef = {"displayName": dataUrl, "url": dataUrl} return Downloader(dataDef).download(self.dataLoader) def JSONloadSparkDataFrameFromUrl(self, dataUrl): i = dataUrl.rfind('/') dataName = dataUrl[(i + 1):] dataDef = {"displayName": dataUrl, "url": dataUrl} return Downloader(dataDef).download(self.JSONdataLoader)
from IPython.core.getipython import * from IPython.display import display, HTML, Javascript from pixiedust.utils.shellAccess import ShellAccess from functools import reduce import uuid import json import sys import traceback import pixiedust from IPython.core.getipython import get_ipython from collections import OrderedDict from threading import Thread, Lock, Event import time myLogger = pixiedust.getLogger(__name__) _env = PixiedustTemplateEnvironment() progressMonitor = None loadingProgressMonitor = False def enableSparkJobProgressMonitor(): global progressMonitor, loadingProgressMonitor if progressMonitor is None and not loadingProgressMonitor: loadingProgressMonitor = True def startSparkJobProgressMonitor(): global progressMonitor progressMonitor = SparkJobProgressMonitor() t = Thread(target=startSparkJobProgressMonitor) t.daemon = True t.start() class SparkJobProgressMonitorOutput(Thread):