Пример #1
0
class SampleData(object):
    env = PixiedustTemplateEnvironment()

    def __init__(self, dataDefs):
        self.dataDefs = dataDefs

    def sampleData(self, dataId=None):
        if dataId is None:
            self.printSampleDataList()
        elif str(dataId) in dataDefs:
            return self.loadSparkDataFrameFromSampleData(dataDefs[str(dataId)])
        elif "https://" in str(dataId) or "http://" in str(
                dataId) or "file://" in str(dataId):
            return self.loadSparkDataFrameFromUrl(str(dataId))
        else:
            print(
                "Unknown sample data identifier. Please choose an id from the list below"
            )
            self.printSampleDataList()

    def printSampleDataList(self):
        display(
            HTML(
                self.env.getTemplate("sampleData.html").render(
                    dataDefs=iteritems(self.dataDefs))))
        #for key, val in iteritems(self.dataDefs):
        #    print("{0}: {1}".format(key, val["displayName"]))

    def dataLoader(self, path, schema=None):
        #TODO: if in Spark 2.0 or higher, use new API to load CSV
        load = ShellAccess["sqlContext"].read.format(
            'com.databricks.spark.csv')
        if schema is not None:

            def getType(t):
                if t == 'int':
                    return IntegerType()
                elif t == 'double':
                    return DoubleType()
                else:
                    return StringType()

            return load.options(header='true', mode="DROPMALFORMED").load(
                path,
                schema=StructType([
                    StructField(item[0], getType(item[1]), True)
                    for item in schema
                ]))
        else:
            return load.options(header='true',
                                mode="DROPMALFORMED",
                                inferschema='true').load(path)

    def loadSparkDataFrameFromSampleData(self, dataDef):
        return Downloader(dataDef).download(self.dataLoader)

    def loadSparkDataFrameFromUrl(self, dataUrl):
        i = dataUrl.rfind('/')
        dataName = dataUrl[(i + 1):]
        dataDef = {"displayName": dataUrl, "url": dataUrl}
        return Downloader(dataDef).download(self.dataLoader)
Пример #2
0
class SampleData(object):
    env = PixiedustTemplateEnvironment()

    def __init__(self, dataDefs):
        self.dataDefs = dataDefs
        self.url = ""

    def sampleData(self, dataId=None, type='csv'):
        if dataId is None:
            self.printSampleDataList()
        elif str(dataId) in dataDefs:
            return self.loadSparkDataFrameFromSampleData(dataDefs[str(dataId)])
        elif "https://" in str(dataId) or "http://" in str(
                dataId) or "file://" in str(dataId):
            if type is 'json':
                self.url = str(dataId)
                return self.JSONloadSparkDataFrameFromUrl(str(dataId))
            else:
                return self.loadSparkDataFrameFromUrl(str(dataId))
        else:
            print(
                "Unknown sample data identifier. Please choose an id from the list below"
            )
            self.printSampleDataList()

    def printSampleDataList(self):
        display(
            HTML(
                self.env.getTemplate("sampleData.html").render(
                    dataDefs=iteritems(self.dataDefs))))

    def dataLoader(self, path, schema=None):
        if schema is not None and Environment.hasSpark:
            from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType

            def getType(t):
                if t == 'int':
                    return IntegerType()
                elif t == 'double':
                    return DoubleType()
                else:
                    return StringType()

        if Environment.sparkVersion == 1:
            print("Loading file using 'com.databricks.spark.csv'")
            load = ShellAccess.sqlContext.read.format(
                'com.databricks.spark.csv')
            if schema is not None:
                return load.options(header='true', mode="DROPMALFORMED").load(
                    path,
                    schema=StructType([
                        StructField(item[0], getType(item[1]), True)
                        for item in schema
                    ]))
            else:
                return load.options(header='true',
                                    mode="DROPMALFORMED",
                                    inferschema='true').load(path)
        elif Environment.sparkVersion == 2:
            print("Loading file using 'SparkSession'")
            csvload = ShellAccess.SparkSession.builder.getOrCreate() \
                .read \
                .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") \
                .option("header", "true") \
                .option("mode", "DROPMALFORMED")
            if schema is not None:
                return csvload.schema(
                    StructType([
                        StructField(item[0], getType(item[1]), True)
                        for item in schema
                    ])).load(path)
            else:
                return csvload.option("inferSchema", "true").load(path)
        else:
            print("Loading file using 'pandas'")
            return pd.read_csv(path)

    def JSONdataLoader(self, path, schema=None):
        if schema is not None and Environment.hasSpark:
            from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType

            def getType(t):
                if t == 'int':
                    return IntegerType()
                elif t == 'double':
                    return DoubleType()
                else:
                    return StringType()

        res = open(path, 'r').read()

        if Environment.sparkVersion == 1:
            print("Loading file using a pyspark dataframe for spark 1")
            dataRDD = ShellAccess.sc.parallelize([res])
            return ShellAccess.sqlContext.jsonRDD(dataRDD)
        elif Environment.sparkVersion == 2:
            print("Loading file using a pyspark dataframe for spark 2")
            dataRDD = ShellAccess.sc.parallelize([res])
            return ShellAccess.spark.read.json(dataRDD)
        else:
            print("Loading file using 'pandas'")
            data = json.loads(res)
            df = json_normalize(data)
            return df

    def loadSparkDataFrameFromSampleData(self, dataDef):
        return Downloader(dataDef).download(self.dataLoader)

    def loadSparkDataFrameFromUrl(self, dataUrl):
        i = dataUrl.rfind('/')
        dataName = dataUrl[(i + 1):]
        dataDef = {"displayName": dataUrl, "url": dataUrl}

        return Downloader(dataDef).download(self.dataLoader)

    def JSONloadSparkDataFrameFromUrl(self, dataUrl):
        i = dataUrl.rfind('/')
        dataName = dataUrl[(i + 1):]
        dataDef = {"displayName": dataUrl, "url": dataUrl}

        return Downloader(dataDef).download(self.JSONdataLoader)
Пример #3
0
from IPython.core.getipython import *
from IPython.display import display, HTML, Javascript
from pixiedust.utils.shellAccess import ShellAccess
from functools import reduce
import uuid
import json
import sys
import traceback
import pixiedust
from IPython.core.getipython import get_ipython
from collections import OrderedDict
from threading import Thread, Lock, Event
import time

myLogger = pixiedust.getLogger(__name__)
_env = PixiedustTemplateEnvironment()
progressMonitor = None
loadingProgressMonitor = False

def enableSparkJobProgressMonitor():
    global progressMonitor, loadingProgressMonitor
    if progressMonitor is None and not loadingProgressMonitor:
        loadingProgressMonitor = True
        def startSparkJobProgressMonitor():
            global progressMonitor
            progressMonitor = SparkJobProgressMonitor()
        t = Thread(target=startSparkJobProgressMonitor)
        t.daemon = True
        t.start()

class SparkJobProgressMonitorOutput(Thread):