def to_spark(self): """Pass URL to spark to load as a DataFrame Note that this requires ``org.apache.spark.sql.avro.AvroFileFormat`` to be installed in your spark classes. This feature is experimental. """ from intake_spark.base import SparkHolder sh = SparkHolder(True, [['read'], ['format', ["com.databricks.spark.avro"]], ['load', [self._urlpath]]], {}) return sh.setup()
def to_spark(self): """Produce Spark DataFrame equivalent This will ignore all arguments except the urlpath, which will be directly interpreted by Spark. If you need to configure the storage, that must be done on the spark side. This method requires intake-spark. See its documentation for how to set up a spark Session. """ from intake_spark.base import SparkHolder args = [['read'], ['parquet', [self._urlpath]]] sh = SparkHolder(True, args, {}) return sh.setup()
def test_cat(): import pyspark h = SparkHolder(True, [('catalog', )], {}) h.setup() # create spark session early session = h.session[0] d = session.createDataFrame(df) sql = pyspark.HiveContext(session.sparkContext) sql.registerDataFrameAsTable(d, 'temp') cat = SparkTablesCatalog() assert 'temp' in list(cat) s = cat.temp() assert isinstance(s, SparkDataFrame) out = s.read() assert out.astype(df.dtypes).equals(df)
def to_spark(self): from intake_spark.base import SparkHolder h = SparkHolder(False, [('textFile', (self._urlpath, ))], {}) return h.setup()
def to_spark(self): from intake_spark.base import SparkHolder h = SparkHolder(True, [('read', ), ('format', ("csv", )), ('option', ("header", "true")), ('load', (self.urlpath, ))], {}) return h.setup()