예제 #1
0
    def to_spark(self):
        """Pass URL to spark to load as a DataFrame

        Note that this requires ``org.apache.spark.sql.avro.AvroFileFormat``
        to be installed in your spark classes.

        This feature is experimental.
        """
        from intake_spark.base import SparkHolder
        sh = SparkHolder(True,
                         [['read'], ['format', ["com.databricks.spark.avro"]],
                          ['load', [self._urlpath]]], {})
        return sh.setup()
예제 #2
0
    def to_spark(self):
        """Produce Spark DataFrame equivalent

        This will ignore all arguments except the urlpath, which will be
        directly interpreted by Spark. If you need to configure the storage,
        that must be done on the spark side.

        This method requires intake-spark. See its documentation for how to
        set up a spark Session.
        """
        from intake_spark.base import SparkHolder
        args = [['read'], ['parquet', [self._urlpath]]]
        sh = SparkHolder(True, args, {})
        return sh.setup()
예제 #3
0
def test_cat():
    import pyspark
    h = SparkHolder(True, [('catalog', )], {})
    h.setup()  # create spark session early
    session = h.session[0]
    d = session.createDataFrame(df)
    sql = pyspark.HiveContext(session.sparkContext)
    sql.registerDataFrameAsTable(d, 'temp')

    cat = SparkTablesCatalog()
    assert 'temp' in list(cat)
    s = cat.temp()
    assert isinstance(s, SparkDataFrame)
    out = s.read()
    assert out.astype(df.dtypes).equals(df)
예제 #4
0
 def to_spark(self):
     from intake_spark.base import SparkHolder
     h = SparkHolder(False, [('textFile', (self._urlpath, ))], {})
     return h.setup()
예제 #5
0
 def to_spark(self):
     from intake_spark.base import SparkHolder
     h = SparkHolder(True, [('read', ), ('format', ("csv", )),
                            ('option', ("header", "true")),
                            ('load', (self.urlpath, ))], {})
     return h.setup()