예제 #1
0
 def __init__(self, sc=None, **create_sc_kwargs):
     if not sc:
         sc = create_sc(**create_sc_kwargs)
     if type(sc) is not SparkContext:
         raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
     self._sc = sc
     self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
     self._jutils = JUtils(self._sc)
     self._scala_sc = self._jutils.get_scala_sc()
     loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time
예제 #2
0
class TkContext(object):
    """TK Context - grounding object for the sparktk library"""

    def __init__(self, sc=None, **create_sc_kwargs):
        if not sc:
            sc = create_sc(**create_sc_kwargs)
        if type(sc) is not SparkContext:
            raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
        self._sc = sc
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time

    @property
    def sc(self):
        return self._sc

    @property
    def jutils(self):
        return self._jutils

    @property
    def models(self):
        """access to the various models of sparktk"""
        return get_lazy_loader(self, "models", implicit_kwargs={'tc': self})

    @property
    def frame(self):
        return get_lazy_loader(self, "frame", implicit_kwargs={'tc': self}).frame  # .frame to account for extra 'frame' in name vis-a-vis scala

    def load(self, path):
        """loads an object from the given path"""
        scala_obj = self._jtc.load(path)
        return self._create_python_proxy(scala_obj)

    def _create_python_proxy(self, scala_obj):
        """Create a python object for the scala_obj

        Convention is such that the python proxy object is available off the TkContext with the SAME
        path that the object has in Scala, starting with sparktk.

        Example:

        org.trustedanalytics.sparktk.models.clustering.kmeans.KMeansModel

        means a call to

        tc.models.clustering.kmeans.KMeansModel.load(tc, scala_obj)

        The signature is simply the python tc and the reference to the scala obj
        """
        name_parts = scala_obj.getClass().getName().split('.')
        relevant_path = ".".join(name_parts[name_parts.index('sparktk')+1:])
        cmd = "tc.%s.load(tc, scala_obj)" % relevant_path
        logger.debug("tkcontext._create_python_proxy cmd=%s", cmd)
        proxy = eval(cmd, {"tc": self, "scala_obj": scala_obj})
        return proxy
예제 #3
0
 def __init__(self, sc=None, other_libs=None, **create_sc_kwargs):
     if not sc:
         if SparkContext._active_spark_context:
             sc = SparkContext._active_spark_context
         else:
             sc = create_sc(other_libs=other_libs, **create_sc_kwargs)
     if type(sc) is not SparkContext:
         raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
     self._sc = sc
     self._sql_context = None
     self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
     self._jutils = JUtils(self._sc)
     self._scala_sc = self._jutils.get_scala_sc()
     self._other_libs = other_libs if other_libs is None or isinstance(other_libs, list) else [other_libs]
     if self._other_libs is not None:
         for lib in self._other_libs:
             lib_obj = lib.get_main_object(self)
             setattr(self, lib.__name__, lib_obj)
     loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time
예제 #4
0
class TkContext(object):
    """TK Context - grounding object for the sparktk library"""

    def __init__(self, sc=None, **create_sc_kwargs):
        if not sc:
            sc = create_sc(**create_sc_kwargs)
        if type(sc) is not SparkContext:
            raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
        self._sc = sc
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time

    @property
    def sc(self):
        return self._sc

    @property
    def jutils(self):
        return self._jutils

    @property
    def models(self):
        """access to the various models of sparktk"""
        return get_lazy_loader(self, "models", implicit_kwargs={'tc': self})

    @property
    def frame(self):
        return get_lazy_loader(self, "frame", implicit_kwargs={'tc': self}).frame  # .frame to account for extra 'frame' in name vis-a-vis scala

    @deprecated
    def to_frame(self, data, schema=None):
        """creates a frame from the given data"""
        from sparktk.frame.frame import create
        return create(data, schema, tc=self)

    def load(self, path):
        """loads an object from the given path"""
        scala_obj = self._jtc.load(path)
        return self._create_python_proxy(scala_obj)

    def _create_python_proxy(self, scala_obj):
        """Create a python object for the scala_obj

        Convention is such that the python proxy object is available off the TkContext with the SAME
        path that the object has in Scala, starting with sparktk.

        Example:

        org.trustedanalytics.sparktk.models.clustering.kmeans.KMeansModel

        means a call to

        tc.models.clustering.kmeans.KMeansModel.load(tc, scala_obj)

        The signature is simply the python tc and the reference to the scala obj
        """
        name_parts = scala_obj.getClass().getName().split('.')
        relevant_path = ".".join(name_parts[name_parts.index('sparktk')+1:])
        cmd = "tc.%s.load(tc, scala_obj)" % relevant_path
        logger.debug("tkcontext._create_python_proxy cmd=%s", cmd)
        proxy = eval(cmd, {"tc": self, "scala_obj": scala_obj})
        return proxy
예제 #5
0
class TkContext(object):
    """TK Context - grounding object for the sparktk library"""
    def __init__(self, sc=None, **create_sc_kwargs):
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(**create_sc_kwargs)
        if type(sc) is not SparkContext:
            raise TypeError(
                "sparktk context init requires a valid SparkContext.  Received type %s"
                % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(
            self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        loggers.set_spark(
            self._sc, "off"
        )  # todo: undo this/move to config, I just want it outta my face most of the time

    from sparktk.arguments import implicit

    @staticmethod
    def validate(tc, arg_name='tc'):
        """
        Raises a ValueError if the tc variable is not of type TkContext

        Since tc is so commonly used as an implicit variable, it's worth the special code here to save a lot of imports otherwise

        """
        require_type(tc, arg_name, TkContext)

    @property
    def sc(self):
        return self._sc

    @property
    def sql_context(self):
        if self._sql_context is None:
            from pyspark.sql import SQLContext
            self._sql_context = SQLContext(self.sc)
        return self._sql_context

    @property
    def jutils(self):
        return self._jutils

    @property
    def models(self):
        """access to the various models of sparktk"""
        return get_lazy_loader(self, "models", implicit_kwargs={'tc': self})

    @property
    def frame(self):
        return get_lazy_loader(self, "frame", implicit_kwargs={
            'tc': self
        }).frame  # .frame to account for extra 'frame' in name vis-a-vis scala

    @property
    def graph(self):
        return get_lazy_loader(self, "graph", implicit_kwargs={
            'tc': self
        }).graph  # .graph to account for extra 'graph' in name vis-a-vis scala

    @property
    def examples(self):
        return get_lazy_loader(self, "examples", implicit_kwargs={'tc': self})

    def load(self, path, validate_type=None):
        """loads object from the given path (if validate_type is provided, error raised if loaded obj does not match"""
        scala_obj = self._jtc.load(path)
        python_obj = self._create_python_proxy(scala_obj)
        if validate_type and not isinstance(python_obj, validate_type):
            raise RuntimeError("load expected to get type %s but got type %s" %
                               (validate_type, type(python_obj)))
        return python_obj

    def _create_python_proxy(self, scala_obj):
        """Create a python object for the scala_obj

        Convention is such that the python proxy object is available off the TkContext with the SAME
        path that the object has in Scala, starting with sparktk.

        Example:

        org.trustedanalytics.sparktk.models.clustering.kmeans.KMeansModel

        means a call to

        tc.models.clustering.kmeans.KMeansModel.load(tc, scala_obj)

        The signature is simply the python tc and the reference to the scala obj
        """
        name_parts = scala_obj.getClass().getName().split('.')
        try:
            relevant_path = ".".join(name_parts[name_parts.index('sparktk') +
                                                1:])
        except ValueError as e:
            raise ValueError("Trouble with class name %s, %s" %
                             ('.'.join(name_parts), str(e)))
        cmd = "tc.%s._from_scala(tc, scala_obj)" % relevant_path
        logger.debug("tkcontext._create_python_proxy cmd=%s", cmd)
        proxy = eval(cmd, {"tc": self, "scala_obj": scala_obj})
        return proxy

    @property
    def agg(self):
        """access to the aggregation function enumeration"""
        from sparktk.frame.ops.group_by import agg
        return agg
예제 #6
0
    def __init__(self,
                 sc=None,
                 master=None,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name=None,
                 other_libs=None,
                 extra_conf_file=None,
                 extra_conf_dict=None,
                 use_local_fs=False,
                 debug=None):
        r"""
        Creates a TkContext object

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
                   (see https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkContext.html)
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf_file: (str) local file path to a spark conf file to supplement the spark conf
                                File format is basic key-value pairs per line, like:

                                    spark.executor.memory=6g
                                    spark.files.overwrite=true

        (NOTE: if env var $SPARKTK_EXTRA_CONF is set, the file it indicates will be used.)

        :param extra_conf_dict: (dict) dict for any extra spark conf settings,
                                for ex.  {"spark.hadoop.fs.default.name": "file:///"}
                                these will override any matching settings from extra_conf_file, if provided
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext

        Creating a TkContext requires creating or obtaining a SparkContext object.  It is usually recommended to have
        the TkContext create the SparkContext, since it can provide the proper locations to the sparktk specific
        dependencies (i.e. jars).  Otherwise, specifying the classpath and jars arguments is left to the user.


        Examples
        --------

        <skip>
        Creating a TkContext using no arguments will cause a SparkContext to be created using default settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext()

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=sparktk
            spark.driver.extraClassPath=/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.jars=file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=local[4]
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar


        Another case with arguments to control some Spark Context settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext(master='yarn-client',
            ...                        py_files='mylib.py',
            ...                        pyspark_submit_args='--jars /usr/lib/custom/extra.jar' \
            ...                                            '--driver-class-path /usr/lib/custom/*' \
            ...                                            '--executor-memory 6g',
            ...                        extra_conf={'spark.files.overwrite': 'true'},
            ...                        app_name='myapp'

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=myapp
            spark.driver.extraClassPath=/usr/lib/custom/*:/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.executor.memory=6g
            spark.files.overwrite=true
            spark.jars=file:/usr/local/custom/extra.jar,file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=yarn-client
            spark.yarn.isPython=true
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar

        </skip>

        """
        if not sc:
            if SparkContext._active_spark_context:
                msg = "New context NOT created because there is already an active SparkContext"
                logger.warn(msg)
                import sys
                sys.stderr.write("[WARNING] %s\n" % msg)
                sys.stderr.flush()
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf_file=extra_conf_file,
                               extra_conf_dict=extra_conf_dict,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            if sc is TkContext.__mock:
                return
            raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time
예제 #7
0
class TkContext(object):
    """
    TK Context

    The sparktk Python API centers around the TkContext object.  This object holds the session's requisite
    SparkContext object in order to work with Spark.  It also provides the entry point to the main APIs.
    """

    _other_libs = None
    __mock = object()

    def __init__(self,
                 sc=None,
                 master=None,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name=None,
                 other_libs=None,
                 extra_conf_file=None,
                 extra_conf_dict=None,
                 use_local_fs=False,
                 debug=None):
        r"""
        Creates a TkContext object

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
                   (see https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkContext.html)
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf_file: (str) local file path to a spark conf file to supplement the spark conf
                                File format is basic key-value pairs per line, like:

                                    spark.executor.memory=6g
                                    spark.files.overwrite=true

        (NOTE: if env var $SPARKTK_EXTRA_CONF is set, the file it indicates will be used.)

        :param extra_conf_dict: (dict) dict for any extra spark conf settings,
                                for ex.  {"spark.hadoop.fs.default.name": "file:///"}
                                these will override any matching settings from extra_conf_file, if provided
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext

        Creating a TkContext requires creating or obtaining a SparkContext object.  It is usually recommended to have
        the TkContext create the SparkContext, since it can provide the proper locations to the sparktk specific
        dependencies (i.e. jars).  Otherwise, specifying the classpath and jars arguments is left to the user.


        Examples
        --------

        <skip>
        Creating a TkContext using no arguments will cause a SparkContext to be created using default settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext()

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=sparktk
            spark.driver.extraClassPath=/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.jars=file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=local[4]
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar


        Another case with arguments to control some Spark Context settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext(master='yarn-client',
            ...                        py_files='mylib.py',
            ...                        pyspark_submit_args='--jars /usr/lib/custom/extra.jar' \
            ...                                            '--driver-class-path /usr/lib/custom/*' \
            ...                                            '--executor-memory 6g',
            ...                        extra_conf={'spark.files.overwrite': 'true'},
            ...                        app_name='myapp'

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=myapp
            spark.driver.extraClassPath=/usr/lib/custom/*:/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.executor.memory=6g
            spark.files.overwrite=true
            spark.jars=file:/usr/local/custom/extra.jar,file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=yarn-client
            spark.yarn.isPython=true
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar

        </skip>

        """
        if not sc:
            if SparkContext._active_spark_context:
                msg = "New context NOT created because there is already an active SparkContext"
                logger.warn(msg)
                import sys
                sys.stderr.write("[WARNING] %s\n" % msg)
                sys.stderr.flush()
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf_file=extra_conf_file,
                               extra_conf_dict=extra_conf_dict,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            if sc is TkContext.__mock:
                return
            raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time

    from sparktk.arguments import implicit

    @staticmethod
    def validate(tc, arg_name='tc'):
        """
        Validates that the given tc object is indeed a TkContext.  Raises a ValueError if it is not.

        Examples
        --------

            <hide>
            >>> from sparktk import TkContext

            </hide>

            >>> TkContext.validate(tc)

            >>> try:
            ...     TkContext(25)
            ... except TypeError:
            ...     print "Not a TkContext!"
            Not a TkContext!

        """
        # Since tc is so commonly used as an implicit variable, it's worth special code here to save a lot of imports
        require_type(TkContext, tc, arg_name)

    @staticmethod
    def _create_mock_tc():
        """
        Creates a TkContext which does NOT have a valid SparkContext

        (Useful for testing or exploring sparktk without having Spark around)
        """
        return TkContext(TkContext.__mock)

    @property
    def sc(self):
        """
        Access to the underlying SparkContext

        Example
        -------

            >>> tc.sc.version
            u'1.6.0'

        """
        return self._sc

    @property
    def sql_context(self):
        """
        Access to the underlying Spark SQLContext

        Example
        -------

        <skip>

            >>> tc.sql_context.registerDataFrameAsTable(frame.dataframe, "table1")
            >>> df2 = tc.sql_context.sql("SELECT field1 AS f1, field2 as f2 from table1")
            >>> df2.collect()
            [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]

        </skip>

        """
        if self._sql_context is None:
            from pyspark.sql import SQLContext
            self._sql_context = SQLContext(self.sc)
        return self._sql_context

    @property
    def jutils(self):
        """Utilities for working with the remote JVM"""
        return self._jutils

    @property
    def agg(self):
        """
        Convenient access to the aggregation function enumeration (See the
        <a href="frame.m.html#sparktk.frame.frame.Frame.group_by">group_by operation</a> on sparktk Frames)

        Example
        -------

        For the given frame, count the groups in column 'b':

            <hide>
            >>> data = [[1, "alpha", 3.0],
            ...         [1, "bravo", 5.0],
            ...         [1, "alpha", 5.0],
            ...         [2, "bravo", 8.0],
            ...         [2, "charlie", 12.0],
            ...         [2, "bravo", 7.0],
            ...         [2, "bravo", 12.0]]
            >>> schema = [("a",int), ("b",str), ("c",float)]

            >>> frame = tc.frame.create(data, schema)

            </hide>

            >>> frame.inspect()
            [#]  a  b        c
            =====================
            [0]  1  alpha     3.0
            [1]  1  bravo     5.0
            [2]  1  alpha     5.0
            [3]  2  bravo     8.0
            [4]  2  charlie  12.0
            [5]  2  bravo     7.0
            [6]  2  bravo    12.0

            >>> b_count = frame.group_by('b', tc.agg.count)

            >>> b_count.inspect()
            [#]  b        count
            ===================
            [0]  alpha        2
            [1]  charlie      1
            [2]  bravo        4

        """
        from sparktk.frame.ops.group_by import agg
        return agg

    @property
    def frame(self):
        """
        Access to create or load the sparktk Frames  (See the <a href="frame.m.html">Frame API</a>)

        Example
        -------

            >>> frame = tc.frame.create([[1, 3.14, 'blue'], [7, 1.61, 'red'], [4, 2.72, 'yellow']])

            >>> frame.inspect()
            [#] C0  C1    C2
            =====================
            [0]  1  3.14  blue
            [1]  7  1.61  red
            [2]  4  2.72  yellow


            >>> frame2 = tc.frame.import_csv("../datasets/basic.csv")

            >>> frame2.inspect(5)
            [#]  C0   C1     C2  C3
            ================================
            [0]  132  75.4    0  correction
            [1]  133  77.66   0  fitness
            [2]  134  71.22   1  proposal
            [3]  201   72.3   1  utilization
            [4]  202   80.1   0  commission


        """
        return get_lazy_loader(self, "frame", implicit_kwargs={'tc': self}).frame  # .frame to account for extra 'frame' in name vis-a-vis scala

    @property
    def graph(self):
        """
        Access to create or load the sparktk Graphs (See the <a href="graph.m.html">Graph API</a>)

        Example
        -------

            <hide>
            >>> g = tc.examples.graphs.get_movie_graph()
            >>> g.save('sandbox/my_saved_graph')
            </hide>

            >>> g = tc.graph.load('sandbox/my_saved_graph')

        """
        return get_lazy_loader(self, "graph", implicit_kwargs={'tc': self}).graph  # .graph to account for extra 'graph' in name vis-a-vis scala

    @property
    def dicom(self):
        """
        Access to create or load the sparktk Dicom objects  (See the <a href="dicom.m.html">Dicom API</a>)

        Example
        -------

            <skip>
            >>> d = tc.dicom.import_dcm('path/to/dicom/images/')

            >>> type(d)
            sparktk.dicom.dicom.Dicom
            </skip>

        """
        return get_lazy_loader(self, "dicom", implicit_kwargs={'tc': self}).dicom  # .dicom to account for extra 'dicom' in name vis-a-vis scala

    @property
    def models(self):
        """
        Access to create or load the various models available in sparktk  (See the <a href="models/index.html">Models API</a>)

        Examples
        --------

        Train an SVM model:

            <skip>
            >>> svm_model = tc.models.classification.svm.train(frame, 'label', ['data'])

        Train a Random Forest regression model:

            >>> rf = tc.models.regression.random_forest_regressor.train(frame,
            ...                                                         'Class',
            ...                                                         ['Dim_1', 'Dim_2'],
            ...                                                         num_trees=1,
            ...                                                         impurity="variance",
            ...                                                         max_depth=4,
            ...                                                         max_bins=100)

        Train a KMeans clustering model:

            >>> km = tc.models.clustering.kmeans.train(frame, ["data"], k=3)

            </skip>

        """
        return get_lazy_loader(self, "models", implicit_kwargs={'tc': self})

    @property
    def examples(self):
        """
        Access to some example data structures

        Example
        -------

        Get a small, built-in sparktk Frame object:

            >>> cities = tc.examples.frames.get_cities_frame()

            >>> cities.inspect(5)
            [#]  rank  city       population_2013  population_2010  change  county
            ==========================================================================
            [0]  1     Portland   609456           583776           4.40%   Multnomah
            [1]  2     Salem      160614           154637           3.87%   Marion
            [2]  3     Eugene     159190           156185           1.92%   Lane
            [3]  4     Gresham    109397           105594           3.60%   Multnomah
            [4]  5     Hillsboro  97368            91611            6.28%   Washington

        """
        return get_lazy_loader(self, "examples", implicit_kwargs={'tc': self})

    def load(self, path, validate_type=None):
        """
        Loads object from the given path

        Parameters
        ----------

        :param path: (str) location of the object to load
        :param validate_type: (type) if provided, a RuntimeError is raised if the loaded obj is not of that type
        :return: (object) the loaded object

        Example
        -------

        <skip>
            >>> f = tc.load("/home/user/sandbox/superframe")

            >>> type(f)
            sparktk.frame.frame.Frame

        </skip>

        """
        loaders_map = None
        if self._other_libs is not None:
            other_loaders = []
            for other_lib in self._other_libs:
                other_loaders.append(other_lib.get_loaders(self))
            loaders_map =  self.jutils.convert.combine_scala_maps(other_loaders)
        scala_obj = self._jtc.load(path, self.jutils.convert.to_scala_option(loaders_map))
        python_obj = self._create_python_proxy(scala_obj)
        if validate_type and not isinstance(python_obj, validate_type):
          raise RuntimeError("load expected to get type %s but got type %s" % (validate_type, type(python_obj)))
        return python_obj

    def _create_python_proxy(self, scala_obj):
        """
        Create a python object for the scala_obj

        Convention is such that the python proxy object is available off the TkContext with the SAME
        path that the object has in Scala, starting with sparktk.

        Example:

        org.trustedanalytics.sparktk.models.clustering.kmeans.KMeansModel

        means a call to

        tc.models.clustering.kmeans.KMeansModel.load(tc, scala_obj)

        The signature is simply the python tc and the reference to the scala obj
        """
        name_parts = scala_obj.getClass().getName().split('.')
        try:
            relevant_path = ".".join(name_parts[name_parts.index('sparktk')+1:])
        except ValueError as e:
            # check if it's from another library
            relevant_path = ""
            for other_lib in self._other_libs:
                other_lib_name = other_lib.__name__
                if other_lib_name in name_parts:
                    relevant_path = ".".join(name_parts[name_parts.index(other_lib_name):])
                    break
            # if it's not from of the other libraries, then raise an error
            if relevant_path == "":
                raise ValueError("Trouble with class name %s, %s" % ('.'.join(name_parts), str(e)))
        cmd = "tc.%s._from_scala(tc, scala_obj)" % relevant_path
        logger.debug("tkcontext._create_python_proxy cmd=%s", cmd)
        proxy = eval(cmd, {"tc": self, "scala_obj": scala_obj})
        return proxy
예제 #8
0
    def __init__(self,
                 sc=None,
                 master=default_spark_master,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name="sparktk",
                 other_libs=None,
                 extra_conf=None,
                 use_local_fs=False,
                 debug=None):
        r"""
        Creates a TkContext object

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
                   (see https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkContext.html)
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext

        Creating a TkContext requires creating or obtaining a SparkContext object.  It is usually recommended to have
        the TkContext create the SparkContext, since it can provide the proper locations to the sparktk specific
        dependencies (i.e. jars).  Otherwise, specifying the classpath and jars arguments is left to the user.


        Examples
        --------

        <skip>
        Creating a TkContext using no arguments will cause a SparkContext to be created using default settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext()

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=sparktk
            spark.driver.extraClassPath=/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.jars=file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=local[4]
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar


        Another case with arguments to control some Spark Context settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext(master='yarn-client',
            ...                        py_files='mylib.py',
            ...                        pyspark_submit_args='--jars /usr/lib/custom/extra.jar' \
            ...                                            '--driver-class-path /usr/lib/custom/*' \
            ...                                            '--executor-memory 6g',
            ...                        extra_conf={'spark.files.overwrite': 'true'},
            ...                        app_name='myapp'

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=myapp
            spark.driver.extraClassPath=/usr/lib/custom/*:/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.executor.memory=6g
            spark.files.overwrite=true
            spark.jars=file:/usr/local/custom/extra.jar,file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=yarn-client
            spark.yarn.isPython=true
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar

        </skip>

        """
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf=extra_conf,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            if sc is TkContext.__mock:
                return
            raise TypeError(
                "sparktk context init requires a valid SparkContext.  Received type %s"
                % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(
            self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(
            other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(
            self._sc, "off"
        )  # todo: undo this/move to config, I just want it outta my face most of the time
예제 #9
0
class TkContext(object):
    """
    TK Context

    The sparktk Python API centers around the TkContext object.  This object holds the session's requisite
    SparkContext object in order to work with Spark.  It also provides the entry point to the main APIs.
    """

    _other_libs = None
    __mock = object()

    def __init__(self,
                 sc=None,
                 master=default_spark_master,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name="sparktk",
                 other_libs=None,
                 extra_conf=None,
                 use_local_fs=False,
                 debug=None):
        r"""
        Creates a TkContext object

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
                   (see https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkContext.html)
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext

        Creating a TkContext requires creating or obtaining a SparkContext object.  It is usually recommended to have
        the TkContext create the SparkContext, since it can provide the proper locations to the sparktk specific
        dependencies (i.e. jars).  Otherwise, specifying the classpath and jars arguments is left to the user.


        Examples
        --------

        <skip>
        Creating a TkContext using no arguments will cause a SparkContext to be created using default settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext()

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=sparktk
            spark.driver.extraClassPath=/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.jars=file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=local[4]
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar


        Another case with arguments to control some Spark Context settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext(master='yarn-client',
            ...                        py_files='mylib.py',
            ...                        pyspark_submit_args='--jars /usr/lib/custom/extra.jar' \
            ...                                            '--driver-class-path /usr/lib/custom/*' \
            ...                                            '--executor-memory 6g',
            ...                        extra_conf={'spark.files.overwrite': 'true'},
            ...                        app_name='myapp'

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=myapp
            spark.driver.extraClassPath=/usr/lib/custom/*:/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.executor.memory=6g
            spark.files.overwrite=true
            spark.jars=file:/usr/local/custom/extra.jar,file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=yarn-client
            spark.yarn.isPython=true
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar

        </skip>

        """
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf=extra_conf,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            if sc is TkContext.__mock:
                return
            raise TypeError(
                "sparktk context init requires a valid SparkContext.  Received type %s"
                % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(
            self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(
            other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(
            self._sc, "off"
        )  # todo: undo this/move to config, I just want it outta my face most of the time

    from sparktk.arguments import implicit

    @staticmethod
    def validate(tc, arg_name='tc'):
        """
        Validates that the given tc object is indeed a TkContext.  Raises a ValueError if it is not.

        Examples
        --------

            <hide>
            >>> from sparktk import TkContext

            </hide>

            >>> TkContext.validate(tc)

            >>> try:
            ...     TkContext(25)
            ... except TypeError:
            ...     print "Not a TkContext!"
            Not a TkContext!

        """
        # Since tc is so commonly used as an implicit variable, it's worth special code here to save a lot of imports
        require_type(TkContext, tc, arg_name)

    @staticmethod
    def _create_mock_tc():
        """
        Creates a TkContext which does NOT have a valid SparkContext

        (Useful for testing or exploring sparktk without having Spark around)
        """
        return TkContext(TkContext.__mock)

    @property
    def sc(self):
        """
        Access to the underlying SparkContext

        Example
        -------

            >>> tc.sc.version
            u'1.6.0'

        """
        return self._sc

    @property
    def sql_context(self):
        """
        Access to the underlying Spark SQLContext

        Example
        -------

        <skip>

            >>> tc.sql_context.registerDataFrameAsTable(frame.dataframe, "table1")
            >>> df2 = tc.sql_context.sql("SELECT field1 AS f1, field2 as f2 from table1")
            >>> df2.collect()
            [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]

        </skip>

        """
        if self._sql_context is None:
            from pyspark.sql import SQLContext
            self._sql_context = SQLContext(self.sc)
        return self._sql_context

    @property
    def jutils(self):
        """Utilities for working with the remote JVM"""
        return self._jutils

    @property
    def agg(self):
        """
        Convenient access to the aggregation function enumeration (See the
        <a href="frame.m.html#sparktk.frame.frame.Frame.group_by">group_by operation</a> on sparktk Frames)

        Example
        -------

        For the given frame, count the groups in column 'b':

            <hide>
            >>> data = [[1, "alpha", 3.0],
            ...         [1, "bravo", 5.0],
            ...         [1, "alpha", 5.0],
            ...         [2, "bravo", 8.0],
            ...         [2, "charlie", 12.0],
            ...         [2, "bravo", 7.0],
            ...         [2, "bravo", 12.0]]
            >>> schema = [("a",int), ("b",str), ("c",float)]

            >>> frame = tc.frame.create(data, schema)

            </hide>

            >>> frame.inspect()
            [#]  a  b        c
            =====================
            [0]  1  alpha     3.0
            [1]  1  bravo     5.0
            [2]  1  alpha     5.0
            [3]  2  bravo     8.0
            [4]  2  charlie  12.0
            [5]  2  bravo     7.0
            [6]  2  bravo    12.0

            >>> b_count = frame.group_by('b', tc.agg.count)

            >>> b_count.inspect()
            [#]  b        count
            ===================
            [0]  alpha        2
            [1]  charlie      1
            [2]  bravo        4

        """
        from sparktk.frame.ops.group_by import agg
        return agg

    @property
    def frame(self):
        """
        Access to create or load the sparktk Frames  (See the <a href="frame.m.html">Frame API</a>)

        Example
        -------

            >>> frame = tc.frame.create([[1, 3.14, 'blue'], [7, 1.61, 'red'], [4, 2.72, 'yellow']])

            >>> frame.inspect()
            [#] C0  C1    C2
            =====================
            [0]  1  3.14  blue
            [1]  7  1.61  red
            [2]  4  2.72  yellow


            >>> frame2 = tc.frame.import_csv("../datasets/basic.csv")

            >>> frame2.inspect(5)
            [#]  C0   C1     C2  C3
            ================================
            [0]  132  75.4    0  correction
            [1]  133  77.66   0  fitness
            [2]  134  71.22   1  proposal
            [3]  201   72.3   1  utilization
            [4]  202   80.1   0  commission


        """
        return get_lazy_loader(self, "frame", implicit_kwargs={
            'tc': self
        }).frame  # .frame to account for extra 'frame' in name vis-a-vis scala

    @property
    def graph(self):
        """
        Access to create or load the sparktk Graphs (See the <a href="graph.m.html">Graph API</a>)

        Example
        -------

            <hide>
            >>> g = tc.examples.graphs.get_movie_graph()
            >>> g.save('sandbox/my_saved_graph')
            </hide>

            >>> g = tc.graph.load('sandbox/my_saved_graph')

        """
        return get_lazy_loader(self, "graph", implicit_kwargs={
            'tc': self
        }).graph  # .graph to account for extra 'graph' in name vis-a-vis scala

    @property
    def dicom(self):
        """
        Access to create or load the sparktk Dicom objects  (See the <a href="dicom.m.html">Dicom API</a>)

        Example
        -------

            <skip>
            >>> d = tc.dicom.import_dcm('path/to/dicom/images/')

            >>> type(d)
            sparktk.dicom.dicom.Dicom
            </skip>

        """
        return get_lazy_loader(self, "dicom", implicit_kwargs={
            'tc': self
        }).dicom  # .dicom to account for extra 'dicom' in name vis-a-vis scala

    @property
    def models(self):
        """
        Access to create or load the various models available in sparktk  (See the <a href="models/index.html">Models API</a>)

        Examples
        --------

        Train an SVM model:

            <skip>
            >>> svm_model = tc.models.classification.svm.train(frame, 'label', ['data'])

        Train a Random Forest regression model:

            >>> rf = tc.models.regression.random_forest_regressor.train(frame,
            ...                                                         'Class',
            ...                                                         ['Dim_1', 'Dim_2'],
            ...                                                         num_trees=1,
            ...                                                         impurity="variance",
            ...                                                         max_depth=4,
            ...                                                         max_bins=100)

        Train a KMeans clustering model:

            >>> km = tc.models.clustering.kmeans.train(frame, ["data"], k=3)

            </skip>

        """
        return get_lazy_loader(self, "models", implicit_kwargs={'tc': self})

    @property
    def examples(self):
        """
        Access to some example data structures

        Example
        -------

        Get a small, built-in sparktk Frame object:

            >>> cities = tc.examples.frames.get_cities_frame()

            >>> cities.inspect(5)
            [#]  rank  city       population_2013  population_2010  change  county
            ==========================================================================
            [0]  1     Portland   609456           583776           4.40%   Multnomah
            [1]  2     Salem      160614           154637           3.87%   Marion
            [2]  3     Eugene     159190           156185           1.92%   Lane
            [3]  4     Gresham    109397           105594           3.60%   Multnomah
            [4]  5     Hillsboro  97368            91611            6.28%   Washington

        """
        return get_lazy_loader(self, "examples", implicit_kwargs={'tc': self})

    def load(self, path, validate_type=None):
        """
        Loads object from the given path

        Parameters
        ----------

        :param path: (str) location of the object to load
        :param validate_type: (type) if provided, a RuntimeError is raised if the loaded obj is not of that type
        :return: (object) the loaded object

        Example
        -------

        <skip>
            >>> f = tc.load("/home/user/sandbox/superframe")

            >>> type(f)
            sparktk.frame.frame.Frame

        </skip>

        """
        loaders_map = None
        if self._other_libs is not None:
            other_loaders = []
            for other_lib in self._other_libs:
                other_loaders.append(other_lib.get_loaders(self))
            loaders_map = self.jutils.convert.combine_scala_maps(other_loaders)
        scala_obj = self._jtc.load(
            path, self.jutils.convert.to_scala_option(loaders_map))
        python_obj = self._create_python_proxy(scala_obj)
        if validate_type and not isinstance(python_obj, validate_type):
            raise RuntimeError("load expected to get type %s but got type %s" %
                               (validate_type, type(python_obj)))
        return python_obj

    def _create_python_proxy(self, scala_obj):
        """
        Create a python object for the scala_obj

        Convention is such that the python proxy object is available off the TkContext with the SAME
        path that the object has in Scala, starting with sparktk.

        Example:

        org.trustedanalytics.sparktk.models.clustering.kmeans.KMeansModel

        means a call to

        tc.models.clustering.kmeans.KMeansModel.load(tc, scala_obj)

        The signature is simply the python tc and the reference to the scala obj
        """
        name_parts = scala_obj.getClass().getName().split('.')
        try:
            relevant_path = ".".join(name_parts[name_parts.index('sparktk') +
                                                1:])
        except ValueError as e:
            # check if it's from another library
            relevant_path = ""
            for other_lib in self._other_libs:
                other_lib_name = other_lib.__name__
                if other_lib_name in name_parts:
                    relevant_path = ".".join(
                        name_parts[name_parts.index(other_lib_name):])
                    break
            # if it's not from of the other libraries, then raise an error
            if relevant_path == "":
                raise ValueError("Trouble with class name %s, %s" %
                                 ('.'.join(name_parts), str(e)))
        cmd = "tc.%s._from_scala(tc, scala_obj)" % relevant_path
        logger.debug("tkcontext._create_python_proxy cmd=%s", cmd)
        proxy = eval(cmd, {"tc": self, "scala_obj": scala_obj})
        return proxy
예제 #10
0
class TkContext(object):
    """TK Context - grounding object for the sparktk library"""
    _other_libs = None

    def __init__(self, sc=None, other_libs=None, **create_sc_kwargs):
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(other_libs=other_libs, **create_sc_kwargs)
        if type(sc) is not SparkContext:
            raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time

    from sparktk.arguments import implicit

    @staticmethod
    def validate(tc, arg_name='tc'):
        """
        Raises a ValueError if the tc variable is not of type TkContext

        Since tc is so commonly used as an implicit variable, it's worth the special code here to save a lot of imports otherwise

        """
        require_type(tc, arg_name, TkContext)

    @property
    def sc(self):
        return self._sc

    @property
    def sql_context(self):
        if self._sql_context is None:
            from pyspark.sql import SQLContext
            self._sql_context = SQLContext(self.sc)
        return self._sql_context

    @property
    def jutils(self):
        return self._jutils

    @property
    def models(self):
        """access to the various models of sparktk"""
        return get_lazy_loader(self, "models", implicit_kwargs={'tc': self})

    @property
    def frame(self):
        return get_lazy_loader(self, "frame", implicit_kwargs={'tc': self}).frame  # .frame to account for extra 'frame' in name vis-a-vis scala

    @property
    def graph(self):
        return get_lazy_loader(self, "graph", implicit_kwargs={'tc': self}).graph  # .graph to account for extra 'graph' in name vis-a-vis scala

    @property
    def dicom(self):
        return get_lazy_loader(self, "dicom", implicit_kwargs={'tc': self}).dicom  # .dicom to account for extra 'dicom' in name vis-a-vis scala


    @property
    def examples(self):
        return get_lazy_loader(self, "examples", implicit_kwargs={'tc': self})

    def load(self, path, validate_type=None):
        """loads object from the given path (if validate_type is provided, error raised if loaded obj does not match"""
        loaders_map = None
        if self._other_libs is not None:
            other_loaders = []
            for other_lib in self._other_libs:
                other_loaders.append(other_lib.get_loaders(self))
            loaders_map =  self.jutils.convert.combine_scala_maps(other_loaders)
        scala_obj = self._jtc.load(path, self.jutils.convert.to_scala_option(loaders_map))
        python_obj = self._create_python_proxy(scala_obj)
        if validate_type and not isinstance(python_obj, validate_type):
          raise RuntimeError("load expected to get type %s but got type %s" % (validate_type, type(python_obj)))
        return python_obj

    def _create_python_proxy(self, scala_obj):
        """Create a python object for the scala_obj

        Convention is such that the python proxy object is available off the TkContext with the SAME
        path that the object has in Scala, starting with sparktk.

        Example:

        org.trustedanalytics.sparktk.models.clustering.kmeans.KMeansModel

        means a call to

        tc.models.clustering.kmeans.KMeansModel.load(tc, scala_obj)

        The signature is simply the python tc and the reference to the scala obj
        """
        name_parts = scala_obj.getClass().getName().split('.')
        try:
            relevant_path = ".".join(name_parts[name_parts.index('sparktk')+1:])
        except ValueError as e:
            # check if it's from another library
            relevant_path = ""
            for other_lib in self._other_libs:
                other_lib_name = other_lib.__name__
                if other_lib_name in name_parts:
                    relevant_path = ".".join(name_parts[name_parts.index(other_lib_name):])
                    break
            # if it's not from of the other libraries, then raise an error
            if relevant_path == "":
                raise ValueError("Trouble with class name %s, %s" % ('.'.join(name_parts), str(e)))
        cmd = "tc.%s._from_scala(tc, scala_obj)" % relevant_path
        logger.debug("tkcontext._create_python_proxy cmd=%s", cmd)
        proxy = eval(cmd, {"tc": self, "scala_obj": scala_obj})
        return proxy

    @property
    def agg(self):
        """access to the aggregation function enumeration"""
        from sparktk.frame.ops.group_by import agg
        return agg
예제 #11
0
    def __init__(self,
                 sc=None,
                 master=None,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name="sparktk",
                 other_libs=None,
                 extra_conf=None,
                 use_local_fs=False,
                 debug=None):
        """
        Creates a TkContext.

        If SparkContext sc is not provided, a new spark context will be created using the
        given settings and otherwise default values

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext
        """
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf=extra_conf,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            raise TypeError(
                "sparktk context init requires a valid SparkContext.  Received type %s"
                % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(
            self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(
            other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(
            self._sc, "off"
        )  # todo: undo this/move to config, I just want it outta my face most of the time
예제 #12
0
class TkContext(object):
    """TK Context - grounding object for the sparktk library"""
    _other_libs = None

    def __init__(self,
                 sc=None,
                 master=None,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name="sparktk",
                 other_libs=None,
                 extra_conf=None,
                 use_local_fs=False,
                 debug=None):
        """
        Creates a TkContext.

        If SparkContext sc is not provided, a new spark context will be created using the
        given settings and otherwise default values

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext
        """
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf=extra_conf,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            raise TypeError(
                "sparktk context init requires a valid SparkContext.  Received type %s"
                % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(
            self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(
            other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(
            self._sc, "off"
        )  # todo: undo this/move to config, I just want it outta my face most of the time

    from sparktk.arguments import implicit

    @staticmethod
    def validate(tc, arg_name='tc'):
        """
        Raises a ValueError if the tc variable is not of type TkContext

        Since tc is so commonly used as an implicit variable, it's worth the special code here to save a lot of imports otherwise

        """
        require_type(TkContext, tc, arg_name)

    @property
    def sc(self):
        return self._sc

    @property
    def sql_context(self):
        if self._sql_context is None:
            from pyspark.sql import SQLContext
            self._sql_context = SQLContext(self.sc)
        return self._sql_context

    @property
    def jutils(self):
        return self._jutils

    @property
    def models(self):
        """access to the various models of sparktk"""
        return get_lazy_loader(self, "models", implicit_kwargs={'tc': self})

    @property
    def frame(self):
        return get_lazy_loader(self, "frame", implicit_kwargs={
            'tc': self
        }).frame  # .frame to account for extra 'frame' in name vis-a-vis scala

    @property
    def graph(self):
        return get_lazy_loader(self, "graph", implicit_kwargs={
            'tc': self
        }).graph  # .graph to account for extra 'graph' in name vis-a-vis scala

    @property
    def dicom(self):
        return get_lazy_loader(self, "dicom", implicit_kwargs={
            'tc': self
        }).dicom  # .dicom to account for extra 'dicom' in name vis-a-vis scala

    @property
    def examples(self):
        return get_lazy_loader(self, "examples", implicit_kwargs={'tc': self})

    def load(self, path, validate_type=None):
        """loads object from the given path (if validate_type is provided, error raised if loaded obj does not match"""
        loaders_map = None
        if self._other_libs is not None:
            other_loaders = []
            for other_lib in self._other_libs:
                other_loaders.append(other_lib.get_loaders(self))
            loaders_map = self.jutils.convert.combine_scala_maps(other_loaders)
        scala_obj = self._jtc.load(
            path, self.jutils.convert.to_scala_option(loaders_map))
        python_obj = self._create_python_proxy(scala_obj)
        if validate_type and not isinstance(python_obj, validate_type):
            raise RuntimeError("load expected to get type %s but got type %s" %
                               (validate_type, type(python_obj)))
        return python_obj

    def _create_python_proxy(self, scala_obj):
        """Create a python object for the scala_obj

        Convention is such that the python proxy object is available off the TkContext with the SAME
        path that the object has in Scala, starting with sparktk.

        Example:

        org.trustedanalytics.sparktk.models.clustering.kmeans.KMeansModel

        means a call to

        tc.models.clustering.kmeans.KMeansModel.load(tc, scala_obj)

        The signature is simply the python tc and the reference to the scala obj
        """
        name_parts = scala_obj.getClass().getName().split('.')
        try:
            relevant_path = ".".join(name_parts[name_parts.index('sparktk') +
                                                1:])
        except ValueError as e:
            # check if it's from another library
            relevant_path = ""
            for other_lib in self._other_libs:
                other_lib_name = other_lib.__name__
                if other_lib_name in name_parts:
                    relevant_path = ".".join(
                        name_parts[name_parts.index(other_lib_name):])
                    break
            # if it's not from of the other libraries, then raise an error
            if relevant_path == "":
                raise ValueError("Trouble with class name %s, %s" %
                                 ('.'.join(name_parts), str(e)))
        cmd = "tc.%s._from_scala(tc, scala_obj)" % relevant_path
        logger.debug("tkcontext._create_python_proxy cmd=%s", cmd)
        proxy = eval(cmd, {"tc": self, "scala_obj": scala_obj})
        return proxy

    @property
    def agg(self):
        """access to the aggregation function enumeration"""
        from sparktk.frame.ops.group_by import agg
        return agg
예제 #13
0
    def __init__(self,
                 sc=None,
                 master=None,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name="sparktk",
                 other_libs=None,
                 extra_conf=None,
                 use_local_fs=False,
                 debug=None):
        """
        Creates a TkContext.

        If SparkContext sc is not provided, a new spark context will be created using the
        given settings and otherwise default values

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext
        """
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf=extra_conf,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time
예제 #14
0
class TkContext(object):
    """TK Context - grounding object for the sparktk library"""
    _other_libs = None

    def __init__(self,
                 sc=None,
                 master=None,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name="sparktk",
                 other_libs=None,
                 extra_conf=None,
                 use_local_fs=False,
                 debug=None):
        """
        Creates a TkContext.

        If SparkContext sc is not provided, a new spark context will be created using the
        given settings and otherwise default values

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext
        """
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf=extra_conf,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time

    from sparktk.arguments import implicit

    @staticmethod
    def validate(tc, arg_name='tc'):
        """
        Raises a ValueError if the tc variable is not of type TkContext

        Since tc is so commonly used as an implicit variable, it's worth the special code here to save a lot of imports otherwise

        """
        require_type(TkContext, tc, arg_name)

    @property
    def sc(self):
        return self._sc

    @property
    def sql_context(self):
        if self._sql_context is None:
            from pyspark.sql import SQLContext
            self._sql_context = SQLContext(self.sc)
        return self._sql_context

    @property
    def jutils(self):
        return self._jutils

    @property
    def models(self):
        """access to the various models of sparktk"""
        return get_lazy_loader(self, "models", implicit_kwargs={'tc': self})

    @property
    def frame(self):
        return get_lazy_loader(self, "frame", implicit_kwargs={'tc': self}).frame  # .frame to account for extra 'frame' in name vis-a-vis scala

    @property
    def graph(self):
        return get_lazy_loader(self, "graph", implicit_kwargs={'tc': self}).graph  # .graph to account for extra 'graph' in name vis-a-vis scala

    @property
    def dicom(self):
        return get_lazy_loader(self, "dicom", implicit_kwargs={'tc': self}).dicom  # .dicom to account for extra 'dicom' in name vis-a-vis scala


    @property
    def examples(self):
        return get_lazy_loader(self, "examples", implicit_kwargs={'tc': self})

    def load(self, path, validate_type=None):
        """loads object from the given path (if validate_type is provided, error raised if loaded obj does not match"""
        loaders_map = None
        if self._other_libs is not None:
            other_loaders = []
            for other_lib in self._other_libs:
                other_loaders.append(other_lib.get_loaders(self))
            loaders_map =  self.jutils.convert.combine_scala_maps(other_loaders)
        scala_obj = self._jtc.load(path, self.jutils.convert.to_scala_option(loaders_map))
        python_obj = self._create_python_proxy(scala_obj)
        if validate_type and not isinstance(python_obj, validate_type):
          raise RuntimeError("load expected to get type %s but got type %s" % (validate_type, type(python_obj)))
        return python_obj

    def _create_python_proxy(self, scala_obj):
        """Create a python object for the scala_obj

        Convention is such that the python proxy object is available off the TkContext with the SAME
        path that the object has in Scala, starting with sparktk.

        Example:

        org.trustedanalytics.sparktk.models.clustering.kmeans.KMeansModel

        means a call to

        tc.models.clustering.kmeans.KMeansModel.load(tc, scala_obj)

        The signature is simply the python tc and the reference to the scala obj
        """
        name_parts = scala_obj.getClass().getName().split('.')
        try:
            relevant_path = ".".join(name_parts[name_parts.index('sparktk')+1:])
        except ValueError as e:
            # check if it's from another library
            relevant_path = ""
            for other_lib in self._other_libs:
                other_lib_name = other_lib.__name__
                if other_lib_name in name_parts:
                    relevant_path = ".".join(name_parts[name_parts.index(other_lib_name):])
                    break
            # if it's not from of the other libraries, then raise an error
            if relevant_path == "":
                raise ValueError("Trouble with class name %s, %s" % ('.'.join(name_parts), str(e)))
        cmd = "tc.%s._from_scala(tc, scala_obj)" % relevant_path
        logger.debug("tkcontext._create_python_proxy cmd=%s", cmd)
        proxy = eval(cmd, {"tc": self, "scala_obj": scala_obj})
        return proxy

    @property
    def agg(self):
        """access to the aggregation function enumeration"""
        from sparktk.frame.ops.group_by import agg
        return agg