示例#1
0
    def colsWithLabel(self, labels=None):
        """Returns all column names in the data frame that contain all the specified labels

            If labels are empty, returns names of unlabeled columns
        """
        def metaLabelMatched(meta):
            if labels:
                # if labels are provided, match the column whose labels contain the given ones
                return set(labels) <= set(_getMetaLabels(meta))
            else:
                # if labels are empty, match the column with no label
                return not _getMetaLabels(meta)

        ret = [
            col.name for col in self.fields if metaLabelMatched(col.metadata)
        ]

        if not ret:
            if labels:
                raise SmvRuntimeError("there are no columns labeled with {{{}}} in {}"\
                    .format(", ".join(labels), self.df))
            else:
                raise SmvRuntimeError("there are no unlabeled columns in {}"\
                    .format(self.df))

        return ret
示例#2
0
def infer_full_name_from_part(full_names, part_name):
    """For a given partial name (postfix), infer full name from a list
    """
    candidates = [s for s in full_names if s.endswith(part_name)]

    if (len(candidates) == 0):
        raise SmvRuntimeError("Can't find name {}".format(part_name))
    elif (len(candidates) == 1):
        return candidates[0]
    else:
        raise SmvRuntimeError("Partial name {} is ambiguous".format(part_name))
示例#3
0
 def run_delayed_postAction(mod, state):
     (_run_set, coll) = state
     if (mod in _run_set):
         smv.logger.debug("Run post_action of {} from {}".format(
             mod.fqn(), self.fqn()))
         mod._post_action()
         meta_io_strategy = mod.metaStrategy()
         if (not_persisted_or_no_edd_when_forced(meta_io_strategy)):
             # data cache should be populated by this step
             if (mod.data is None):
                 raise SmvRuntimeError(
                     "Module {}'s data is None, can't run postAction".
                     format(mod.fqn()))
             # Since the ancestor list will be visited as depth-first, although
             # user_meta may trigger actions, the upper stream modules' post action
             # are already run. No need to call _run_ancestor_and_me_postAction
             # in the calculate_user_meta() any more
             mod._calculate_user_meta()
             mod._finalize_meta()
             mod._validate_meta()
             mod._persist_meta()
             mod._collect_runinfo_and_update_hist(coll)
         else:
             meta_json = meta_io_strategy.read()
             self.module_meta = SmvMetaData().fromJson(meta_json)
         _run_set.discard(mod)
示例#4
0
    def _all_providers(self):
        """scans user libraries and smv libraries for "provider" classes.
            Returns list of discovered provider classes
        """
        def is_provider(klass):
            """A class is a provider if it has `IS_PROVIDER` and is not the base `SmvProvider`
               which returns empty string for provider type.
            """
            try:
                klass_is_provider = (klass.IS_PROVIDER is
                                     True) and (klass.provider_type())
            except AttributeError:
                klass_is_provider = False
            return klass_is_provider

        # providers can be in user libs dir or builtin smv
        prov_libs_names = self.smvApp.userLibs() + self.smvApp.smvLibs()
        prov_dict = {}

        for prov_lib_name in prov_libs_names:
            prov_lib = self.load_pymodule(prov_lib_name)
            providers = self._matchingClassesInPyModule(prov_lib, is_provider)
            for p in providers:
                p_fqn = p.provider_type_fqn()
                if p_fqn in prov_dict:
                    raise SmvRuntimeError(
                        "multiple providers with same fqn: " + p_fqn)
                prov_dict[p_fqn] = p

        return prov_dict
示例#5
0
 def _assure_output_type(self, run_output):
     # TODO move this back to top
     import h2o
     if (not isinstance(run_output, h2o.H2OFrame)):
         raise SmvRuntimeError(
             'The run method output should be an H2OFrame, but {} is given.'
             .format(type(run_output)))
示例#6
0
文件: smvapp.py 项目: bakhalea/SMV
 def getStageFromModuleFqn(self, fqn):
     """Returns the stage name for a given fqn"""
     res = [s for s in self.stages() if fqn.startswith(s + ".")]
     if (len(res) == 0):
         raise SmvRuntimeError("Can't find {} from stages {}".format(
             fqn, ", ".join(self.stages())))
     return res[0]
示例#7
0
    def all_data_dirs(self):
        """Create all the data dir configs
        """
        props = self.merged_props()
        if (self.cmdline.get('dataDir')):
            data_dir = self.cmdline.get('dataDir')
        elif (props.get('smv.dataDir')):
            data_dir = props.get('smv.dataDir')
        elif(os.getenv('DATA_DIR', None)):
            data_dir = os.getenv('DATA_DIR')
            print("WARNING: use of DATA_DIR environment variable is deprecated. use smv.dataDir instead!!!")
        else:
            raise SmvRuntimeError("Must specify a data-dir either on command line or in conf.")

        def get_sub_dir(name, default):
            res = "{}/{}".format(data_dir, default)
            if (self.cmdline.get(name)):
                res = self.cmdline.get(name)
            elif (props.get('smv.' + name)):
                res = props.get('smv.' + name)
            return res

        return {
            'dataDir': data_dir,
            'inputDir': get_sub_dir('inputDir', "input"),
            'outputDir': get_sub_dir('outputDir', "output"),
            'lockDir': get_sub_dir('lockDir', "lock"),
            'historyDir': get_sub_dir('historyDir', "history"),
            'publishDir': get_sub_dir('publishDir', 'publish'),
            'publishVersion': self.cmdline.get('publish')
        }
示例#8
0
 def dependencies(self):
     model_mod = self.requiresModel()
     if not self._targetIsSmvModel(model_mod):
         raise SmvRuntimeError(
             "requiresModel method must return an SmvModel or a link to one"
         )
     return [model_mod] + self.requiresDS()
示例#9
0
    def smvGetRunConfig(self, key):
        """return the current user run configuration value for the given key."""
        if (key not in self.requiresConfig()):
            raise SmvRuntimeError(
                "RunConfig key {} was not specified in requiresConfig method{}."
                .format(key, self.requiresConfig()))

        return self.smvApp.getConf(key)
示例#10
0
 def metadataJson(self, jdf):
     """Get user's metadata and jsonify it for py4j transport
     """
     df = DataFrame(jdf, self.smvApp.sqlContext)
     metadata = self.metadata(df)
     if not isinstance(metadata, dict):
         raise SmvRuntimeError("User metadata {} is not a dict".format(repr(metadata)))
     return json.dumps(metadata)
示例#11
0
    def write(self, smvSchema):
        schema_str = "\n".join(scala_seq_to_list(self.smvApp._jvm, smvSchema.toStringsWithMeta()))
        if (self._write_mode.lower() == "overwrite"):
            self._remove()
        else:
            raise SmvRuntimeError("Write mode {} is not implemented yet. (Only support overwrite)".format(self._write_mode))

        self.smvApp._jvm.SmvHDFS.writeToFile(schema_str, self._file_path)
示例#12
0
 def validateMetadataJson(self, currentJson, historyJson):
     """Load metadata (jsonified for py4j transport) and run user's validation on it
     """
     current = json.loads(currentJson)
     history = [json.loads(j) for j in historyJson]
     res = self.validateMetadata(current, history)
     if res is not None and not is_string(res):
         raise SmvRuntimeError("Validation failure message {} is not a string".format(repr(res)))
     return res
示例#13
0
 def _validate_meta(self):
     hist = self.smvApp._read_meta_hist(self)
     res = self.validateMetadata(self.module_meta, hist)
     if res is not None and not is_string(res):
         raise SmvRuntimeError(
             "Validation failure message {} is not a string".format(
                 repr(res)))
     if (is_string(res) and len(res) > 0):
         raise SmvMetadataValidationError(res)
示例#14
0
    def write(self, raw_data):
        jdf = raw_data._jdf

        if (self._write_mode.lower() == "overwrite"):
            self._remove()
        else:
            raise SmvRuntimeError("Write mode {} is not implemented yet. (Only support overwrite)".format(self._write_mode))

        handler = self.smvApp.j_smvPyClient.createFileIOHandler(self._file_path)
        handler.saveAsCsv(jdf, self._smv_schema)
示例#15
0
 def _assert_single_input(self):
     """Make sure SmvOutput only depends on a single module
         This method will not be called, when SmvOutput is used for mixin.
         It should be called by the doRun method when SmvOutput is used for
         base class
     """
     if (len(self.requiresDS()) != 1):
         raise SmvRuntimeError("SmvOutput modules depend on a single input, more are given: {}"\
             .format(", ".join([m.fqn() for m in self.requiresDS()]))
         )
示例#16
0
    def _checkColExistence(self, colNames):
        """Check if the given column names exist in the DataFrame

            Will throw if some of the column names are not found

            Args:
                colNames (list(string)) a list of column names to check
        """
        invalidCols = set(colNames) - set(self.df.columns)
        if invalidCols:
            raise SmvRuntimeError("{} does not have columns {}".format(
                self.df, ", ".join(invalidCols)))
示例#17
0
    def _calculate_user_meta(self):
        """Calculate user defined metadata
            could have action on the result df
        """
        self.module_meta.addSystemMeta(self)
        (user_meta, self.userMetadataTimeElapsed) = self._do_action_on_df(
            self.metadata, self.data, "GENERATE USER METADATA")

        if not isinstance(user_meta, dict):
            raise SmvRuntimeError("User metadata {} is not a dict".format(
                repr(user_meta)))

        self.module_meta.addUserMeta(user_meta)
示例#18
0
    def get_connection_by_name(self, name):
        """Get connection instance from name
        """
        props = self.py_smvconf.merged_props()
        type_name = "smv.conn.{}.type".format(name)

        if (type_name in props):
            con_type = props.get(type_name)
            provider_fqn = "conn.{}".format(con_type)
            ConnClass = self.get_provider_by_fqn(provider_fqn)
            return ConnClass(name, props)
        else:
            raise SmvRuntimeError(
                "Connection name {} is not configured with a type".format(
                    name))
示例#19
0
 def resolveDataSet(self, ds):
     """Return cached resolved version of given SmvGenericModule if it exists, or resolve
         it otherwise.
     """
     if (ds.fqn() in self.resolveStack):
         raise SmvRuntimeError("Cycle found while resolving {}: {}".format(
             ds.fqn(), ", ".join(self.resolveStack)))
     else:
         if (ds.fqn() in self.fqn2res):
             return self.fqn2res.get(ds.fqn())
         else:
             self.resolveStack.append(ds.fqn())
             resolvedDs = ds._resolve(self)
             resolvedDs._setTimestamp(self.transaction_time)
             self.fqn2res.update({ds.fqn(): resolvedDs})
             self.resolveStack.pop()
             return resolvedDs
示例#20
0
    def _getMetaByName(self, colName):
        """Returns the metadata of the first column that matches the column name

            Will throw if there's no column matching the specified name

            Args:
                colName (string) the name of the column that is being looked for
            
            Returns:
                (dict) the metadata of the given column
        """
        try:
            meta = next(col.metadata for col in self.fields
                        if col.name == colName)
        except StopIteration:
            raise SmvRuntimeError("column name {} not found".format(colName))
        return meta
示例#21
0
 def loadDataSet(self, fqns):
     """Given a list of FQNs, return cached resolved version SmvGenericModules if exists, or
         otherwise load unresolved version from source and resolve them.
     """
     res = []
     for fqn in fqns:
         # Caller need to check whether the fqn is in a stage of the SmvConfig stages
         if (fqn in self.fqn2res):
             ds = self.fqn2res.get(fqn)
         else:
             mod = self.repo.loadDataSet(fqn)
             if (mod is None):
                 raise SmvRuntimeError(
                     "Module {} does not exist".format(fqn))
             ds = self.resolveDataSet(mod)
         res.append(ds)
     return res
示例#22
0
    def get_connection(self):
        """Get data connection instance from connectionName()

            Connetion should be configured in conf file with at least a class FQN

            Ex: smv.conn.con_name.class=smv.conn.SmvJdbcConnectionInfo
        """
        name = self.connectionName()
        conn = self.smvApp.get_connection_by_name(name)

        # check whether the connection provided by name has the type as expected
        conn_type = conn.provider_type()
        if (conn_type != self.connectionType()):
            raise SmvRuntimeError(
                "Connection {} has type {}, while {} need connection type {}".
                format(name, conn_type, self.__class__.__name__,
                       self.connectionType()))
        return conn
示例#23
0
    def addDesc(self, *colDescs):
        """Adds column descriptions
        """
        if not colDescs:
            raise SmvRuntimeError(
                "must provide (name, description) pair to add")

        self._checkColExistence([tup[0] for tup in colDescs])

        addDict = dict(colDescs)

        def colShouldUpdate(col):
            return col.name in addDict

        def colUpdateMeta(col):
            return _setMetaDesc(col.metadata, addDict[col.name])

        return self._updateColMeta(colShouldUpdate, colUpdateMeta)
示例#24
0
    def addLabel(self, colNames, labels):
        """Adds labels to the specified columns

            If colNames are empty, adds the same set of labels to all columns
        """
        if not labels:
            raise SmvRuntimeError("must provide a list of labels to add")

        if colNames:
            self._checkColExistence(colNames)
            addSet = set(colNames)

        def colShouldUpdate(col):
            return not colNames or col.name in addSet

        def colUpdateMeta(col):
            return _setMetaLabel(col.metadata, labels)

        return self._updateColMeta(colShouldUpdate, colUpdateMeta)
示例#25
0
    def readAsDF(self, readerLogger):
        flist = self.smvApp.j_smvPyClient.getDirList(self.fullPath())
        # ignore all hidden files in the data dir
        filesInDir = [
            "{}/{}".format(self.fullPath(), n) for n in flist
            if not n.startswith(".")
        ]

        if (not filesInDir):
            raise SmvRuntimeError("There are no data files in {}".format(
                self.fullPath()))

        combinedJdf = None
        for filePath in filesInDir:
            jdf = self.smvApp.j_smvPyClient.readCsvFromFile(
                filePath, self.smvSchema(), self.csvAttr(), readerLogger)
            combinedJdf = jdf if (
                combinedJdf is None) else combinedJdf.unionAll(jdf)

        return DataFrame(combinedJdf, self.smvApp.sqlContext)
示例#26
0
    def _all_providers(self):
        """scans user libraries and smv libraries for "provider" classes.
            Returns list of discovered provider classes
        """
        def is_provider(klass):
            """A class is a provider if it has `IS_PROVIDER` and is not the base `SmvProvider`
               which returns empty string for provider type.
            """
            try:
                klass_is_provider = (klass.IS_PROVIDER is
                                     True) and (klass.provider_type())
            except AttributeError:
                klass_is_provider = False
            return klass_is_provider

        # providers can be in user libs dir or builtin smv
        prov_libs_names = self.smvApp.userLibs() + self.smvApp.semiLibs(
        ) + self.smvApp.smvLibs()
        prov_dict = {}

        for prov_lib_name in prov_libs_names:
            try:
                prov_lib = self.load_pymodule(prov_lib_name)
            except Exception as err:
                # ignore the prov_lib_name if there is any loading error
                traceback.print_exc()
                message = "{0}({1!r})".format(type(err).__name__, err.args)
                smv.logger.debug("Ignoring {} because it has error: {}".format(
                    prov_lib_name, message))
                continue
            providers = self._matchingClassesInPyModule(prov_lib,
                                                        is_provider,
                                                        skip_abs=False)
            for p in providers:
                p_fqn = p.provider_type_fqn()
                if p_fqn in prov_dict:
                    raise SmvRuntimeError(
                        "multiple providers with same fqn: " + p_fqn)
                prov_dict[p_fqn] = p

        return prov_dict
示例#27
0
文件: inputs.py 项目: bakhalea/SMV
    def doRun(self, known):
        dir_path = os.path.join(self.get_connection().path, self.dirName())
        smv_schema = self.smvSchema()

        flist = self.smvApp._jvm.SmvHDFS.dirList(dir_path).array()
        # ignore all hidden files in the data dir
        filesInDir = [
            os.path.join(dir_path, n) for n in flist if not n.startswith(".")
        ]

        if (not filesInDir):
            raise SmvRuntimeError(
                "There are no data files in {}".format(dir_path))

        combinedDf = None
        reader_logger = self._readerLogger()
        for filePath in filesInDir:
            df = SmvCsvOnHdfsIoStrategy(self.smvApp, filePath, smv_schema,
                                        reader_logger).read()
            combinedDf = df if (
                combinedDf is None) else combinedDf.unionAll(df)

        return combinedDf
示例#28
0
文件: smvapp.py 项目: bakhalea/SMV
    def __init__(self, arglist, _sparkSession, py_module_hotload=True):
        self.smvHome = os.environ.get("SMV_HOME")
        if (self.smvHome is None):
            raise SmvRuntimeError("SMV_HOME env variable not set!")

        self.sparkSession = _sparkSession

        if (self.sparkSession is not None):
            sc = self.sparkSession.sparkContext
            sc.setLogLevel("ERROR")

            self.sc = sc
            self.sqlContext = self.sparkSession._wrapped
            self._jvm = sc._jvm
            self.j_smvPyClient = self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init(
                self.sparkSession._jsparkSession)
            self.j_smvApp = self.j_smvPyClient.j_smvApp()
        else:
            _gw = launch_gateway(None)
            self._jvm = _gw.jvm

        self.py_module_hotload = py_module_hotload

        java_import(self._jvm, "org.tresamigos.smv.ColumnHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper")
        java_import(self._jvm, "org.tresamigos.smv.dqm.*")
        java_import(self._jvm, "org.tresamigos.smv.panel.*")
        java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvHDFS")
        java_import(self._jvm, "org.tresamigos.smv.DfCreator")

        self.smvSchemaObj = self._jvm.SmvPythonHelper.getSmvSchema()

        self.py_smvconf = SmvConfig(arglist, self._jvm)

        # configure spark sql params
        if (self.sparkSession is not None):
            for k, v in self.py_smvconf.spark_sql_props().items():
                self.sqlContext.setConf(k, v)

        # issue #429 set application name from smv config
        if (self.sparkSession is not None):
            sc._conf.setAppName(self.appName())

        # CmdLine is static, so can be an attribute
        cl = self.py_smvconf.cmdline
        self.cmd_line = namedtuple("CmdLine", cl.keys())(*cl.values())

        # shortcut is meant for internal use only
        self.dsm = DataSetMgr(self._jvm, self.py_smvconf)

        # computed df cache, keyed by m.versioned_fqn
        self.data_cache = {}

        # AFTER app is available but BEFORE stages,
        # use the dynamically configured app dir to set the source path, library path
        self.prependDefaultDirs()

        self.repoFactory = DataSetRepoFactory(self)
        self.dsm.register(self.repoFactory)

        # provider cache, keyed by providers' fqn
        self.provider_cache = {}
        self.refresh_provider_cache()

        # Initialize DataFrame and Column with helper methods
        smv.helpers.init_helpers()
示例#29
0
文件: smvapp.py 项目: bakhalea/SMV
 def getInstance(cls):
     if cls._instance is None:
         raise SmvRuntimeError("An instance of SmvApp has not been created")
     else:
         return cls._instance
示例#30
0
文件: smvapp.py 项目: bakhalea/SMV
 def jdbcDriver(self):
     res = self.py_smvconf.merged_props().get('smv.jdbc.driver')
     if (res is None):
         raise SmvRuntimeError("JDBC driver is not specified in SMV config")
     return res