Exemplo n.º 1
0
    def getRunInfoByPartialName(self, name, runConfig):
        """Returns the run information of a module and all its dependencies
        from the last run.

        Unlike the runModule() method, which returns the run
        information just for that run, this method returns the run
        information from the last run.

        If no module was run (e.g. the code did not change, so the
        data is read from persistent storage), the SmRunInfoCollector
        returned from the runModule() method would be empty.  But the
        SmvRunInfoCollector returned from this method would contain
        all latest run information about all dependent modules.

        Args:
            name (str): unique suffix to fqn of target module
            runConfig (dict): runConfig to apply when collecting info. If module
                              was run with a config, the same config needs to be
                              specified here to retrieve the info.

        Returns:
            SmvRunInfoCollector
        """
        java_result = self.j_smvPyClient.getRunInfoByPartialName(
            name, runConfig)
        return SmvRunInfoCollector(java_result)
Exemplo n.º 2
0
    def run(self, forceRun=False):
        # a set of modules which need to run post_action, keep tracking
        # to make sure post_action run one and only one time for each TX
        # the set will be updated by _create_df, _create_meta and _force_post
        # and eventually be emptied out
        # See docs/dev/SmvGenericModule/SmvModuleRunner.md for details
        mods_to_run_post_action = set(self.visitor.modules_needed_for_run)

        # a map from fqn to already run DF, since the `run` interface of
        # SmvModule takes a map of class => df, the map here have to be
        # keyed by class method instead of `versioned_fqn`, which is only
        # in the resolved instance
        known = {}

        collector = SmvRunInfoCollector()

        # Do the real module calculation, when there are persistence, run
        # the post_actions and ancestor ephemeral modules post actions
        self._create_df(known, mods_to_run_post_action, collector, forceRun)

        # If there are ephemeral modules who has no persisting module
        # down stream, (must be part of roots), force an action and run
        # post actions
        self._force_post(mods_to_run_post_action, collector)

        dfs = [m.data for m in self.roots]
        return (dfs, collector)
Exemplo n.º 3
0
    def runModuleByName(self,
                        name,
                        forceRun=False,
                        version=None,
                        runConfig=None,
                        quickRun=False):
        """Runs a SmvModule by its name (can be partial FQN)

        See the `runModule` method above

        Args:
            name (str): The unique name of a module. Does not have to be the FQN.
            forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise.
            version (str): The name of the published version to load from
            runConfig (dict): runtime configuration to use when running the module
            quickRun (bool): skip computing dqm+metadata and persisting csv

        Returns:
            (DataFrame, SmvRunInfoCollector) tuple
            - DataFrame is the computed result of the module
            - SmvRunInfoCollector contains additional information
              about the run, such as validation results.
        """
        # TODO call setDynamicRunConfig() here not on scala side
        java_result = self.j_smvPyClient.runModuleByName(
            name, forceRun, self.scalaOption(version), runConfig, quickRun)
        return (DataFrame(java_result.df(), self.sqlContext),
                SmvRunInfoCollector(java_result.collector()))
Exemplo n.º 4
0
    def get_runinfo(self):
        collector = SmvRunInfoCollector()

        def add_to_coll(m, _collector):
            hist = self.smvApp._read_meta_hist(m)
            _collector.add_runinfo(m.fqn(), m._get_metadata(), hist)

        self.visitor.dfs_visit(add_to_coll, collector, need_to_run_only=True)
        return collector
Exemplo n.º 5
0
    def runModuleByName(self,
                        name,
                        forceRun=False,
                        version=None,
                        runConfig=None):
        """Runs a SmvModule by its name (can be partial FQN)

        See the `runModule` method above

        Returns:
            (DataFrame, SmvRunInfoCollector) tuple
            - DataFrame is the computed result of the module
            - SmvRunInfoCollector contains additional information
              about the run, such as validation results.
        """
        # TODO call setDynamicRunConfig() here not on scala side
        java_result = self.j_smvPyClient.runModuleByName(
            name, forceRun, self.scalaOption(version), runConfig)
        return (DataFrame(java_result.df(), self.sqlContext),
                SmvRunInfoCollector(java_result.collector()))
Exemplo n.º 6
0
    def runModule(self, urn, forceRun=False, version=None, runConfig=None):
        """Runs either a Scala or a Python SmvModule by its Fully Qualified Name(fqn)

        Use j_smvPyClient instead of j_smvApp directly so we don't
        have to construct SmvRunCollector from the python side.

        Example:
            To get just the dataframe of the module:
                dataframe = smvApp.runModule('mod:package.module.SmvModuleClass')[0]
            To get both the dataframe and the run info collector:
                dataframe, collector = smvApp.runModule('mod:package.module.SmvModuleClass')

        Returns:
            (DataFrame, SmvRunInfoCollector) tuple
            - DataFrame is the computed result of the module
            - SmvRunInfoCollector contains additional information
              about the run, such as validation results.
        """
        # TODO call setDynamicRunConfig() here not on scala side
        java_result = self.j_smvPyClient.runModule(urn, forceRun,
                                                   self.scalaOption(version),
                                                   runConfig)
        return (DataFrame(java_result.df(), self.sqlContext),
                SmvRunInfoCollector(java_result.collector()))
Exemplo n.º 7
0
    def runModule(self,
                  urn,
                  forceRun=False,
                  version=None,
                  runConfig=None,
                  quickRun=False):
        """Runs either a Scala or a Python SmvModule by its Fully Qualified Name(fqn)

        Use j_smvPyClient instead of j_smvApp directly so we don't
        have to construct SmvRunCollector from the python side.

        Args:
            urn (str): The URN of a module
            forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise.
            version (str): The name of the published version to load from
            runConfig (dict): runtime configuration to use when running the module
            quickRun (bool): skip computing dqm+metadata and persisting csv

        Example:
            To get just the dataframe of the module:
                dataframe = smvApp.runModule('mod:package.module.SmvModuleClass')[0]
            To get both the dataframe and the run info collector:
                dataframe, collector = smvApp.runModule('mod:package.module.SmvModuleClass')

        Returns:
            (DataFrame, SmvRunInfoCollector) tuple
            - DataFrame is the computed result of the module
            - SmvRunInfoCollector contains additional information
              about the run, such as validation results.
        """
        # TODO call setDynamicRunConfig() here not on scala side
        java_result = self.j_smvPyClient.runModule(urn, forceRun,
                                                   self.scalaOption(version),
                                                   runConfig, quickRun)
        return (DataFrame(java_result.df(), self.sqlContext),
                SmvRunInfoCollector(java_result.collector()))
Exemplo n.º 8
0
 def quick_run(self, forceRun=False):
     known = {}
     collector = SmvRunInfoCollector()
     self._create_df(known, set(), forceRun, is_quick_run=True)
     dfs = [m.data for m in self.roots]
     return (dfs, collector)