예제 #1
0
    def test_need_to_run_list(self):
        self.df("stage.modules.M2")

        ds = self.load("stage.modules.M5")[0]
        res = ModulesVisitor([ds]).modules_needed_for_run
        names = [m.fqn()[14:] for m in res]
        self.assertEqual(names, ['M2', 'M5'])
예제 #2
0
    def test_visit_queue(self):
        fqns = ["stage.modules.M3", "stage.modules.M2"]
        ds = self.load(*fqns)

        queue = ModulesVisitor(ds).queue

        names = [m.fqn()[14:] for m in queue]
        self.assertEqual(names, ['I1', 'M1', 'M2', 'M3'])
예제 #3
0
파일: smvapp.py 프로젝트: bakhalea/SMV
 def get_need_to_run(self, roots, keep_roots=False):
     """Given a list of target modules to run, return a list of modules which
         will run and be persisted in the order of how they should run. This
         is a sub-set of modules_needed_for_run, but only keep the
         non-ephemeral and not-persisted-yet modules.
         Please note that some of the roots may not be in this list, to include
         all roots, set `keep_roots` to True
     """
     visitor = ModulesVisitor(roots)
     return [
         m for m in visitor.modules_needed_for_run
         if ((not m._is_persisted() and not m.isEphemeral()) or (
             keep_roots and m in roots))
     ]
예제 #4
0
 def __init__(self, modules, smvApp):
     self.roots = modules
     self.smvApp = smvApp
     self.log = smv.logger
     self.visitor = ModulesVisitor(modules)
예제 #5
0
class SmvModuleRunner(object):
    """Represent the run-transaction. Provides the single entry point to run
        a group of modules
    """
    def __init__(self, modules, smvApp):
        self.roots = modules
        self.smvApp = smvApp
        self.log = smv.logger
        self.visitor = ModulesVisitor(modules)

    def run(self, forceRun=False):
        # a set of modules which need to run post_action, keep tracking
        # to make sure post_action run one and only one time for each TX
        # the set will be updated by _create_df, _create_meta and _force_post
        # and eventually be emptied out
        # See docs/dev/SmvGenericModule/SmvModuleRunner.md for details
        mods_to_run_post_action = set(self.visitor.modules_needed_for_run)

        # a map from fqn to already run DF, since the `run` interface of
        # SmvModule takes a map of class => df, the map here have to be
        # keyed by class method instead of `versioned_fqn`, which is only
        # in the resolved instance
        known = {}

        collector = SmvRunInfoCollector()

        # Do the real module calculation, when there are persistence, run
        # the post_actions and ancestor ephemeral modules post actions
        self._create_df(known, mods_to_run_post_action, collector, forceRun)

        # If there are ephemeral modules who has no persisting module
        # down stream, (must be part of roots), force an action and run
        # post actions
        self._force_post(mods_to_run_post_action, collector)

        dfs = [m.data for m in self.roots]
        return (dfs, collector)

    def quick_run(self, forceRun=False):
        known = {}
        self._create_df(known, set(), forceRun, is_quick_run=True)
        return [m.data for m in self.roots]

    def get_runinfo(self):
        collector = SmvRunInfoCollector()

        def add_to_coll(m, _collector):
            hist = self.smvApp._read_meta_hist(m)
            _collector.add_runinfo(m.fqn(), m._get_metadata(), hist)

        self.visitor.dfs_visit(add_to_coll, collector, need_to_run_only=True)
        return collector

    # TODO: All the publish* methods below should be removed when move to generic output module
    def publish(self, publish_dir=None):
        # run before publish
        self.run()

        if (publish_dir is None):
            pubdir = self.smvApp.all_data_dirs().publishDir
            version = self.smvApp.all_data_dirs().publishVersion
            publish_dir = "{}/{}".format(pubdir, version)

        for m in self.roots:
            publish_base_path = "{}/{}".format(publish_dir, m.fqn())
            publish_csv_path = publish_base_path + ".csv"
            publish_meta_path = publish_base_path + ".meta"
            publish_hist_path = publish_base_path + ".hist"

            SmvCsvPersistenceStrategy(m.smvApp, m.fqn(),
                                      publish_csv_path).write(m.data)
            SmvJsonOnHdfsPersistenceStrategy(
                m.smvApp, publish_meta_path).write(m.module_meta.toJson())
            hist = self.smvApp._read_meta_hist(m)
            SmvJsonOnHdfsPersistenceStrategy(
                m.smvApp, publish_hist_path).write(hist.toJson())

    def publish_to_hive(self):
        # run before publish
        self.run()

        for m in self.roots:
            m.exportToHive()

    def publish_to_jdbc(self):
        self.run()

        for m in self.roots:
            m.publishThroughJDBC()

    def publish_local(self, local_dir):
        self.run()

        for m in self.roots:
            csv_path = "{}/{}".format(local_dir, m.versioned_fqn)
            m.data.smvExportCsv(csv_path)

    def purge_persisted(self):
        def cleaner(m, state):
            m.persistStrategy().remove()
            m.metaStrategy().remove()

        self.visitor.dfs_visit(cleaner, None)

    def _create_df(self,
                   known,
                   need_post,
                   collector,
                   forceRun=False,
                   is_quick_run=False):
        # run module and create df. when persisting, post_action
        # will run on current module and all upstream modules
        def runner(m, state):
            (fqn2df, run_set, collector) = state
            m._get_data(fqn2df, run_set, collector, forceRun, is_quick_run)

        self.visitor.dfs_visit(runner, (known, need_post, collector),
                               need_to_run_only=True)

    def _force_post(self, need_post, collector):
        # If there are still module left for post_action, force a run here
        # to run them and all left over on their upstream
        if (len(need_post) > 0):
            self.log.debug("leftover mods need to run post action: {}".format(
                [m.fqn() for m in need_post]))

            def force_run(mod, state):
                (run_set, coll) = state
                mod._force_post_action(run_set, coll)

            # Note: we used bfs_visit here run downstream first
            # In case of A<-B<-C all need to run, this way will only
            # need to force action on C, and A and B's post action can
            # also be calculated
            self.visitor.bfs_visit(force_run, (need_post, collector),
                                   need_to_run_only=True)
예제 #6
0
 def _ancestor_and_me_visitor(self):
     return ModulesVisitor([self])
예제 #7
0
 def __init__(self, modules, smvApp, runMonitorCallback=None):
     self.roots = modules
     self.smvApp = smvApp
     self.log = smv.logger
     self.visitor = ModulesVisitor(modules)
     self.runMonitorCallback = runMonitorCallback