def runModule(self, fqn, forceRun=False, quickRun=False): """Runs SmvModule by its Fully Qualified Name(fqn) Args: fqn (str): The FQN of a module forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise. quickRun (bool): skip computing dqm+metadata and persisting csv Example: To get just the dataframe of the module: dataframe = smvApp.runModule('package.module.SmvModuleClass')[0] To get both the dataframe and the run info collector: dataframe, collector = smvApp.runModule('package.module.SmvModuleClass') Returns: (DataFrame, SmvRunInfoCollector) tuple - DataFrame is the computed result of the module - SmvRunInfoCollector contains additional information about the run, such as validation results. """ ds = self.dsm.load(fqn)[0] if (quickRun): return self._to_single_run_res( SmvModuleRunner([ds], self).quick_run(forceRun)) else: return self._to_single_run_res( SmvModuleRunner([ds], self).run(forceRun))
def getRunInfo(self, fqn): """Returns the run information of a module and all its dependencies from the last run. Unlike the runModule() method, which returns the run information just for that run, this method returns the run information from the last run. If no module was run (e.g. the code did not change, so the data is read from persistent storage), the SmRunInfoCollector returned from the runModule() method would be empty. But the SmvRunInfoCollector returned from this method would contain all latest run information about all dependent modules. Args: fqn (str): fqn of target module runConfig (dict): runConfig to apply when collecting info. If module was run with a config, the same config needs to be specified here to retrieve the info. Returns: SmvRunInfoCollector """ ds = self.dsm.load(fqn)[0] return SmvModuleRunner([ds], self).get_runinfo()
def test_basic_metadata_creation(self): fqn = "stage.modules.M2" m = self.load(fqn)[0] SmvModuleRunner([m], self.smvApp).run() result = m.module_meta._metadata['_dqmValidation'] rule_cnt = result['dqmStateSnapshot']['ruleErrors']['b_lt_04']['total'] self.assertEqual(m.module_meta._metadata['_fqn'], fqn) self.assertEqual(rule_cnt, 1)
def test_publish(self): fqn = "stage.modules.M3" pub_dir = self.smvApp.all_data_dirs().publishDir m = self.load(fqn)[0] SmvModuleRunner([m], self.smvApp).publish(pub_dir) csv_path = '{}/{}.csv'.format(pub_dir, m.fqn()) meta_path = '{}/{}.meta'.format(pub_dir, m.fqn()) hist_path = '{}/{}.hist'.format(pub_dir, m.fqn()) self.assertTrue(os.path.exists(csv_path)) self.assertTrue(os.path.exists(meta_path)) self.assertTrue(os.path.exists(hist_path))
def test_purge_persisted(self): fqn1 = "stage.modules.M2" fqn2 = "stage.modules.M3" (m1, m2) = self.load(fqn1, fqn2) self.df(fqn2) # Should be persisted self.assertTrue(os.path.exists(m1.persistStrategy()._file_path)) # Should be removed SmvModuleRunner([m2], self.smvApp).purge_persisted() self.assertFalse(os.path.exists(m1.persistStrategy()._file_path))
def _generate_output_modules(self, mods): SmvModuleRunner(mods, self).run()
def _publish_modules_locally(self, mods): local_dir = self.cmd_line.exportCsv SmvModuleRunner(mods, self).publish_local(local_dir)
def _publish_modules_through_jdbc(self, mods): SmvModuleRunner(mods, self).publish_to_jdbc()
def _publish_modules_to_hive(self, mods): SmvModuleRunner(mods, self).publish_to_hive()
def _publish_modules(self, mods): SmvModuleRunner(mods, self).publish()
def _purge_current_output_files(self, mods): SmvModuleRunner(mods, self).purge_persisted()
def publishModuleToHiveByName(self, name): """Publish an SmvModule to Hive by its name (can be partial FQN) """ fqn = self.dsm.inferFqn(name) ds = self.load_single_ds(fqn) return SmvModuleRunner([ds], self).publish_to_hive()
def quickRunModule(self, fqn): ds = self.dsm.load(fqn)[0] return SmvModuleRunner([ds], self).quick_run()[0]
def test_publish_to_hive2(self): m = self.load("stage.modules.M")[0] df = self.df("stage.modules.M") SmvModuleRunner([m], self.smvApp).publish_to_hive() read_back = self.smvApp.sqlContext.sql("select * from " + "M") self.should_be_same(df, read_back)