def test_configure_logging(self): Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig ' '-logfile pig.log -brief -debug ' '-f "wordcount.pig"')) \ .log_config(logfile="pig.log", debug=True, brief=True) \ .run()
def test_with_param_query(self): Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig ' '-param_file params.properties ' '-f "wordcount.pig"')) \ .load_parameters_from_file("params.properties") \ .run()
def test_with_property_file(self): Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig ' '-propertyFile pig.properties ' '-x mapreduce ' '-f "wordcount.pig"')) \ .with_property_file("pig.properties").using_mode().run()
def test_log4j_configs_injections(self): Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig ' '-log4jconf ~/log4j.properties ' '-f "wordcount.pig"')) \ .log4j_config("~/log4j.properties") \ .run()
def test_with_param_file(self): Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig ' '-param param001=value001 ' '-param param002=value002 ' '-x mapreduce ' '-f "wordcount.pig"')) \ .with_parameter("param001", "value001").using_mode() \ .with_parameter("param002", "value002").run()
def test_run_preconfigured_job_without_parameters_substitution(self): _test_id = str(uuid.uuid4()) _job_name = "TEST_PIG_{}".format(_test_id) _input_dir = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) _output_dir = "/tmp/data_{}".format(_test_id) _commands = "A = load '{}' using PigStorage(',');".format(_input_dir) _commands += "B = foreach A generate \$0 as id;" _commands += "STORE B into '{}';".format(_output_dir) # create job configuration. can also be loaded from .ini file _config = Configuration.create() _config.set(_job_name, TaskOptions.CONFIG_KEY_COMMANDS_STRING, _commands) _config.set(_job_name, TaskOptions.CONFIG_KEY_LOG_BRIEF, 'enabled') _config.set( _job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE, 'input_dir={}\noutput_dir={}'.format(_input_dir, _output_dir)) try: _pig = Pig.load_preconfigured_job(config=_config, job_name=_job_name) _result = _pig.run() _result.if_failed_raise( AssertionError("test_run_preconfigured_job failed")) self.assertTrue( HDFS(_output_dir).exists(), "Cannot find job output") finally: self.delete_file_in_hdfs(_input_dir) self.delete_file_in_hdfs(_output_dir)
def test_run_preconfigured_job_without_parameters_substitution(self): _test_id = str(uuid.uuid4()) _job_name = "TEST_PIG_{}".format(_test_id) _input_dir = self.copy_file_from_local(self.temp_file("hello,world,world", ".txt")) _output_dir = "/tmp/data_{}".format(_test_id) _commands = "A = load '{}' using PigStorage(',');".format(_input_dir) _commands += "B = foreach A generate \$0 as id;" _commands += "STORE B into '{}';".format(_output_dir) # create job configuration. can also be loaded from .ini file _config = Configuration.create() _config.set(_job_name, TaskOptions.CONFIG_KEY_COMMANDS_STRING, _commands) _config.set(_job_name, TaskOptions.CONFIG_KEY_LOG_BRIEF, "enabled") _config.set( _job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE, "input_dir={}\noutput_dir={}".format(_input_dir, _output_dir), ) try: _pig = Pig.load_preconfigured_job(config=_config, job_name=_job_name) _result = _pig.run() _result.if_failed_raise(AssertionError("test_run_preconfigured_job failed")) self.assertTrue(HDFS(_output_dir).exists(), "Cannot find job output") finally: self.delete_file_in_hdfs(_input_dir) self.delete_file_in_hdfs(_output_dir)
def test_logging_configuration(self): files = self.copy_file_from_local(self.temp_file("hello,world,world", ".txt")) path = "/tmp/pig_log" commands = "A = load '$input_dir' using PigStorage(',');" commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '$output_dir';" files_s = self.temp_file(commands) try: import os os.makedirs(path) pig = ( Pig.load_commands_from_file(files_s) .with_parameter("input_dir", files) .with_parameter("output_dir", "/tmp/data") ) pig.log_config(logfile=path + "pig") self.assertEqual(os.path.exists(path), pig.run().is_ok()) finally: import shutil shutil.rmtree(path) self.delete_file_in_hdfs() self.delete_file_in_hdfs(files) self.delete_local(files_s)
def merge_snapshot_with_updates(context): context["partition"] = datetime.now().strftime('%Y%m%d') pig_job = Pig.load_commands_from_file(_pig_script) \ .with_parameter("active_snapshot", _scd_active_snapshot) \ .with_parameter("data_updates", os.path.join(_hdfs_tmpdir.path, os.path.basename(_scd_updates))) \ .with_parameter('output', _hdfs_job_output) \ .with_parameter("date", context["partition"]) pig_job.run()
def test_load_preconfigured_job(self): _command = 'pig -brief -optimizer_off SplitFilter -optimizer_off ColumnMapKeyPrune -e "ls /"' metastore = IniFileMetaStore(file=os.path.join(os.path.dirname(__file__), 'resources/pig/pig.ini')) pig = Pig.load_preconfigured_job(job_name='pig test', config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True), command_executor=mock_executor(expected_command=_command)) pig.without_split_filter().run()
def test_load_preconfigured_job(self): _command = 'pig -brief -optimizer_off SplitFilter -optimizer_off ColumnMapKeyPrune -e "ls /"' metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/pig/pig.ini')) pig = Pig.load_preconfigured_job( job_name='pig test', config=Configuration.load(metastore=metastore, readonly=False, accepts_nulls=True), command_executor=mock_executor(expected_command=_command)) pig.without_split_filter().run()
def test_wrap_with_quotes(self): _pc = Pig(config=Configuration.create(), job_name=None, command_executor=None) self.assertEqual("", _pc._wrap_with_quotes_("")) self.assertEqual(None, _pc._wrap_with_quotes_(None)) self.assertEqual('"test"', _pc._wrap_with_quotes_("test")) self.assertEqual("'test'", _pc._wrap_with_quotes_("'test'")) self.assertEqual("'te\"st'", _pc._wrap_with_quotes_('te"st')) self.assertEqual('"te\'st"', _pc._wrap_with_quotes_("te'st"))
def test_run_commands_from_string_without_param_substitution(self): _test_id = str(uuid.uuid4()) _output_dir = "/tmp/data_{}".format(_test_id) _input_dir = self.copy_file_from_local(self.temp_file("hello,world,world", ".txt")) commands = "A = load '{}' using PigStorage(',');".format(_input_dir) commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '{}';".format(_output_dir) try: _pig = Pig.load_commands_from_string(commands) _result = _pig.run() _result.if_failed_raise(AssertionError("test_run_commands_from_string failed")) self.assertTrue(HDFS(_output_dir).exists(), "Cannot find job output") finally: self.delete_file_in_hdfs(_input_dir) self.delete_file_in_hdfs(_output_dir)
def test_run_commands_from_string_without_param_substitution(self): _test_id = str(uuid.uuid4()) _output_dir = "/tmp/data_{}".format(_test_id) _input_dir = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) commands = "A = load '{}' using PigStorage(',');".format(_input_dir) commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '{}';".format(_output_dir) try: _pig = Pig.load_commands_from_string(commands) _result = _pig.run() _result.if_failed_raise( AssertionError("test_run_commands_from_string failed")) self.assertTrue( HDFS(_output_dir).exists(), "Cannot find job output") finally: self.delete_file_in_hdfs(_input_dir) self.delete_file_in_hdfs(_output_dir)
def test_run_commands_from_file(self): _test_id = str(uuid.uuid4()) _inputs = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) commands = "A = load '$input_dir' using PigStorage(',');" commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '$output_dir';" files_s = self.temp_file(commands) try: _output_dir = "/tmp/data_{}".format(_test_id) pig = Pig.load_commands_from_file(files_s) \ .with_parameter("input_dir", _inputs) \ .with_parameter("output_dir", _output_dir) self.assertTrue(pig.run().is_ok()) self.assertTrue(HDFS(_output_dir).exists()) finally: self.delete_local(files_s) self.delete_file_in_hdfs() self.delete_file_in_hdfs(_inputs)
def test_run_commands_from_file(self): _test_id = str(uuid.uuid4()) _inputs = self.copy_file_from_local(self.temp_file("hello,world,world", ".txt")) commands = "A = load '$input_dir' using PigStorage(',');" commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '$output_dir';" files_s = self.temp_file(commands) try: _output_dir = "/tmp/data_{}".format(_test_id) pig = ( Pig.load_commands_from_file(files_s) .with_parameter("input_dir", _inputs) .with_parameter("output_dir", _output_dir) ) self.assertTrue(pig.run().is_ok()) self.assertTrue(HDFS(_output_dir).exists()) finally: self.delete_local(files_s) self.delete_file_in_hdfs() self.delete_file_in_hdfs(_inputs)
def test_logging_configuration(self): files = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) path = "/tmp/pig_log" commands = "A = load '$input_dir' using PigStorage(',');" commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '$output_dir';" files_s = self.temp_file(commands) try: import os os.makedirs(path) pig = Pig.load_commands_from_file(files_s).with_parameter("input_dir", files) \ .with_parameter("output_dir", "/tmp/data") pig.log_config(logfile=path + "pig") self.assertEqual(os.path.exists(path), pig.run().is_ok()) finally: import shutil shutil.rmtree(path) self.delete_file_in_hdfs() self.delete_file_in_hdfs(files) self.delete_local(files_s)
def test_run_script_from_string(self): Pig.load_commands_from_string( commands="ls /", command_executor=mock_executor('pig -e "ls /"')).run()
def test_optimization_disabling(self): Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off SplitFilter -f "wordcount.pig"')) \ .without_split_filter().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off PushUpFilter -f "wordcount.pig"')) \ .without_pushup_filter().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off MergeFilter -f "wordcount.pig"')) \ .without_merge_filter().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off PushDownForeachFlatten -f "wordcount.pig"')) \ .without_push_down_foreach_flatten().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off LimitOptimizer -f "wordcount.pig"')) \ .without_limit_optimizer().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off ColumnMapKeyPrune -f "wordcount.pig"')) \ .without_column_map_key_prune().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off AddForEach -f "wordcount.pig"')) \ .without_add_foreach().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off MergeForEach -f "wordcount.pig"')) \ .without_merge_foreach().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off GroupByConstParallelSetter -f "wordcount.pig"')) \ .without_groupby_const_parallel_setter().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -optimizer_off All -f "wordcount.pig"')) \ .disable_all_optimizations().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig ' '-optimizer_off LimitOptimizer ' '-optimizer_off AddForEach ' '-f "wordcount.pig"')) \ .without_add_foreach().without_limit_optimizer().run() Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig ' '-x tez ' '-optimizer_off LimitOptimizer ' '-optimizer_off AddForEach ' '-no_multiquery ' '-f "wordcount.pig"')) \ .without_add_foreach().using_mode(type="tez")\ .without_limit_optimizer() \ .without_multiquery().run()
def test_run_script_from_file_verbose(self): Pig.load_commands_from_file( path='wordcount.pig', command_executor=mock_executor('pig -verbose -f "wordcount.pig"')) \ .debug()
def test_try_execute_empty_command(self): self.assertRaises( PigCommandError, Pig(config=Configuration.create(), job_name=None, command_executor=None).run)