def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if args: option_parser.error('takes no arguments') # set up logging if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) # create the persistent job runner_kwargs = { 'conf_path': options.conf_path, 'ec2_instance_type': options.ec2_instance_type, 'ec2_master_instance_type': options.ec2_master_instance_type, 'ec2_slave_instance_type': options.ec2_slave_instance_type, 'label': options.label, 'num_ec2_instances': options.num_ec2_instances, 'owner': options.owner, } runner = EMRJobRunner(**runner_kwargs) emr_job_flow_id = runner.make_persistent_job_flow() print emr_job_flow_id
def test_create_scratch_uri(self): # "walrus" bucket will be ignored; it doesn't start with "mrjob-" self.add_mock_s3_data({'walrus': {}, 'zebra': {}}) runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01) # bucket name should be mrjob- plus 16 random hex digits s3_scratch_uri = runner._opts['s3_scratch_uri'] assert_equal(s3_scratch_uri[:11], 's3://mrjob-') assert_equal(s3_scratch_uri[27:], '/tmp/') # bucket shouldn't actually exist yet scratch_bucket, _ = parse_s3_uri(s3_scratch_uri) assert_not_in(scratch_bucket, self.mock_s3_fs.keys()) # need to do something to ensure that the bucket actually gets # created. let's launch a (mock) job flow jfid = runner.make_persistent_job_flow() assert_in(scratch_bucket, self.mock_s3_fs.keys()) runner.make_emr_conn().terminate_jobflow(jfid) # once our scratch bucket is created, we should re-use it runner2 = EMRJobRunner(conf_path=False) assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri) s3_scratch_uri = runner._opts['s3_scratch_uri']
def test_local_bootstrap_action(self): # make sure that local bootstrap action scripts get uploaded to S3 action_path = os.path.join(self.tmp_dir, 'apt-install.sh') with open(action_path, 'w') as f: f.write('for $pkg in $@; do sudo apt-get install $pkg; done\n') bootstrap_actions = [ action_path + ' python-scipy mysql-server'] runner = EMRJobRunner(conf_path=False, bootstrap_actions=bootstrap_actions, s3_sync_wait_time=0.01) job_flow_id = runner.make_persistent_job_flow() emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(job_flow_id) actions = job_flow.bootstrapactions assert_equal(len(actions), 2) assert actions[0].path.startswith('s3://mrjob-') assert actions[0].path.endswith('/apt-install.sh') assert_equal(actions[0].name, 'apt-install.sh') assert_equal(actions[0].args, ['python-scipy', 'mysql-server']) # check for master boostrap script assert actions[1].path.startswith('s3://mrjob-') assert actions[1].path.endswith('b.py') assert_equal(actions[1].args, []) assert_equal(actions[1].name, 'master') # make sure master bootstrap script is on S3 assert runner.path_exists(actions[1].path)
def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if args: option_parser.error('takes no arguments') MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner_kwargs = options.__dict__.copy() del runner_kwargs['quiet'] del runner_kwargs['verbose'] runner = EMRJobRunner(**runner_kwargs) emr_job_flow_id = runner.make_persistent_job_flow() print emr_job_flow_id
def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if args: option_parser.error('takes no arguments') # set up logging if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) # create the persistent job runner_kwargs = options.__dict__.copy() del runner_kwargs['quiet'] del runner_kwargs['verbose'] runner = EMRJobRunner(**runner_kwargs) emr_job_flow_id = runner.make_persistent_job_flow() print emr_job_flow_id
def test_bootstrap_actions_get_added(self): bootstrap_actions = [ 's3://elasticmapreduce/bootstrap-actions/configure-hadoop -m,mapred.tasktracker.map.tasks.maximum=1', 's3://foo/bar#xyzzy', # use alternate name for script ] runner = EMRJobRunner(conf_path=False, bootstrap_actions=bootstrap_actions, s3_sync_wait_time=0.01) job_flow_id = runner.make_persistent_job_flow() emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(job_flow_id) actions = job_flow.bootstrapactions assert_equal(len(actions), 3) assert_equal( actions[0].path, 's3://elasticmapreduce/bootstrap-actions/configure-hadoop') assert_equal( actions[0].args, ['-m,mapred.tasktracker.map.tasks.maximum=1']) assert_equal(actions[0].name, 'configure-hadoop') assert_equal(actions[1].path, 's3://foo/bar') assert_equal(actions[1].args, []) assert_equal(actions[1].name, 'xyzzy') # check for master bootstrap script assert actions[2].path.startswith('s3://mrjob-') assert actions[2].path.endswith('b.py') assert_equal(actions[2].args, []) assert_equal(actions[2].name, 'master') # make sure master bootstrap script is on S3 assert runner.path_exists(actions[2].path)
def main(args=None): """Run the create_job_flow tool with arguments from ``sys.argv`` and printing to ``sys.stdout``.""" runner = EMRJobRunner(**runner_kwargs(args)) emr_job_flow_id = runner.make_persistent_job_flow() print(emr_job_flow_id)
def main(args=None): """Run the create_job_flow tool with arguments from ``sys.argv`` and printing to ``sys.stdout``.""" runner = EMRJobRunner(**runner_kwargs(args)) emr_job_flow_id = runner.make_persistent_job_flow() print emr_job_flow_id