def test_bootstrap_actions(): bootstrap_actions = [ BootstrapAction( name="bs1", path="path/to/script", bootstrap_action_args=["arg1", "arg2&arg3"], ), BootstrapAction(name="bs2", path="path/to/anotherscript", bootstrap_action_args=[]), ] conn = boto.connect_emr() cluster_id = conn.run_jobflow(bootstrap_actions=bootstrap_actions, **run_jobflow_args) jf = conn.describe_jobflow(cluster_id) for x, y in zip(jf.bootstrapactions, bootstrap_actions): x.name.should.equal(y.name) x.path.should.equal(y.path) list(o.value for o in x.args).should.equal(y.args()) resp = conn.list_bootstrap_actions(cluster_id) for i, y in enumerate(bootstrap_actions): x = resp.actions[i] x.name.should.equal(y.name) x.scriptpath.should.equal(y.path) list(arg.value for arg in x.args).should.equal(y.args())
def get_bootstrap_actions(self): '''Get list of bootstrap actions from property''' actions = [] for bootstrap_action in self.prop.emr.bootstrap_actions: assert len(bootstrap_action ) >= 2, 'Wrong bootstrap action definition: ' + str( bootstrap_action) actions.append( BootstrapAction(bootstrap_action[0], bootstrap_action[1], bootstrap_action[2:])) return actions
def run_emr(profile, input_path, output_path, errors_path, log_path, ec2_keyname): c = boto.connect_s3(profile_name=profile) jar_bucket = c.get_bucket(input_path.split("/")[0]) r = get_valid_region(jar_bucket.get_location()) bootstrap_actions = [ BootstrapAction("Install Spark", "s3://support.elasticmapreduce/spark/install-spark", ["-x"]) ] args = [ "/home/hadoop/spark/bin/spark-submit", "--deploy-mode", "cluster", "--master", "yarn-cluster", "--class", "com.snowplowanalytics.schemaguru.sparkjob.SchemaDeriveJob", "s3://snowplow-hosted-assets/schema-guru/spark/" + JAR_FILE, "--ndjson", # Assuming your source files contain many JSONs each, one per line "--errors-path", "s3n://" + errors_path, # trailing slash is required "--output", "s3n://" + output_path, # ...here too "s3n://" + input_path, # ...here too ] steps = [ InstallHiveStep(), ScriptRunnerStep("Run SchemaDeriveJob", step_args=args) ] conn = boto.emr.connect_to_region(r, profile_name=profile) job_id = conn.run_jobflow(name="Schema Derive Spark", log_uri="s3://" + log_path, ec2_keyname=ec2_keyname, master_instance_type="m3.xlarge", slave_instance_type="m3.xlarge", num_instances=3, enable_debugging=True, ami_version="3.8", steps=steps, bootstrap_actions=bootstrap_actions, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole") print("Started jobflow " + job_id)
def launch_cluster(script_name, keep_alive=False, instance_types=None, subnet_id=None): '''launch new cluster''' if instance_types is None: instance_type = 'm2.4xlarge' instance_count = 3 else: match = re.match('^([^:]+)(:\d+)?$', instance_types) if not match: raise ValueError('invalid instance types: %s' % instance_types) instance_type, instance_count = match.groups() instance_count = int(instance_count[1:]) instance_groups = [ InstanceGroup(1, 'MASTER', instance_type, 'ON_DEMAND', 'MASTER_GROUP'), InstanceGroup(instance_count, 'CORE', instance_type, 'ON_DEMAND', 'CORE_GROUP') ] bootstrap_actions = [ BootstrapAction('install-pig', install_pig_script, [pig_version]), ] api_params = {} if subnet_id is not None: api_params['Instances.Ec2SubnetId'] = subnet_id name = name_prefix + '-' + script_name jobid = emr_conn.run_jobflow(name=name, keep_alive=keep_alive, ami_version=ami_version, visible_to_all_users=True, ec2_keyname=ec2_keyname, service_role='EMR_DefaultRole', job_flow_role='EMR_EC2_DefaultRole', log_uri=log_uri, action_on_failure='CONTINUE', instance_groups=instance_groups, bootstrap_actions=bootstrap_actions, api_params=api_params) print('launched %s (%s)' % (name, jobid)) return jobid
def run_emr(profile, bucket, ec2_keyname, vpc_subnet_id): c = boto.connect_s3(profile_name=profile) b = c.get_bucket(bucket) r = get_valid_region(b.get_location()) bootstrap_actions = [ BootstrapAction("Install Spark", "s3://support.elasticmapreduce/spark/install-spark", ["-x"]) ] args = [ "/home/hadoop/spark/bin/spark-submit", "--deploy-mode", "cluster", "--master", "yarn-cluster", "--class", "com.snowplowanalytics.spark.WordCountJob", "s3://" + bucket + "/jar/" + JAR_FILE, "s3n://" + bucket + "/" + HELLO_TXT, "s3n://" + bucket + "/out" ] steps = [ InstallHiveStep(), ScriptRunnerStep("Run WordCountJob", step_args=args) ] conn = boto.emr.connect_to_region(r, profile_name=profile) job_id = conn.run_jobflow(name="Spark Example Project", log_uri="s3://" + bucket + "logs", ec2_keyname=ec2_keyname, master_instance_type="m3.xlarge", slave_instance_type="m3.xlarge", num_instances=3, enable_debugging=True, ami_version="3.6", steps=steps, bootstrap_actions=bootstrap_actions, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole") print "Started jobflow " + job_id
def _bootstrap_actions(cls): name = cls.BOOTSTRAP_NAME path = cls.BOOTSTRAP_SCRIPT bootstrap_action_args = [g.TRAFFIC_SRC_DIR, g.tracking_secret] bootstrap = BootstrapAction(name, path, bootstrap_action_args) return [bootstrap]
if o in ('--test'): params['test_mode']=True required = ['aws_key','secret','keypair'] for pname in required: if not params.get(pname, None): print '\nERROR:%s is required' % pname usage() for p, v in params.iteritems(): print "param:" + `p`+ " value:" + `v` conn = boto.connect_emr(params['aws_key'],params['secret']) bootstrap_step1 = BootstrapAction("install_cc", "s3://commoncrawl-public/config64.sh",[params['aws_key'], params['secret']]) bootstrap_step2 = BootstrapAction("configure_hadoop", "s3://elasticmapreduce/bootstrap-actions/configure-hadoop", [ "-m","mapred.tasktracker.map.tasks.maximum=8", "-m","mapred.child.java.opts=-XX:ErrorFile=/tmp/hs_err_${mapred.tip.id}.log -Xmx700m -XX:+UseParNewGC -XX:ParallelGCThreads=8 -XX:NewSize=100m -XX:+UseConcMarkSweepGC -XX:+UseTLAB -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:CMSIncrementalDutyCycle=10" ]) bootstrap_step3 = BootstrapAction("configure_jobtrackerheap", "s3://elasticmapreduce/bootstrap-actions/configure-daemons",["--jobtracker-heap-size=12096"]) namenode_instance_group = InstanceGroup(1,"MASTER","c1.xlarge","ON_DEMAND","MASTER_GROUP") core_instance_group = InstanceGroup(params['num_core'],"CORE","c1.xlarge","ON_DEMAND","CORE_GROUP") instance_groups=[] if params['num_spot'] <= 0: instance_groups=[namenode_instance_group,core_instance_group] else:
from boto.emr.bootstrap_action import BootstrapAction from boto.emr.connection import EmrConnection # Description: # BootstrapAction is an object reperesenting a bootstrap action in Elastic Map # Reduce (EMR), a script that gets run before the EMR job executes. # initialize a bootstrap action bootstrapSetup = BootstrapAction("Bootstrap Name", "s3://<my-bucket>/<my-bootstrap-action>", ["arg1=hello", "arg2=world"]) # initialize emr connection emr_job = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>") # run emr job flow with defined bootstrap action emr_job.run_jobflow(bootstrap_actions=[bootstrapSetup])
def __init__(self): name = 'memory intensive' path = 's3://elasticmapreduce/bootstrap-actions/' \ 'configurations/latest/memory-intensive' args = [] BootstrapAction.__init__(self, name, path, args)