def run(self): adam_cmd = ("{adam_home}/bin/adam-submit --master {spark_master} flatten " "{source} {target}").format( adam_home=eggo_config.get("worker_env", "adam_home"), spark_master=eggo_config.get("worker_env", "spark_master"), source=ToastConfig().edition_url(edition=self.source_edition), target=ToastConfig().edition_url(edition=self.edition), ) check_call(adam_cmd, shell=True)
def teardown(): teardown_cmd = "{spark_home}/ec2/spark-ec2 -k {ec2_key_pair} " "-i {ec2_private_key_file} destroy {stack_name}" interp_cmd = teardown_cmd.format( spark_home=eggo_config.get("client_env", "spark_home"), ec2_key_pair=eggo_config.get("aws", "ec2_key_pair"), ec2_private_key_file=eggo_config.get("aws", "ec2_private_key_file"), stack_name=eggo_config.get("spark_ec2", "stack_name"), ) local(interp_cmd)
def get_master_host(): getmaster_cmd = "{spark_home}/ec2/spark-ec2 -k {ec2_key_pair} " "-i {ec2_private_key_file} get-master {stack_name}" interp_cmd = getmaster_cmd.format( spark_home=eggo_config.get("client_env", "spark_home"), ec2_key_pair=eggo_config.get("aws", "ec2_key_pair"), ec2_private_key_file=eggo_config.get("aws", "ec2_private_key_file"), stack_name=eggo_config.get("spark_ec2", "stack_name"), ) result = local(interp_cmd, capture=True) return result.split("\n")[2].strip()
def create_SUCCESS_file(path): if path.startswith("s3:") or path.startswith("s3n:") or path.startswith("s3a:"): s3_client = S3Client( eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key") ) s3_client.put_string("", os.path.join(path, "_SUCCESS")) elif path.startswith("hdfs:"): hdfs_client = HdfsClient() hdfs_client.put("/dev/null", os.path.join(path, "_SUCCESS")) elif path.startswith("file:"): open(os.path.join(path, "_SUCCESS"), "a").close()
def mapper(self, line): source = json.loads("\t".join(line.split("\t")[1:])) dest_name = build_dest_filename(source["url"], decompress=source["compression"]) dest_url = os.path.join(self.destination, dest_name) if dest_url.startswith("s3:") or dest_url.startswith("s3n:"): client = S3Client( eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key") ) else: client = HdfsClient() if not client.exists(dest_url): _dnload_to_local_upload_to_dfs(source["url"], dest_url, source["compression"]) yield (source["url"], 1) # dummy output
def do(): with open(config, 'r') as ip: config_data = json.load(ip) dag_class = config_data['dag'] # push the toast config to the remote machine toast_config_worker_path = os.path.join( eggo_config.get('worker_env', 'work_path'), build_dest_filename(config)) put(local_path=config, remote_path=toast_config_worker_path) # TODO: run on central scheduler instead toast_cmd = ('toaster.py --local-scheduler {clazz} ' '--ToastConfig-config {toast_config}'.format( clazz=dag_class, toast_config=toast_config_worker_path)) hadoop_bin = os.path.join(eggo_config.get('worker_env', 'hadoop_home'), 'bin') toast_env = {'EGGO_HOME': eggo_config.get('worker_env', 'eggo_home'), # toaster.py imports eggo_config, which needs EGGO_HOME on worker 'EGGO_CONFIG': eggo_config.get('worker_env', 'eggo_config_path'), # bc toaster.py imports eggo_config which must be init on the worker 'LUIGI_CONFIG_PATH': eggo_config.get('worker_env', 'luigi_config_path'), 'AWS_ACCESS_KEY_ID': eggo_config.get('aws', 'aws_access_key_id'), # bc dataset dnload pushes data to S3 TODO: should only be added if the dfs is S3 'AWS_SECRET_ACCESS_KEY': eggo_config.get('aws', 'aws_secret_access_key'), # TODO: should only be added if the dfs is S3 'SPARK_HOME': eggo_config.get('worker_env', 'spark_home')} if exec_ctx == 'local': # this should copy vars that maintain venv info env_copy = os.environ.copy() env_copy.update(toast_env) toast_env = env_copy with path(hadoop_bin): with shell_env(**toast_env): wrun(toast_cmd)
def update_eggo(): work_path = eggo_config.get('worker_env', 'work_path') venv_path = eggo_config.get('worker_env', 'venv_path') eggo_fork = eggo_config.get('versions', 'eggo_fork') eggo_branch = eggo_config.get('versions', 'eggo_branch') eggo_home = eggo_config.get('worker_env', 'eggo_home') def do(): env.parallel = True if exec_ctx in ['director', 'spark_ec2']: wrun('rm -rf {0}'.format(eggo_home)) install_eggo(work_path, eggo_home, eggo_fork, eggo_branch) execute(do, hosts=get_worker_hosts())
def run(self): delete_raw_cmd = "{hadoop_home}/bin/hadoop fs -rm -r {raw} {target}".format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), raw=ToastConfig().raw_data_url(), target=ToastConfig().dataset_url(), ) check_call(delete_raw_cmd, shell=True)
def job_runner(self): addl_conf = {"mapred.map.tasks.speculative.execution": "false", "mapred.task.timeout": 12000000} # TODO: can we delete the AWS vars with Director? does it set AWS cred in core-site.xml? streaming_args = [ "-cmdenv", "EGGO_HOME=" + eggo_config.get("worker_env", "eggo_home"), "-cmdenv", "EGGO_CONFIG=" + eggo_config.get("worker_env", "eggo_config_path"), "-cmdenv", "AWS_ACCESS_KEY_ID=" + eggo_config.get("aws", "aws_access_key_id"), "-cmdenv", "AWS_SECRET_ACCESS_KEY=" + eggo_config.get("aws", "aws_secret_access_key"), ] return HadoopJobRunner( streaming_jar=eggo_config.get("worker_env", "streaming_jar"), streaming_args=streaming_args, jobconfs=addl_conf, input_format="org.apache.hadoop.mapred.lib.NLineInputFormat", output_format="org.apache.hadoop.mapred.lib.NullOutputFormat", end_job_with_atomic_move_dir=False, )
def provision(): if exec_ctx == 'spark_ec2': eggo.spark_ec2.provision() elif exec_ctx == 'director': eggo.director.provision() # at this point, get_master() should be valid # if the DFS is on the local fs, the directories may need to be created url = urlparse(eggo_config.get('dfs', 'dfs_root_url')) if url.scheme == 'file': local('mkdir -p {0}'.format(url.path)) url = urlparse(eggo_config.get('dfs', 'dfs_raw_data_url')) local('mkdir -p {0}'.format(url.path)) url = urlparse(eggo_config.get('dfs', 'dfs_tmp_data_url')) local('mkdir -p {0}'.format(url.path)) # tag all the provisioned instances if exec_ctx in ['spark_ec2', 'director']: conn = connect_to_region(eggo_config.get(exec_ctx, 'region')) instances = conn.get_only_instances( filters={'key-name': [eggo_config.get('aws', 'ec2_key_pair')]}) for instance in instances: instance.add_tag('owner', getuser()) instance.add_tag('stack_name', eggo_config.get(exec_ctx, 'stack_name'))
def run(self): format = ToastConfig().config["sources"][0]["format"].lower() if format not in self.allowed_file_formats: raise ValueError("Format '{0}' not in allowed formats {1}.".format(format, self.allowed_file_formats)) # 1. Copy the data from source (e.g. S3) to Hadoop's default filesystem tmp_hadoop_path = "/tmp/{rand_id}.{format}".format(rand_id=random_id(), format=format) distcp_cmd = "{hadoop_home}/bin/hadoop distcp {source} {target}".format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), source=ToastConfig().raw_data_url(), target=tmp_hadoop_path, ) check_call(distcp_cmd, shell=True) # 2. Run the adam-submit job adam_cmd = ("{adam_home}/bin/adam-submit --master {spark_master} {adam_command} " "{source} {target}").format( adam_home=eggo_config.get("worker_env", "adam_home"), spark_master=eggo_config.get("worker_env", "spark_master"), adam_command=self.adam_command, source=tmp_hadoop_path, target=ToastConfig().edition_url(edition=self.edition), ) check_call(adam_cmd, shell=True)
def run(self): tmp_dir = mkdtemp(prefix="tmp_eggo_", dir=eggo_config.get("worker_env", "work_path")) try: # build the remote command for each source tmp_command_file = "{0}/command_file".format(tmp_dir) with open(tmp_command_file, "w") as command_file: for source in ToastConfig().config["sources"]: command_file.write("{0}\n".format(json.dumps(source))) # 3. Copy command file to Hadoop filesystem hdfs_client = HdfsClient() hdfs_client.mkdir(os.path.dirname(self.hdfs_path), True) hdfs_client.put(tmp_command_file, self.hdfs_path) finally: rmtree(tmp_dir)
def delete_toasted(config): with open(config, 'r') as ip: config_data = json.load(ip) url = os.path.join(eggo_config.get('dfs', 'dfs_root_url'), config_data['name']) url = urlparse(url) if url.scheme == 's3n': conn = S3Connection() bucket = conn.get_bucket(url.netloc) keys = bucket.list(url.path.lstrip('/')) bucket.delete_keys(keys) elif url.scheme == 'file': rmtree(url.path, ignore_errors=True) else: raise NotImplementedError( "{0} dfs scheme not supported".format(url.scheme))
def provision(): provision_cmd = ( "{spark_home}/ec2/spark-ec2 -k {ec2_key_pair} " "-i {ec2_private_key_file} -s {slaves} -t {type_} " "-r {region} {zone_arg} {spot_price_arg} " "--copy-aws-credentials launch {stack_name}" ) az = eggo_config.get("spark_ec2", "availability_zone") zone_arg = "--zone {0}".format(az) if az != "" else "" spot_price = eggo_config.get("spark_ec2", "spot_price") spot_price_arg = "--spot-price {0}".format(spot_price) if spot_price != "" else "" interp_cmd = provision_cmd.format( spark_home=eggo_config.get("client_env", "spark_home"), ec2_key_pair=eggo_config.get("aws", "ec2_key_pair"), ec2_private_key_file=eggo_config.get("aws", "ec2_private_key_file"), slaves=eggo_config.get("spark_ec2", "num_slaves"), type_=eggo_config.get("spark_ec2", "instance_type"), region=eggo_config.get("spark_ec2", "region"), zone_arg=zone_arg, spot_price_arg=spot_price_arg, stack_name=eggo_config.get("spark_ec2", "stack_name"), ) local(interp_cmd)
def raw_data_url(self): return os.path.join(eggo_config.get("dfs", "dfs_raw_data_url"), self.config["name"])
from cStringIO import StringIO from fabric.api import ( task, env, execute, local, open_shell, put, cd, run, prefix, shell_env, require, hosts, path, sudo, lcd) from fabric.contrib.files import append, exists from boto.ec2 import connect_to_region from boto.s3.connection import S3Connection import eggo.director import eggo.spark_ec2 from eggo.util import build_dest_filename from eggo.config import eggo_config, generate_luigi_cfg exec_ctx = eggo_config.get('execution', 'context') work_path = eggo_config.get('worker_env', 'work_path') eggo_config_path = eggo_config.get('worker_env', 'eggo_config_path') luigi_config_path = eggo_config.get('worker_env', 'luigi_config_path') adam_fork = eggo_config.get('versions', 'adam_fork') adam_branch = eggo_config.get('versions', 'adam_branch') adam_home = eggo_config.get('worker_env', 'adam_home') eggo_fork = eggo_config.get('versions', 'eggo_fork') eggo_branch = eggo_config.get('versions', 'eggo_branch') eggo_home = eggo_config.get('worker_env', 'eggo_home') maven_version = eggo_config.get('versions', 'maven') # the diff exec ctxs have diff permissions if exec_ctx == 'local': wrun = local
def _dnload_to_local_upload_to_dfs(source, destination, compression): # source: (string) URL suitable for curl # destination: (string) full URL of destination file name # compression: (bool) whether file needs to be decompressed tmp_local_dir = mkdtemp(prefix="tmp_eggo_", dir=eggo_config.get("worker_env", "work_path")) try: # 1. dnload file dnload_cmd = "pushd {tmp_local_dir} && curl -L -O {source} && popd" check_call(dnload_cmd.format(tmp_local_dir=tmp_local_dir, source=source), shell=True) # 2. decompress if necessary if compression: compression_type = os.path.splitext(source)[-1] if compression_type == ".gz": decompr_cmd = "pushd {tmp_local_dir} && gunzip *.gz && popd" else: raise ValueError("Unknown compression type: {0}".format(compression_type)) check_call(decompr_cmd.format(tmp_local_dir=tmp_local_dir), shell=True) try: # 3. upload to tmp distributed filesystem location (e.g. S3) tmp_staged_dir = os.path.join(eggo_config.get("dfs", "dfs_tmp_data_url"), "staged", random_id()) # get the name of the local file that we're uploading local_files = os.listdir(tmp_local_dir) if len(local_files) != 1: # TODO: generate warning/error here pass filename = local_files[0] # ensure the dfs directory exists; this cmd may fail if the dir # already exists, but that's ok (though it shouldn't already exist) create_dir_cmd = "{hadoop_home}/bin/hadoop fs -mkdir -p {tmp_dfs_dir}" call( create_dir_cmd.format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), tmp_dfs_dir=tmp_staged_dir ), shell=True, ) upload_cmd = "{hadoop_home}/bin/hadoop fs -put {tmp_local_file} {tmp_dfs_file}" check_call( upload_cmd.format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), tmp_local_file=os.path.join(tmp_local_dir, filename), tmp_dfs_file=os.path.join(tmp_staged_dir, filename), ), shell=True, ) # 4. rename to final target location rename_cmd = "{hadoop_home}/bin/hadoop fs -mv {tmp_path} {final_path}" check_call( rename_cmd.format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), tmp_path=os.path.join(tmp_staged_dir, filename), final_path=destination, ), shell=True, ) finally: pass # TODO: clean up dfs tmp dir finally: rmtree(tmp_local_dir)
def dfs_tmp_data_url(self): return os.path.join( eggo_config.get("dfs", "dfs_tmp_data_url"), self.config["name"], eggo_config.get("execution", "random_id") )
def edition_url(self, format="bdg", edition="basic"): return os.path.join(eggo_config.get("dfs", "dfs_root_url"), self.config["name"], format, edition)
# regarding copyright ownership. The BDG licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from boto.ec2 import connect_to_region from eggo.config import eggo_config exec_ctx = eggo_config.get('execution', 'context') # check that we're running on EC2 if exec_ctx not in ['spark_ec2', 'director']: sys.exit() conn = connect_to_region(eggo_config.get(exec_ctx, 'region')) instances = conn.get_only_instances( filters={'tag:stack_name': [eggo_config.get(exec_ctx, 'stack_name')]}) for instance in instances: print instance instance.terminate()
def dataset_url(self): return os.path.join(eggo_config.get("dfs", "dfs_root_url"), self.config["name"])
import os import sys import time from tempfile import mkdtemp from datetime import datetime import boto.ec2 import boto.cloudformation from boto.ec2.networkinterface import ( NetworkInterfaceCollection, NetworkInterfaceSpecification) from fabric.api import local, env, run, execute, prefix, put, open_shell from eggo.config import eggo_config AWS_ACCESS_KEY_ID = eggo_config.get('aws', 'aws_access_key_id') AWS_SECRET_ACCESS_KEY = eggo_config.get('aws', 'aws_secret_access_key') EC2_KEY_PAIR = eggo_config.get('aws', 'ec2_key_pair') EC2_PRIVATE_KEY_FILE = eggo_config.get('aws', 'ec2_private_key_file') REGION = eggo_config.get('director', 'region') LAUNCHER_INSTANCE_TYPE = eggo_config.get('director', 'launcher_instance_type') LAUNCHER_AMI = eggo_config.get('director', 'launcher_ami') CLUSTER_AMI = eggo_config.get('director', 'cluster_ami') NUM_WORKERS = eggo_config.get('director', 'num_workers') STACK_NAME = eggo_config.get('director', 'stack_name') CLOUDFORMATION_TEMPLATE = eggo_config.get('director', 'cloudformation_template') DIRECTOR_CONF_TEMPLATE = eggo_config.get('director', 'director_conf_template') def provision(): start_time = datetime.now()