示例#1
0
 def do():
     with open(config, 'r') as ip:
         config_data = json.load(ip)
     dag_class = config_data['dag']
     # push the toast config to the remote machine
     toast_config_worker_path = os.path.join(
         eggo_config.get('worker_env', 'work_path'),
         build_dest_filename(config))
     put(local_path=config,
         remote_path=toast_config_worker_path)
     # TODO: run on central scheduler instead
     toast_cmd = ('toaster.py --local-scheduler {clazz} '
                  '--ToastConfig-config {toast_config}'.format(
                     clazz=dag_class,
                     toast_config=toast_config_worker_path))
     
     hadoop_bin = os.path.join(eggo_config.get('worker_env', 'hadoop_home'), 'bin')
     toast_env = {'EGGO_HOME': eggo_config.get('worker_env', 'eggo_home'),  # toaster.py imports eggo_config, which needs EGGO_HOME on worker
                  'EGGO_CONFIG': eggo_config.get('worker_env', 'eggo_config_path'),  # bc toaster.py imports eggo_config which must be init on the worker
                  'LUIGI_CONFIG_PATH': eggo_config.get('worker_env', 'luigi_config_path'),
                  'AWS_ACCESS_KEY_ID': eggo_config.get('aws', 'aws_access_key_id'),  # bc dataset dnload pushes data to S3 TODO: should only be added if the dfs is S3
                  'AWS_SECRET_ACCESS_KEY': eggo_config.get('aws', 'aws_secret_access_key'),  # TODO: should only be added if the dfs is S3
                  'SPARK_HOME': eggo_config.get('worker_env', 'spark_home')}
     if exec_ctx == 'local':
             # this should copy vars that maintain venv info
             env_copy = os.environ.copy()
             env_copy.update(toast_env)
             toast_env = env_copy
     with path(hadoop_bin):
         with shell_env(**toast_env):
             wrun(toast_cmd)
示例#2
0
文件: dag.py 项目: ryan-williams/eggo
 def requires(self):
     for source in ToastConfig().config["sources"]:
         dest_name = build_dest_filename(source["url"], decompress=source["compression"])
         yield DownloadFileToDFSTask(
             source=source["url"],
             target=os.path.join(self.destination, dest_name),
             compression=source["compression"],
         )
示例#3
0
文件: dag.py 项目: ryan-williams/eggo
    def mapper(self, line):
        source = json.loads("\t".join(line.split("\t")[1:]))
        dest_name = build_dest_filename(source["url"], decompress=source["compression"])
        dest_url = os.path.join(self.destination, dest_name)
        if dest_url.startswith("s3:") or dest_url.startswith("s3n:"):
            client = S3Client(
                eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key")
            )
        else:
            client = HdfsClient()
        if not client.exists(dest_url):
            _dnload_to_local_upload_to_dfs(source["url"], dest_url, source["compression"])

        yield (source["url"], 1)  # dummy output