Пример #1
0
def create_SUCCESS_file(path):
    if path.startswith("s3:") or path.startswith("s3n:") or path.startswith("s3a:"):
        s3_client = S3Client(
            eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key")
        )
        s3_client.put_string("", os.path.join(path, "_SUCCESS"))
    elif path.startswith("hdfs:"):
        hdfs_client = HdfsClient()
        hdfs_client.put("/dev/null", os.path.join(path, "_SUCCESS"))
    elif path.startswith("file:"):
        open(os.path.join(path, "_SUCCESS"), "a").close()
Пример #2
0
    def mapper(self, line):
        source = json.loads("\t".join(line.split("\t")[1:]))
        dest_name = build_dest_filename(source["url"], decompress=source["compression"])
        dest_url = os.path.join(self.destination, dest_name)
        if dest_url.startswith("s3:") or dest_url.startswith("s3n:"):
            client = S3Client(
                eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key")
            )
        else:
            client = HdfsClient()
        if not client.exists(dest_url):
            _dnload_to_local_upload_to_dfs(source["url"], dest_url, source["compression"])

        yield (source["url"], 1)  # dummy output
Пример #3
0
    def run(self):
        tmp_dir = mkdtemp(prefix="tmp_eggo_", dir=eggo_config.get("worker_env", "work_path"))
        try:
            # build the remote command for each source
            tmp_command_file = "{0}/command_file".format(tmp_dir)
            with open(tmp_command_file, "w") as command_file:
                for source in ToastConfig().config["sources"]:
                    command_file.write("{0}\n".format(json.dumps(source)))

            # 3. Copy command file to Hadoop filesystem
            hdfs_client = HdfsClient()
            hdfs_client.mkdir(os.path.dirname(self.hdfs_path), True)
            hdfs_client.put(tmp_command_file, self.hdfs_path)
        finally:
            rmtree(tmp_dir)