def extract_one_path(file_path, file_type, env): file_url_splits = file_path.split("://") prefix = file_url_splits[0] if prefix == "s3": file_paths = list_s3_file(file_url_splits[1], file_type, env) elif prefix == "hdfs": import pyarrow as pa fs = pa.hdfs.connect() if fs.isfile(file_path): return [file_path] else: file_paths = get_file_list(file_path) # only get json/csv files file_paths = [ file for file in file_paths if os.path.splitext(file)[1] == "." + file_type ] else: if os.path.isfile(file_path): return [file_path] else: file_paths = get_file_list(file_path) # only get json/csv files file_paths = [ file for file in file_paths if os.path.splitext(file)[1] == "." + file_type ] return file_paths
def extract_one_path(file_path, file_type, env): file_url_splits = file_path.split("://") prefix = file_url_splits[0] if prefix == "s3": file_paths = list_s3_file(file_url_splits[1], file_type, env) else: file_paths = get_file_list(file_path) # only get json/csv files file_paths = [ file for file in file_paths if os.path.splitext(file)[1] == "." + file_type ] return file_paths
def extract_one_path(file_path, env): file_url_splits = file_path.split("://") prefix = file_url_splits[0] if prefix == "s3": file_paths = list_s3_file(file_url_splits[1], env) elif prefix == "hdfs": import pyarrow as pa fs = pa.hdfs.connect() if fs.isfile(file_path): file_paths = [file_path] else: file_paths = get_file_list(file_path) else: # Local file path; could be a relative path. from os.path import isfile, abspath, join if isfile(file_path): file_paths = [abspath(file_path)] else: # An error would be already raised here if the path is invalid. file_paths = [abspath(join(file_path, file)) for file in os.listdir(file_path)] return file_paths
def read_file_spark(context, file_path, file_type, **kwargs): file_url_splits = file_path.split("://") prefix = file_url_splits[0] node_num, core_num = get_node_and_core_number() if prefix == "s3": data_paths = list_s3_file(file_url_splits[1], file_type, os.environ) else: data_paths = get_file_list(file_path) rdd = context.parallelize(data_paths, node_num * core_num) if prefix == "hdfs": def loadFile(iterator): import pandas as pd import pyarrow as pa fs = pa.hdfs.connect() for x in iterator: with fs.open(x, 'rb') as f: if file_type == "csv": df = pd.read_csv(f, **kwargs) elif file_type == "json": df = pd.read_json(f, **kwargs) else: raise Exception("Unsupported file type") yield df pd_rdd = rdd.mapPartitions(loadFile) elif prefix == "s3": def loadFile(iterator): access_key_id = os.environ["AWS_ACCESS_KEY_ID"] secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"] import boto3 import pandas as pd s3_client = boto3.Session( aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key, ).client('s3', verify=False) for x in iterator: path_parts = x.split("://")[1].split('/') bucket = path_parts.pop(0) key = "/".join(path_parts) obj = s3_client.get_object(Bucket=bucket, Key=key) if file_type == "json": df = pd.read_json(obj['Body'], **kwargs) elif file_type == "csv": df = pd.read_csv(obj['Body'], **kwargs) else: raise Exception("Unsupported file type") yield df pd_rdd = rdd.mapPartitions(loadFile) else: def loadFile(iterator): import pandas as pd for x in iterator: if file_type == "csv": df = pd.read_csv(x, **kwargs) elif file_type == "json": df = pd.read_json(x, **kwargs) else: raise Exception("Unsupported file type") yield df pd_rdd = rdd.mapPartitions(loadFile) data_shards = SparkDataShards(pd_rdd) return data_shards