Exemplo n.º 1
0
def extract_one_path(file_path, file_type, env):
    file_url_splits = file_path.split("://")
    prefix = file_url_splits[0]
    if prefix == "s3":
        file_paths = list_s3_file(file_url_splits[1], file_type, env)
    elif prefix == "hdfs":
        import pyarrow as pa
        fs = pa.hdfs.connect()
        if fs.isfile(file_path):
            return [file_path]
        else:
            file_paths = get_file_list(file_path)
            # only get json/csv files
            file_paths = [
                file for file in file_paths
                if os.path.splitext(file)[1] == "." + file_type
            ]
    else:
        if os.path.isfile(file_path):
            return [file_path]
        else:
            file_paths = get_file_list(file_path)
            # only get json/csv files
            file_paths = [
                file for file in file_paths
                if os.path.splitext(file)[1] == "." + file_type
            ]
    return file_paths
Exemplo n.º 2
0
def extract_one_path(file_path, file_type, env):
    file_url_splits = file_path.split("://")
    prefix = file_url_splits[0]
    if prefix == "s3":
        file_paths = list_s3_file(file_url_splits[1], file_type, env)
    else:
        file_paths = get_file_list(file_path)
    # only get json/csv files
    file_paths = [
        file for file in file_paths
        if os.path.splitext(file)[1] == "." + file_type
    ]
    return file_paths
Exemplo n.º 3
0
def extract_one_path(file_path, env):
    file_url_splits = file_path.split("://")
    prefix = file_url_splits[0]
    if prefix == "s3":
        file_paths = list_s3_file(file_url_splits[1], env)
    elif prefix == "hdfs":
        import pyarrow as pa
        fs = pa.hdfs.connect()
        if fs.isfile(file_path):
            file_paths = [file_path]
        else:
            file_paths = get_file_list(file_path)
    else:  # Local file path; could be a relative path.
        from os.path import isfile, abspath, join
        if isfile(file_path):
            file_paths = [abspath(file_path)]
        else:
            # An error would be already raised here if the path is invalid.
            file_paths = [abspath(join(file_path, file)) for file in os.listdir(file_path)]
    return file_paths
Exemplo n.º 4
0
def read_file_spark(context, file_path, file_type, **kwargs):
    file_url_splits = file_path.split("://")
    prefix = file_url_splits[0]
    node_num, core_num = get_node_and_core_number()

    if prefix == "s3":
        data_paths = list_s3_file(file_url_splits[1], file_type, os.environ)
    else:
        data_paths = get_file_list(file_path)
    rdd = context.parallelize(data_paths, node_num * core_num)

    if prefix == "hdfs":

        def loadFile(iterator):
            import pandas as pd
            import pyarrow as pa
            fs = pa.hdfs.connect()

            for x in iterator:
                with fs.open(x, 'rb') as f:
                    if file_type == "csv":
                        df = pd.read_csv(f, **kwargs)
                    elif file_type == "json":
                        df = pd.read_json(f, **kwargs)
                    else:
                        raise Exception("Unsupported file type")
                    yield df

        pd_rdd = rdd.mapPartitions(loadFile)
    elif prefix == "s3":

        def loadFile(iterator):
            access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
            secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
            import boto3
            import pandas as pd
            s3_client = boto3.Session(
                aws_access_key_id=access_key_id,
                aws_secret_access_key=secret_access_key,
            ).client('s3', verify=False)
            for x in iterator:
                path_parts = x.split("://")[1].split('/')
                bucket = path_parts.pop(0)
                key = "/".join(path_parts)
                obj = s3_client.get_object(Bucket=bucket, Key=key)
                if file_type == "json":
                    df = pd.read_json(obj['Body'], **kwargs)
                elif file_type == "csv":
                    df = pd.read_csv(obj['Body'], **kwargs)
                else:
                    raise Exception("Unsupported file type")
                yield df

        pd_rdd = rdd.mapPartitions(loadFile)
    else:

        def loadFile(iterator):
            import pandas as pd
            for x in iterator:
                if file_type == "csv":
                    df = pd.read_csv(x, **kwargs)
                elif file_type == "json":
                    df = pd.read_json(x, **kwargs)
                else:
                    raise Exception("Unsupported file type")
                yield df

        pd_rdd = rdd.mapPartitions(loadFile)

    data_shards = SparkDataShards(pd_rdd)
    return data_shards