Пример #1
0
    def csv(path, sep=',', header='true', infer_schema='true', *args, **kwargs):
        """
        Return a dataframe from a csv file. It is the same read.csv Spark function with some predefined
        params

        :param path: path or location of the file.
        :param sep: usually delimiter mark are ',' or ';'.
        :param header: tell the function whether dataset has a header row. 'true' default.
        :param infer_schema: infers the input schema automatically from data.
        It requires one extra pass over the data. 'true' default.

        :return dataFrame
        """
        if is_url(path):
            path = Load._data_loader(path, "csv")

        try:
            df = (Spark.instance.spark.read
                  .options(header=header)
                  .options(mode="DROPMALFORMED")
                  .options(delimiter=sep)
                  .options(inferSchema=infer_schema)
                  .csv(path, *args, **kwargs))
        except IOError as error:
            logger.print(error)
            raise
        return replace_columns_special_characters(df)
Пример #2
0
    def excel(path, sheet_name=0, *args, **kwargs):
        """
        Return a dataframe from a excel file.
        :param path: Path or location of the file. Must be string dataType
        :param sheet_name: excel sheet name
        :param args: custom argument to be passed to the excel function
        :param kwargs: custom keyword arguments to be passed to the excel function
        :return: Spark Dataframe
        """
        if is_url(path):
            path = Load._data_loader(path, "xls")

        try:
            pdf = pd.read_excel(path, sheet_name=sheet_name, *args, **kwargs)

            # Parse object column data type to string to ensure that Spark can handle it. With this we try to reduce
            # exception when Spark try to infer the column data type
            col_names = list(pdf.select_dtypes(include=['object']))

            column_dtype = {}
            for col in col_names:
                column_dtype[col] = str

            # Convert object columns to string
            pdf = pdf.astype(column_dtype)

            # Create spark data frame
            df = Spark.instance.spark.createDataFrame(pdf)
        except IOError as error:
            logger.print(error)
            raise

        return replace_columns_special_characters(df)
Пример #3
0
def prepare_path(path, file_format):
    """
    Helper to return the file to be loaded and the file name
    :param path: Path to the file to be loaded
    :param file_format: format file
    :return:
    """

    file_name = ntpath.basename(path)
    if is_url(path):
        file = downloader(path, file_format)
    else:
        file = path
    return file, file_name
Пример #4
0
    def parquet(path, *args, **kwargs):
        """
        Return a dataframe from a parquet file.
        :param path: path or location of the file. Must be string dataType
        :param args: custom argument to be passed to the spark parquet function
        :param kwargs: custom keyword arguments to be passed to the spark parquet function
        :return: Spark Dataframe
        """
        if is_url(path):
            path = Load._data_loader(path, "parquet")

        try:
            df = Spark.instance.spark.read.parquet(path, *args, **kwargs)
        except IOError as error:
            logger.print(error)
            raise

        return df
Пример #5
0
    def json(path, multiline=False, *args, **kwargs):
        """
        Return a dataframe from a json file.
        :param path: path or location of the file.
        :param multiline:
        :return:
        """
        if is_url(path):
            path = Load._data_loader(path, "json")

        try:
            # TODO: Check a better way to handle this Spark.instance.spark. Very verbose.
            df = Spark.instance.spark.read \
                .option("multiLine", multiline) \
                .option("mode", "PERMISSIVE") \
                .json(path, *args, **kwargs)

        except IOError as error:
            logger.print(error)
            raise
        return replace_columns_special_characters(df)
Пример #6
0
    def avro(path, *args, **kwargs):
        """
        Return a dataframe from a avro file.
        :param path: path or location of the file. Must be string dataType
        :param args: custom argument to be passed to the spark avro function
        :param kwargs: custom keyword arguments to be passed to the spark avro function
        :return: Spark Dataframe
        """
        if is_url(path):
            path = Load._data_loader(path, "avro")

        try:
            if version.parse(Spark.instance.spark.version) < version.parse("2.4"):
                avro_version = "com.databricks.spark.avro"
            else:
                avro_version = "avro "
            df = Spark.instance.spark.read.format(avro_version).load(path, *args, **kwargs)

        except IOError as error:
            logger.print(error)
            raise

        return df