示例#1
0
    def csv(path,
            sep=',',
            header='true',
            infer_schema='true',
            charset="UTF-8",
            *args,
            **kwargs):
        """
        Return a dataframe from a csv file. It is the same read.csv Spark function with some predefined
        params

        :param path: path or location of the file.
        :param sep: usually delimiter mark are ',' or ';'.
        :param header: tell the function whether dataset has a header row. 'true' default.
        :param infer_schema: infers the input schema automatically from data.
        :param charset: Charset file encoding
        It requires one extra pass over the data. 'true' default.

        :return dataFrame
        """
        file, file_name = prepare_path(path, "csv")

        try:
            df = (Spark.instance.spark.read.options(header=header).options(
                mode="DROPMALFORMED").options(delimiter=sep).options(
                    inferSchema=infer_schema).option("charset", charset).csv(
                        file, *args, **kwargs))

            df.set_meta("file_name", file_name)
        except IOError as error:
            logger.print(error)
            raise
        df = replace_columns_special_characters(df)
        return df
示例#2
0
    def excel(path, sheet_name=0, *args, **kwargs):
        """
        Return a dataframe from a excel file.
        :param path: Path or location of the file. Must be string dataType
        :param sheet_name: excel sheet name
        :param args: custom argument to be passed to the excel function
        :param kwargs: custom keyword arguments to be passed to the excel function
        :return: Spark Dataframe
        """
        file, file_name = prepare_path(path, "xls")

        try:
            pdf = pd.read_excel(file, sheet_name=sheet_name, *args, **kwargs)

            # Parse object column data type to string to ensure that Spark can handle it. With this we try to reduce
            # exception when Spark try to infer the column data type
            col_names = list(pdf.select_dtypes(include=['object']))

            column_dtype = {}
            for col in col_names:
                column_dtype[col] = str

            # Convert object columns to string
            pdf = pdf.astype(column_dtype)

            # Create spark data frame
            df = Spark.instance.spark.createDataFrame(pdf)
            df.set_meta("file_name", ntpath.basename(file_name))
        except IOError as error:
            logger.print(error)
            raise

        df = replace_columns_special_characters(df)
        return df
示例#3
0
    def csv(path, sep=',', header='true', infer_schema='true', *args, **kwargs):
        """
        Return a dataframe from a csv file. It is the same read.csv Spark function with some predefined
        params

        :param path: path or location of the file.
        :param sep: usually delimiter mark are ',' or ';'.
        :param header: tell the function whether dataset has a header row. 'true' default.
        :param infer_schema: infers the input schema automatically from data.
        It requires one extra pass over the data. 'true' default.

        :return dataFrame
        """
        if is_url(path):
            path = Load._data_loader(path, "csv")

        try:
            df = (Spark.instance.spark.read
                  .options(header=header)
                  .options(mode="DROPMALFORMED")
                  .options(delimiter=sep)
                  .options(inferSchema=infer_schema)
                  .csv(path, *args, **kwargs))
        except IOError as error:
            logger.print(error)
            raise
        return replace_columns_special_characters(df)
示例#4
0
    def json(path, multiline=False, *args, **kwargs):
        """
        Return a dataframe from a json file.
        :param path: path or location of the file.
        :param multiline:
        :return:
        """
        file, file_name = prepare_path(path, "json")

        try:
            df = Spark.instance.spark.read \
                .option("multiLine", multiline) \
                .option("mode", "PERMISSIVE") \
                .json(file, *args, **kwargs)

            df.set_meta("file_name", file_name)

        except IOError as error:
            logger.print(error)
            raise
        df = replace_columns_special_characters(df)

        df = df.action_meta("columns", df.cols.names())
        df.reset()
        return df
示例#5
0
    def json(path, multiline=False, *args, **kwargs):
        """
        Return a dataframe from a json file.
        :param path: path or location of the file.
        :param multiline:
        :return:
        """
        if is_url(path):
            path = Load._data_loader(path, "json")

        try:
            # TODO: Check a better way to handle this Spark.instance.spark. Very verbose.
            df = Spark.instance.spark.read \
                .option("multiLine", multiline) \
                .option("mode", "PERMISSIVE") \
                .json(path, *args, **kwargs)

        except IOError as error:
            logger.print(error)
            raise
        return replace_columns_special_characters(df)