def csv(path, sep=',', header='true', infer_schema='true', charset="UTF-8", *args, **kwargs): """ Return a dataframe from a csv file. It is the same read.csv Spark function with some predefined params :param path: path or location of the file. :param sep: usually delimiter mark are ',' or ';'. :param header: tell the function whether dataset has a header row. 'true' default. :param infer_schema: infers the input schema automatically from data. :param charset: Charset file encoding It requires one extra pass over the data. 'true' default. :return dataFrame """ file, file_name = prepare_path(path, "csv") try: df = (Spark.instance.spark.read.options(header=header).options( mode="DROPMALFORMED").options(delimiter=sep).options( inferSchema=infer_schema).option("charset", charset).csv( file, *args, **kwargs)) df.set_meta("file_name", file_name) except IOError as error: logger.print(error) raise df = replace_columns_special_characters(df) return df
def excel(path, sheet_name=0, *args, **kwargs): """ Return a dataframe from a excel file. :param path: Path or location of the file. Must be string dataType :param sheet_name: excel sheet name :param args: custom argument to be passed to the excel function :param kwargs: custom keyword arguments to be passed to the excel function :return: Spark Dataframe """ file, file_name = prepare_path(path, "xls") try: pdf = pd.read_excel(file, sheet_name=sheet_name, *args, **kwargs) # Parse object column data type to string to ensure that Spark can handle it. With this we try to reduce # exception when Spark try to infer the column data type col_names = list(pdf.select_dtypes(include=['object'])) column_dtype = {} for col in col_names: column_dtype[col] = str # Convert object columns to string pdf = pdf.astype(column_dtype) # Create spark data frame df = Spark.instance.spark.createDataFrame(pdf) df.set_meta("file_name", ntpath.basename(file_name)) except IOError as error: logger.print(error) raise df = replace_columns_special_characters(df) return df
def csv(path, sep=',', header='true', infer_schema='true', *args, **kwargs): """ Return a dataframe from a csv file. It is the same read.csv Spark function with some predefined params :param path: path or location of the file. :param sep: usually delimiter mark are ',' or ';'. :param header: tell the function whether dataset has a header row. 'true' default. :param infer_schema: infers the input schema automatically from data. It requires one extra pass over the data. 'true' default. :return dataFrame """ if is_url(path): path = Load._data_loader(path, "csv") try: df = (Spark.instance.spark.read .options(header=header) .options(mode="DROPMALFORMED") .options(delimiter=sep) .options(inferSchema=infer_schema) .csv(path, *args, **kwargs)) except IOError as error: logger.print(error) raise return replace_columns_special_characters(df)
def json(path, multiline=False, *args, **kwargs): """ Return a dataframe from a json file. :param path: path or location of the file. :param multiline: :return: """ file, file_name = prepare_path(path, "json") try: df = Spark.instance.spark.read \ .option("multiLine", multiline) \ .option("mode", "PERMISSIVE") \ .json(file, *args, **kwargs) df.set_meta("file_name", file_name) except IOError as error: logger.print(error) raise df = replace_columns_special_characters(df) df = df.action_meta("columns", df.cols.names()) df.reset() return df
def json(path, multiline=False, *args, **kwargs): """ Return a dataframe from a json file. :param path: path or location of the file. :param multiline: :return: """ if is_url(path): path = Load._data_loader(path, "json") try: # TODO: Check a better way to handle this Spark.instance.spark. Very verbose. df = Spark.instance.spark.read \ .option("multiLine", multiline) \ .option("mode", "PERMISSIVE") \ .json(path, *args, **kwargs) except IOError as error: logger.print(error) raise return replace_columns_special_characters(df)