def csv(path, sep=',', header='true', infer_schema='true', *args, **kwargs): """ Return a dataframe from a csv file. It is the same read.csv Spark function with some predefined params :param path: path or location of the file. :param sep: usually delimiter mark are ',' or ';'. :param header: tell the function whether dataset has a header row. 'true' default. :param infer_schema: infers the input schema automatically from data. It requires one extra pass over the data. 'true' default. :return dataFrame """ if is_url(path): path = Load._data_loader(path, "csv") try: df = (Spark.instance.spark.read .options(header=header) .options(mode="DROPMALFORMED") .options(delimiter=sep) .options(inferSchema=infer_schema) .csv(path, *args, **kwargs)) except IOError as error: logger.print(error) raise return replace_columns_special_characters(df)
def excel(path, sheet_name=0, *args, **kwargs): """ Return a dataframe from a excel file. :param path: Path or location of the file. Must be string dataType :param sheet_name: excel sheet name :param args: custom argument to be passed to the excel function :param kwargs: custom keyword arguments to be passed to the excel function :return: Spark Dataframe """ if is_url(path): path = Load._data_loader(path, "xls") try: pdf = pd.read_excel(path, sheet_name=sheet_name, *args, **kwargs) # Parse object column data type to string to ensure that Spark can handle it. With this we try to reduce # exception when Spark try to infer the column data type col_names = list(pdf.select_dtypes(include=['object'])) column_dtype = {} for col in col_names: column_dtype[col] = str # Convert object columns to string pdf = pdf.astype(column_dtype) # Create spark data frame df = Spark.instance.spark.createDataFrame(pdf) except IOError as error: logger.print(error) raise return replace_columns_special_characters(df)
def prepare_path(path, file_format): """ Helper to return the file to be loaded and the file name :param path: Path to the file to be loaded :param file_format: format file :return: """ file_name = ntpath.basename(path) if is_url(path): file = downloader(path, file_format) else: file = path return file, file_name
def parquet(path, *args, **kwargs): """ Return a dataframe from a parquet file. :param path: path or location of the file. Must be string dataType :param args: custom argument to be passed to the spark parquet function :param kwargs: custom keyword arguments to be passed to the spark parquet function :return: Spark Dataframe """ if is_url(path): path = Load._data_loader(path, "parquet") try: df = Spark.instance.spark.read.parquet(path, *args, **kwargs) except IOError as error: logger.print(error) raise return df
def json(path, multiline=False, *args, **kwargs): """ Return a dataframe from a json file. :param path: path or location of the file. :param multiline: :return: """ if is_url(path): path = Load._data_loader(path, "json") try: # TODO: Check a better way to handle this Spark.instance.spark. Very verbose. df = Spark.instance.spark.read \ .option("multiLine", multiline) \ .option("mode", "PERMISSIVE") \ .json(path, *args, **kwargs) except IOError as error: logger.print(error) raise return replace_columns_special_characters(df)
def avro(path, *args, **kwargs): """ Return a dataframe from a avro file. :param path: path or location of the file. Must be string dataType :param args: custom argument to be passed to the spark avro function :param kwargs: custom keyword arguments to be passed to the spark avro function :return: Spark Dataframe """ if is_url(path): path = Load._data_loader(path, "avro") try: if version.parse(Spark.instance.spark.version) < version.parse("2.4"): avro_version = "com.databricks.spark.avro" else: avro_version = "avro " df = Spark.instance.spark.read.format(avro_version).load(path, *args, **kwargs) except IOError as error: logger.print(error) raise return df