def append(dfs, like="columns"): """ Concat multiple dataFrames columns or rows wise :param dfs: List of DataFrames :param like: concat as columns or rows :return: """ # FIX: Because monotonically_increasing_id can create different # sequence for different dataframes the result could be wrong. if like == "columns": temp_dfs = [] col_temp_name = "id_" + random_int() dfs = val_to_list(dfs) for df in dfs: temp_dfs.append( df.withColumn(col_temp_name, F.monotonically_increasing_id())) def _append(df1, df2): return df1.join(df2, col_temp_name, "outer") df_result = reduce(_append, temp_dfs).drop(col_temp_name) elif like == "rows": df_result = reduce(DataFrame.union, dfs) else: RaiseIt.value_error(like, ["columns", "rows"]) return df_result
def concat(dfs, like="columns"): """ Concat multiple dataframes as columns or rows :param dfs: :param like: The way dataframes is going to be concat. like columns or rows :return: """ # Add increasing Ids, and they should be the same. if like == "columns": temp_dfs = [] col_temp_name = "id_" + random_int() for df in dfs: temp_dfs.append( df.withColumn(col_temp_name, F.monotonically_increasing_id())) def _append_df(df1, df2): return df1.join(df2, col_temp_name, "outer") df_result = reduce(_append_df, temp_dfs).drop(col_temp_name) elif like == "rows": df_result = reduce(DataFrame.union, dfs) else: RaiseIt.value_error(like, ["columns", "rows"]) return df_result
def abstract_udf(col, func, func_return_type=None, attrs=None, func_type=None, verbose=False): """ Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp :param col: Column to created or transformed :param func: Function to be applied to the data :param attrs: If required attributes to be passed to the function :param func_return_type: Required by UDF and Pandas UDF. :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined :param verbose: print additional info :return: A function, UDF or Pandas UDF """ # By default is going to try to use pandas UDF if func_type is None and is_pyarrow_installed() is True: func_type = "pandas_udf" types = ["column_exp", "udf", "pandas_udf"] if func_type not in types: RaiseIt.value_error(func_type, types) # if verbose is True: # logging.info("Using '{func_type}' to process column '{column}' with function {func_name}" # .format(func_type=func_type, column=col, func_name=func.__name__)) df_func = func_factory(func_type, func_return_type) return df_func(attrs, func)(col)
def delete_check_point_folder(path, file_system): """ Function that deletes the temporal folder where temp files were stored. The path required is the same provided by user in setCheckPointFolder(). :param path: path where the info will be saved :param file_system: Describes if file system is local or hadoop file system. :return: """ if file_system == "hadoop": # Folder path: folder_path = path + "/" + "checkPointFolder" logger.print("Deleting checkpoint folder...") command = "hadoop fs -rm -r " + folder_path os.system(command) logger.print("$" + command) logger.print("Folder deleted.") elif file_system == "local": logger.print("Deleting checkpoint folder...") # Folder path: folder_path = path + "/" + "checkPointFolder" # Checking if tempFolder exits: if os.path.isdir(folder_path): # Deletes folder if exits: rmtree(folder_path) # Creates new folder: logger.print("Folder deleted.") else: logger.print("Folder deleted.") else: RaiseIt.value_error(file_system, ["hadoop", "local"])
def get(driver_type) -> AbstractDriver: """ Returns a driver implementation given a database name :param driver_type: name of the database :return: a database driver """ if driver_type == DriverProperties.CASSANDRA.value["name"]: return CassandraDriver() elif driver_type == DriverProperties.MYSQL.value["name"]: return MySQLDriver() elif driver_type == DriverProperties.ORACLE.value["name"]: return OracleDriver() elif driver_type == DriverProperties.POSTGRESQL.value["name"]: return PostgreSQLDriver() elif driver_type == DriverProperties.PRESTO.value["name"]: return PrestoDriver() elif driver_type == DriverProperties.REDSHIFT.value["name"]: return RedshiftDriver() elif driver_type == DriverProperties.SQLITE.value["name"]: return SQLiteDriver() elif driver_type == DriverProperties.SQLSERVER.value["name"]: return SQLServerDriver() elif driver_type == DriverProperties.BIGQUERY.value["name"]: return BigQueryDriver() elif driver_type == DriverProperties.IMPALA.value["name"]: return ImpalaDriver() else: RaiseIt.value_error( driver_type, [database["name"] for database in DriverProperties.list()])
def nest(input_cols, output_col, shape="string", separator=""): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested :param output_col: final column with the nested content :param separator: char to be used as separator at the concat time :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ df = self if has_(input_cols, F.Column): # Transform non Column data to lit columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols] else: columns = parse_columns(self, input_cols) if shape is "vector": columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) vector_assembler = VectorAssembler( inputCols=columns, outputCol=output_col) df = vector_assembler.transform(df) elif shape is "array": df = apply_expr(output_col, F.array(*columns)) elif shape is "string": df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIt.value_error(shape, ["vector", "array", "string"]) return df
def sort(col_sort): """ Sort rows taking in account multiple columns :param col_sort: column and sort type combination (col_name, "asc") :type col_sort: list of tuples """ # If a list of columns names are given order this by desc. If you need to specify the order of every # column use a list of tuples (col_name, "asc") df = self t = [] if is_list_of_str_or_int(col_sort): for col_name in col_sort: t.append(tuple([col_name, "desc"])) col_sort = t func = [] for cs in col_sort: col_name = one_list_to_val(cs[0]) order = cs[1] if order == "asc": sort_func = F.asc elif order == "desc": sort_func = F.desc else: RaiseIt.value_error(sort_func, ["asc", "desc"]) func.append(sort_func(col_name)) df = df.preserve_meta(self, Actions.SORT_ROW.value, col_name) df = df.sort(*func) return df
def cast_factory(cls): # Parse to Vector if is_type(cls, Vectors): func_type = "udf" def cast_to_vectors(val, attr): return Vectors.dense(val) func_return_type = VectorUDT() # Parse standard data types elif get_spark_dtypes_object(cls): func_type = "column_exp" def cast_to_vectors(col_name, attr): return F.col(col_name).cast(get_spark_dtypes_object(cls)) func_return_type = None # Add here any other parse you want else: RaiseIt.value_error(cls) return func_return_type, cast_to_vectors, func_type
def nest(input_cols, output_col, shape=None, separator=" "): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested :param output_col: final column with the nested content :param separator: char to be used as separator at the concat time :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ columns = parse_columns(self, input_cols) df = self if shape is "vector": vector_assembler = VectorAssembler(inputCols=input_cols, outputCol=output_col) df = vector_assembler.transform(self) elif shape is "array": df = apply_expr(output_col, F.array(*columns)) elif shape is "string": df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIt.value_error(shape, ["vector", "array", "string"]) return df
def to_file(self, path=None, output="html"): """ Save profiler data to a file in the specified format (html, json) :param output: html or json :param path: filename in which the data will be saved :return: """ if path is None: RaiseIt.value_error(path, "str") # We need to append a some extra html tags to display it correctly in the browser. if output is "html": if self.html is None: RaiseIt.not_ready_error( "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')" ) write_html(HEADER + self.html + FOOTER, path) elif output is "json": if self.json is None: RaiseIt.not_ready_error( "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')" ) write_json(self.json, path) else: RaiseIt.type_error(output, ["html", "json"])
def check_for_missing_columns(df, col_names): """ Check if the columns you want to select exits in the dataframe :param df: Dataframe to be checked :param col_names: cols names to :return: """ missing_columns = list(OrderedSet(col_names) - OrderedSet(df.schema.names)) if len(missing_columns) > 0: RaiseIt.value_error(missing_columns, df.columns) return False
def check_column_numbers(columns, number=0): """ Check if the columns number match number expected :param columns: :param number: Number of columns to check :return: """ if columns is None: RaiseIt.value_error( columns, ["str", "list"], extra_text= "Maybe the columns selected do not match a specified datatype filter." ) if isinstance(columns, zip): columns = list(columns) count = list(columns) if number == "*": if not len(columns) >= 1: RaiseIt.value_error(len(columns), ["1 or greater"]) elif number == ">1": if not len(columns) > 1: RaiseIt.value_error(len(columns), ["more than 1"]) elif len(columns) != number: RaiseIt.value_error(count, "{} columns, {} needed".format(number, columns))
def table_name(self, name=None): """ Create a temp view for a data frame :param self: :param name: :return: """ if not is_str(name): RaiseIt.type_error(name, ["string"]) if len(name) is 0: RaiseIt.value_error(name, ["> 0"]) self.createOrReplaceTempView(name) return self
def check_column_numbers(columns, number=0): """ Check if the columns number match number expected :param columns: :param number: Number of columns to check :return: """ count = len(columns) if number is "*": if not len(columns) >= 1: RaiseIt.value_error(len(columns), ["more than 1"]) elif not len(columns) == number: RaiseIt.value_error(count, "Receive {} columns, {} needed".format(number, columns))
def set_name(self, value=None): """ Create a temp view for a data frame also used in the json output profiling :param self: :param value: :return: """ self._name = value if not is_str(value): RaiseIt.type_error(value, ["string"]) if len(value) == 0: RaiseIt.value_error(value, ["> 0"]) self.createOrReplaceTempView(value)
def sort(order="asc"): """ Sort dataframes columns asc or desc :param order: 'asc' or 'desc' accepted :return: Spark DataFrame """ if order == "asc": sorted_col_names = sorted(self.columns) elif order == "desc": sorted_col_names = sorted(self.columns, reverse=True) else: RaiseIt.value_error(order, ["asc", "desc"]) return self.select(sorted_col_names)
def to_file(self, path=None, output=None): """ Save profiler data to a file in the specified format (html, json) :param output: html or json :param path: filename in which the data will be saved :return: """ if path is None: RaiseIt.value_error(path, ["Invalid file path"]) # We need to append a some extra html tags to display it correctly in the browser. if output is "html": if self.html is None: assert self.html is not None, "Please run the profiler first" header = '''<!doctype html> <html class="no-js" lang=""> <head> <meta charset="utf-8"> <meta http-equiv="x-ua-compatible" content="ie=edge"> <title></title> <meta name="description" content=""> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <link rel="manifest" href="site.webmanifest"> <link rel="apple-touch-icon" href="icon.png"> <!-- Place favicon.ico in the root directory --> <link rel="stylesheet" href="css/normalize.css"> <link rel="stylesheet" href="css/main.css"> </head> <body>''' footer = '''</body></html>''' write_html(header + self.html + footer, path) elif output is "json": if self.json is None: assert self.json is not None, "Please run the profiler first" write_json(self.json, path) else: print("sdf") RaiseIt.type_error(output, ["html", "json"])
def absolute_path(files, format="posix"): """ User project base folder to construct and absolute path :param files: path files :param format: posix or uri :return: """ files = val_to_list(files) if format == "uri": result = [Path(ROOT_DIR + file).as_uri() for file in files] elif format == "posix": result = [Path(ROOT_DIR + file).as_posix() for file in files] else: RaiseIt.value_error(format, ["posix", "uri"]) result = one_list_to_val(result) return result
def _set_check_point_folder(path, file_system): """ Function that receives a workspace path where a folder is created. This folder will store temporal dataframes when user writes the .checkPoint(). :param path: Location of the dataset (string). :param file_system: Describes if file system is local or hadoop file system. """ print_check_point_config(file_system) if file_system == "hadoop": folder_path = path + "/" + "checkPointFolder" Optimus.delete_check_point_folder(path=path, file_system=file_system) # Creating file: logger.print("Creating the hadoop folder...") command = "hadoop fs -mkdir " + folder_path logger.print("$" + command) os.system(command) logger.print("Hadoop folder created. \n") logger.print("Setting created folder as checkpoint folder...") Spark.instance.sc.setCheckpointDir(folder_path) elif file_system == "local": # Folder path: folder_path = path + "/" + "checkPointFolder" # Checking if tempFolder exits: logger.print("Deleting previous folder if exists...") if os.path.isdir(folder_path): # Deletes folder if exits: rmtree(folder_path) logger.print("Creating the checkpoint directory...") # Creates new folder: os.mkdir(folder_path) Spark.instance.sc.setCheckpointDir(dirName="file:///" + folder_path) else: RaiseIt.value_error(file_system, ["hadoop", "local"])
def abstract_udf(col, func, func_return_type=None, args=None, func_type=None): """ Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp :param col: Column to created or transformed :param func: Function to be applied to the data :param args: If required attributes to be passed to the function :param func_return_type: Required by UDF and Pandas UDF. :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined :return: A function, UDF or Pandas UDF """ if func_return_type is None: func_type = "column_expr" # By default is going to try to use pandas UDF elif func_type is None and is_pyarrow_installed() is True: func_type = "pandas_udf" types = ["column_expr", "udf", "pandas_udf"] if func_type not in types: RaiseIt.value_error(func_type, types) # It handle if func param is a plain expression or a function returning and expression def func_col_exp(col_name, attr): return func if is_column(func): _func = func_col_exp else: _func = func # print(func_type) logger.print( "Using '{func_type}' to process column '{column}' with function {func_name}" .format(func_type=func_type, column=col, func_name=_func.__name__)) df_func = func_factory(func_type, func_return_type) if not is_tuple(args): args = (args, ) # print("-----------------df_func(_func, args)(col)", df_func(_func, args)(col)) return df_func(_func, args)(col)
def move(column, position, ref_col=None): """ Move a column to specific position :param column: Column to be moved :param position: Column new position. Accepts 'after', 'before', 'beginning', 'end' :param ref_col: Column taken as reference :return: Spark DataFrame """ # Check that column is a string or a list column = parse_columns(self, column) ref_col = parse_columns(self, ref_col) # Get dataframe columns columns = self.columns # Get source and reference column index position new_index = columns.index(ref_col[0]) # Column to move column_to_move_index = columns.index(column[0]) if position == 'after': # Check if the movement is from right to left: if new_index < column_to_move_index: new_index = new_index + 1 elif position == 'before': # If position if before: if new_index >= column_to_move_index: # Check if the movement if from right to left: new_index = new_index - 1 elif position == 'beginning': new_index = 0 elif position == 'end': new_index = len(columns) else: RaiseIt.value_error(position, ["after", "before", "beginning", "end"]) # Move the column to the new place columns.insert(new_index, columns.pop(column_to_move_index)) # insert and delete a element return self[columns]
def sample_n(self, n=10, random=False): """ Return a n number of sample from a dataFrame :param self: :param n: Number of samples :param random: if true get a semi random sample :return: """ if random is True: seed = random_int() elif random is False: seed = 0 else: RaiseIt.value_error(random, ["True", "False"]) rows_count = self.count() if n < rows_count: fraction = n / rows_count else: fraction = 1.0 return self.sample(False, fraction, seed=seed)
def sample_n(self, n=10, random=False): """ Return a n number of sample from a dataFrame :param self: :param n: Number of samples :param random: if true get a semi random sample :return: """ if random is True: seed = random_int() elif random is False: seed = 0 else: RaiseIt.value_error(random, ["True", "False"]) rows_count = self.count() if n < rows_count: # n/rows_count can return a number that represent less the total number we expect. multiply by 1.1 bo fraction = (n / rows_count) * 1.1 else: fraction = 1.0 return self.sample(False, fraction, seed=seed).limit(n)
def check_column_numbers(columns, number=0): """ Check if the columns number match number expected :param columns: :param number: Number of columns to check :return: """ if columns is None: RaiseIt.value_error(columns, "not None") count = len(columns) if number == "*": if not len(columns) >= 1: RaiseIt.value_error(len(columns), ["1 or greater"]) elif number == ">1": if not len(columns) > 1: RaiseIt.value_error(len(columns), ["more than 1"]) elif len(columns) == number: RaiseIt.value_error(count, "{} columns, {} needed".format(number, columns))
def optimus(engine=Engine.DASK.value, *args, **kwargs): """ This is the entry point to initialize the selected engine. :param engine: A string identifying an engine :classL`Engine`. :param args: :param kwargs: :return: """ logger.print("ENGINE", engine) # lemmatizer nltk.download('wordnet', quiet=True) # Stopwords nltk.download('stopwords', quiet=True) # Init engine if engine == Engine.PANDAS.value: from optimus.engines.pandas.engine import PandasEngine op = PandasEngine(*args, **kwargs) elif engine == Engine.VAEX.value: from optimus.engines.vaex.engine import VaexEngine op = VaexEngine(*args, **kwargs) elif engine == Engine.SPARK.value: from optimus.engines.spark.engine import SparkEngine op = SparkEngine(*args, **kwargs) elif engine == Engine.DASK.value: from optimus.engines.dask.engine import DaskEngine op = DaskEngine(*args, **kwargs) elif engine == Engine.IBIS.value: from optimus.engines.ibis.engine import IbisEngine op = IbisEngine(*args, **kwargs) elif engine == Engine.CUDF.value: from optimus.engines.cudf.engine import CUDFEngine op = CUDFEngine(*args, **kwargs) elif engine == Engine.DASK_CUDF.value: from optimus.engines.dask_cudf.engine import DaskCUDFEngine op = DaskCUDFEngine(*args, **kwargs) else: RaiseIt.value_error(engine, Engine.list()) # Set cupy yo user RMM def switch_to_rmm_allocator(): import rmm import cupy cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) return True if engine == Engine.CUDF.value: switch_to_rmm_allocator() if engine == Engine.DASK_CUDF.value: if op.client: op.client.run(switch_to_rmm_allocator) return op
def __init__(self, host, database, user, password, port=None, driver=None, schema="public", oracle_tns=None, oracle_service_name=None, oracle_sid=None, presto_catalog=None, cassandra_keyspace=None, cassandra_table=None): """ Create the JDBC connection object :return: """ self.port = None self.db_driver = driver self.oracle_sid = oracle_sid self.cassandra_keyspace = cassandra_keyspace self.cassandra_table = cassandra_table # TODO: add mongo? # Handle the default port if self.db_driver == DriverResolver.REDSHIFT.__str__(): if port is None: self.port = DriverResolver.REDSHIFT.port() # "com.databricks.spark.redshift" elif self.db_driver == DriverResolver.POSTGRES_SQL.__str__(): if port is None: self.port = DriverResolver.POSTGRES_SQL.port() self.driver_option = DriverResolver.POSTGRES_SQL.java_class() elif self.db_driver == DriverResolver.POSTGRES.__str__( ): # backward compat if port is None: self.port = DriverResolver.POSTGRES.port() self.driver_option = DriverResolver.POSTGRES.java_class() self.db_driver = DriverResolver.POSTGRES_SQL.__str__() elif self.db_driver == DriverResolver.MY_SQL.__str__(): if port is None: self.port = DriverResolver.MY_SQL.port() # "com.mysql.jdbc.Driver" elif self.db_driver == DriverResolver.SQL_SERVER.__str__(): if port is None: self.port = DriverResolver.SQL_SERVER.port() # "com.microsoft.jdbc.sqlserver.SQLServerDriver" elif self.db_driver == DriverResolver.ORACLE.__str__(): if port is None: self.port = DriverResolver.ORACLE.port() self.driver_option = DriverResolver.ORACLE.java_class() elif self.db_driver == DriverResolver.PRESTO.__str__(): if port is None: self.port = DriverResolver.PRESTO.port() self.driver_option = DriverResolver.PRESTO.java_class() elif self.db_driver == DriverResolver.SQL_LITE.__str__(): # SQlite do not need port pass elif database == DriverResolver.CASSANDRA.__str__(): # When using Cassandra there is no jdbc url since we are going to use the spark cassandra connector pass else: # print("Driver not supported") RaiseIt.value_error( driver, [database["name"] for database in DriverResolver.list()]) if self.port is not None: port = self.port if database is None: database = "" # Create string connection url = "" # Reference SQLite https://mitzen.blogspot.com/2017/06/pyspark-working-with-jdbc-sqlite.html if self.db_driver == DriverResolver.SQL_LITE.__str__(): url = "jdbc:{DB_DRIVER}:{HOST}".format(DB_DRIVER=driver, HOST=host, DATABASE=database) elif self.db_driver == DriverResolver.POSTGRES_SQL.__str__() \ or self.db_driver == DriverResolver.REDSHIFT.__str__() \ or self.db_driver == DriverResolver.MY_SQL.__str__(): # url = "jdbc:" + db_type + "://" + url + ":" + port + "/" + database + "?currentSchema=" + schema url = "jdbc:{DB_DRIVER}://{HOST}:{PORT}/{DATABASE}?currentSchema={SCHEMA}".format( DB_DRIVER=self.db_driver, HOST=host, PORT=port, DATABASE=database, SCHEMA=schema) elif self.db_driver == DriverResolver.SQL_SERVER.__str__(): url = "jdbc:{DB_DRIVER}://{HOST}:{PORT};databaseName={DATABASE}".format( DB_DRIVER=self.db_driver, HOST=host, PORT=port, DATABASE=database, SCHEMA=schema) elif self.db_driver == DriverResolver.ORACLE.__str__(): if oracle_sid: url = "jdbc:{DB_DRIVER}:thin:@{HOST}:{PORT}/{ORACLE_SID}".format( DB_DRIVER=driver, HOST=host, PORT=port, DATABASE=database, ORACLE_SID=oracle_sid, SCHEMA=schema) elif oracle_service_name: url = "jdbc:{DB_DRIVER}:thin:@//{HOST}:{PORT}/{ORACLE_SERVICE_NAME}".format( DB_DRIVER=driver, HOST=host, PORT=port, DATABASE=database, ORACLE_SERVICE_NAME=oracle_service_name) elif oracle_tns: url = "jdbc:{DB_DRIVER}:thin:@//{TNS}".format(DB_DRIVER=driver, TNS=oracle_tns) elif self.db_driver == DriverResolver.PRESTO.__str__(): url = "jdbc:{DB_DRIVER}://{HOST}:{PORT}/{CATALOG}/{DATABASE}".format( DB_DRIVER=self.db_driver, HOST=host, PORT=port, CATALOG=presto_catalog, DATABASE=database) logger.print(url) self.url = url self.database = database self.user = user self.password = password self.schema = schema
def __init__(self, driver, host, database, user, password, port=None, schema="public", oracle_tns=None, oracle_service_name=None, oracle_sid=None): """ Create the JDBC connection object :return: """ self.db_driver = driver self.oracle_sid = oracle_sid # Handle the default port if self.db_driver == "redshift": if port is None: self.port = 5439 # "com.databricks.spark.redshift" elif self.db_driver == "postgres": if port is None: self.port = 5432 # "org.postgresql.Driver" elif self.db_driver == "mysql": if port is None: self.port = 3306 # "com.mysql.jdbc.Driver" elif self.db_driver == "sqlserver": if port is None: self.port = 1433 # "com.microsoft.jdbc.sqlserver.SQLServerDriver" elif self.db_driver == "oracle": if port is None: self.port = 1521 self.driver_option = "oracle.jdbc.OracleDriver" # TODO: add mongo? else: # print("Driver not supported") RaiseIt.value_error(driver, ["redshift", "postgres", "mysql", "sqlite"]) if database is None: database = "" # Create string connection if self.db_driver == "sqlite": url = "jdbc:{DB_DRIVER}://{HOST}/{DATABASE}".format( DB_DRIVER=driver, HOST=host, DATABASE=database) elif self.db_driver == "postgres" or self.db_driver == "redshift" or self.db_driver == "mysql": # url = "jdbc:" + db_type + "://" + url + ":" + port + "/" + database + "?currentSchema=" + schema url = "jdbc:{DB_DRIVER}://{HOST}:{PORT}/{DATABASE}?currentSchema={SCHEMA}".format( DB_DRIVER=driver, HOST=host, PORT=port, DATABASE=database, SCHEMA=schema) elif self.db_driver == "oracle": if oracle_sid: url = "jdbc:{DB_DRIVER}:thin:@{HOST}:{PORT}/{ORACLE_SID}".format( DB_DRIVER=driver, HOST=host, PORT=port, DATABASE=database, ORACLE_SID=oracle_sid, SCHEMA=schema) elif oracle_service_name: url = "jdbc:{DB_DRIVER}:thin:@//{HOST}:{PORT}/{ORACLE_SERVICE_NAME}".format( DB_DRIVER=driver, HOST=host, PORT=port, DATABASE=database, ORACLE_SERVICE_NAME=oracle_service_name) elif oracle_tns: url = "jdbc:{DB_DRIVER}:thin:@//{TNS}".format(DB_DRIVER=driver, TNS=oracle_tns) logger.print(url) self.url = url self.database = database self.user = user self.password = password self.schema = schema
def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column_dtypes=None, accepts_missing_cols=False, invert=False): """ Return a list of columns and check that columns exists in the dataframe Accept '*' as parameter in which case return a list of all columns in the dataframe. Also accept a regex. If a list of tuples return to list. The first element is the columns name the others element are params. This params can be used to create custom transformation functions. You can find and example in cols().cast() :param df: Dataframe in which the columns are going to be checked :param cols_args: Accepts * as param to return all the string columns in the dataframe :param get_args: :param is_regex: Use True is col_attrs is a regex :param filter_by_column_dtypes: A data type for which a columns list is going be filtered :param accepts_missing_cols: if true not check if column exist in the dataframe :param invert: Invert the final selection. For example if you want to select not integers :return: A list of columns string names """ attrs = None # if columns value is * get all dataframes columns if is_regex is True: r = re.compile(cols_args[0]) cols = list(filter(r.match, df.columns)) elif cols_args == "*" or cols_args is None: cols = df.columns # In case we have a list of tuples we use the first element of the tuple is taken as the column name # and the rest as params. We can use the param in a custom function as follow # def func(attrs): attrs return (1,2) and (3,4) # return attrs[0] + 1 # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func) # Verify if we have a list with tuples elif is_tuple(cols_args) or is_list_of_tuples(cols_args): cols_args = val_to_list(cols_args) # Extract a specific position in the tuple cols = [(i[0:1][0]) for i in cols_args] attrs = [(i[1:]) for i in cols_args] else: # if not a list convert to list cols = val_to_list(cols_args) # Get col name from index cols = [c if is_str(c) else df.columns[c] for c in cols] # Check for missing columns if accepts_missing_cols is False: check_for_missing_columns(df, cols) # Filter by column data type if filter_by_column_dtypes is not None: filter_by_column_dtypes = val_to_list(filter_by_column_dtypes) columns_residual = None # If necessary filter the columns by data type if filter_by_column_dtypes: # Get columns for every data type columns_filtered = filter_col_name_by_dtypes(df, filter_by_column_dtypes) # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function final_columns = list(OrderedSet(cols).intersection(columns_filtered)) # This columns match filtered data type columns_residual = list( OrderedSet(cols) - OrderedSet(columns_filtered)) else: final_columns = cols # Return cols or cols an params cols_params = [] if invert: final_columns = list(OrderedSet(cols) - OrderedSet(final_columns)) if get_args is True: cols_params = final_columns, attrs elif get_args is False: cols_params = final_columns else: RaiseIt.value_error(get_args, ["True", "False"]) if columns_residual: logger.print("%s %s %s", ",".join(escape_columns(columns_residual)), "column(s) was not processed because is/are not", ",".join(filter_by_column_dtypes)) return cols_params