示例#1
0
def append(dfs, like="columns"):
    """
    Concat multiple dataFrames columns or rows wise
    :param dfs: List of DataFrames
    :param like: concat as columns or rows
    :return:
    """

    # FIX: Because monotonically_increasing_id can create different
    # sequence for different dataframes the result could be wrong.

    if like == "columns":
        temp_dfs = []
        col_temp_name = "id_" + random_int()

        dfs = val_to_list(dfs)
        for df in dfs:
            temp_dfs.append(
                df.withColumn(col_temp_name, F.monotonically_increasing_id()))

        def _append(df1, df2):
            return df1.join(df2, col_temp_name, "outer")

        df_result = reduce(_append, temp_dfs).drop(col_temp_name)

    elif like == "rows":
        df_result = reduce(DataFrame.union, dfs)
    else:
        RaiseIt.value_error(like, ["columns", "rows"])

    return df_result
示例#2
0
def concat(dfs, like="columns"):
    """
    Concat multiple dataframes as columns or rows
    :param dfs:
    :param like: The way dataframes is going to be concat. like columns or rows
    :return:
    """
    # Add increasing Ids, and they should be the same.
    if like == "columns":
        temp_dfs = []
        col_temp_name = "id_" + random_int()
        for df in dfs:
            temp_dfs.append(
                df.withColumn(col_temp_name, F.monotonically_increasing_id()))

        def _append_df(df1, df2):
            return df1.join(df2, col_temp_name, "outer")

        df_result = reduce(_append_df, temp_dfs).drop(col_temp_name)

    elif like == "rows":
        df_result = reduce(DataFrame.union, dfs)
    else:
        RaiseIt.value_error(like, ["columns", "rows"])

    return df_result
示例#3
0
def abstract_udf(col,
                 func,
                 func_return_type=None,
                 attrs=None,
                 func_type=None,
                 verbose=False):
    """
    Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp
    :param col: Column to created or transformed
    :param func: Function to be applied to the data
    :param attrs: If required attributes to be passed to the function
    :param func_return_type: Required by UDF and Pandas UDF.
    :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined
    :param verbose: print additional info
    :return: A function, UDF or Pandas UDF
    """

    # By default is going to try to use pandas UDF
    if func_type is None and is_pyarrow_installed() is True:
        func_type = "pandas_udf"

    types = ["column_exp", "udf", "pandas_udf"]
    if func_type not in types:
        RaiseIt.value_error(func_type, types)

    # if verbose is True:
    #    logging.info("Using '{func_type}' to process column '{column}' with function {func_name}"
    #                 .format(func_type=func_type, column=col, func_name=func.__name__))

    df_func = func_factory(func_type, func_return_type)
    return df_func(attrs, func)(col)
示例#4
0
    def delete_check_point_folder(path, file_system):
        """
        Function that deletes the temporal folder where temp files were stored.
        The path required is the same provided by user in setCheckPointFolder().

        :param path: path where the info will be saved
        :param file_system: Describes if file system is local or hadoop file system.
        :return:
        """

        if file_system == "hadoop":
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            logger.print("Deleting checkpoint folder...")
            command = "hadoop fs -rm -r " + folder_path
            os.system(command)
            logger.print("$" + command)
            logger.print("Folder deleted.")
        elif file_system == "local":
            logger.print("Deleting checkpoint folder...")
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            # Checking if tempFolder exits:
            if os.path.isdir(folder_path):
                # Deletes folder if exits:
                rmtree(folder_path)
                # Creates new folder:
                logger.print("Folder deleted.")
            else:
                logger.print("Folder deleted.")
        else:
            RaiseIt.value_error(file_system, ["hadoop", "local"])
示例#5
0
    def get(driver_type) -> AbstractDriver:
        """
        Returns a driver implementation given a database name

        :param driver_type: name of the database
        :return: a database driver
        """
        if driver_type == DriverProperties.CASSANDRA.value["name"]:
            return CassandraDriver()
        elif driver_type == DriverProperties.MYSQL.value["name"]:
            return MySQLDriver()
        elif driver_type == DriverProperties.ORACLE.value["name"]:
            return OracleDriver()
        elif driver_type == DriverProperties.POSTGRESQL.value["name"]:
            return PostgreSQLDriver()
        elif driver_type == DriverProperties.PRESTO.value["name"]:
            return PrestoDriver()
        elif driver_type == DriverProperties.REDSHIFT.value["name"]:
            return RedshiftDriver()
        elif driver_type == DriverProperties.SQLITE.value["name"]:
            return SQLiteDriver()
        elif driver_type == DriverProperties.SQLSERVER.value["name"]:
            return SQLServerDriver()
        elif driver_type == DriverProperties.BIGQUERY.value["name"]:
            return BigQueryDriver()
        elif driver_type == DriverProperties.IMPALA.value["name"]:
            return ImpalaDriver()
        else:
            RaiseIt.value_error(
                driver_type,
                [database["name"] for database in DriverProperties.list()])
示例#6
0
    def nest(input_cols, output_col, shape="string", separator=""):
        """
        Concat multiple columns to one with the format specified
        :param input_cols: columns to be nested
        :param output_col: final column with the nested content
        :param separator: char to be used as separator at the concat time
        :param shape: final data type, 'array', 'string' or 'vector'
        :return: Spark DataFrame
        """

        df = self

        if has_(input_cols, F.Column):
            # Transform non Column data to lit
            columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols]
        else:
            columns = parse_columns(self, input_cols)

        if shape is "vector":
            columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

            vector_assembler = VectorAssembler(
                inputCols=columns,
                outputCol=output_col)
            df = vector_assembler.transform(df)

        elif shape is "array":
            df = apply_expr(output_col, F.array(*columns))

        elif shape is "string":
            df = apply_expr(output_col, F.concat_ws(separator, *columns))
        else:
            RaiseIt.value_error(shape, ["vector", "array", "string"])

        return df
示例#7
0
    def sort(col_sort):
        """
        Sort rows taking in account multiple columns
        :param col_sort: column and sort type combination (col_name, "asc")
        :type col_sort: list of tuples
        """
        # If a list of columns names are given order this by desc. If you need to specify the order of every
        # column use a list of tuples (col_name, "asc")
        df = self

        t = []
        if is_list_of_str_or_int(col_sort):
            for col_name in col_sort:
                t.append(tuple([col_name, "desc"]))
            col_sort = t

        func = []
        for cs in col_sort:
            col_name = one_list_to_val(cs[0])
            order = cs[1]

            if order == "asc":
                sort_func = F.asc
            elif order == "desc":
                sort_func = F.desc
            else:
                RaiseIt.value_error(sort_func, ["asc", "desc"])

            func.append(sort_func(col_name))
            df = df.preserve_meta(self, Actions.SORT_ROW.value, col_name)

        df = df.sort(*func)
        return df
示例#8
0
        def cast_factory(cls):

            # Parse to Vector
            if is_type(cls, Vectors):
                func_type = "udf"

                def cast_to_vectors(val, attr):
                    return Vectors.dense(val)

                func_return_type = VectorUDT()
            # Parse standard data types
            elif get_spark_dtypes_object(cls):

                func_type = "column_exp"

                def cast_to_vectors(col_name, attr):
                    return F.col(col_name).cast(get_spark_dtypes_object(cls))

                func_return_type = None

            # Add here any other parse you want
            else:
                RaiseIt.value_error(cls)

            return func_return_type, cast_to_vectors, func_type
示例#9
0
    def nest(input_cols, output_col, shape=None, separator=" "):
        """
        Concat multiple columns to one with the format specified
        :param input_cols: columns to be nested
        :param output_col: final column with the nested content
        :param separator: char to be used as separator at the concat time
        :param shape: final data type, 'array', 'string' or 'vector'
        :return: Spark DataFrame
        """
        columns = parse_columns(self, input_cols)
        df = self

        if shape is "vector":
            vector_assembler = VectorAssembler(inputCols=input_cols,
                                               outputCol=output_col)
            df = vector_assembler.transform(self)

        elif shape is "array":
            df = apply_expr(output_col, F.array(*columns))

        elif shape is "string":

            df = apply_expr(output_col, F.concat_ws(separator, *columns))
        else:
            RaiseIt.value_error(shape, ["vector", "array", "string"])

        return df
示例#10
0
    def to_file(self, path=None, output="html"):
        """
        Save profiler data to a file in the specified format (html, json)
        :param output: html or json
        :param path: filename in which the data will be saved
        :return:
        """

        if path is None:
            RaiseIt.value_error(path, "str")

        # We need to append a some extra html tags to display it correctly in the browser.
        if output is "html":
            if self.html is None:
                RaiseIt.not_ready_error(
                    "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')"
                )

            write_html(HEADER + self.html + FOOTER, path)
        elif output is "json":
            if self.json is None:
                RaiseIt.not_ready_error(
                    "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')"
                )

            write_json(self.json, path)
        else:

            RaiseIt.type_error(output, ["html", "json"])
示例#11
0
def check_for_missing_columns(df, col_names):
    """
    Check if the columns you want to select exits in the dataframe
    :param df: Dataframe to be checked
    :param col_names: cols names to
    :return:
    """
    missing_columns = list(OrderedSet(col_names) - OrderedSet(df.schema.names))

    if len(missing_columns) > 0:
        RaiseIt.value_error(missing_columns, df.columns)
    return False
示例#12
0
def check_column_numbers(columns, number=0):
    """
    Check if the columns number match number expected
    :param columns:
    :param number: Number of columns to check
    :return:
    """
    if columns is None:
        RaiseIt.value_error(
            columns, ["str", "list"],
            extra_text=
            "Maybe the columns selected do not match a specified datatype filter."
        )

    if isinstance(columns, zip):
        columns = list(columns)

    count = list(columns)

    if number == "*":
        if not len(columns) >= 1:
            RaiseIt.value_error(len(columns), ["1 or greater"])
    elif number == ">1":
        if not len(columns) > 1:
            RaiseIt.value_error(len(columns), ["more than 1"])
    elif len(columns) != number:
        RaiseIt.value_error(count,
                            "{} columns, {} needed".format(number, columns))
示例#13
0
def table_name(self, name=None):
    """
    Create a temp view for a data frame
    :param self:
    :param name:
    :return:
    """
    if not is_str(name):
        RaiseIt.type_error(name, ["string"])

    if len(name) is 0:
        RaiseIt.value_error(name, ["> 0"])

    self.createOrReplaceTempView(name)
    return self
示例#14
0
def check_column_numbers(columns, number=0):
    """
    Check if the columns number match number expected
    :param columns:
    :param number: Number of columns to check
    :return:
    """
    count = len(columns)

    if number is "*":
        if not len(columns) >= 1:
            RaiseIt.value_error(len(columns), ["more than 1"])
    elif not len(columns) == number:

        RaiseIt.value_error(count, "Receive {} columns, {} needed".format(number, columns))
示例#15
0
def set_name(self, value=None):
    """
    Create a temp view for a data frame also used in the json output profiling
    :param self:
    :param value:
    :return:
    """
    self._name = value
    if not is_str(value):
        RaiseIt.type_error(value, ["string"])

    if len(value) == 0:
        RaiseIt.value_error(value, ["> 0"])

    self.createOrReplaceTempView(value)
示例#16
0
    def sort(order="asc"):
        """
        Sort dataframes columns asc or desc
        :param order: 'asc' or 'desc' accepted
        :return: Spark DataFrame
        """

        if order == "asc":
            sorted_col_names = sorted(self.columns)
        elif order == "desc":
            sorted_col_names = sorted(self.columns, reverse=True)
        else:
            RaiseIt.value_error(order, ["asc", "desc"])

        return self.select(sorted_col_names)
示例#17
0
    def to_file(self, path=None, output=None):
        """
        Save profiler data to a file in the specified format (html, json)
        :param output: html or json
        :param path: filename in which the data will be saved
        :return:
        """

        if path is None:
            RaiseIt.value_error(path, ["Invalid file path"])

        # We need to append a some extra html tags to display it correctly in the browser.
        if output is "html":
            if self.html is None:
                assert self.html is not None, "Please run the profiler first"

            header = '''<!doctype html>
<html class="no-js" lang="">

<head>
  <meta charset="utf-8">
  <meta http-equiv="x-ua-compatible" content="ie=edge">
  <title></title>
  <meta name="description" content="">
  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

  <link rel="manifest" href="site.webmanifest">
  <link rel="apple-touch-icon" href="icon.png">
  <!-- Place favicon.ico in the root directory -->

  <link rel="stylesheet" href="css/normalize.css">
  <link rel="stylesheet" href="css/main.css">
</head>

<body>'''

            footer = '''</body></html>'''

            write_html(header + self.html + footer, path)
        elif output is "json":
            if self.json is None:
                assert self.json is not None, "Please run the profiler first"

            write_json(self.json, path)
        else:
            print("sdf")
            RaiseIt.type_error(output, ["html", "json"])
示例#18
0
def absolute_path(files, format="posix"):
    """
    User project base folder to construct and absolute path
    :param files: path files
    :param format: posix or uri
    :return:
    """
    files = val_to_list(files)
    if format == "uri":
        result = [Path(ROOT_DIR + file).as_uri() for file in files]
    elif format == "posix":
        result = [Path(ROOT_DIR + file).as_posix() for file in files]
    else:
        RaiseIt.value_error(format, ["posix", "uri"])

    result = one_list_to_val(result)
    return result
示例#19
0
    def _set_check_point_folder(path, file_system):
        """
        Function that receives a workspace path where a folder is created.
        This folder will store temporal dataframes when user writes the .checkPoint().

        :param path: Location of the dataset (string).
        :param file_system: Describes if file system is local or hadoop file system.

        """

        print_check_point_config(file_system)

        if file_system == "hadoop":
            folder_path = path + "/" + "checkPointFolder"
            Optimus.delete_check_point_folder(path=path,
                                              file_system=file_system)

            # Creating file:
            logger.print("Creating the hadoop folder...")
            command = "hadoop fs -mkdir " + folder_path
            logger.print("$" + command)
            os.system(command)
            logger.print("Hadoop folder created. \n")

            logger.print("Setting created folder as checkpoint folder...")
            Spark.instance.sc.setCheckpointDir(folder_path)
        elif file_system == "local":
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            # Checking if tempFolder exits:
            logger.print("Deleting previous folder if exists...")
            if os.path.isdir(folder_path):
                # Deletes folder if exits:
                rmtree(folder_path)

            logger.print("Creating the checkpoint directory...")
            # Creates new folder:
            os.mkdir(folder_path)

            Spark.instance.sc.setCheckpointDir(dirName="file:///" +
                                               folder_path)
        else:
            RaiseIt.value_error(file_system, ["hadoop", "local"])
示例#20
0
文件: audf.py 项目: a-domingu/tbcnn
def abstract_udf(col, func, func_return_type=None, args=None, func_type=None):
    """
    Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp
    :param col: Column to created or transformed
    :param func: Function to be applied to the data
    :param args: If required attributes to be passed to the function
    :param func_return_type: Required by UDF and Pandas UDF.
    :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined
    :return: A function, UDF or Pandas UDF
    """

    if func_return_type is None:
        func_type = "column_expr"
    # By default is going to try to use pandas UDF
    elif func_type is None and is_pyarrow_installed() is True:
        func_type = "pandas_udf"

    types = ["column_expr", "udf", "pandas_udf"]
    if func_type not in types:
        RaiseIt.value_error(func_type, types)

    # It handle if func param is a plain expression or a function returning and expression
    def func_col_exp(col_name, attr):
        return func

    if is_column(func):
        _func = func_col_exp
    else:
        _func = func
    # print(func_type)
    logger.print(
        "Using '{func_type}' to process column '{column}' with function {func_name}"
        .format(func_type=func_type, column=col, func_name=_func.__name__))

    df_func = func_factory(func_type, func_return_type)
    if not is_tuple(args):
        args = (args, )

    # print("-----------------df_func(_func, args)(col)", df_func(_func, args)(col))
    return df_func(_func, args)(col)
示例#21
0
    def move(column, position, ref_col=None):
        """
        Move a column to specific position
        :param column: Column to be moved
        :param position: Column new position. Accepts 'after', 'before', 'beginning', 'end'
        :param ref_col: Column taken as reference
        :return: Spark DataFrame
        """
        # Check that column is a string or a list
        column = parse_columns(self, column)
        ref_col = parse_columns(self, ref_col)

        # Get dataframe columns
        columns = self.columns

        # Get source and reference column index position
        new_index = columns.index(ref_col[0])

        # Column to move
        column_to_move_index = columns.index(column[0])

        if position == 'after':
            # Check if the movement is from right to left:
            if new_index < column_to_move_index:
                new_index = new_index + 1
        elif position == 'before':  # If position if before:
            if new_index >= column_to_move_index:  # Check if the movement if from right to left:
                new_index = new_index - 1
        elif position == 'beginning':
            new_index = 0
        elif position == 'end':
            new_index = len(columns)
        else:
            RaiseIt.value_error(position, ["after", "before", "beginning", "end"])

        # Move the column to the new place
        columns.insert(new_index, columns.pop(column_to_move_index))  # insert and delete a element

        return self[columns]
示例#22
0
def sample_n(self, n=10, random=False):
    """
    Return a n number of sample from a dataFrame
    :param self:
    :param n: Number of samples
    :param random: if true get a semi random sample
    :return:
    """
    if random is True:
        seed = random_int()
    elif random is False:
        seed = 0
    else:
        RaiseIt.value_error(random, ["True", "False"])

    rows_count = self.count()
    if n < rows_count:
        fraction = n / rows_count
    else:
        fraction = 1.0

    return self.sample(False, fraction, seed=seed)
示例#23
0
def sample_n(self, n=10, random=False):
    """
    Return a n number of sample from a dataFrame
    :param self:
    :param n: Number of samples
    :param random: if true get a semi random sample
    :return:
    """
    if random is True:
        seed = random_int()
    elif random is False:
        seed = 0
    else:
        RaiseIt.value_error(random, ["True", "False"])

    rows_count = self.count()
    if n < rows_count:
        # n/rows_count can return a number that represent less the total number we expect. multiply by 1.1 bo
        fraction = (n / rows_count) * 1.1
    else:
        fraction = 1.0

    return self.sample(False, fraction, seed=seed).limit(n)
示例#24
0
def check_column_numbers(columns, number=0):
    """
    Check if the columns number match number expected
    :param columns:
    :param number: Number of columns to check
    :return:
    """
    if columns is None:
        RaiseIt.value_error(columns, "not None")

    count = len(columns)

    if number == "*":
        if not len(columns) >= 1:
            RaiseIt.value_error(len(columns), ["1 or greater"])
    elif number == ">1":
        if not len(columns) > 1:
            RaiseIt.value_error(len(columns), ["more than 1"])
    elif len(columns) == number:
        RaiseIt.value_error(count, "{} columns, {} needed".format(number, columns))
示例#25
0
def optimus(engine=Engine.DASK.value, *args, **kwargs):
    """
    This is the entry point to initialize the selected engine.
    :param engine: A string identifying an engine :classL`Engine`.
    :param args:
    :param kwargs:
    :return:
    """
    logger.print("ENGINE", engine)

    # lemmatizer
    nltk.download('wordnet', quiet=True)

    # Stopwords
    nltk.download('stopwords', quiet=True)

    # Init engine
    if engine == Engine.PANDAS.value:
        from optimus.engines.pandas.engine import PandasEngine
        op = PandasEngine(*args, **kwargs)

    elif engine == Engine.VAEX.value:
        from optimus.engines.vaex.engine import VaexEngine
        op = VaexEngine(*args, **kwargs)

    elif engine == Engine.SPARK.value:
        from optimus.engines.spark.engine import SparkEngine
        op = SparkEngine(*args, **kwargs)

    elif engine == Engine.DASK.value:
        from optimus.engines.dask.engine import DaskEngine
        op = DaskEngine(*args, **kwargs)

    elif engine == Engine.IBIS.value:
        from optimus.engines.ibis.engine import IbisEngine
        op = IbisEngine(*args, **kwargs)

    elif engine == Engine.CUDF.value:
        from optimus.engines.cudf.engine import CUDFEngine
        op = CUDFEngine(*args, **kwargs)

    elif engine == Engine.DASK_CUDF.value:
        from optimus.engines.dask_cudf.engine import DaskCUDFEngine
        op = DaskCUDFEngine(*args, **kwargs)

    else:
        RaiseIt.value_error(engine, Engine.list())

    # Set cupy yo user RMM
    def switch_to_rmm_allocator():
        import rmm
        import cupy
        cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
        return True

    if engine == Engine.CUDF.value:
        switch_to_rmm_allocator()

    if engine == Engine.DASK_CUDF.value:
        if op.client:
            op.client.run(switch_to_rmm_allocator)

    return op
示例#26
0
    def __init__(self,
                 host,
                 database,
                 user,
                 password,
                 port=None,
                 driver=None,
                 schema="public",
                 oracle_tns=None,
                 oracle_service_name=None,
                 oracle_sid=None,
                 presto_catalog=None,
                 cassandra_keyspace=None,
                 cassandra_table=None):
        """
        Create the JDBC connection object
        :return:
        """
        self.port = None

        self.db_driver = driver
        self.oracle_sid = oracle_sid
        self.cassandra_keyspace = cassandra_keyspace
        self.cassandra_table = cassandra_table

        # TODO: add mongo?
        # Handle the default port
        if self.db_driver == DriverResolver.REDSHIFT.__str__():
            if port is None: self.port = DriverResolver.REDSHIFT.port()
            # "com.databricks.spark.redshift"

        elif self.db_driver == DriverResolver.POSTGRES_SQL.__str__():
            if port is None: self.port = DriverResolver.POSTGRES_SQL.port()
            self.driver_option = DriverResolver.POSTGRES_SQL.java_class()

        elif self.db_driver == DriverResolver.POSTGRES.__str__(
        ):  # backward compat
            if port is None: self.port = DriverResolver.POSTGRES.port()
            self.driver_option = DriverResolver.POSTGRES.java_class()
            self.db_driver = DriverResolver.POSTGRES_SQL.__str__()

        elif self.db_driver == DriverResolver.MY_SQL.__str__():

            if port is None:
                self.port = DriverResolver.MY_SQL.port()
            # "com.mysql.jdbc.Driver"

        elif self.db_driver == DriverResolver.SQL_SERVER.__str__():
            if port is None: self.port = DriverResolver.SQL_SERVER.port()
            # "com.microsoft.jdbc.sqlserver.SQLServerDriver"

        elif self.db_driver == DriverResolver.ORACLE.__str__():
            if port is None: self.port = DriverResolver.ORACLE.port()
            self.driver_option = DriverResolver.ORACLE.java_class()

        elif self.db_driver == DriverResolver.PRESTO.__str__():
            if port is None: self.port = DriverResolver.PRESTO.port()
            self.driver_option = DriverResolver.PRESTO.java_class()

        elif self.db_driver == DriverResolver.SQL_LITE.__str__():
            # SQlite do not need port
            pass

        elif database == DriverResolver.CASSANDRA.__str__():
            # When using Cassandra there is no jdbc url since we are going to use the spark cassandra connector
            pass

        else:
            # print("Driver not supported")
            RaiseIt.value_error(
                driver,
                [database["name"] for database in DriverResolver.list()])

        if self.port is not None:
            port = self.port

        if database is None:
            database = ""

        # Create string connection

        url = ""
        # Reference SQLite https://mitzen.blogspot.com/2017/06/pyspark-working-with-jdbc-sqlite.html
        if self.db_driver == DriverResolver.SQL_LITE.__str__():
            url = "jdbc:{DB_DRIVER}:{HOST}".format(DB_DRIVER=driver,
                                                   HOST=host,
                                                   DATABASE=database)

        elif self.db_driver == DriverResolver.POSTGRES_SQL.__str__() \
                or self.db_driver == DriverResolver.REDSHIFT.__str__() \
                or self.db_driver == DriverResolver.MY_SQL.__str__():
            # url = "jdbc:" + db_type + "://" + url + ":" + port + "/" + database + "?currentSchema=" + schema
            url = "jdbc:{DB_DRIVER}://{HOST}:{PORT}/{DATABASE}?currentSchema={SCHEMA}".format(
                DB_DRIVER=self.db_driver,
                HOST=host,
                PORT=port,
                DATABASE=database,
                SCHEMA=schema)

        elif self.db_driver == DriverResolver.SQL_SERVER.__str__():
            url = "jdbc:{DB_DRIVER}://{HOST}:{PORT};databaseName={DATABASE}".format(
                DB_DRIVER=self.db_driver,
                HOST=host,
                PORT=port,
                DATABASE=database,
                SCHEMA=schema)
        elif self.db_driver == DriverResolver.ORACLE.__str__():
            if oracle_sid:
                url = "jdbc:{DB_DRIVER}:thin:@{HOST}:{PORT}/{ORACLE_SID}".format(
                    DB_DRIVER=driver,
                    HOST=host,
                    PORT=port,
                    DATABASE=database,
                    ORACLE_SID=oracle_sid,
                    SCHEMA=schema)
            elif oracle_service_name:
                url = "jdbc:{DB_DRIVER}:thin:@//{HOST}:{PORT}/{ORACLE_SERVICE_NAME}".format(
                    DB_DRIVER=driver,
                    HOST=host,
                    PORT=port,
                    DATABASE=database,
                    ORACLE_SERVICE_NAME=oracle_service_name)

            elif oracle_tns:
                url = "jdbc:{DB_DRIVER}:thin:@//{TNS}".format(DB_DRIVER=driver,
                                                              TNS=oracle_tns)

        elif self.db_driver == DriverResolver.PRESTO.__str__():
            url = "jdbc:{DB_DRIVER}://{HOST}:{PORT}/{CATALOG}/{DATABASE}".format(
                DB_DRIVER=self.db_driver,
                HOST=host,
                PORT=port,
                CATALOG=presto_catalog,
                DATABASE=database)

        logger.print(url)

        self.url = url
        self.database = database
        self.user = user
        self.password = password
        self.schema = schema
示例#27
0
    def __init__(self,
                 driver,
                 host,
                 database,
                 user,
                 password,
                 port=None,
                 schema="public",
                 oracle_tns=None,
                 oracle_service_name=None,
                 oracle_sid=None):
        """
        Create the JDBC connection object
        :return:
        """

        self.db_driver = driver
        self.oracle_sid = oracle_sid

        # Handle the default port
        if self.db_driver == "redshift":
            if port is None: self.port = 5439
            # "com.databricks.spark.redshift"

        elif self.db_driver == "postgres":
            if port is None: self.port = 5432
            # "org.postgresql.Driver"

        elif self.db_driver == "mysql":
            if port is None: self.port = 3306
            # "com.mysql.jdbc.Driver"

        elif self.db_driver == "sqlserver":
            if port is None: self.port = 1433
            # "com.microsoft.jdbc.sqlserver.SQLServerDriver"

        elif self.db_driver == "oracle":
            if port is None: self.port = 1521
            self.driver_option = "oracle.jdbc.OracleDriver"

        # TODO: add mongo?
        else:
            # print("Driver not supported")
            RaiseIt.value_error(driver,
                                ["redshift", "postgres", "mysql", "sqlite"])

        if database is None:
            database = ""

        # Create string connection
        if self.db_driver == "sqlite":
            url = "jdbc:{DB_DRIVER}://{HOST}/{DATABASE}".format(
                DB_DRIVER=driver, HOST=host, DATABASE=database)
        elif self.db_driver == "postgres" or self.db_driver == "redshift" or self.db_driver == "mysql":
            # url = "jdbc:" + db_type + "://" + url + ":" + port + "/" + database + "?currentSchema=" + schema
            url = "jdbc:{DB_DRIVER}://{HOST}:{PORT}/{DATABASE}?currentSchema={SCHEMA}".format(
                DB_DRIVER=driver,
                HOST=host,
                PORT=port,
                DATABASE=database,
                SCHEMA=schema)

        elif self.db_driver == "oracle":
            if oracle_sid:
                url = "jdbc:{DB_DRIVER}:thin:@{HOST}:{PORT}/{ORACLE_SID}".format(
                    DB_DRIVER=driver,
                    HOST=host,
                    PORT=port,
                    DATABASE=database,
                    ORACLE_SID=oracle_sid,
                    SCHEMA=schema)
            elif oracle_service_name:
                url = "jdbc:{DB_DRIVER}:thin:@//{HOST}:{PORT}/{ORACLE_SERVICE_NAME}".format(
                    DB_DRIVER=driver,
                    HOST=host,
                    PORT=port,
                    DATABASE=database,
                    ORACLE_SERVICE_NAME=oracle_service_name)

            elif oracle_tns:
                url = "jdbc:{DB_DRIVER}:thin:@//{TNS}".format(DB_DRIVER=driver,
                                                              TNS=oracle_tns)

        logger.print(url)

        self.url = url
        self.database = database
        self.user = user
        self.password = password
        self.schema = schema
示例#28
0
def parse_columns(df,
                  cols_args,
                  get_args=False,
                  is_regex=None,
                  filter_by_column_dtypes=None,
                  accepts_missing_cols=False,
                  invert=False):
    """
    Return a list of columns and check that columns exists in the dataframe
    Accept '*' as parameter in which case return a list of all columns in the dataframe.
    Also accept a regex.
    If a list of tuples return to list. The first element is the columns name the others element are params.
    This params can be used to create custom transformation functions. You can find and example in cols().cast()
    :param df: Dataframe in which the columns are going to be checked
    :param cols_args: Accepts * as param to return all the string columns in the dataframe
    :param get_args:
    :param is_regex: Use True is col_attrs is a regex
    :param filter_by_column_dtypes: A data type for which a columns list is going be filtered
    :param accepts_missing_cols: if true not check if column exist in the dataframe
    :param invert: Invert the final selection. For example if you want to select not integers

    :return: A list of columns string names
    """

    attrs = None

    # if columns value is * get all dataframes columns
    if is_regex is True:
        r = re.compile(cols_args[0])
        cols = list(filter(r.match, df.columns))

    elif cols_args == "*" or cols_args is None:
        cols = df.columns

    # In case we have a list of tuples we use the first element of the tuple is taken as the column name
    # and the rest as params. We can use the param in a custom function as follow
    # def func(attrs): attrs return (1,2) and (3,4)
    #   return attrs[0] + 1
    # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func)

    # Verify if we have a list with tuples
    elif is_tuple(cols_args) or is_list_of_tuples(cols_args):
        cols_args = val_to_list(cols_args)
        # Extract a specific position in the tuple
        cols = [(i[0:1][0]) for i in cols_args]
        attrs = [(i[1:]) for i in cols_args]
    else:
        # if not a list convert to list
        cols = val_to_list(cols_args)
        # Get col name from index
        cols = [c if is_str(c) else df.columns[c] for c in cols]

    # Check for missing columns
    if accepts_missing_cols is False:
        check_for_missing_columns(df, cols)

    # Filter by column data type
    if filter_by_column_dtypes is not None:
        filter_by_column_dtypes = val_to_list(filter_by_column_dtypes)

    columns_residual = None

    # If necessary filter the columns by data type
    if filter_by_column_dtypes:
        # Get columns for every data type

        columns_filtered = filter_col_name_by_dtypes(df,
                                                     filter_by_column_dtypes)

        # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function
        final_columns = list(OrderedSet(cols).intersection(columns_filtered))

        # This columns match filtered data type
        columns_residual = list(
            OrderedSet(cols) - OrderedSet(columns_filtered))
    else:
        final_columns = cols

    # Return cols or cols an params
    cols_params = []

    if invert:
        final_columns = list(OrderedSet(cols) - OrderedSet(final_columns))

    if get_args is True:
        cols_params = final_columns, attrs
    elif get_args is False:
        cols_params = final_columns
    else:
        RaiseIt.value_error(get_args, ["True", "False"])

    if columns_residual:
        logger.print("%s %s %s", ",".join(escape_columns(columns_residual)),
                     "column(s) was not processed because is/are not",
                     ",".join(filter_by_column_dtypes))

    return cols_params