예제 #1
0
파일: base.py 프로젝트: nils-braun/dask-sql
    def fix_dtype_to_row_type(
        dc: DataContainer, row_type: "org.apache.calcite.rel.type.RelDataType"
    ):
        """
        Fix the dtype of the given data container (or: the df within it)
        to the data type given as argument.
        To prevent unneeded conversions, do only convert if really needed,
        e.g. if the two types are "similar" enough, do not convert.
        Similarity involves the same general type (int, float, string etc)
        but not necessary the size (int64 and int32 are compatible)
        or the nullability.
        TODO: we should check the nullability of the SQL type
        """
        df = dc.df
        cc = dc.column_container

        field_types = {
            int(field.getIndex()): str(field.getType())
            for field in row_type.getFieldList()
        }

        for index, field_type in field_types.items():
            expected_type = sql_to_python_type(field_type)
            field_name = cc.get_backend_by_frontend_index(index)

            df = cast_column_type(df, field_name, expected_type)

        return DataContainer(df, dc.column_container)
예제 #2
0
def try_to_infer_type_of_operation(operation,
                                   column_types,
                                   default_type=np.float64):
    """
    Tries to infer the return type for an operation passed to aggregate
    or extended_projection methods.
    In order to work with dask-sql, the return type should be a pandas
    or numpy type.

    Parameters
    ----------
    operation : Union[Callable, str]
        The operation to infer the type for
    column_types : pd.Series
        The dtypes series mapping the dtype for each column.
        Used if operation references a known column.
    default_type : Type, optional
        The return value if type cannot be infered, by default np.float64

    Returns
    -------
    Type
        An infered return type for the operation.
    """
    try:
        # 1. First we try to guess the return type of the operation
        if isinstance(operation, (types.FunctionType, types.MethodType)):
            # operation is a custom function
            rtype = typing_callable_from_annotated_function(operation)
            rtype = get_args(rtype)[1]
        elif isinstance(operation, types.BuiltinFunctionType):
            # operation is something like 'sum'
            rtype = infer_type_builtins(operation)
            rtype = get_args(rtype)[1]
        else:
            if isinstance(operation, str):
                default_type = str
                # check if it's one of SQLAlchemy's known functions, like count
                if hasattr(functions, operation):
                    rtype = getattr(functions, operation).type
                    if inspect.isclass(rtype):
                        rtype = rtype()
                    rtype = sql_to_python_type(rtype.compile())
                else:
                    # otherwise operation is probably a str or
                    # RelationalAlgebraStringExpression representing a column
                    # literal, like 'col_a + 1', or a constant like '0'.
                    # We try to parse the expression to get type of variable or
                    # constant.
                    rtype = type_of_expression(
                        ast.parse(operation, mode="eval").body, column_types)
            else:
                rtype = type(operation)
    except (ValueError, TypeError, NotImplementedError, SyntaxError):
        LOG.warning(f"Unable to infer type of operation {operation}"
                    f", assuming default {default_type} type instead.")
        rtype = default_type
    return rtype
예제 #3
0
파일: call.py 프로젝트: gallamine/dask-sql
    def div(self, lhs, rhs, rex=None):
        result = lhs / rhs

        output_type = str(rex.getType())
        output_type = sql_to_python_type(output_type.upper())

        is_float = pd.api.types.is_float_dtype(output_type)
        if not is_float:
            result = da.trunc(result)

        return result
예제 #4
0
파일: call.py 프로젝트: gallamine/dask-sql
    def cast(self, operand, rex=None) -> SeriesOrScalar:
        if not is_frame(operand):
            return operand

        output_type = str(rex.getType())
        output_type = sql_to_python_type(output_type.upper())

        return_column = cast_column_to_type(operand, output_type)

        if return_column is None:
            return operand
        else:
            return return_column
예제 #5
0
    def cast(self, operand, rex=None) -> SeriesOrScalar:
        output_type = str(rex.getType())
        python_type = sql_to_python_type(output_type.upper())

        return_column = cast_column_to_type(operand, python_type)

        if return_column is None:
            return_column = operand

        # TODO: ideally we don't want to directly access the datetimes,
        # but Pandas can't truncate timezone datetimes and cuDF can't
        # truncate datetimes
        if output_type == "DATE":
            return return_column.dt.floor("D").astype(python_type)

        return return_column
예제 #6
0
    def fix_dtype_to_row_type(
            dc: DataContainer,
            row_type: "org.apache.calcite.rel.type.RelDataType"):
        """
        Fix the dtype of the given data container (or: the df within it)
        to the data type given as argument.
        To prevent unneeded conversions, do only convert if really needed,
        e.g. if the two types are "similar" enough, do not convert.
        Similarity involves the same general type (int, float, string etc)
        but not necessary the size (int64 and int32 are compatible)
        or the nullability.
        TODO: we should check the nullability of the SQL type
        """
        df = dc.df
        cc = dc.column_container

        field_types = {
            int(field.getIndex()): str(field.getType())
            for field in row_type.getFieldList()
        }

        for index, field_type in field_types.items():
            expected_type = sql_to_python_type(field_type)
            field_name = cc.get_backend_by_frontend_index(index)
            current_type = df[field_name].dtype

            logger.debug(
                f"Column {field_name} has type {current_type}, expecting {expected_type}..."
            )

            if similar_type(current_type, expected_type):
                logger.debug("...not converting.")
                continue

            current_float = pd.api.types.is_float_dtype(current_type)
            expected_integer = pd.api.types.is_integer_dtype(expected_type)
            if current_float and expected_integer:
                logger.debug("...truncating...")
                df[field_name] = da.trunc(df[field_name])

            logger.debug(
                f"Need to cast {field_name} from {current_type} to {expected_type}"
            )
            df[field_name] = df[field_name].astype(expected_type)

        return DataContainer(df, dc.column_container)
예제 #7
0
    def to_dc(self,
              input_item: Any,
              table_name: str,
              format: str = None,
              **kwargs):  # pragma: no cover
        table_name = kwargs.pop("hive_table_name", table_name)
        schema = kwargs.pop("hive_schema_name", "default")

        parsed = self._parse_hive_table_description(input_item, schema,
                                                    table_name)
        (
            column_information,
            table_information,
            storage_information,
            partition_information,
        ) = parsed

        logger.debug("Extracted hive information: ")
        logger.debug(f"column information: {column_information}")
        logger.debug(f"table information: {table_information}")
        logger.debug(f"storage information: {storage_information}")
        logger.debug(f"partition information: {partition_information}")

        # Convert column information
        column_information = {
            col: sql_to_python_type(col_type.upper())
            for col, col_type in column_information.items()
        }

        # Extract format information
        if "InputFormat" in storage_information:
            format = storage_information["InputFormat"].split(".")[-1]
        # databricks format is different, see https://github.com/nils-braun/dask-sql/issues/83
        elif "InputFormat" in table_information:
            format = table_information["InputFormat"].split(".")[-1]
        else:
            raise RuntimeError(
                "Do not understand the output of 'DESCRIBE FORMATTED <table>'")

        if format == "TextInputFormat" or format == "SequenceFileInputFormat":
            storage_description = storage_information.get(
                "Storage Desc Params", {})
            read_function = partial(
                dd.read_csv,
                sep=storage_description.get("field.delim", ","),
                header=None,
            )
        elif format == "ParquetInputFormat" or format == "MapredParquetInputFormat":
            read_function = dd.read_parquet
        elif format == "OrcInputFormat":
            read_function = dd.read_orc
        elif format == "JsonInputFormat":
            read_function = dd.read_json
        else:
            raise AttributeError(
                f"Do not understand hive's table format {format}")

        def _normalize(loc):
            if loc.startswith("dbfs:/") and not loc.startswith("dbfs://"):
                # dask (or better: fsspec) needs to have the URL in a specific form
                # starting with two // after the protocol
                loc = f"dbfs://{loc.lstrip('dbfs:')}"
            # file:// is not a known protocol
            loc = loc.lstrip("file:")
            # Only allow files which do not start with . or _
            # Especially, not allow the _SUCCESS files
            return os.path.join(loc, "[A-Za-z0-9-]*")

        def wrapped_read_function(location, column_information, **kwargs):
            location = _normalize(location)
            logger.debug(f"Reading in hive data from {location}")
            df = read_function(location, **kwargs)

            logger.debug(f"Applying column information: {column_information}")
            df = df.rename(
                columns=dict(zip(df.columns, column_information.keys())))

            for col, expected_type in column_information.items():
                df = cast_column_type(df, col, expected_type)

            return df

        if partition_information:
            partition_list = self._parse_hive_partition_description(
                input_item, schema, table_name)
            logger.debug(f"Reading in partitions from {partition_list}")

            tables = []
            for partition in partition_list:
                parsed = self._parse_hive_table_description(
                    input_item, schema, table_name, partition=partition)
                (
                    partition_column_information,
                    partition_table_information,
                    _,
                    _,
                ) = parsed

                location = partition_table_information["Location"]
                table = wrapped_read_function(location,
                                              partition_column_information,
                                              **kwargs)

                # Now add the additional partition columns
                partition_values = ast.literal_eval(
                    partition_table_information["Partition Value"])

                logger.debug(
                    f"Applying additional partition information as columns: {partition_information}"
                )

                partition_id = 0
                for partition_key, partition_type in partition_information.items(
                ):
                    table[partition_key] = partition_values[partition_id]
                    table = cast_column_type(table, partition_key,
                                             partition_type)

                    partition_id += 1

                tables.append(table)

            return dd.concat(tables)

        location = table_information["Location"]
        df = wrapped_read_function(location, column_information, **kwargs)
        return df