Exemplo n.º 1
0
    def _read_parquet_schema(self, engine) -> Tuple[Dict[str, Any], ...]:
        if engine == "pyarrow":
            from pyarrow import parquet

            from pyathena.arrow.util import to_column_info

            if not self._unload_location:
                raise ProgrammingError("UnloadLocation is none or empty.")
            bucket, key = parse_output_location(self._unload_location)
            try:
                dataset = parquet.ParquetDataset(f"{bucket}/{key}",
                                                 filesystem=self._fs,
                                                 use_legacy_dataset=False)
                return to_column_info(dataset.schema)
            except Exception as e:
                _logger.exception(f"Failed to read schema {bucket}/{key}.")
                raise OperationalError(*e.args) from e
        elif engine == "fastparquet":
            from fastparquet import ParquetFile

            # TODO: https://github.com/python/mypy/issues/1153
            from pyathena.fastparquet.util import to_column_info  # type: ignore

            if not self._data_manifest:
                self._data_manifest = self._read_data_manifest()
            bucket, key = parse_output_location(self._data_manifest[0])
            try:
                file = ParquetFile(f"{bucket}/{key}", open_with=self._fs.open)
                return to_column_info(file.schema)
            except Exception as e:
                _logger.exception(f"Failed to read schema {bucket}/{key}.")
                raise OperationalError(*e.args) from e
        else:
            raise ProgrammingError(
                "Engine must be one of `pyarrow`, `fastparquet`.")
Exemplo n.º 2
0
    def format(self,
               operation: str,
               parameters: Optional[Dict[str, Any]] = None) -> str:
        if not operation or not operation.strip():
            raise ProgrammingError("Query is none or empty.")
        operation = operation.strip()

        operation_upper = operation.upper()
        if (operation_upper.startswith("SELECT")
                or operation_upper.startswith("WITH")
                or operation_upper.startswith("INSERT")):
            escaper = _escape_presto
        else:
            escaper = _escape_hive

        kwargs: Optional[Dict[str, Any]] = None
        if parameters is not None:
            kwargs = dict()
            if isinstance(parameters, dict):
                for k, v in parameters.items():
                    func = self.get(v)
                    if not func:
                        raise TypeError("{0} is not defined formatter.".format(
                            type(v)))
                    kwargs.update({k: func(self, escaper, v)})
            else:
                raise ProgrammingError(
                    "Unsupported parameter " +
                    "(Support for dict only): {0}".format(parameters))

        return (operation %
                kwargs).strip() if kwargs is not None else operation.strip()
Exemplo n.º 3
0
    def format(self,
               operation: str,
               parameters: Optional[List[str]] = None) -> str:
        if not operation or not operation.strip():
            raise ProgrammingError("Query is none or empty.")
        operation = operation.strip()

        if operation.upper().startswith(
                "SELECT") or operation.upper().startswith("WITH"):
            escaper = _escape_presto
        else:
            escaper = _escape_hive

        kwargs: Optional[List[str]] = None
        if parameters is not None:
            kwargs = list()
            if isinstance(parameters, list):
                for v in parameters:

                    # TODO Review this annoying Decimal hack, unsure if issue in dbt, agate or pyathena
                    if isinstance(v, Decimal) and v == int(v):
                        v = int(v)

                    func = self.get(v)
                    if not func:
                        raise TypeError("{0} is not defined formatter.".format(
                            type(v)))
                    kwargs.append(func(self, escaper, v))
            else:
                raise ProgrammingError(
                    "Unsupported parameter " +
                    "(Support for list only): {0}".format(parameters))
        return (operation % tuple(kwargs)
                ).strip() if kwargs is not None else operation.strip()
Exemplo n.º 4
0
 def __fetch(self, next_token=None):
     if not self._query_execution.query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     if self._query_execution.state != 'SUCCEEDED':
         raise ProgrammingError('QueryExecutionState is not SUCCEEDED.')
     request = {
         'QueryExecutionId': self._query_execution.query_id,
         'MaxResults': self._arraysize,
     }
     if next_token:
         request.update({'NextToken': next_token})
     try:
         response = retry_api_call(
             self._connection.client.get_query_results,
             exceptions=self.retry_exceptions,
             attempt=self.retry_attempt,
             multiplier=self.retry_multiplier,
             max_delay=self.retry_max_delay,
             exp_base=self.retry_exponential_base,
             logger=_logger,
             **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         return response
Exemplo n.º 5
0
    def format(self, operation, parameters=None):
        if not operation or not operation.strip():
            raise ProgrammingError("Query is none or empty.")
        operation = operation.strip()

        if operation.upper().startswith("SELECT") or operation.upper().startswith(
            "WITH"
        ):
            escaper = _escape_presto
        else:
            escaper = _escape_hive

        kwargs = dict()
        if parameters:
            if isinstance(parameters, dict):
                for k, v in iteritems(parameters):
                    func = self.get(v)
                    if not func:
                        raise TypeError("{0} is not defined formatter.".format(type(v)))
                    kwargs.update({k: func(self, escaper, v)})
            else:
                raise ProgrammingError(
                    "Unsupported parameter "
                    + "(Support for dict only): {0}".format(parameters)
                )

        return (operation % kwargs).strip() if kwargs else operation.strip()
Exemplo n.º 6
0
 def fetchmany(
     self, size: int = None
 ) -> List[Union[Tuple[Optional[Any], ...], Dict[Any, Optional[Any]]]]:
     if not self.has_result_set:
         raise ProgrammingError("No result set.")
     result_set = cast(AthenaResultSet, self.result_set)
     return result_set.fetchmany(size)
Exemplo n.º 7
0
 def fetchall(
     self,
 ) -> List[Union[Tuple[Optional[Any], ...], Dict[Any, Optional[Any]]]]:
     if not self.has_result_set:
         raise ProgrammingError("No result set.")
     result_set = cast(AthenaPandasResultSet, self.result_set)
     return result_set.fetchall()
Exemplo n.º 8
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = self._parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   config=self._retry_config,
                                   logger=_logger,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         length = response['ContentLength']
         if length:
             df = pd.read_csv(io.BytesIO(response['Body'].read()),
                              dtype=self.dtypes,
                              converters=self.converters,
                              parse_dates=self.parse_dates,
                              infer_datetime_format=True)
             df = self._trunc_date(df)
         else:  # Allow empty response so DDL can be used
             df = pd.DataFrame()
         return df
Exemplo n.º 9
0
    def wrap_unload(
        operation: str,
        s3_staging_dir: str,
        format_: str = AthenaFileFormat.FILE_FORMAT_PARQUET,
        compression: str = AthenaCompression.COMPRESSION_SNAPPY,
    ):
        if not operation or not operation.strip():
            raise ProgrammingError("Query is none or empty.")

        operation_upper = operation.strip().upper()
        if operation_upper.startswith("SELECT") or operation_upper.startswith(
                "WITH"):
            now = datetime.utcnow().strftime("%Y%m%d")
            location = f"{s3_staging_dir}unload/{now}/{str(uuid.uuid4())}/"
            operation = textwrap.dedent(f"""
                UNLOAD (
                \t{operation.strip()}
                )
                TO '{location}'
                WITH (
                \tformat = '{format_}',
                \tcompression = '{compression}'
                )
                """)
        else:
            location = None
        return operation, location
Exemplo n.º 10
0
 def arraysize(self, value):
     if value <= 0 or value > self.DEFAULT_FETCH_SIZE:
         raise ProgrammingError(
             "MaxResults is more than maximum allowed length {0}.".format(
                 self.DEFAULT_FETCH_SIZE
             )
         )
     self._arraysize = value
Exemplo n.º 11
0
 def arraysize(self, value: int) -> None:
     if value <= 0 or value > CursorIterator.DEFAULT_FETCH_SIZE:
         raise ProgrammingError(
             "MaxResults is more than maximum allowed length {0}.".format(
                 CursorIterator.DEFAULT_FETCH_SIZE
             )
         )
     self._arraysize = value
Exemplo n.º 12
0
 def fetchone(self):
     if not self._query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     if not self._result_set and self._next_token:
         self._fetch()
     if not self._result_set:
         return None
     else:
         self._rownumber += 1
         return self._result_set.popleft()
Exemplo n.º 13
0
 def __fetch(self, next_token=None):
     if not self._query_execution.query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     if self._query_execution.state != AthenaQueryExecution.STATE_SUCCEEDED:
         raise ProgrammingError('QueryExecutionState is not SUCCEEDED.')
     request = {
         'QueryExecutionId': self._query_execution.query_id,
         'MaxResults': self._arraysize,
     }
     if next_token:
         request.update({'NextToken': next_token})
     try:
         response = retry_api_call(self._connection.client.get_query_results,
                                   config=self._retry_config,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         return response
Exemplo n.º 14
0
    def format(self, operation, parameters=None):
        if not operation or not operation.strip():
            raise ProgrammingError('Query is none or empty.')
        operation = operation.strip()

        if operation.upper().startswith('SELECT') or operation.upper().startswith('WITH'):
            escaper = _escape_presto
        else:
            escaper = _escape_hive

        kwargs = dict()
        if parameters:
            if isinstance(parameters, dict):
                for k, v in iteritems(parameters):
                    func = self.get_formatter(v)
                    kwargs.update({k: func(self, escaper, v)})
            else:
                raise ProgrammingError('Unsupported parameter ' +
                                       '(Support for dict only): {0}'.format(parameters))

        return (operation % kwargs).strip() if kwargs else operation.strip()
Exemplo n.º 15
0
    def _read_csv(self) -> "Table":
        import pyarrow as pa
        from pyarrow import csv

        if not self.output_location:
            raise ProgrammingError("OutputLocation is none or empty.")
        if not self.output_location.endswith((".csv", ".txt")):
            return pa.Table.from_pydict(dict())
        length = self._get_content_length()
        if length and self.output_location.endswith(".txt"):
            description = self.description if self.description else []
            column_names = [d[0] for d in description]
            read_opts = csv.ReadOptions(
                skip_rows=0,
                column_names=column_names,
                block_size=self._block_size,
                use_threads=True,
            )
            parse_opts = csv.ParseOptions(
                delimiter="\t",
                quote_char=False,
                double_quote=False,
                escape_char=False,
            )
        elif length and self.output_location.endswith(".csv"):
            read_opts = csv.ReadOptions(skip_rows=0,
                                        block_size=self._block_size,
                                        use_threads=True)
            parse_opts = csv.ParseOptions(
                delimiter=",",
                quote_char='"',
                double_quote=True,
                escape_char=False,
            )
        else:
            return pa.Table.from_pydict(dict())

        bucket, key = parse_output_location(self.output_location)
        try:
            return csv.read_csv(
                self._fs.open_input_stream(f"{bucket}/{key}"),
                read_options=read_opts,
                parse_options=parse_opts,
                convert_options=csv.ConvertOptions(
                    quoted_strings_can_be_null=False,
                    timestamp_parsers=self.timestamp_parsers,
                    column_types=self.column_types,
                ),
            )
        except Exception as e:
            _logger.exception(f"Failed to read {bucket}/{key}.")
            raise OperationalError(*e.args) from e
Exemplo n.º 16
0
    def _poll(self):
        if not self._query_id:
            raise ProgrammingError('QueryExecutionId is none or empty.')
        while True:
            try:
                request = {'QueryExecutionId': self._query_id}
                response = retry_api_call(self._connection.get_query_execution,
                                          exceptions=self.retry_exceptions,
                                          attempt=self.retry_attempt,
                                          multiplier=self.retry_multiplier,
                                          max_delay=self.retry_max_deply,
                                          exp_base=self.retry_exponential_base,
                                          logger=_logger,
                                          **request)
            except Exception as e:
                _logger.exception('Failed to poll query result.')
                raise_from(OperationalError(*e.args), e)
            else:
                query_execution = response.get('QueryExecution', None)
                if not query_execution:
                    raise DataError('KeyError `QueryExecution`')
                status = query_execution.get('Status', None)
                if not status:
                    raise DataError('KeyError `Status`')

                state = status.get('State', None)
                if state == 'SUCCEEDED':
                    self._completion_date_time = status.get(
                        'CompletionDateTime', None)
                    self._submission_date_time = status.get(
                        'SubmissionDateTime', None)

                    statistics = query_execution.get('Statistics', {})
                    self._data_scanned_in_bytes = statistics.get(
                        'DataScannedInBytes', None)
                    self._execution_time_in_millis = statistics.get(
                        'EngineExecutionTimeInMillis', None)

                    result_conf = query_execution.get('ResultConfiguration',
                                                      {})
                    self._output_location = result_conf.get(
                        'OutputLocation', None)
                    break
                elif state == 'FAILED':
                    raise OperationalError(
                        status.get('StateChangeReason', None))
                elif state == 'CANCELLED':
                    raise OperationalError(
                        status.get('StateChangeReason', None))
                else:
                    time.sleep(self._poll_interval)
Exemplo n.º 17
0
 def __fetch(self, next_token: Optional[str] = None):
     if not self.query_id:
         raise ProgrammingError("QueryExecutionId is none or empty.")
     if self.state != AthenaQueryExecution.STATE_SUCCEEDED:
         raise ProgrammingError("QueryExecutionState is not SUCCEEDED.")
     if self.is_closed:
         raise ProgrammingError("AthenaResultSet is closed.")
     request = {
         "QueryExecutionId": self.query_id,
         "MaxResults": self._arraysize,
     }
     if next_token:
         request.update({"NextToken": next_token})
     try:
         connection = cast("Connection", self._connection)
         response = retry_api_call(connection.client.get_query_results,
                                   config=self._retry_config,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception("Failed to fetch result set.")
         raise OperationalError(*e.args) from e
     else:
         return response
Exemplo n.º 18
0
    def _as_pandas(self) -> "DataFrame":
        import pandas as pd

        if not self.output_location:
            raise ProgrammingError("OutputLocation is none or empty.")
        bucket, key = parse_output_location(self.output_location)
        try:
            response = retry_api_call(
                self._client.get_object,
                config=self._retry_config,
                logger=_logger,
                Bucket=bucket,
                Key=key,
            )
        except Exception as e:
            _logger.exception("Failed to download csv.")
            raise OperationalError(*e.args) from e
        else:
            length = response["ContentLength"]
            if length:
                if self.output_location.endswith(".txt"):
                    sep = "\t"
                    header = None
                    description = self.description if self.description else []
                    names: Optional[Any] = [d[0] for d in description]
                else:  # csv format
                    sep = ","
                    header = 0
                    names = None
                df = pd.read_csv(
                    response["Body"],
                    sep=sep,
                    header=header,
                    names=names,
                    dtype=self.dtypes,
                    converters=self.converters,
                    parse_dates=self.parse_dates,
                    infer_datetime_format=True,
                    skip_blank_lines=False,
                    keep_default_na=self._keep_default_na,
                    na_values=self._na_values,
                    quoting=self._quoting,
                    **self._kwargs,
                )
                df = self._trunc_date(df)
            else:  # Allow empty response
                df = pd.DataFrame()
            return df
Exemplo n.º 19
0
 def cancel(self):
     if not self._query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     try:
         request = {'QueryExecutionId': self._query_id}
         retry_api_call(self._connection.stop_query_execution,
                        exceptions=self.retry_exceptions,
                        attempt=self.retry_attempt,
                        multiplier=self.retry_multiplier,
                        max_delay=self.retry_max_deply,
                        exp_base=self.retry_exponential_base,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception('Failed to cancel query.')
         raise_from(OperationalError(*e.args), e)
Exemplo n.º 20
0
    def _read_csv(self) -> "DataFrame":
        import pandas as pd

        if not self.output_location:
            raise ProgrammingError("OutputLocation is none or empty.")
        if not self.output_location.endswith((".csv", ".txt")):
            return pd.DataFrame()
        length = self._get_content_length()
        if length and self.output_location.endswith(".txt"):
            sep = "\t"
            header = None
            description = self.description if self.description else []
            names = [d[0] for d in description]
        elif length and self.output_location.endswith(".csv"):
            sep = ","
            header = 0
            names = None
        else:
            return pd.DataFrame()
        try:
            # TODO chunksize
            df = pd.read_csv(
                self.output_location,
                sep=sep,
                header=header,
                names=names,
                dtype=self.dtypes,
                converters=self.converters,
                parse_dates=self.parse_dates,
                infer_datetime_format=True,
                skip_blank_lines=False,
                keep_default_na=self._keep_default_na,
                na_values=self._na_values,
                quoting=self._quoting,
                storage_options={
                    "profile": self.connection.profile_name,
                    "client_kwargs": {
                        "region_name": self.connection.region_name,
                        **self.connection._client_kwargs,
                    },
                },
                **self._kwargs,
            )
            return self._trunc_date(df)
        except Exception as e:
            _logger.exception(f"Failed to read {self.output_location}.")
            raise OperationalError(*e.args) from e
Exemplo n.º 21
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = self._parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         df = pd.read_csv(io.BytesIO(response['Body'].read()),
                          dtype=self._dtypes(),
                          converters=self._converters(),
                          parse_dates=self._parse_dates(),
                          infer_datetime_format=True)
         df = self._trunc_date(df)
         return df
Exemplo n.º 22
0
    def _read_parquet(self, engine) -> "DataFrame":
        import pandas as pd

        self._data_manifest = self._read_data_manifest()
        if not self._data_manifest:
            return pd.DataFrame()
        if not self._unload_location:
            self._unload_location = (
                "/".join(self._data_manifest[0].split("/")[:-1]) + "/")

        if engine == "pyarrow":
            unload_location = self._unload_location
            kwargs = {
                "use_threads": True,
                "use_legacy_dataset": False,
            }
        elif engine == "fastparquet":
            unload_location = f"{self._unload_location}*"
            kwargs = {}
        else:
            raise ProgrammingError(
                "Engine must be one of `pyarrow`, `fastparquet`.")
        kwargs.update(self._kwargs)

        try:
            return pd.read_parquet(
                unload_location,
                engine=self._engine,
                storage_options={
                    "profile": self.connection.profile_name,
                    "client_kwargs": {
                        "region_name": self.connection.region_name,
                        **self.connection._client_kwargs,
                    },
                },
                use_nullable_dtypes=False,
                **kwargs,
            )
        except Exception as e:
            _logger.exception(f"Failed to read {self.output_location}.")
            raise OperationalError(*e.args) from e
Exemplo n.º 23
0
 def _pre_fetch(self):
     if not self._query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     try:
         request = {
             'QueryExecutionId': self._query_id,
             'MaxResults': self._arraysize,
         }
         response = retry_api_call(self._connection.get_query_results,
                                   exceptions=self.retry_exceptions,
                                   attempt=self.retry_attempt,
                                   multiplier=self.retry_multiplier,
                                   max_delay=self.retry_max_deply,
                                   exp_base=self.retry_exponential_base,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         self._process_meta_data(response)
         self._process_result_set(response)
Exemplo n.º 24
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   config=self._retry_config,
                                   logger=_logger,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         length = response['ContentLength']
         if length:
             if self.output_location.endswith('.txt'):
                 sep = '\t'
                 header = None
                 names = [d[0] for d in self.description]
             else:  # csv format
                 sep = ','
                 header = 0
                 names = None
             df = pd.read_csv(io.BytesIO(response['Body'].read()),
                              sep=sep,
                              header=header,
                              names=names,
                              dtype=self.dtypes,
                              converters=self.converters,
                              parse_dates=self.parse_dates,
                              infer_datetime_format=True,
                              skip_blank_lines=False)
             df = self._trunc_date(df)
         else:  # Allow empty response
             df = pd.DataFrame()
         return df
Exemplo n.º 25
0
 def _process_result_set(self, response):
     if self._meta_data is None:
         raise ProgrammingError('ResultSetMetadata is none.')
     result_set = response.get('ResultSet', None)
     if not result_set:
         raise DataError('KeyError `ResultSet`')
     rows = result_set.get('Rows', None)
     if rows is None:
         raise DataError('KeyError `Rows`')
     processed_rows = []
     if len(rows) > 0:
         offset = 1 if not self._next_token and self._is_first_row_column_labels(
             rows) else 0
         processed_rows = [
             tuple([
                 self._converter.convert(meta.get('Type', None),
                                         row.get('VarCharValue', None))
                 for meta, row in zip(self._meta_data, rows[i].get(
                     'Data', []))
             ]) for i in xrange(offset, len(rows))
         ]
     self._result_set.extend(processed_rows)
     self._next_token = response.get('NextToken', None)
Exemplo n.º 26
0
 def cancel(self):
     if not self._query_id:
         raise ProgrammingError("QueryExecutionId is none or empty.")
     self._cancel(self._query_id)
Exemplo n.º 27
0
 def as_pandas(self):
     if not self.has_result_set:
         raise ProgrammingError("No result set.")
     return self._result_set.as_pandas()
Exemplo n.º 28
0
 def fetchall(self):
     if not self.has_result_set:
         raise ProgrammingError("No result set.")
     return self._result_set.fetchall()
Exemplo n.º 29
0
 def fetchmany(self, size=None):
     if not self.has_result_set:
         raise ProgrammingError("No result set.")
     return self._result_set.fetchmany(size)
Exemplo n.º 30
0
 def fetchone(self):
     if not self.has_result_set:
         raise ProgrammingError('No result set.')
     return self._result_set.fetchone()