Exemplo n.º 1
0
    def _poll(self):
        if not self._query_id:
            raise ProgrammingError('QueryExecutionId is none or empty.')
        while True:
            try:
                request = {'QueryExecutionId': self._query_id}
                response = retry_api_call(self._connection.get_query_execution,
                                          exceptions=self.retry_exceptions,
                                          attempt=self.retry_attempt,
                                          multiplier=self.retry_multiplier,
                                          max_delay=self.retry_max_deply,
                                          exp_base=self.retry_exponential_base,
                                          logger=_logger,
                                          **request)
            except Exception as e:
                _logger.exception('Failed to poll query result.')
                raise_from(OperationalError(*e.args), e)
            else:
                query_execution = response.get('QueryExecution', None)
                if not query_execution:
                    raise DataError('KeyError `QueryExecution`')
                status = query_execution.get('Status', None)
                if not status:
                    raise DataError('KeyError `Status`')

                state = status.get('State', None)
                if state == 'SUCCEEDED':
                    self._completion_date_time = status.get(
                        'CompletionDateTime', None)
                    self._submission_date_time = status.get(
                        'SubmissionDateTime', None)

                    statistics = query_execution.get('Statistics', {})
                    self._data_scanned_in_bytes = statistics.get(
                        'DataScannedInBytes', None)
                    self._execution_time_in_millis = statistics.get(
                        'EngineExecutionTimeInMillis', None)

                    result_conf = query_execution.get('ResultConfiguration',
                                                      {})
                    self._output_location = result_conf.get(
                        'OutputLocation', None)
                    break
                elif state == 'FAILED':
                    raise OperationalError(
                        status.get('StateChangeReason', None))
                elif state == 'CANCELLED':
                    raise OperationalError(
                        status.get('StateChangeReason', None))
                else:
                    time.sleep(self._poll_interval)
Exemplo n.º 2
0
 def _list_table_metadata(
     self,
     max_results: Optional[int] = None,
     catalog_name: Optional[str] = None,
     schema_name: Optional[str] = None,
     expression: Optional[str] = None,
     next_token: Optional[str] = None,
 ) -> Tuple[Optional[str], List[AthenaTableMetadata]]:
     request = self._build_list_table_metadata_request(
         max_results=max_results,
         catalog_name=catalog_name,
         schema_name=schema_name,
         expression=expression,
         next_token=next_token,
     )
     try:
         response = retry_api_call(
             self.connection._client.list_table_metadata,
             config=self._retry_config,
             logger=_logger,
             **request
         )
     except Exception as e:
         _logger.exception("Failed to list table metadata.")
         raise OperationalError(*e.args) from e
     else:
         return response.get("NextToken", None), [
             AthenaTableMetadata({"TableMetadata": r})
             for r in response.get("TableMetadataList", [])
         ]
Exemplo n.º 3
0
 def execute(
     self,
     operation: str,
     parameters: Optional[Dict[str, Any]] = None,
     work_group: Optional[str] = None,
     s3_staging_dir: Optional[str] = None,
     cache_size: int = 0,
     cache_expiration_time: int = 0,
 ):
     self._reset_state()
     self.query_id = self._execute(
         operation,
         parameters=parameters,
         work_group=work_group,
         s3_staging_dir=s3_staging_dir,
         cache_size=cache_size,
         cache_expiration_time=cache_expiration_time,
     )
     query_execution = self._poll(self.query_id)
     if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED:
         self.result_set = self._result_set_class(
             self._connection,
             self._converter,
             query_execution,
             self.arraysize,
             self._retry_config,
         )
     else:
         raise OperationalError(query_execution.state_change_reason)
     return self
Exemplo n.º 4
0
 def execute(
     self,
     operation,
     parameters=None,
     work_group=None,
     s3_staging_dir=None,
     cache_size=0,
 ):
     self._reset_state()
     self._query_id = self._execute(
         operation,
         parameters=parameters,
         work_group=work_group,
         s3_staging_dir=s3_staging_dir,
         cache_size=cache_size,
     )
     query_execution = self._poll(self._query_id)
     if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED:
         self._result_set = AthenaResultSet(
             self._connection,
             self._converter,
             query_execution,
             self.arraysize,
             self._retry_config,
         )
     else:
         raise OperationalError(query_execution.state_change_reason)
     return self
Exemplo n.º 5
0
 def _list_query_executions(
     self,
     max_results: Optional[int] = None,
     work_group: Optional[str] = None,
     next_token: Optional[str] = None,
 ) -> Tuple[Optional[str], List[AthenaQueryExecution]]:
     request = self._build_list_query_executions_request(
         max_results=max_results, work_group=work_group, next_token=next_token
     )
     try:
         response = retry_api_call(
             self.connection._client.list_query_executions,
             config=self._retry_config,
             logger=_logger,
             **request
         )
     except Exception as e:
         _logger.exception("Failed to list query executions.")
         raise OperationalError(*e.args) from e
     else:
         next_token = response.get("NextToken", None)
         query_ids = response.get("QueryExecutionIds", None)
         if not query_ids:
             return next_token, []
         return next_token, self._batch_get_query_execution(query_ids)
Exemplo n.º 6
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = self._parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   config=self._retry_config,
                                   logger=_logger,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         length = response['ContentLength']
         if length:
             df = pd.read_csv(io.BytesIO(response['Body'].read()),
                              dtype=self.dtypes,
                              converters=self.converters,
                              parse_dates=self.parse_dates,
                              infer_datetime_format=True)
             df = self._trunc_date(df)
         else:  # Allow empty response so DDL can be used
             df = pd.DataFrame()
         return df
Exemplo n.º 7
0
 def __fetch(self, next_token=None):
     if not self._query_execution.query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     if self._query_execution.state != 'SUCCEEDED':
         raise ProgrammingError('QueryExecutionState is not SUCCEEDED.')
     request = {
         'QueryExecutionId': self._query_execution.query_id,
         'MaxResults': self._arraysize,
     }
     if next_token:
         request.update({'NextToken': next_token})
     try:
         response = retry_api_call(
             self._connection.client.get_query_results,
             exceptions=self.retry_exceptions,
             attempt=self.retry_attempt,
             multiplier=self.retry_multiplier,
             max_delay=self.retry_max_delay,
             exp_base=self.retry_exponential_base,
             logger=_logger,
             **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         return response
Exemplo n.º 8
0
 def execute(
     self,
     operation,
     parameters=None,
     work_group=None,
     s3_staging_dir=None,
     cache_size=0,
     keep_default_na=False,
     na_values=None,
     quoting=1,
 ):
     self._reset_state()
     self._query_id = self._execute(
         operation,
         parameters=parameters,
         work_group=work_group,
         s3_staging_dir=s3_staging_dir,
         cache_size=cache_size,
     )
     query_execution = self._poll(self._query_id)
     if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED:
         self._result_set = AthenaPandasResultSet(
             connection=self._connection,
             converter=self._converter,
             query_execution=query_execution,
             arraysize=self.arraysize,
             retry_config=self._retry_config,
             keep_default_na=keep_default_na,
             na_values=na_values,
             quoting=quoting,
         )
     else:
         raise OperationalError(query_execution.state_change_reason)
     return self
Exemplo n.º 9
0
 def _cancel(self, query_id: str) -> None:
     request = {"QueryExecutionId": query_id}
     try:
         retry_api_call(self._connection.client.stop_query_execution,
                        config=self._retry_config,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception("Failed to cancel query.")
         raise OperationalError(*e.args) from e
Exemplo n.º 10
0
 def _cancel(self, query_id):
     request = {'QueryExecutionId': query_id}
     try:
         retry_api_call(self._connection.client.stop_query_execution,
                        config=self._retry_config,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception('Failed to cancel query.')
         raise_from(OperationalError(*e.args), e)
Exemplo n.º 11
0
 def execute(self, operation, parameters=None):
     self._reset_state()
     self._query_id = self._execute(operation, parameters)
     query_execution = self._poll(self._query_id)
     if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED:
         self._result_set = AthenaResultSet(self._connection,
                                            self._converter,
                                            query_execution, self.arraysize,
                                            self._retry_config)
     else:
         raise OperationalError(query_execution.state_change_reason)
     return self
Exemplo n.º 12
0
 def execute(
     self: _T,
     operation: str,
     parameters: Optional[Dict[str, Any]] = None,
     work_group: Optional[str] = None,
     s3_staging_dir: Optional[str] = None,
     cache_size: int = 0,
     cache_expiration_time: int = 0,
     keep_default_na: bool = False,
     na_values: Optional[Iterable[str]] = ("", ),
     quoting: int = 1,
     **kwargs,
 ) -> _T:
     self._reset_state()
     if self._unload:
         s3_staging_dir = s3_staging_dir if s3_staging_dir else self._s3_staging_dir
         assert (
             s3_staging_dir
         ), "If the unload option is used, s3_staging_dir is required."
         operation, unload_location = self._formatter.wrap_unload(
             operation,
             s3_staging_dir=s3_staging_dir,
             format_=AthenaFileFormat.FILE_FORMAT_PARQUET,
             compression=AthenaCompression.COMPRESSION_SNAPPY,
         )
     else:
         unload_location = None
     self.query_id = self._execute(
         operation,
         parameters=parameters,
         work_group=work_group,
         s3_staging_dir=s3_staging_dir,
         cache_size=cache_size,
         cache_expiration_time=cache_expiration_time,
     )
     query_execution = self._poll(self.query_id)
     if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED:
         self.result_set = AthenaPandasResultSet(
             connection=self._connection,
             converter=self._converter,
             query_execution=query_execution,
             arraysize=self.arraysize,
             retry_config=self._retry_config,
             keep_default_na=keep_default_na,
             na_values=na_values,
             quoting=quoting,
             unload=self._unload,
             unload_location=unload_location,
             **kwargs,
         )
     else:
         raise OperationalError(query_execution.state_change_reason)
     return self
Exemplo n.º 13
0
 def execute(self, operation, parameters=None):
     self._reset_state()
     self._query_id = self._execute(operation, parameters)
     query_execution = self._poll(self._query_id)
     if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED:
         self._result_set = AthenaResultSet(
             self._connection, self._converter, query_execution,
             self.arraysize, self.retry_exceptions, self.retry_attempt,
             self.retry_multiplier, self.retry_max_delay,
             self.retry_exponential_base)
     else:
         raise OperationalError(query_execution.state_change_reason)
Exemplo n.º 14
0
 def _get_query_execution(self, query_id: str) -> AthenaQueryExecution:
     request = {"QueryExecutionId": query_id}
     try:
         response = retry_api_call(
             self._connection.client.get_query_execution,
             config=self._retry_config,
             logger=_logger,
             **request)
     except Exception as e:
         _logger.exception("Failed to get query execution.")
         raise OperationalError(*e.args) from e
     else:
         return AthenaQueryExecution(response)
Exemplo n.º 15
0
 def _get_query_execution(self, query_id):
     request = {'QueryExecutionId': query_id}
     try:
         response = retry_api_call(
             self._connection.client.get_query_execution,
             config=self._retry_config,
             logger=_logger,
             **request)
     except Exception as e:
         _logger.exception('Failed to get query execution.')
         raise_from(OperationalError(*e.args), e)
     else:
         return AthenaQueryExecution(response)
Exemplo n.º 16
0
 def _cancel(self, query_id):
     request = {'QueryExecutionId': query_id}
     try:
         retry_api_call(self._connection.stop_query_execution,
                        exceptions=self.retry_exceptions,
                        attempt=self.retry_attempt,
                        multiplier=self.retry_multiplier,
                        max_delay=self.retry_max_delay,
                        exp_base=self.retry_exponential_base,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception('Failed to cancel query.')
         raise_from(OperationalError(*e.args), e)
Exemplo n.º 17
0
    def _as_pandas(self) -> "DataFrame":
        import pandas as pd

        if not self.output_location:
            raise ProgrammingError("OutputLocation is none or empty.")
        bucket, key = parse_output_location(self.output_location)
        try:
            response = retry_api_call(
                self._client.get_object,
                config=self._retry_config,
                logger=_logger,
                Bucket=bucket,
                Key=key,
            )
        except Exception as e:
            _logger.exception("Failed to download csv.")
            raise OperationalError(*e.args) from e
        else:
            length = response["ContentLength"]
            if length:
                if self.output_location.endswith(".txt"):
                    sep = "\t"
                    header = None
                    description = self.description if self.description else []
                    names: Optional[Any] = [d[0] for d in description]
                else:  # csv format
                    sep = ","
                    header = 0
                    names = None
                df = pd.read_csv(
                    response["Body"],
                    sep=sep,
                    header=header,
                    names=names,
                    dtype=self.dtypes,
                    converters=self.converters,
                    parse_dates=self.parse_dates,
                    infer_datetime_format=True,
                    skip_blank_lines=False,
                    keep_default_na=self._keep_default_na,
                    na_values=self._na_values,
                    quoting=self._quoting,
                    **self._kwargs,
                )
                df = self._trunc_date(df)
            else:  # Allow empty response
                df = pd.DataFrame()
            return df
Exemplo n.º 18
0
 def cancel(self):
     if not self._query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     try:
         request = {'QueryExecutionId': self._query_id}
         retry_api_call(self._connection.stop_query_execution,
                        exceptions=self.retry_exceptions,
                        attempt=self.retry_attempt,
                        multiplier=self.retry_multiplier,
                        max_delay=self.retry_max_deply,
                        exp_base=self.retry_exponential_base,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception('Failed to cancel query.')
         raise_from(OperationalError(*e.args), e)
Exemplo n.º 19
0
 def _query_execution(self, query_id):
     request = {'QueryExecutionId': query_id}
     try:
         response = retry_api_call(self._connection.get_query_execution,
                                   exceptions=self.retry_exceptions,
                                   attempt=self.retry_attempt,
                                   multiplier=self.retry_multiplier,
                                   max_delay=self.retry_max_delay,
                                   exp_base=self.retry_exponential_base,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception('Failed to get query execution.')
         raise_from(OperationalError(*e.args), e)
     else:
         return AthenaQueryExecution(response)
Exemplo n.º 20
0
    def execute(self, sql, bindings=None):
        if bindings is not None:
            # presto doesn't actually pass bindings along so we have to do the
            # escaping and formatting ourselves
            bindings = tuple(self._escape_value(b) for b in bindings)
            sql = sql % bindings

        query_id, future = self._cursor.execute(sql)
        result_set = future.result()

        if result_set.state != AthenaQueryExecution.STATE_SUCCEEDED:
            raise OperationalError(result_set.state_change_reason)

        self._fetch_result = result_set.fetchall()
        self._query_id = query_id
        self._state = result_set.state
        return self
Exemplo n.º 21
0
 def _batch_get_query_execution(
     self, query_ids: List[str]
 ) -> List[AthenaQueryExecution]:
     try:
         response = retry_api_call(
             self.connection._client.batch_get_query_execution,
             config=self._retry_config,
             logger=_logger,
             QueryExecutionIds=query_ids,
         )
     except Exception as e:
         _logger.exception("Failed to batch get query execution.")
         raise OperationalError(*e.args) from e
     else:
         return [
             AthenaQueryExecution({"QueryExecution": r})
             for r in response.get("QueryExecutions", [])
         ]
Exemplo n.º 22
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = self._parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         df = pd.read_csv(io.BytesIO(response['Body'].read()),
                          dtype=self._dtypes(),
                          converters=self._converters(),
                          parse_dates=self._parse_dates(),
                          infer_datetime_format=True)
         df = self._trunc_date(df)
         return df
Exemplo n.º 23
0
 def __fetch(self, next_token=None):
     if not self._query_execution.query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     if self._query_execution.state != AthenaQueryExecution.STATE_SUCCEEDED:
         raise ProgrammingError('QueryExecutionState is not SUCCEEDED.')
     request = {
         'QueryExecutionId': self._query_execution.query_id,
         'MaxResults': self._arraysize,
     }
     if next_token:
         request.update({'NextToken': next_token})
     try:
         response = retry_api_call(self._connection.client.get_query_results,
                                   config=self._retry_config,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         return response
Exemplo n.º 24
0
 def _pre_fetch(self):
     if not self._query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     try:
         request = {
             'QueryExecutionId': self._query_id,
             'MaxResults': self._arraysize,
         }
         response = retry_api_call(self._connection.get_query_results,
                                   exceptions=self.retry_exceptions,
                                   attempt=self.retry_attempt,
                                   multiplier=self.retry_multiplier,
                                   max_delay=self.retry_max_deply,
                                   exp_base=self.retry_exponential_base,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         self._process_meta_data(response)
         self._process_result_set(response)
Exemplo n.º 25
0
 def execute(
     self,
     operation: str,
     parameters: Optional[Dict[str, Any]] = None,
     work_group: Optional[str] = None,
     s3_staging_dir: Optional[str] = None,
     cache_size: int = 0,
     cache_expiration_time: int = 0,
     keep_default_na: bool = False,
     na_values: Optional[Iterable[str]] = ("", ),
     quoting: int = 1,
     **kwargs,
 ):
     self._reset_state()
     self.query_id = self._execute(
         operation,
         parameters=parameters,
         work_group=work_group,
         s3_staging_dir=s3_staging_dir,
         cache_size=cache_size,
         cache_expiration_time=cache_expiration_time,
     )
     query_execution = self._poll(self.query_id)
     if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED:
         self.result_set = AthenaPandasResultSet(
             connection=self._connection,
             converter=self._converter,
             query_execution=query_execution,
             arraysize=self.arraysize,
             retry_config=self._retry_config,
             keep_default_na=keep_default_na,
             na_values=na_values,
             quoting=quoting,
             **kwargs,
         )
     else:
         raise OperationalError(query_execution.state_change_reason)
     return self
Exemplo n.º 26
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   config=self._retry_config,
                                   logger=_logger,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         length = response['ContentLength']
         if length:
             if self.output_location.endswith('.txt'):
                 sep = '\t'
                 header = None
                 names = [d[0] for d in self.description]
             else:  # csv format
                 sep = ','
                 header = 0
                 names = None
             df = pd.read_csv(io.BytesIO(response['Body'].read()),
                              sep=sep,
                              header=header,
                              names=names,
                              dtype=self.dtypes,
                              converters=self.converters,
                              parse_dates=self.parse_dates,
                              infer_datetime_format=True,
                              skip_blank_lines=False)
             df = self._trunc_date(df)
         else:  # Allow empty response
             df = pd.DataFrame()
         return df
Exemplo n.º 27
0
 def _get_table_metadata(
     self,
     table_name: str,
     catalog_name: Optional[str] = None,
     schema_name: Optional[str] = None,
 ) -> AthenaTableMetadata:
     request = {
         "CatalogName": catalog_name if catalog_name else self._catalog_name,
         "DatabaseName": schema_name if schema_name else self._schema_name,
         "TableName": table_name,
     }
     try:
         response = retry_api_call(
             self._connection.client.get_table_metadata,
             config=self._retry_config,
             logger=_logger,
             **request
         )
     except Exception as e:
         _logger.exception("Failed to get table metadata.")
         raise OperationalError(*e.args) from e
     else:
         return AthenaTableMetadata(response)
Exemplo n.º 28
0
 def __fetch(self, next_token: Optional[str] = None):
     if not self.query_id:
         raise ProgrammingError("QueryExecutionId is none or empty.")
     if self.state != AthenaQueryExecution.STATE_SUCCEEDED:
         raise ProgrammingError("QueryExecutionState is not SUCCEEDED.")
     if self.is_closed:
         raise ProgrammingError("AthenaResultSet is closed.")
     request = {
         "QueryExecutionId": self.query_id,
         "MaxResults": self._arraysize,
     }
     if next_token:
         request.update({"NextToken": next_token})
     try:
         connection = cast("Connection", self._connection)
         response = retry_api_call(connection.client.get_query_results,
                                   config=self._retry_config,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception("Failed to fetch result set.")
         raise OperationalError(*e.args) from e
     else:
         return response