def _read_parquet_schema(self, engine) -> Tuple[Dict[str, Any], ...]: if engine == "pyarrow": from pyarrow import parquet from pyathena.arrow.util import to_column_info if not self._unload_location: raise ProgrammingError("UnloadLocation is none or empty.") bucket, key = parse_output_location(self._unload_location) try: dataset = parquet.ParquetDataset(f"{bucket}/{key}", filesystem=self._fs, use_legacy_dataset=False) return to_column_info(dataset.schema) except Exception as e: _logger.exception(f"Failed to read schema {bucket}/{key}.") raise OperationalError(*e.args) from e elif engine == "fastparquet": from fastparquet import ParquetFile # TODO: https://github.com/python/mypy/issues/1153 from pyathena.fastparquet.util import to_column_info # type: ignore if not self._data_manifest: self._data_manifest = self._read_data_manifest() bucket, key = parse_output_location(self._data_manifest[0]) try: file = ParquetFile(f"{bucket}/{key}", open_with=self._fs.open) return to_column_info(file.schema) except Exception as e: _logger.exception(f"Failed to read schema {bucket}/{key}.") raise OperationalError(*e.args) from e else: raise ProgrammingError( "Engine must be one of `pyarrow`, `fastparquet`.")
def format(self, operation: str, parameters: Optional[Dict[str, Any]] = None) -> str: if not operation or not operation.strip(): raise ProgrammingError("Query is none or empty.") operation = operation.strip() operation_upper = operation.upper() if (operation_upper.startswith("SELECT") or operation_upper.startswith("WITH") or operation_upper.startswith("INSERT")): escaper = _escape_presto else: escaper = _escape_hive kwargs: Optional[Dict[str, Any]] = None if parameters is not None: kwargs = dict() if isinstance(parameters, dict): for k, v in parameters.items(): func = self.get(v) if not func: raise TypeError("{0} is not defined formatter.".format( type(v))) kwargs.update({k: func(self, escaper, v)}) else: raise ProgrammingError( "Unsupported parameter " + "(Support for dict only): {0}".format(parameters)) return (operation % kwargs).strip() if kwargs is not None else operation.strip()
def format(self, operation: str, parameters: Optional[List[str]] = None) -> str: if not operation or not operation.strip(): raise ProgrammingError("Query is none or empty.") operation = operation.strip() if operation.upper().startswith( "SELECT") or operation.upper().startswith("WITH"): escaper = _escape_presto else: escaper = _escape_hive kwargs: Optional[List[str]] = None if parameters is not None: kwargs = list() if isinstance(parameters, list): for v in parameters: # TODO Review this annoying Decimal hack, unsure if issue in dbt, agate or pyathena if isinstance(v, Decimal) and v == int(v): v = int(v) func = self.get(v) if not func: raise TypeError("{0} is not defined formatter.".format( type(v))) kwargs.append(func(self, escaper, v)) else: raise ProgrammingError( "Unsupported parameter " + "(Support for list only): {0}".format(parameters)) return (operation % tuple(kwargs) ).strip() if kwargs is not None else operation.strip()
def __fetch(self, next_token=None): if not self._query_execution.query_id: raise ProgrammingError('QueryExecutionId is none or empty.') if self._query_execution.state != 'SUCCEEDED': raise ProgrammingError('QueryExecutionState is not SUCCEEDED.') request = { 'QueryExecutionId': self._query_execution.query_id, 'MaxResults': self._arraysize, } if next_token: request.update({'NextToken': next_token}) try: response = retry_api_call( self._connection.client.get_query_results, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_delay, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: return response
def format(self, operation, parameters=None): if not operation or not operation.strip(): raise ProgrammingError("Query is none or empty.") operation = operation.strip() if operation.upper().startswith("SELECT") or operation.upper().startswith( "WITH" ): escaper = _escape_presto else: escaper = _escape_hive kwargs = dict() if parameters: if isinstance(parameters, dict): for k, v in iteritems(parameters): func = self.get(v) if not func: raise TypeError("{0} is not defined formatter.".format(type(v))) kwargs.update({k: func(self, escaper, v)}) else: raise ProgrammingError( "Unsupported parameter " + "(Support for dict only): {0}".format(parameters) ) return (operation % kwargs).strip() if kwargs else operation.strip()
def fetchmany( self, size: int = None ) -> List[Union[Tuple[Optional[Any], ...], Dict[Any, Optional[Any]]]]: if not self.has_result_set: raise ProgrammingError("No result set.") result_set = cast(AthenaResultSet, self.result_set) return result_set.fetchmany(size)
def fetchall( self, ) -> List[Union[Tuple[Optional[Any], ...], Dict[Any, Optional[Any]]]]: if not self.has_result_set: raise ProgrammingError("No result set.") result_set = cast(AthenaPandasResultSet, self.result_set) return result_set.fetchall()
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = self._parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: length = response['ContentLength'] if length: df = pd.read_csv(io.BytesIO(response['Body'].read()), dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True) df = self._trunc_date(df) else: # Allow empty response so DDL can be used df = pd.DataFrame() return df
def wrap_unload( operation: str, s3_staging_dir: str, format_: str = AthenaFileFormat.FILE_FORMAT_PARQUET, compression: str = AthenaCompression.COMPRESSION_SNAPPY, ): if not operation or not operation.strip(): raise ProgrammingError("Query is none or empty.") operation_upper = operation.strip().upper() if operation_upper.startswith("SELECT") or operation_upper.startswith( "WITH"): now = datetime.utcnow().strftime("%Y%m%d") location = f"{s3_staging_dir}unload/{now}/{str(uuid.uuid4())}/" operation = textwrap.dedent(f""" UNLOAD ( \t{operation.strip()} ) TO '{location}' WITH ( \tformat = '{format_}', \tcompression = '{compression}' ) """) else: location = None return operation, location
def arraysize(self, value): if value <= 0 or value > self.DEFAULT_FETCH_SIZE: raise ProgrammingError( "MaxResults is more than maximum allowed length {0}.".format( self.DEFAULT_FETCH_SIZE ) ) self._arraysize = value
def arraysize(self, value: int) -> None: if value <= 0 or value > CursorIterator.DEFAULT_FETCH_SIZE: raise ProgrammingError( "MaxResults is more than maximum allowed length {0}.".format( CursorIterator.DEFAULT_FETCH_SIZE ) ) self._arraysize = value
def fetchone(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') if not self._result_set and self._next_token: self._fetch() if not self._result_set: return None else: self._rownumber += 1 return self._result_set.popleft()
def __fetch(self, next_token=None): if not self._query_execution.query_id: raise ProgrammingError('QueryExecutionId is none or empty.') if self._query_execution.state != AthenaQueryExecution.STATE_SUCCEEDED: raise ProgrammingError('QueryExecutionState is not SUCCEEDED.') request = { 'QueryExecutionId': self._query_execution.query_id, 'MaxResults': self._arraysize, } if next_token: request.update({'NextToken': next_token}) try: response = retry_api_call(self._connection.client.get_query_results, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: return response
def format(self, operation, parameters=None): if not operation or not operation.strip(): raise ProgrammingError('Query is none or empty.') operation = operation.strip() if operation.upper().startswith('SELECT') or operation.upper().startswith('WITH'): escaper = _escape_presto else: escaper = _escape_hive kwargs = dict() if parameters: if isinstance(parameters, dict): for k, v in iteritems(parameters): func = self.get_formatter(v) kwargs.update({k: func(self, escaper, v)}) else: raise ProgrammingError('Unsupported parameter ' + '(Support for dict only): {0}'.format(parameters)) return (operation % kwargs).strip() if kwargs else operation.strip()
def _read_csv(self) -> "Table": import pyarrow as pa from pyarrow import csv if not self.output_location: raise ProgrammingError("OutputLocation is none or empty.") if not self.output_location.endswith((".csv", ".txt")): return pa.Table.from_pydict(dict()) length = self._get_content_length() if length and self.output_location.endswith(".txt"): description = self.description if self.description else [] column_names = [d[0] for d in description] read_opts = csv.ReadOptions( skip_rows=0, column_names=column_names, block_size=self._block_size, use_threads=True, ) parse_opts = csv.ParseOptions( delimiter="\t", quote_char=False, double_quote=False, escape_char=False, ) elif length and self.output_location.endswith(".csv"): read_opts = csv.ReadOptions(skip_rows=0, block_size=self._block_size, use_threads=True) parse_opts = csv.ParseOptions( delimiter=",", quote_char='"', double_quote=True, escape_char=False, ) else: return pa.Table.from_pydict(dict()) bucket, key = parse_output_location(self.output_location) try: return csv.read_csv( self._fs.open_input_stream(f"{bucket}/{key}"), read_options=read_opts, parse_options=parse_opts, convert_options=csv.ConvertOptions( quoted_strings_can_be_null=False, timestamp_parsers=self.timestamp_parsers, column_types=self.column_types, ), ) except Exception as e: _logger.exception(f"Failed to read {bucket}/{key}.") raise OperationalError(*e.args) from e
def _poll(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') while True: try: request = {'QueryExecutionId': self._query_id} response = retry_api_call(self._connection.get_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to poll query result.') raise_from(OperationalError(*e.args), e) else: query_execution = response.get('QueryExecution', None) if not query_execution: raise DataError('KeyError `QueryExecution`') status = query_execution.get('Status', None) if not status: raise DataError('KeyError `Status`') state = status.get('State', None) if state == 'SUCCEEDED': self._completion_date_time = status.get( 'CompletionDateTime', None) self._submission_date_time = status.get( 'SubmissionDateTime', None) statistics = query_execution.get('Statistics', {}) self._data_scanned_in_bytes = statistics.get( 'DataScannedInBytes', None) self._execution_time_in_millis = statistics.get( 'EngineExecutionTimeInMillis', None) result_conf = query_execution.get('ResultConfiguration', {}) self._output_location = result_conf.get( 'OutputLocation', None) break elif state == 'FAILED': raise OperationalError( status.get('StateChangeReason', None)) elif state == 'CANCELLED': raise OperationalError( status.get('StateChangeReason', None)) else: time.sleep(self._poll_interval)
def __fetch(self, next_token: Optional[str] = None): if not self.query_id: raise ProgrammingError("QueryExecutionId is none or empty.") if self.state != AthenaQueryExecution.STATE_SUCCEEDED: raise ProgrammingError("QueryExecutionState is not SUCCEEDED.") if self.is_closed: raise ProgrammingError("AthenaResultSet is closed.") request = { "QueryExecutionId": self.query_id, "MaxResults": self._arraysize, } if next_token: request.update({"NextToken": next_token}) try: connection = cast("Connection", self._connection) response = retry_api_call(connection.client.get_query_results, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception("Failed to fetch result set.") raise OperationalError(*e.args) from e else: return response
def _as_pandas(self) -> "DataFrame": import pandas as pd if not self.output_location: raise ProgrammingError("OutputLocation is none or empty.") bucket, key = parse_output_location(self.output_location) try: response = retry_api_call( self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key, ) except Exception as e: _logger.exception("Failed to download csv.") raise OperationalError(*e.args) from e else: length = response["ContentLength"] if length: if self.output_location.endswith(".txt"): sep = "\t" header = None description = self.description if self.description else [] names: Optional[Any] = [d[0] for d in description] else: # csv format sep = "," header = 0 names = None df = pd.read_csv( response["Body"], sep=sep, header=header, names=names, dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True, skip_blank_lines=False, keep_default_na=self._keep_default_na, na_values=self._na_values, quoting=self._quoting, **self._kwargs, ) df = self._trunc_date(df) else: # Allow empty response df = pd.DataFrame() return df
def cancel(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') try: request = {'QueryExecutionId': self._query_id} retry_api_call(self._connection.stop_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to cancel query.') raise_from(OperationalError(*e.args), e)
def _read_csv(self) -> "DataFrame": import pandas as pd if not self.output_location: raise ProgrammingError("OutputLocation is none or empty.") if not self.output_location.endswith((".csv", ".txt")): return pd.DataFrame() length = self._get_content_length() if length and self.output_location.endswith(".txt"): sep = "\t" header = None description = self.description if self.description else [] names = [d[0] for d in description] elif length and self.output_location.endswith(".csv"): sep = "," header = 0 names = None else: return pd.DataFrame() try: # TODO chunksize df = pd.read_csv( self.output_location, sep=sep, header=header, names=names, dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True, skip_blank_lines=False, keep_default_na=self._keep_default_na, na_values=self._na_values, quoting=self._quoting, storage_options={ "profile": self.connection.profile_name, "client_kwargs": { "region_name": self.connection.region_name, **self.connection._client_kwargs, }, }, **self._kwargs, ) return self._trunc_date(df) except Exception as e: _logger.exception(f"Failed to read {self.output_location}.") raise OperationalError(*e.args) from e
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = self._parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: df = pd.read_csv(io.BytesIO(response['Body'].read()), dtype=self._dtypes(), converters=self._converters(), parse_dates=self._parse_dates(), infer_datetime_format=True) df = self._trunc_date(df) return df
def _read_parquet(self, engine) -> "DataFrame": import pandas as pd self._data_manifest = self._read_data_manifest() if not self._data_manifest: return pd.DataFrame() if not self._unload_location: self._unload_location = ( "/".join(self._data_manifest[0].split("/")[:-1]) + "/") if engine == "pyarrow": unload_location = self._unload_location kwargs = { "use_threads": True, "use_legacy_dataset": False, } elif engine == "fastparquet": unload_location = f"{self._unload_location}*" kwargs = {} else: raise ProgrammingError( "Engine must be one of `pyarrow`, `fastparquet`.") kwargs.update(self._kwargs) try: return pd.read_parquet( unload_location, engine=self._engine, storage_options={ "profile": self.connection.profile_name, "client_kwargs": { "region_name": self.connection.region_name, **self.connection._client_kwargs, }, }, use_nullable_dtypes=False, **kwargs, ) except Exception as e: _logger.exception(f"Failed to read {self.output_location}.") raise OperationalError(*e.args) from e
def _pre_fetch(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') try: request = { 'QueryExecutionId': self._query_id, 'MaxResults': self._arraysize, } response = retry_api_call(self._connection.get_query_results, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: self._process_meta_data(response) self._process_result_set(response)
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: length = response['ContentLength'] if length: if self.output_location.endswith('.txt'): sep = '\t' header = None names = [d[0] for d in self.description] else: # csv format sep = ',' header = 0 names = None df = pd.read_csv(io.BytesIO(response['Body'].read()), sep=sep, header=header, names=names, dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True, skip_blank_lines=False) df = self._trunc_date(df) else: # Allow empty response df = pd.DataFrame() return df
def _process_result_set(self, response): if self._meta_data is None: raise ProgrammingError('ResultSetMetadata is none.') result_set = response.get('ResultSet', None) if not result_set: raise DataError('KeyError `ResultSet`') rows = result_set.get('Rows', None) if rows is None: raise DataError('KeyError `Rows`') processed_rows = [] if len(rows) > 0: offset = 1 if not self._next_token and self._is_first_row_column_labels( rows) else 0 processed_rows = [ tuple([ self._converter.convert(meta.get('Type', None), row.get('VarCharValue', None)) for meta, row in zip(self._meta_data, rows[i].get( 'Data', [])) ]) for i in xrange(offset, len(rows)) ] self._result_set.extend(processed_rows) self._next_token = response.get('NextToken', None)
def cancel(self): if not self._query_id: raise ProgrammingError("QueryExecutionId is none or empty.") self._cancel(self._query_id)
def as_pandas(self): if not self.has_result_set: raise ProgrammingError("No result set.") return self._result_set.as_pandas()
def fetchall(self): if not self.has_result_set: raise ProgrammingError("No result set.") return self._result_set.fetchall()
def fetchmany(self, size=None): if not self.has_result_set: raise ProgrammingError("No result set.") return self._result_set.fetchmany(size)
def fetchone(self): if not self.has_result_set: raise ProgrammingError('No result set.') return self._result_set.fetchone()