Exemplo n.º 1
0
    def read_sql_athena(self,
                        sql,
                        database,
                        s3_output=None,
                        max_result_size=None):
        """
        Executes any SQL query on AWS Athena and return a Dataframe of the result.
        P.S. If max_result_size is passed, then a iterator of Dataframes is returned.

        :param sql: SQL Query
        :param database: Glue/Athena Database
        :param s3_output: AWS S3 path
        :param max_result_size: Max number of bytes on each request to S3
        :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
        """
        if not s3_output:
            s3_output = self._session.athena.create_athena_bucket()
        query_execution_id = self._session.athena.run_query(
            query=sql, database=database, s3_output=s3_output)
        query_response = self._session.athena.wait_query(
            query_execution_id=query_execution_id)
        if query_response["QueryExecution"]["Status"]["State"] in [
                "FAILED", "CANCELLED"
        ]:
            reason = query_response["QueryExecution"]["Status"][
                "StateChangeReason"]
            message_error = f"Query error: {reason}"
            raise AthenaQueryError(message_error)
        else:
            dtype, parse_timestamps, parse_dates, converters = self._session.athena.get_query_dtype(
                query_execution_id=query_execution_id)
            path = f"{s3_output}{query_execution_id}.csv"
            ret = self.read_csv(path=path,
                                dtype=dtype,
                                parse_dates=parse_timestamps,
                                converters=converters,
                                quoting=csv.QUOTE_ALL,
                                max_result_size=max_result_size)
            if max_result_size is None:
                if len(ret.index) > 0:
                    for col in parse_dates:
                        ret[col] = ret[col].dt.date
                return ret
            else:
                return Pandas._apply_dates_to_generator(
                    generator=ret, parse_dates=parse_dates)
Exemplo n.º 2
0
 def read_sql_athena(self, sql, database, s3_output=None):
     if not s3_output:
         account_id = (self._session.boto3_session.client(
             service_name="sts", config=self._session.botocore_config).
                       get_caller_identity().get("Account"))
         session_region = self._session.boto3_session.region_name
         s3_output = f"s3://aws-athena-query-results-{account_id}-{session_region}/"
         s3_resource = self._session.boto3_session.resource("s3")
         s3_resource.Bucket(s3_output)
     query_execution_id = self._session.athena.run_query(
         query=sql, database=database, s3_output=s3_output)
     query_response = self._session.athena.wait_query(
         query_execution_id=query_execution_id)
     if query_response.get("QueryExecution").get("Status").get("State") in [
             "FAILED", "CANCELLED"
     ]:
         reason = (query_response.get("QueryExecution").get("Status").get(
             "StateChangeReason"))
         message_error = f"Query error: {reason}"
         raise AthenaQueryError(message_error)
     else:
         path = f"{s3_output}{query_execution_id}.csv"
         dataframe = self.read_csv(path=path)
     return dataframe