def __execute_query(self, database, query, s3_output_url, return_results=True, save_results=True): s3_bucket, s3_path = self.__parse_s3_path(s3_output_url) response = self.__athena.start_query_execution( QueryString=query, QueryExecutionContext={'Database': database}, ResultConfiguration={ 'OutputLocation': 's3://' + s3_bucket + "/" + s3_path, }) query_execution_id = response['QueryExecutionId'] status = self.__poll_status(query_execution_id) if status == 'SUCCEEDED': s3_key = s3_path + "/" + query_execution_id + '.csv' if return_results: # Dirty patch to support descriptive queries such as describe table and show partition: try: obj = self.__s3.get_object(Bucket=s3_bucket, Key=s3_key) df = pd.read_csv(io.BytesIO(obj['Body'].read())) except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey': try: s3_key = s3_path + "/" + query_execution_id + '.txt' obj = self.__s3.get_object(Bucket=s3_bucket, Key=s3_key) df = obj['Body'].read().decode('utf-8') except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey': raise Exceptions.QueryNotSupported( "The specified query is not supported by this package." ) else: raise e else: raise e # Remove result file from s3 if not save_results: self.__s3.delete_object(Bucket=s3_bucket, Key=s3_key) self.__s3.delete_object(Bucket=s3_bucket, Key=s3_key + '.metadata') return df, query_execution_id else: return query_execution_id elif status == "FAILED": raise Exceptions.QueryExecutionFailedException( "Query Failed. Check athena logs for more info.") else: raise Exceptions.QueryUnknownStatusException( "Query is in an unknown status. Check athena logs for more info." )
def get_result(self, query_execution_id, save_results=False): ''' Given an execution id, returns result as a pandas df if successful. Prints error otherwise. -- Data deleted unless save_results true ''' # Get execution status and save path, which we can then split into bucket and key. Automatically handles csv/txt res = self._athena.get_query_execution(QueryExecutionId=query_execution_id) s3_bucket, s3_key = self.__parse_s3_path(res['QueryExecution']['ResultConfiguration']['OutputLocation']) # If succeed, return df if res['QueryExecution']['Status']['State'] == 'SUCCEEDED': obj = self.__s3.get_object(Bucket=s3_bucket, Key=s3_key) df = pd.read_csv(io.BytesIO(obj['Body'].read())) # Remove results from s3 if not save_results: self.__s3.delete_object(Bucket=s3_bucket, Key=s3_key) self.__s3.delete_object(Bucket=s3_bucket, Key=s3_key + '.metadata') return df # If failed, return error message elif res['QueryExecution']['Status']['State'] == 'FAILED': raise Exceptions.QueryExecutionFailedException("Query failed with response: %s" % (self.get_query_error(query_execution_id))) elif res['QueryExecution']['Status']['State'] == 'RUNNING': raise Exceptions.QueryStillRunningException("Query has not finished executing.") else: raise Exceptions.QueryUnknownStatusException("Query is in an unknown status. Check athena logs for more info.")
def __init__(self, database, region='us-east-1', session=None): self.__database = database self._session = session if session is not None else boto3.session.Session() if region is None: region = self._session.region_name if region is None: raise Exceptions.NoRegionFoundError("No default aws region configuration found. Must specify a region.") self.__region = region self._athena = self._session.client('athena', region_name=region) self.__s3 = self._session.client('s3', region_name=region) self.__glue = self._session.client('glue', region_name=region) if database not in Utils.get_databases(region=region, session=self._session): raise Exceptions.DatabaseNotFound("Database " + database + " not found.")
def __parse_s3_path(self, s3_path): if not re.compile(self.__s3_path_regex).match(s3_path): raise Exceptions.InvalidS3PathException("s3 Path must follow format: " + self.__s3_path_regex) url = urlparse(s3_path) bucket = url.netloc path = url.path.lstrip('/') return bucket, path
def __poll_status(self, query_execution_id): status = self.get_query_status(query_execution_id) if status in ['SUCCEEDED', 'FAILED']: return status else: raise Exceptions.QueryExecutionTimeoutException( "Query to athena has timed out. Try running the query in the athena or asynchronously" )
def __poll_status(self, query_execution_id): res = self.__athena.get_query_execution( QueryExecutionId=query_execution_id) status = res['QueryExecution']['Status']['State'] if status == 'SUCCEEDED': return status elif status == 'FAILED': return status else: raise Exceptions.QueryExecutionTimeoutException( "Query to athena has timed out. Try running in query in the athena" )