Exemplo n.º 1
0
    def __execute_query(self,
                        database,
                        query,
                        s3_output_url,
                        return_results=True,
                        save_results=True):
        s3_bucket, s3_path = self.__parse_s3_path(s3_output_url)

        response = self.__athena.start_query_execution(
            QueryString=query,
            QueryExecutionContext={'Database': database},
            ResultConfiguration={
                'OutputLocation': 's3://' + s3_bucket + "/" + s3_path,
            })

        query_execution_id = response['QueryExecutionId']
        status = self.__poll_status(query_execution_id)

        if status == 'SUCCEEDED':

            s3_key = s3_path + "/" + query_execution_id + '.csv'

            if return_results:
                # Dirty patch to support descriptive queries such as describe table and show partition:
                try:
                    obj = self.__s3.get_object(Bucket=s3_bucket, Key=s3_key)
                    df = pd.read_csv(io.BytesIO(obj['Body'].read()))
                except ClientError as e:
                    if e.response['Error']['Code'] == 'NoSuchKey':
                        try:
                            s3_key = s3_path + "/" + query_execution_id + '.txt'
                            obj = self.__s3.get_object(Bucket=s3_bucket,
                                                       Key=s3_key)
                            df = obj['Body'].read().decode('utf-8')
                        except ClientError as e:
                            if e.response['Error']['Code'] == 'NoSuchKey':
                                raise Exceptions.QueryNotSupported(
                                    "The specified query is not supported by this package."
                                )
                            else:
                                raise e
                    else:
                        raise e

                # Remove result file from s3
                if not save_results:
                    self.__s3.delete_object(Bucket=s3_bucket, Key=s3_key)
                    self.__s3.delete_object(Bucket=s3_bucket,
                                            Key=s3_key + '.metadata')

                return df, query_execution_id
            else:
                return query_execution_id
        elif status == "FAILED":
            raise Exceptions.QueryExecutionFailedException(
                "Query Failed. Check athena logs for more info.")
        else:
            raise Exceptions.QueryUnknownStatusException(
                "Query is in an unknown status. Check athena logs for more info."
            )
Exemplo n.º 2
0
    def get_result(self, query_execution_id, save_results=False):
        '''
        Given an execution id, returns result as a pandas df if successful. Prints error otherwise. 
        -- Data deleted unless save_results true
        '''
        # Get execution status and save path, which we can then split into bucket and key. Automatically handles csv/txt
        res = self._athena.get_query_execution(QueryExecutionId=query_execution_id)
        s3_bucket, s3_key = self.__parse_s3_path(res['QueryExecution']['ResultConfiguration']['OutputLocation'])

        # If succeed, return df
        if res['QueryExecution']['Status']['State'] == 'SUCCEEDED':
            obj = self.__s3.get_object(Bucket=s3_bucket, Key=s3_key)
            df = pd.read_csv(io.BytesIO(obj['Body'].read()))

            # Remove results from s3
            if not save_results:
                self.__s3.delete_object(Bucket=s3_bucket, Key=s3_key)
                self.__s3.delete_object(Bucket=s3_bucket, Key=s3_key + '.metadata')

            return df

        # If failed, return error message
        elif res['QueryExecution']['Status']['State'] == 'FAILED':
            raise Exceptions.QueryExecutionFailedException("Query failed with response: %s" % (self.get_query_error(query_execution_id)))
        elif res['QueryExecution']['Status']['State'] == 'RUNNING':
            raise Exceptions.QueryStillRunningException("Query has not finished executing.")
        else:
            raise Exceptions.QueryUnknownStatusException("Query is in an unknown status. Check athena logs for more info.")
Exemplo n.º 3
0
 def __init__(self, database, region='us-east-1', session=None):
     self.__database = database
     self._session = session if session is not None else boto3.session.Session()
     if region is None:
         region = self._session.region_name
         if region is None:
             raise Exceptions.NoRegionFoundError("No default aws region configuration found. Must specify a region.")
     self.__region = region
     self._athena = self._session.client('athena', region_name=region)
     self.__s3 = self._session.client('s3', region_name=region)
     self.__glue = self._session.client('glue', region_name=region)
     if database not in Utils.get_databases(region=region, session=self._session):
         raise Exceptions.DatabaseNotFound("Database " + database + " not found.")
Exemplo n.º 4
0
 def __parse_s3_path(self, s3_path):
     if not re.compile(self.__s3_path_regex).match(s3_path):
         raise Exceptions.InvalidS3PathException("s3 Path must follow format: " + self.__s3_path_regex)
     url = urlparse(s3_path)
     bucket = url.netloc
     path = url.path.lstrip('/')
     return bucket, path
Exemplo n.º 5
0
 def __poll_status(self, query_execution_id):
     status = self.get_query_status(query_execution_id)
     if status in ['SUCCEEDED', 'FAILED']:
         return status
     else:
         raise Exceptions.QueryExecutionTimeoutException(
             "Query to athena has timed out. Try running the query in the athena or asynchronously"
         )
Exemplo n.º 6
0
    def __poll_status(self, query_execution_id):
        res = self.__athena.get_query_execution(
            QueryExecutionId=query_execution_id)
        status = res['QueryExecution']['Status']['State']

        if status == 'SUCCEEDED':
            return status
        elif status == 'FAILED':
            return status
        else:
            raise Exceptions.QueryExecutionTimeoutException(
                "Query to athena has timed out. Try running in query in the athena"
            )