def scan_athena(query, env): if env == Env.KCL_PREP.value: conn = connect(profile_name='health-customers', s3_staging_dir= 's3://aws-athena-query-results-eu-west-1-036573440528/', region_name='eu-west-1', schema_name='kclprep') return pd.read_sql(query, conn) if env == Env.KCL_TEST.value: conn = connect(profile_name='health', s3_staging_dir= 's3://aws-athena-query-results-458907533143-us-west-2/', region_name='us-west-2', schema_name='test') return pd.read_sql(query, conn) if env == Env.SHEBA_TEST.value: conn = connect(profile_name='health', s3_staging_dir= 's3://aws-athena-query-results-458907533143-us-west-2/', region_name='us-west-2', schema_name='shebatest') return pd.read_sql(query, conn) if env == Env.SHEBA_PREP.value: conn = connect(profile_name='health-customers', s3_staging_dir= 's3://aws-athena-query-results-eu-west-1-036573440528/', region_name='eu-west-1', schema_name='kclprep') return pd.read_sql(query, conn)
def query_from_athena_using_jdbc(): conn = connect( s3_staging_dir= 's3://krux-attribute-metrics-staging/stream-processor/pixel_attributes_stream/', region_name='us-east-1', driver_path= '/home/vagrant/.virtualenvs/pycharm/lib/python2.7/site-packages/pyathenajdbc/AthenaJDBC41-1.1.0.jar' ) try: with conn.cursor() as cursor: cursor.execute(""" select process_time ,sum(views) from spectrum_db.pixel_attributes_stream where organization = 'c819846f-b6e4-4ef4-b357-28399862b473' and process_time > '2017-10-30_18:55' group by process_time order by 1 desc limit 10 """) print(cursor.description) print(cursor.fetchall()) finally: conn.close()
def _execute_query(self, params, user, chart_query, db): query, bind_params = jinjasql.prepare_query(chart_query, { "params": params, "user": user }) conn = connections[str(db)] if conn.settings_dict['NAME'] == 'Athena': conn = connect(driver_path=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'athena-jdbc/AthenaJDBC41-1.0.0.jar')) with conn.cursor() as cursor: cursor.execute(query, bind_params) rows = [] cols = [desc[0] for desc in cursor.description] for db_row in cursor: row_list = [] for col in db_row: value = col if isinstance(value, str): # If value contains a non english alphabet value = value.encode('utf-8') else: value = value row_list.append(value) rows.append(row_list) return Table(columns=cols, data=rows)
def run_query(self, query, user): conn = connect( s3_staging_dir=self.configuration['s3_staging_dir'], region_name=self.configuration['region'], access_key=self.configuration.get('aws_access_key', None), secret_key=self.configuration.get('aws_secret_key', None), schema_name=self.configuration.get('schema', 'default'), jvm_path=self.configuration.get('jvm_path', None), jvm_options=self.configuration.get['jvm_options'].split(',') if self.configuration.get('jvm_options', None) else None) try: with conn.cursor() as cursor: cursor.execute(query) column_tuples = [(i[0], _ATHENA_TYPES_MAPPING.get(i[1], None)) for i in cursor.description] columns = self.fetch_columns(column_tuples) rows = [ dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall()) ] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except Exception as e: json_data = None error = e.message finally: if conn: conn.close() return json_data, error
def refresh_paritions(self, temp_athena_staging_dir=None, database_name=None): """ Refresh the partitions in a table, if they exist """ if self.partitions: if not temp_athena_staging_dir: if self.database: temp_athena_staging_dir = self.database.s3_athena_temp_folder else: raise ValueError( "You must provide a path to a directory in s3 for Athena to cache query results" ) conn = connect(s3_staging_dir=temp_athena_staging_dir, region_name='eu-west-1') if not database_name: if self.database: database_name = self.database.name else: raise KeyError( "You must provide a database name, or register a database object against the table" ) sql = "MSCK REPAIR TABLE {}.{}".format(database_name, self.name) try: with conn.cursor() as cursor: cursor.execute(sql) finally: conn.close()
def execute_query(self, query, data=None): """Run a SELECT statement. Args: query: The SELECT statement to be executed data[Optional]: The data to be used for parametrized query Returns: Returns the result as pandas dataframe """ try: with contextlib.closing(connect()) as conn: with conn.cursor() as cursor: cursor.execute(query, data) result_file = cursor.query_id + '.csv' self.__query_id = cursor.query_id bucket_name = os.environ["AWS_ATHENA_S3_STAGING_DIR"] s3_client = boto3.client('s3') # Remove the s3:// part from bucket name obj = s3_client.get_object(Bucket=bucket_name[5:], Key=result_file) df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8') except Exception as ex: raise ex # Query executed, now load the result csv to pandas df return df
def _new_connection(self): config = self.config if self.mode == 'jdbc': conn_params = { 'Workgroup': config.workgroup, 'AwsRegion': config.region, 'S3OutputLocation': config.output_location } if config.default_database is not None: conn_params['Schema'] = config.default_database conn = pyathenajdbc.connect(**conn_params) else: # rest, csv assert self.mode in ['rest', 'csv'] conn_params = { 'work_group': config.workgroup, 'region_name': config.region, 'output_location': config.output_location } if config.default_database is not None: conn_params['schema_name'] = config.default_database if self.mode == 'csv': conn_params['cursor_class'] = PandasCursor conn = pyathena.connect(**conn_params) return conn
def get_conn(self): """Returns a connection object""" db = self.get_connection(self.athena_conn_id) return pyathenajdbc.connect( s3_staging_dir=db.extra_dejson.get('s3_staging_dir', None), region_name=db.host, access_key=db.login, secret_key=db.password, catalog=db.extra_dejson.get('catalog', 'hive'), )
def get(query): print("running query..") with contextlib.closing( connect(s3_staging_dir=os.getenv('ATHENA_BUCKET', 'UNSET'), region_name=os.getenv('AWS_REGION', 'UNSET'))) as conn: with conn.cursor() as cursor: cursor.execute(query) df = as_pandas(cursor) return df.values.tolist
def check_table(schema_name, table_name): """Check if a table exists in the given athena schema""" try: query = "SHOW TABLES FROM {0}".format(schema_name) with contextlib.closing(connect()) as conn: with conn.cursor() as cursor: cursor.execute(query) result = cursor.fetchall() result = table_name in [table_name[0] for table_name in result] return result except Exception as ex: raise ex
def execute_sql_athena(query_str): results = None conn = connect( s3_staging_dir='s3://aws-athena-query-results-827860021338-eu-west-1/', region_name='eu-west-1') try: with conn.cursor() as cursor: cursor.execute(query_str) results = cursor.fetchall() finally: conn.close() return results
def make_partitions(db_name, table_name, temp_dir): """ Temp dir: e.g 's3://alpha-dag-data-warehouse-template/temp_delete/' """ conn = connect(s3_staging_dir=temp_dir, region_name='eu-west-1') sql = """ MSCK REPAIR TABLE {}.{}; """.format(db_name, table_name) cursor = conn.cursor() cursor.execute(sql) conn.commit()
def __init__(self): self.s3 = boto3.resource("s3", aws_access_key_id=S3_KEY, aws_secret_access_key=S3_SECRET_ACCESS_KEY) self.s3_client = boto3.client( "s3", aws_access_key_id=S3_KEY, aws_secret_access_key=S3_SECRET_ACCESS_KEY) self.conn = connect( access_key=S3_KEY, secret_key=S3_SECRET_ACCESS_KEY, region_name='us-east-1', s3_staging_dir='s3://{bucket}/athena/aijus/query_stage/'.format( bucket=DB_BUCKET))
def execute_non_query(self, query, data=None): """Run a insert/select/ddl queries statement. Args: query: The query statement to be executed data[Optional]: The data to be used for parametrized query Returns: Returns the result as pandas dataframe """ try: with contextlib.closing(connect()) as conn: with conn.cursor() as cursor: cursor.execute(query, data) except Exception as ex: raise ex
def _select_pd_jdbc(sql: str, aws_region: str, database: Optional[str] = None, params: Union[dict, list, None] = None, workgroup: Optional[str] = None, s3_output_location: Optional[str] = None) -> pd.DataFrame: sql_one_line = ' '.join(sql.split()) klog.trace(f'running query on athena, method jdbc: {sql_one_line}') plain_sql, _ = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='plain') klog.trace(f'plain query: {plain_sql}') if klog.log_level > 0: klog.debug('running query on athena, method jdbc') _sql, _params = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='pyformat') connection_params = { 'Workgroup': workgroup, 'AwsRegion': aws_region, 'S3OutputLocation': s3_output_location } if database is not None: connection_params['Schema'] = database with contextlib.closing(pyathenajdbc.connect(**connection_params)) as conn: with contextlib.closing(conn.cursor()) as cursor: results = cursor.execute(_sql, _params) klog.trace('query executed') if klog.log_level > 0: klog.debug('query execution completed.') df = pyathenajdbc.util.as_pandas(results) klog.trace('query results read into dataframe with {} rows.', len(df)) return df
def query_from_athena_using_jdbc(): conn = connect(s3_staging_dir='s3://krux-attribute-metrics-staging/stream-processor/pixel_attributes_stream/', region_name='us-east-1', driver_path='/home/vagrant/.virtualenvs/pycharm/lib/python2.7/site-packages/pyathenajdbc/AthenaJDBC41-1.1.0.jar') try: with conn.cursor() as cursor: cursor.execute(""" select process_time ,sum(views) from spectrum_db.pixel_attributes_stream where organization = 'c819846f-b6e4-4ef4-b357-28399862b473' and process_time > '2017-10-30_18:55' group by process_time order by 1 desc limit 10 """) print(cursor.description) print(cursor.fetchall()) finally: conn.close()
from pyathenajdbc import connect import pandas as pd def processCategories(line): #split categories to list line = list(map(str, line.split("_"))) line = list(filter(None, line)) return line if __name__ == "__main__": conn = connect( access_key=ACCESS_KEY, secret_key=AWS_SECRET_ACCESS_KEY, s3_staging_dir='s3://athena-bucket-reviews', region_name='us-east-2', ) df_false = pd.read_sql( """ SELECT product_id as asin, product_category, count(review_headline) as count_reviews, avg(star_rating) as rating FROM amazon_reviews_parquet GROUP BY product_id, product_category ORDER BY RANDOM() LIMIT 5000;""", conn) df_false['product_category'] = df_false['product_category'].apply( processCategories) data_true = "/reviews/products.csv" data_true = pd.read_csv(data_true, engine='python')
def connect(self, work_group=None): from pyathenajdbc import connect return connect(Schema=SCHEMA, Workgroup=work_group)
def connect(self): return connect()
def teardown_module(module): with contextlib.closing(connect()) as conn: with conn.cursor() as cursor: _drop_database(cursor)
def setup_module(module): with contextlib.closing(connect()) as conn: with conn.cursor() as cursor: _create_database(cursor) _create_table(cursor)
def _teardown_session(): with contextlib.closing(connect()) as conn: with conn.cursor() as cursor: _drop_database(cursor)
def _setup_session(request): request.addfinalizer(_teardown_session) with contextlib.closing(connect()) as conn: with conn.cursor() as cursor: _create_database(cursor) _create_table(cursor)
#Using JDBC https://github.com/laughingman7743/PyAthenaJDBC/ import os import pyathenajdbc from pyathenajdbc import connect conn = connect(S3OutputLocation='s3://YOUR_S3_BUCKET/path/1/', AwsRegion='us-west-2', Profile='YOUR_PROFILE_NAME') try: with conn.cursor() as cursor: cursor.execute(""" SELECT col_string FROM one_row_complex WHERE col_string = %(param)s """, {'param': 'a string'}) print(cursor.fetchall()) finally: conn.close() conn = connect(S3OutputLocation='s3://YOUR_S3_BUCKET/path/2/', AwsRegion='us-west-2') try: with conn.cursor() as cursor: cursor.execute(""" SELECT * FROM many_rows LIMIT 10 """) for row in cursor: print(row) finally: conn.close()
def execute_other_thread(): with contextlib.closing(connect(schema_name=SCHEMA)) as conn: with conn.cursor() as cursor: cursor.execute('SELECT * FROM one_row') return cursor.fetchall()
def connection_athena(access_key, secret_key, s3_staging_dir, region_name): conn = connect(access_key=access_key, secret_key=secret_key, s3_staging_dir=s3_staging_dir, region_name=region_name) return conn
def connect(self): return connect(schema_name=SCHEMA)
def connecti(self): self.conn = pyathenajdbc.connect( s3_staging_dir="s3://athena-internship", region_name="us-east-1", log_path=log_path, )