class read_athena():

    conn = connect(s3_staging_dir='s3://willjeventdata/seatgeek/main data',
                   region_name='us-west-2')

    df = pd.read_sql("select * from tickets_db.seatgeek_events limit 100",
                     conn)
Exemplo n.º 2
0
    def run_query(self, query, user):
        cursor = pyathena.connect(
            s3_staging_dir=self.configuration['s3_staging_dir'],
            region_name=self.configuration['region'],
            aws_access_key_id=self.configuration.get('aws_access_key', None),
            aws_secret_access_key=self.configuration.get(
                'aws_secret_key', None),
            schema_name=self.configuration.get('schema', 'default'),
            encryption_option=self.configuration.get('encryption_option',
                                                     None),
            kms_key=self.configuration.get('kms_key', None),
            formatter=SimpleFormatter()).cursor()

        try:
            cursor.execute(query)
            column_tuples = [(i[0], _TYPE_MAPPINGS.get(i[1], None))
                             for i in cursor.description]
            columns = self.fetch_columns(column_tuples)
            rows = [
                dict(zip(([c['name'] for c in columns]), r))
                for i, r in enumerate(cursor.fetchall())
            ]
            data = {'columns': columns, 'rows': rows}
            json_data = json.dumps(data, cls=JSONEncoder)
            error = None
        except KeyboardInterrupt:
            if cursor.query_id:
                cursor.cancel()
            error = "Query cancelled by user."
            json_data = None
        except Exception, ex:
            if cursor.query_id:
                cursor.cancel()
            error = ex.message
            json_data = None
Exemplo n.º 3
0
    def _connect(cfg):
        global conn, cursor, databases
        try:
            info("Connect.")
            conn = connect(aws_access_key_id=cfg['aws']['access_key'],
                           aws_secret_access_key=cfg['aws']['secret_key'],
                           s3_staging_dir=cfg['aws']['s3_stage_dir'],
                           region_name='ap-northeast-2')

            cursor = conn.cursor()
            databases = [
                rv[0] for rv in cursor.execute('show databases').fetchall()
            ]

        except Exception as e:
            error("=== DB Error Message ===")
            error(str(e))
            messagebox.showerror("에러", "DB 접속 에러. AWS 설정을 확인해 주세요.")
            aws_btn['state'] = 'normal'
        else:
            warning("Connect success.")
            # 모든 프로파일에 DB 설정
            if databases is not None:
                for k, pro in profiles.items():
                    pro.set_databases(databases)
            win.after(10, _first_db_set)
            # enable_controls()
        finally:
            wait_dlg.top.destroy()
            unset_wait_cursor()
    def __post_init__(self):
        # set up the datasets
        (
            self.target_time_series,
            self.related_time_series,
            self.item_metadata,
        ) = self._get_datasets()

        # get the dataset group (from the provided forecast)
        self.dataset_group = self.forecast._dataset_group

        # set up column mappings
        column_mappings = {}
        for n in range(1, MAX_DIMENSIONS + 1):
            column_mappings[f"dimension_{n}"] = None
        for n in range(1, MAX_METADATA_FIELDS + 1):
            column_mappings[f"metadata_{n}"] = None
        self.column_mappings = column_mappings

        # set up the connection to AWS Athena
        self.cursor = connect(
            region_name=get_aws_region(),
            work_group=self.workgroup,
            schema_name=self.schema,
            cursor_class=AsyncCursor,
        ).cursor()
Exemplo n.º 5
0
 def _open_connection(self):
     return pyathena.connect(
         aws_access_key_id=self.config.access_key,
         aws_secret_access_key=self.config.secret_key,
         s3_staging_dir=self.config.staging_dir,
         region_name=self.config.region,
     )
Exemplo n.º 6
0
def athena_python(sql='', s3_staging_dir=''):
    cnx = connect(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                   aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
                   region_name=os.environ['AWS_DEFAULT_REGION'],
                   s3_staging_dir=s3_staging_dir)
    athena_df = pd.read_sql(sql, cnx)
    return athena_df
Exemplo n.º 7
0
    def query(self, q):
        _conn_args = {
            's3_staging_dir': self.staging_dir,
            'database': self.database
        }

        if self.access_key is not None and self.secret_key is not None:
            _conn_args['aws_access_key_id'] = self.access_key
            _conn_args['aws_secret_access_key'] = self.secret_key

        if self.region_name is not None:
            _conn_args['region_name'] = self.region_name

        conn = connect(**_conn_args)
        cursor = conn.cursor()

        cursor.execute(q)

        # Load query results into Pandas DataFrame and show results
        df = as_pandas(cursor)

        col_map = {}
        for col in df.columns:
            col_map[col] = col

        return df, col_map
Exemplo n.º 8
0
    def _new_connection(self):
        config = self.config
        if self.mode == 'jdbc':
            conn_params = {
                'Workgroup': config.workgroup,
                'AwsRegion': config.region,
                'S3OutputLocation': config.output_location
            }
            if config.default_database is not None:
                conn_params['Schema'] = config.default_database

            conn = pyathenajdbc.connect(**conn_params)

        else:  # rest, csv
            assert self.mode in ['rest', 'csv']

            conn_params = {
                'work_group': config.workgroup,
                'region_name': config.region,
                'output_location': config.output_location
            }
            if config.default_database is not None:
                conn_params['schema_name'] = config.default_database
            if self.mode == 'csv':
                conn_params['cursor_class'] = PandasCursor
            conn = pyathena.connect(**conn_params)

        return conn
Exemplo n.º 9
0
def get_device_data(data={}, devices=[]):
    added = 0
    start = int(get_start().strftime('%s')) * 1000
    stop = int(get_stop().strftime('%s')) * 1000

    c = connect(s3_staging_dir='s3://parsyl-athena-output-production-useast1',
                region_name='us-east-1')

    for device_id in tqdm_notebook(devices, desc='device data loaded'):
        if device_id in data:
            continue

        stmt = """
            SELECT time, temperature, humidity
            FROM parsyl_device_data_database.parsyl_data_lake_production_useast1_v3
            WHERE device=%(device_id)s AND temperature IS NOT NULL AND time >= %(start)d AND time <= %(stop)d
            ORDER BY time
        """
        try:
            with c.cursor() as cursor:
                cursor.execute(stmt, {
                    'device_id': device_id,
                    'start': start,
                    'stop': stop
                })
                data[device_id] = as_pandas(cursor)
                added += 1
        except Exception as e:
            print('ERROR querying device data - {}'.format(e))

    c.close()

    return data, added
Exemplo n.º 10
0
def create_connection(pyathena_rc_path=None):
    """Creates SQL Server connection using AWS Athena credentials

    Args:
        pyathena_rc_path (str): Path to the PyAthena RC file with the AWS Athena variables (default: None)

    Returns:
        Connection Cursor
    """
    if pyathena_rc_path is None:
        pyathena_rc_path = Path(__file__).parent / "../../../pyathena/pyathena.rc"

    # Open Pyathena RC file and get list of all connection variables in a processable format
    with open(pyathena_rc_path) as f:
        lines = f.readlines()

    lines = [x.strip() for x in lines]
    lines = [x.split('export ')[1] for x in lines]
    lines = [line.replace('=', '="') + '"' if '="' not in line else line for line in lines]
    variables = [line.split('=') for line in lines]

    # Create variables using the processed variable names from the RC file
    AWS_CREDS = {}
    for key, var in variables:
        exec("{} = {}".format(key, var), AWS_CREDS)

    # Create connection
    cursor = connect(aws_access_key_id=AWS_CREDS['AWS_ACCESS_KEY_ID'],
                     aws_secret_access_key=AWS_CREDS['AWS_SECRET_ACCESS_KEY'],
                     s3_staging_dir=AWS_CREDS['AWS_ATHENA_S3_STAGING_DIR'],
                     region_name=AWS_CREDS['AWS_DEFAULT_REGION'],
                     work_group=AWS_CREDS['AWS_ATHENA_WORK_GROUP'],
                     schema_name=SCHEMA_NAME).cursor(PandasCursor)
    return cursor
Exemplo n.º 11
0
    def run_query(self, query, user):
        cursor = pyathena.connect(
            s3_staging_dir=self.configuration['s3_staging_dir'],
            region_name=self.configuration['region'],
            aws_access_key_id=self.configuration.get('aws_access_key', None),
            aws_secret_access_key=self.configuration.get('aws_secret_key', None),
            schema_name=self.configuration.get('schema', 'default'),
            encryption_option=self.configuration.get('encryption_option', None),
            kms_key=self.configuration.get('kms_key', None),
            formatter=SimpleFormatter()).cursor()

        try:
            cursor.execute(query)
            column_tuples = [(i[0], _TYPE_MAPPINGS.get(i[1], None)) for i in cursor.description]
            columns = self.fetch_columns(column_tuples)
            rows = [dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall())]
            data = {'columns': columns, 'rows': rows}
            json_data = json.dumps(data, cls=JSONEncoder)
            error = None
        except KeyboardInterrupt:
            if cursor.query_id:
                cursor.cancel()
            error = "Query cancelled by user."
            json_data = None
        except Exception, ex:
            if cursor.query_id:
                cursor.cancel()
            error = ex.message
            json_data = None
Exemplo n.º 12
0
def _setup_session(request):
    request.addfinalizer(_teardown_session)
    _upload_rows()
    with contextlib.closing(connect()) as conn:
        with conn.cursor() as cursor:
            _create_database(cursor)
            _create_table(cursor)
Exemplo n.º 13
0
def _cursor(cursor_class, request):
    if not hasattr(request, "param"):
        setattr(request, "param", {})
    with contextlib.closing(
        connect(schema_name=ENV.schema, cursor_class=cursor_class, **request.param)
    ) as conn:
        with conn.cursor() as cursor:
            yield cursor
def lambda_handler(event, context):
    ATHENA_OUTPUT_BUCKET = os.environ['Athena_bucket']
    DB_SCHEMA = os.environ['Db_schema']
    USE_WEATHER_DATA = os.environ['With_weather_data']

    pathParameter = event["pathParameters"]
    queryParameter = event["queryStringParameters"]

    if ("meter_id" not in pathParameter) \
            or ("data_start" not in queryParameter) \
            or ("data_end" not in queryParameter):
        return {
            'statusCode':
            500,
            'body':
            "error: meter_id, data_start, and data_end needs to be provided."
        }

    METER_ID = pathParameter['meter_id']
    ML_ENDPOINT_NAME = load_json_from_file(
        WORKING_BUCKET, "meteranalytics/initial_pass")["ML_endpoint_name"]
    DATA_START = queryParameter['data_start'].replace("-", "")
    DATA_END = queryParameter['data_end'].replace("-", "")

    connection = connect(
        s3_staging_dir='s3://{}/'.format(ATHENA_OUTPUT_BUCKET),
        region_name=REGION)
    query = '''select date_trunc('HOUR', reading_date_time) as datetime, sum(reading_value) as consumption
                from "{}".daily
                where meter_id = '{}' and date_str >= '{}'
                and  date_str < '{}'
                group by 1;
                '''.format(DB_SCHEMA, METER_ID, DATA_START, DATA_END)
    result = pd.read_sql(query, connection)
    result = result.set_index('datetime')

    data_kw = result.resample('1H').sum()
    timeseries = data_kw.iloc[:,
                              0]  #np.trim_zeros(data_kw.iloc[:,0], trim='f')

    freq = 'H'
    df_weather = None
    if USE_WEATHER_DATA == 1:
        df_weather = get_weather(connection, DATA_START, DB_SCHEMA)

    runtime = boto3.client('runtime.sagemaker')
    response = runtime.invoke_endpoint(EndpointName=ML_ENDPOINT_NAME,
                                       ContentType='application/json',
                                       Body=encode_request(
                                           timeseries[:], df_weather))
    prediction_time = timeseries.index[-1] + pd.Timedelta(1, unit='H')
    df_prediction = decode_response(response['Body'].read(), freq,
                                    prediction_time)

    df_prediction.columns = ['consumption']
    prediction_result = df_prediction.to_json()

    return {"statusCode": 200, "body": prediction_result}
Exemplo n.º 15
0
def create_client(config, logger: Logger):
    """Generates an athena client object

    Args:
        config ([type]): [description]
        logger (Logger): [description]

    Returns:
        cursor: athena client object
    """

    logger.info("Attempting to create Athena session")

    # Get the required parameters from config file and/or environment variables
    aws_access_key_id = config.get("aws_access_key_id") or os.environ.get(
        "AWS_ACCESS_KEY_ID"
    )
    aws_secret_access_key = config.get("aws_secret_access_key") or os.environ.get(
        "AWS_SECRET_ACCESS_KEY"
    )
    aws_session_token = config.get("aws_session_token") or os.environ.get(
        "AWS_SESSION_TOKEN"
    )
    aws_profile = config.get("aws_profile") or os.environ.get("AWS_PROFILE")
    aws_region = config.get("aws_region") or os.environ.get("AWS_REGION")
    s3_staging_dir = config.get("s3_staging_dir") or os.environ.get("S3_STAGING_DIR")
    logger.info(f"Using Athena region {aws_region}")

    # AWS credentials based authentication
    if aws_access_key_id and aws_secret_access_key:
        cursor = connect(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name=aws_region,
            s3_staging_dir=s3_staging_dir,
        ).cursor()

    # AWS Profile based authentication
    else:
        cursor = connect(
            profile_name=aws_profile,
            region_name=aws_region,
            s3_staging_dir=s3_staging_dir,
        ).cursor()
    return cursor
Exemplo n.º 16
0
def get_conn() -> AthenaConn:
    """Allow passing of pyathena connection."""
    # load the environment
    env_name = ".env"
    load_dotenv(dotenv_path=os.path.join(PROJ_DIR, env_name))
    return pyathena.connect(aws_access_key_id=os.getenv("ACCESS_KEY"),
                            aws_secret_access_key=os.getenv("SECRET_KEY"),
                            s3_staging_dir=os.getenv("S3_DIR"),
                            region_name=os.getenv("REGION_NAME"))
def lambda_handler(event, context):
    ATHENA_OUTPUT_BUCKET = os.environ['Athena_bucket']
    S3_BUCKET = os.environ['Working_bucket']
    DB_SCHEMA = os.environ['Db_schema']

    BATCH_START = event['Batch_start']
    BATCH_END = event['Batch_end']
    FORECAST_START = event['Data_end']
    FORECAST_PERIOD = event['Forecast_period']
    prediction_length = FORECAST_PERIOD * 24

    connection = connect(
        s3_staging_dir='s3://{}/'.format(ATHENA_OUTPUT_BUCKET),
        region_name=REGION)

    output = 'meteranalytics/inference/batch_%s_%s/batch.json.out' % (
        BATCH_START, BATCH_END)
    boto3.Session().resource('s3').Bucket(S3_BUCKET).Object(
        output).download_file('/tmp/batch.out.json')
    print('get inference result')

    freq = 'H'
    prediction_time = pd.Timestamp(FORECAST_START, freq=freq)
    prediction_index = pd.date_range(
        start=prediction_time,
        end=prediction_time + pd.Timedelta(prediction_length - 1, unit='H'),
        freq=freq)
    dict_of_samples = {}

    meterids = get_meters(connection, BATCH_START, BATCH_END, DB_SCHEMA)

    results = pd.DataFrame(columns=['meterid', 'datetime', 'kwh'])
    i = 0
    with open('/tmp/batch.out.json') as fp:
        for line in fp:
            df = pd.DataFrame(data={
                **json.loads(line)['quantiles'],
                **dict_of_samples
            },
                              index=prediction_index)
            dataframe = pd.DataFrame({
                'meter_id':
                np.array([meterids[i] for x in range(df['0.9'].count())]),
                'datetime':
                df.index.values,
                'consumption':
                df['0.9'].values
            })
            i = i + 1
            results = results.append(dataframe)

    results.to_csv('/tmp/forecast.csv', index=False)
    boto3.Session().resource('s3').Bucket(S3_BUCKET).Object(
        os.path.join(
            'meteranalytics', 'forecast/{}/batch_{}_{}.csv'.format(
                FORECAST_START, BATCH_START,
                BATCH_END))).upload_file('/tmp/forecast.csv')
Exemplo n.º 18
0
 def create_connection(self):
     # pyathena.connect will do the role resolving
     # aws_credentials = self.aws_credentials.resolve_role('soda_scan')
     return pyathena.connect(
         aws_access_key_id=self.aws_credentials.access_key_id,
         aws_secret_access_key=self.aws_credentials.secret_access_key,
         s3_staging_dir=self.athena_staging_dir,
         region_name=self.aws_credentials.region_name,
         role_arn=self.aws_credentials.role_arn)
Exemplo n.º 19
0
    def run_query(self, query, user):
        cursor = pyathena.connect(
            s3_staging_dir=self.configuration['s3_staging_dir'],
            region_name=self.configuration['region'],
            aws_access_key_id=self.configuration.get('aws_access_key', None),
            aws_secret_access_key=self.configuration.get(
                'aws_secret_key', None),
            schema_name=self.configuration.get('schema', 'default'),
            encryption_option=self.configuration.get('encryption_option',
                                                     None),
            kms_key=self.configuration.get('kms_key', None),
            formatter=SimpleFormatter()).cursor()

        try:
            cursor.execute(query)
            column_tuples = [(i[0], _TYPE_MAPPINGS.get(i[1], None))
                             for i in cursor.description]
            columns = self.fetch_columns(column_tuples)
            rows = [
                dict(zip(([c['name'] for c in columns]), r))
                for i, r in enumerate(cursor.fetchall())
            ]
            qbytes = None
            athena_query_id = None
            try:
                qbytes = cursor.data_scanned_in_bytes
            except AttributeError as e:
                logger.debug(
                    "Athena Upstream can't get data_scanned_in_bytes: %s", e)
            try:
                athena_query_id = cursor.query_id
            except AttributeError as e:
                logger.debug("Athena Upstream can't get query_id: %s", e)
            data = {
                'columns': columns,
                'rows': rows,
                'metadata': {
                    'data_scanned': qbytes,
                    'athena_query_id': athena_query_id
                }
            }
            json_data = simplejson.dumps(data,
                                         ignore_nan=True,
                                         cls=SimpleJSONEncoder)
            error = None
        except KeyboardInterrupt:
            if cursor.query_id:
                cursor.cancel()
            error = "Query cancelled by user."
            json_data = None
        except Exception as ex:
            if cursor.query_id:
                cursor.cancel()
            error = ex.message
            json_data = None

        return json_data, error
Exemplo n.º 20
0
    def authenticate(self):
        conn = connect(  #jvm_path='/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/MacOS/libjli.dylib',
            profile_name=self.profile_name,
            aws_access_key_id=self.credentials.access_key,
            aws_secret_access_key=self.credentials.secret_key,
            aws_session_token=self.credentials.token,
            region_name=self.region_name,
            s3_staging_dir=self.s3_staging_dir)

        return conn
Exemplo n.º 21
0
 def __init__(self):
     self.cursor = connect(
         aws_access_key_id=settings.EXTERNAL_SERVICES["AWS"]["ATHENA"]
         ["ACCESS_KEY"],
         aws_secret_access_key=settings.EXTERNAL_SERVICES["AWS"]["ATHENA"]
         ["SECRET_KEY"],
         region_name=settings.EXTERNAL_SERVICES["AWS"]["ATHENA"]
         ["REGION_NAME"],
         s3_staging_dir=settings.EXTERNAL_SERVICES["AWS"]["ATHENA"]
         ["STAGING_DIR"]).cursor()
Exemplo n.º 22
0
def athena_add_partition(cluster, d_yyyy, d_mm, d_dd):
    cursor = connect(aws_access_key_id=conf.athena_access_key,
                     aws_secret_access_key=conf.athena_secret_access_key,
                     s3_staging_dir=conf.s3_staging_dir,
                     region_name=conf.region_name).cursor()

    command = "ALTER TABLE " + conf.athena_schema + "." + conf.athena_table + " ADD IF NOT EXISTS PARTITION "
    command = command + "(cluster='" + cluster + "', yyyy='" + d_yyyy + "', mm='" + d_mm + "', dd='" + d_dd + "') "
    command = command + "location 's3://" + conf.s3_path + cluster + "/" + d_yyyy + "/" + d_mm + "/" + d_dd + "'"
    Mylogger.info("Athena_add_partition:" + command)
    cursor.execute(command)
Exemplo n.º 23
0
def create_connection():
    # Create a database connection based on all of the environment information set above.
    app.logger.info("creating connection...")
    aws_credentials = get_raven_athena_credentials()
    return connect(
        s3_staging_dir=ATHENA_BUCKET,
        region_name=AWS_REGION,
        aws_access_key_id=aws_credentials["AccessKeyId"],
        aws_secret_access_key=aws_credentials["SecretAccessKey"],
        aws_session_token=aws_credentials["SessionToken"],
    )
Exemplo n.º 24
0
    def _get_client(self):
        """
        Connect to Athena lazily

        :rtype: mysql.connector.connection.MySQLConnection
        """
        self.logger.info('Connecting to Athena in "%s"...',
                         self._connection_params['region_name'])

        # https://pypi.org/project/PyAthena/
        return connect(**self._connection_params)
def connect_athena(path='configs/athena.yaml'):
    """Gets athena cursor given athena an athena configuration file.`
    
    Returns
    -------
    pyathena cursor
        Athena database cursor
    """

    athena_config = yaml.load(open(path, 'r'))
    
    return connect(**athena_config)
Exemplo n.º 26
0
    def run_query(self, query, user):
        cursor = pyathena.connect(
            s3_staging_dir=self.configuration["s3_staging_dir"],
            schema_name=self.configuration.get("schema", "default"),
            encryption_option=self.configuration.get("encryption_option", None),
            kms_key=self.configuration.get("kms_key", None),
            work_group=self.configuration.get("work_group", "primary"),
            formatter=SimpleFormatter(),
            **self._get_iam_credentials(user=user)
        ).cursor()

        try:
            cursor.execute(query)
            column_tuples = [
                (i[0], _TYPE_MAPPINGS.get(i[1], None)) for i in cursor.description
            ]
            columns = self.fetch_columns(column_tuples)
            rows = [
                dict(zip(([c["name"] for c in columns]), r))
                for i, r in enumerate(cursor.fetchall())
            ]
            qbytes = None
            athena_query_id = None
            try:
                qbytes = cursor.data_scanned_in_bytes
            except AttributeError as e:
                logger.debug("Athena Upstream can't get data_scanned_in_bytes: %s", e)
            try:
                athena_query_id = cursor.query_id
            except AttributeError as e:
                logger.debug("Athena Upstream can't get query_id: %s", e)
            data = {
                "columns": columns,
                "rows": rows,
                "metadata": {
                    "data_scanned": qbytes,
                    "athena_query_id": athena_query_id,
                },
            }
            json_data = json_dumps(data, ignore_nan=True)
            error = None
        except (KeyboardInterrupt, InterruptException):
            if cursor.query_id:
                cursor.cancel()
            error = "Query cancelled by user."
            json_data = None
        except Exception as ex:
            if cursor.query_id:
                cursor.cancel()
            error = str(ex)
            json_data = None

        return json_data, error
Exemplo n.º 27
0
    def connect(self, database=None):
        conn = pyathena.connect(
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
            region_name=self.region_name,
            s3_staging_dir=self.s3_staging_dir,
            schema_name=database or self.database)
        self.database = database or self.database

        if hasattr(self, 'conn'):
            self.conn.close()
        self.conn = conn
Exemplo n.º 28
0
def run_query(sql):
    from pyathena import connect

    cursor = connect(
        profile_name=aws_profile,
        s3_staging_dir=result_uri,
        schema_name="learn_athena",
    ).cursor()

    cursor.execute(sql)
    for row in cursor.fetchall():
        print(row)
def lambda_handler(event, context):
    ATHENA_OUTPUT_BUCKET = os.environ['Athena_bucket']
    DB_SCHEMA = os.environ['Db_schema']
    USE_WEATHER_DATA = os.environ['With_weather_data']

    pathParameter = event["pathParameters"]
    queryParameter = event["queryStringParameters"]

    if ("meter_id" not in pathParameter) \
            or ("data_start" not in queryParameter) \
            or ("data_end" not in queryParameter) \
            or ("outlier_only" not in queryParameter):
        return {
            'statusCode':
            500,
            'body':
            "error: meter_id, data_start, data_end and outlier_only needs to be provided."
        }

    METER_ID = pathParameter['meter_id']
    DATA_START = queryParameter['data_start']
    DATA_END = queryParameter['data_end']
    OUTLIER_ONLY = queryParameter['outlier_only']

    connection = connect(
        s3_staging_dir='s3://{}/'.format(ATHENA_OUTPUT_BUCKET),
        region_name=REGION)

    if USE_WEATHER_DATA == 1:
        query = '''with weather_daily as (
            select date_trunc('day', date_parse(time,'%Y-%m-%d %H:%i:%s')) as datetime,
            avg(temperature) as temperature, avg(apparenttemperature) as apparenttemperature, avg(humidity) as humidity
            from default.weather group by 1
        )
        select ma.*, mw.temperature, mw.apparenttemperature
        from "{}".anomaly ma, weather_daily mw
        where meter_id = '{}'
        and cast(ma.ds as timestamp) >= timestamp '{}' and cast(ma.ds as timestamp) < timestamp '{}'
        and cast(ma.ds as timestamp) = mw.datetime
        '''.format(DB_SCHEMA, DB_SCHEMA, METER_ID, DATA_START, DATA_END)
    else:
        query = '''
        select * from "{}".anomaly
        where meter_id = '{}'
        and cast(ds as timestamp) >= timestamp '{}' and cast(ds as timestamp) < timestamp '{}'
        '''.format(DB_SCHEMA, METER_ID, DATA_START, DATA_END)

    if OUTLIER_ONLY == 1:
        query = query + ' and anomaly <> 0'

    df_consumption = pd.read_sql(query, connection)

    return {"statusCode": 200, "body": df_consumption.to_json()}
Exemplo n.º 30
0
    def connect(self):

        if not self._connected:

            # Check for database
            if not athena_exist(self._dbname, self._s3bucket, self._remotepath, verbose=False):
                self._connected = False
                pm.print_error('Athena [{0}] does not exist'.format(self._dbname), exit_code=1)

            if check_aws_env():

                try:
                    self._connection = pyathena.connect(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                                                        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
                                                        s3_staging_dir=self._output_location,
                                                        region_name=os.environ['AWS_REGION'])

                except Exception as e:
                    pm.print_error('Error connecting to database')
                    pm.print_separator()
                    pm.print_error(str(e))
                    pm.print_separator()
                    self._connected = False


            else:


                try:
                    self._connection = pyathena.connect(s3_staging_dir=self._output_location,
                                                        region_name=os.environ['AWS_REGION'])

                except Exception as e:
                    pm.print_error('Error connecting to database')
                    pm.print_separator()
                    pm.print_error(str(e))
                    pm.print_separator()
                    self._connected = False

            self._connected = True
Exemplo n.º 31
0
def make_athena_connection(s3_staging_dir=None,
                           region_name=None,
                           schema_name=None,
                           work_group=None):
    """
    Function that connects to an Athena database.

    Parameters
    ----------
    s3_staging_dir : str
        s3 path to data in Athena. Starts with 's3://'.
    region_name : str
        AWS region, e.g., 'us-west-2'
    schema_name : str
        name of the Athena database schema (a.k.a. database_name in AthenaEngine)
    work_group : None or str
        name of the AWS work_group

    Returns
    -------
    con : pyathena.connection.Connection
    cur : pyathena.cursor.Cursor
    engine : sqlalchemy.engine.Engine
    """
    # consolidate direct and configured arguments
    kwargs = _get_aws_connection_kwargs(s3_staging_dir=s3_staging_dir,
                                        region_name=region_name,
                                        schema_name=schema_name,
                                        work_group=work_group)

    con = pyathena.connect(**kwargs)
    cur = con.cursor()

    engine = None
    if kwargs['schema_name'] is not None:
        assert kwargs['region_name'] is not None, kwargs
        conn_str = make_athena_connection.CONNECTION_STRING.format(
            region_name=kwargs['region_name'],
            database_name=kwargs['schema_name'])
        sep = '?'
        if kwargs['s3_staging_dir'] is not None:
            conn_str += '{sep}s3_staging_dir={s3_staging_dir}'.format(
                sep=sep, s3_staging_dir=kwargs['s3_staging_dir'])
            sep = '&'
        if kwargs['work_group'] is not None:
            conn_str += '{sep}work_group={work_group}'.format(
                sep=sep, work_group=kwargs['work_group'])
        logger.debug(
            f"Connecting to AWS Athena with connection string:{conn_str}")
        engine = sa.create_engine(conn_str)

    return con, cur, engine
Exemplo n.º 32
0
    def run_query(self, query, user):
        cursor = pyathena.connect(
            s3_staging_dir=self.configuration['s3_staging_dir'],
            region_name=self.configuration['region'],
            aws_access_key_id=self.configuration.get('aws_access_key', None),
            aws_secret_access_key=self.configuration.get('aws_secret_key', None),
            schema_name=self.configuration.get('schema', 'default'),
            encryption_option=self.configuration.get('encryption_option', None),
            kms_key=self.configuration.get('kms_key', None),
            work_group=self.configuration.get('work_group', 'primary'),
            formatter=SimpleFormatter()).cursor()

        try:
            cursor.execute(query)
            column_tuples = [(i[0], _TYPE_MAPPINGS.get(i[1], None)) for i in cursor.description]
            columns = self.fetch_columns(column_tuples)
            rows = [dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall())]
            qbytes = None
            athena_query_id = None
            try:
                qbytes = cursor.data_scanned_in_bytes
            except AttributeError as e:
                logger.debug("Athena Upstream can't get data_scanned_in_bytes: %s", e)
            try:
                athena_query_id = cursor.query_id
            except AttributeError as e:
                logger.debug("Athena Upstream can't get query_id: %s", e)
            data = {
                'columns': columns,
                'rows': rows,
                'metadata': {
                    'data_scanned': qbytes,
                    'athena_query_id': athena_query_id
                }
            }
            json_data = json_dumps(data, ignore_nan=True)
            error = None
        except (KeyboardInterrupt, InterruptException):
            if cursor.query_id:
                cursor.cancel()
            error = "Query cancelled by user."
            json_data = None
        except Exception as ex:
            if cursor.query_id:
                cursor.cancel()
            error = ex.message
            json_data = None

        return json_data, error