def duplication_rate(request): query = Query(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]) results = query.execute_with_format(''' with source as ( select '''+ schema.SERVER_TIMESTAMP.long_name +''' as srv_tmutc, '''+ schema.UUID.long_name +''' as uuid from "{0}"."{1}''' + DEFAULT_EVENTS.SESSIONSTART + '''" WHERE p_'''+ schema.SERVER_TIMESTAMP.long_name +'''_strftime > date_format((current_timestamp - interval '24' hour), '%Y%m%d%H0000') UNION select '''+ schema.SERVER_TIMESTAMP.long_name +''' as srv_tmutc, '''+ schema.UUID.long_name +''' as uuid from "{0}"."{1}''' + DEFAULT_EVENTS.CLIENTINITCOMPLETE + '''" WHERE p_'''+ schema.SERVER_TIMESTAMP.long_name +'''_strftime > date_format((current_timestamp - interval '24' hour), '%Y%m%d%H0000') ) SELECT to_unixtime(from_iso8601_timestamp(T1.tmp)) AS Timestmp, round((T1.value1 - T1.value2) / (T1.value2 * 1.0), 6) AS DuplicationRate FROM (SELECT date_format(from_unixtime(srv_tmutc), '%Y-%m-%dT%H:00:00Z') AS tmp, count(uuid) AS value1, count(distinct uuid) AS value2 FROM source GROUP BY 1) AS T1 ORDER BY 1 asc''') return convert_to_tuple_dataset(results)
def platforms(request): query = Query(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]) results = query.execute_with_format("select distinct T3.plt " \ "from " \ "( " \ "SELECT distinct "+ schema.PLATFORM_ID.long_name +" as plt FROM \"{0}\".\"{1}" + DEFAULT_EVENTS.CLIENTINITCOMPLETE +"\" as T1 " \ ") as T3 order by 1 asc ") return convert_to_dataset(results)
def __update_partitions(paths): alter = StringIO() alter.write("ALTER TABLE {0}.{1} ADD ") for path in paths: if path.sensitivity_level == sensitivity.SENSITIVITY_TYPE.NONE and path.buildid == '1.0.2' and ( path.platform == 'Android' or path.platform == 'OSX'): alter.write( " PARTITION (idx_source='{1}', idx_bldid='{2}', idx_year='{3}', idx_month='{4}', idx_day='{5}', idx_hour='{6}', idx_platform='{7}', idx_event='{8}') LOCATION 's3://<bucket>/{0}/{1}/{2}/{3}/{4}/{5}/{6}/{7}/{8}/'" .format(path.sensitivity_level, path.source, path.buildid, path.year, path.month, path.day, path.hour, path.platform, path.event)) query = Query( type('obj', (object, ), {c.ENV_STACK_ID: os.environ[c.ENV_DEPLOYMENT_STACK_ARN]})) query.execute(alter.getvalue())
def query_results(request, id): query = Query(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]) results = query.client.get_query_execution(id) #the JSON serializer doesn't support these types right now del results['Status']['SubmissionDateTime'] if 'CompletionDateTime' in results['Status']: del results['Status']['CompletionDateTime'] if results['Status']['State'] == 'SUCCEEDED': results['Result'] = query.client.get_output( results['ResultConfiguration']['OutputLocation'] ) return results
def main(event, request): context = dict({}) context[c.KEY_LAMBDA_FUNCTION] = request.function_name if hasattr( request, 'function_name') else None context[c.KEY_REQUEST_ID] = request.aws_request_id if hasattr( request, 'aws_request_id') else None stackid = os.environ[c.ENV_DEPLOYMENT_STACK_ARN] context[c.KEY_DB] = DynamoDb(context) context[c.KEY_ATHENA_QUERY] = Query(stackid) context[c.KEY_GLUE_CRAWLER] = Glue() thread_pool = ThreadPool(size=3) crawler_name = context[c.KEY_GLUE_CRAWLER].get_crawler_name(stackid) crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE]) glue = Glue() events = glue.get_events() start = datetime.datetime.utcnow() - datetime.timedelta(hours=2) now = datetime.datetime.utcnow() found = False for type in events: dt = start while dt <= now: prefix = metric_schema.s3_key_format().format( context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day, dt.hour, type, dt.strftime(util.partition_date_format())) found = crawler.exists(prefix) if found: print "FOUND new events=>", prefix break dt += timedelta(hours=1) if found: break if found: thread_pool.add(crawl, context, crawler_name, context[c.KEY_ATHENA_QUERY].execute_with_format) thread_pool.wait() return custom_resource_response.success_response({}, "*")
def query(request, sql, sync=False): sql = sql["sql"] query = Query(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]) return query.execute(sql, sync = sync)