def __init__( self, url="https://druid.broker.develop.otonomousmobility.com/", endpoint="druid/v2", datasource="mytaxi_gps_probes_index_parallel_v4", username=None, password=None, ): super().__init__("druid") self.url = url self.endpoint = endpoint self.datasource = datasource self.connector = PyDruid(url, endpoint) self.connector.set_basic_auth_credentials( username or os.environ["USERNAME"], password or os.environ["PASSWORD"]) interval = self.connector.time_boundary( datasource=self.datasource).result[0]["result"] self.interval = f'{interval["minTime"]}/{interval["maxTime"]}' self.default_query = { "datasource": self.datasource, "granularity": "all", "intervals": self.interval, "paging_spec": { "paging_identifiers": {}, "threshold": 100 }, }
class DruidLoader(BaseLoader): def __init__( self, url="https://druid.broker.develop.otonomousmobility.com/", endpoint="druid/v2", datasource="mytaxi_gps_probes_index_parallel_v4", username=None, password=None, ): super().__init__("druid") self.url = url self.endpoint = endpoint self.datasource = datasource self.connector = PyDruid(url, endpoint) self.connector.set_basic_auth_credentials( username or os.environ["USERNAME"], password or os.environ["PASSWORD"]) interval = self.connector.time_boundary( datasource=self.datasource).result[0]["result"] self.interval = f'{interval["minTime"]}/{interval["maxTime"]}' self.default_query = { "datasource": self.datasource, "granularity": "all", "intervals": self.interval, "paging_spec": { "paging_identifiers": {}, "threshold": 100 }, } def load(self, **kwargs): query = deepcopy(self.default_query) query.update(kwargs) for trace in self.connector.select(**query): probes = [ Probe.from_druid(probe) for probe in trace["result"]["events"] ] yield Trace(probes, identifier=self._extract_booking_id(trace)) @staticmethod def _extract_booking_id(trace): probe_groups = { k: len(list(v)) for k, v in itertools.groupby( trace["result"]["events"], key=lambda event: event["event"]["bookingid"]) } if len(probe_groups) > 1: raise ValueError( f"Trace has probes from different bookings: {probe_groups.keys()}" ) return list(probe_groups.keys())[0]
def get_client(self): conn = self.get_connection(self.druid_broker_conn_id) druid_client = PyDruid( url="{conn.schema}://{conn.host}".format(conn=conn), endpoint=conn.extra_dejson.get("endpoint", "druid/v2/"), ) druid_client.set_basic_auth_credentials( username=conn.login, password=conn.password, ) return druid_client
def query_druid(): client = PyDruid(DRUID_URL, 'druid/v2') query = client.select( datasource='pageviews1', granularity='all', dimensions=["url", "user"], filter=Dimension('user') == 'ethan', paging_spec={"pagingIdentifiers": {}, "threshold": 5}, intervals=["2016-07-08/2017-09-13"] ) # print json.dumps(query.result, indent=2) return query.result
def __init__(self): super(DruidManager,self).__init__() self._proto = DBACCESS['druid']['data']['protocol'] self._host = DBACCESS['druid']['data']['host'] self._port = DBACCESS['druid']['data']['port'] self._url_root_path = DBACCESS['druid']['data']['url_root_path'] self._qtimeout = DBACCESS['druid']['data']['query_timeout'] self._qlimit = DBACCESS['druid']['data']['query_limit'] self._granularities = {'second': 'PT1S', 'minute': 'PT1M', 'fifteen_minute': 'PT15M', 'thirty_minute': 'PT30M', 'hour': 'PT1H', 'day': 'P1D', 'week': 'P1W', 'month': 'P1M', 'quarter': 'P3M', 'year': 'P1Y'} self._urlconn = self._proto + '://' + self._host + ':' + \ self._port.__str__() # Druid connection: self._query = PyDruid(self._urlconn, self._url_root_path) # For console debugging: if DEBUG == True: print 'Druid instance created.'
def get_conn(self): """ Returns a druid connection object for query """ conn = self.get_connection(self.druid_query_conn_id) return PyDruid("http://{conn.host}:{conn.port}".format(**locals()), conn.extra_dejson.get('endpoint', ''))
def connect(self, uri): self.connection = PyDruid('http://{0}'.format(uri), 'druid/v2/') self.plyql = PlyQL(uri) try: tables = self.tables() if {'Tables_in_database': 'supervisor'} not in tables: raise Exception('Druid connection error: missing ' '"supervisor" table') except Exception: raise Exception('Druid connection error: {0}'.format(uri))
def druid_simple_groupby(dimensions, filter_list=[], filter_type="and", datasource=settings.DRUID_SPRAYDAY_DATASOURCE): """ Inputs: dimensions => list of dimensions to group by filter_list => list of list of things to filter with e.g. filter_list=[['target_area_id', operator.ne, 1], ['sprayable', operator.eq, "true"], ['dimension', operator, "value"]]) filter_type => type of Druid filter to perform """ query = PyDruid(get_druid_broker_url(), 'druid/v2') params = dict( datasource=datasource, granularity='all', intervals=settings.DRUID_INTERVAL, limit_spec={ "type": "default", "limit": 50000, } ) params['dimensions'] = dimensions if filter_list: fields = [] for this_filter in filter_list: compare_dim = filters.Dimension(this_filter[0]) comparison_operator = this_filter[1] # e.g. operator.eq compare_dim_value = this_filter[2] fields.append(comparison_operator(compare_dim, compare_dim_value)) params['filter'] = filters.Filter( type=filter_type, fields=fields ) try: request = query.groupby(**params) except OSError: pass else: return request.result return []
def test_cube_query(self): query = PyDruid("http://pipeline.qiniu.com", 'v2/stream/cubes/query') query.set_qiniu("", "") top = query.topn( datasource='domain_top_statics', granularity='all', intervals='2019-08-13/pt1h', # utc time of 2014 oscars aggregations={'count': doublesum('count')}, metric='count', dimension='Country', threshold=10) df = query.export_pandas() print(df) top.export_tsv('top.tsv')
def get_druid_data(dimensions=None, filter_list=[], filter_type="and", order_by=["target_area_name"], datasource=settings.DRUID_SPRAYDAY_DATASOURCE): """ Runs a query against Druid, returns data with metrics Inputs: dimensions => list of dimensions to group by filter_list => list of list of things to filter with e.g. filter_list=[['target_area_id', operator.ne, 1], ['sprayable', operator.eq, "true"], ['dimension', operator, "value"]]) filter_type => type of Druid filter to perform, order_by => field(s) to order the data by """ query = PyDruid(get_druid_broker_url(), 'druid/v2') params = dict( datasource=datasource, granularity='all', intervals=settings.DRUID_INTERVAL, aggregations={ 'num_not_sprayable': aggregators.filtered( filters.Filter( type='and', fields=[filters.Dimension('sprayable') == 'false'] ), aggregators.longsum('count') ), 'num_not_sprayed': aggregators.filtered( filters.Filter( type='and', fields=[filters.Dimension('sprayable') == 'true', filters.Dimension('sprayed') == settings.MSPRAY_WAS_NOT_SPRAYED_VALUE] ), aggregators.longsum('count') ), 'num_sprayed': aggregators.filtered( filters.Dimension('sprayed') == settings.MSPRAY_WAS_SPRAYED_VALUE, aggregators.longsum('count') ), 'num_new': aggregators.filtered( filters.Dimension('is_new') == 'true', aggregators.longsum('count') ), 'num_new_no_duplicates': aggregators.filtered( filters.Filter( type='and', fields=[filters.Dimension('is_duplicate') == 'false', filters.Dimension('is_new') == 'true'] ), aggregators.longsum('count') ), 'num_duplicate': aggregators.filtered( filters.Dimension('is_duplicate') == 'true', aggregators.longsum('count') ), 'num_sprayed_no_duplicates': aggregators.filtered( filters.Filter( type='and', fields=[filters.Dimension('is_duplicate') == 'false', filters.Dimension('sprayed') == settings.MSPRAY_WAS_SPRAYED_VALUE] ), aggregators.longsum('count') ), 'num_not_sprayed_no_duplicates': aggregators.filtered( filters.Filter( type='and', fields=[filters.Dimension('is_duplicate') == 'false', filters.Dimension('sprayable') == 'true', filters.Dimension('sprayed') == settings.MSPRAY_WAS_NOT_SPRAYED_VALUE] ), aggregators.longsum('count') ), 'num_sprayed_duplicates': aggregators.filtered( filters.Filter( type='and', fields=[filters.Dimension('is_duplicate') == 'true', filters.Dimension('sprayable') == 'true', filters.Dimension('sprayed') == settings.MSPRAY_WAS_SPRAYED_VALUE] ), aggregators.longsum('count') ), 'num_not_sprayable_no_duplicates': aggregators.filtered( filters.Filter( type='and', fields=[filters.Dimension('is_duplicate') == 'false', filters.Dimension('sprayable') == 'false'] ), aggregators.longsum('count') ), 'num_refused': aggregators.filtered( filters.Filter( type='and', fields=[filters.Dimension('is_duplicate') == 'false', filters.Dimension('is_refused') == 'true', filters.Dimension('sprayed') == settings.MSPRAY_WAS_NOT_SPRAYED_VALUE] ), aggregators.longsum('count') ), }, post_aggregations={ 'num_found': Field('num_sprayed_no_duplicates') + Field('num_sprayed_duplicates') + Field('num_not_sprayed_no_duplicates') }, limit_spec={ "type": "default", "limit": 50000, "columns": order_by } ) if filter_list: fields = [] for this_filter in filter_list: compare_dim = filters.Dimension(this_filter[0]) comparison_operator = this_filter[1] # e.g. operator.eq compare_dim_value = this_filter[2] fields.append(comparison_operator(compare_dim, compare_dim_value)) params['filter'] = filters.Filter( type=filter_type, fields=fields ) if dimensions is None: params['dimensions'] = ['target_area_id', 'target_area_name', 'target_area_structures'] else: params['dimensions'] = dimensions try: request = query.groupby(**params) except OSError: return [] else: return request.result
def create_client(): return PyDruid("http://localhost:8083", "druid/v2/")
def __init__(self): self.client = PyDruid(DRUID_BROKER_URL, 'druid/v2')
class MeDruidHelper(object): """ Market Events on Druid Helper Auxilary class for working with Market Events in Druid """ events_dir = 'G:/work' in_vm_dir = '/mnt/hgfs/G/work' def __init__(self): self.client = PyDruid(DRUID_BROKER_URL, 'druid/v2') @staticmethod def index_market_events(file_name, market_events): """ Creates data file from list of market_events at location accessible to Druid and submits indexing task :type file_name: Union[str,unicode] :type market_events: list :param file_name: name of the data file :param market_events: list of events :return: """ task_proto_path = base_path + '/market_event_indexing_task_proto.json' with open(task_proto_path) as fh: indexing_task_spec = json.loads(fh.read()) if indexing_task_spec is None: raise DruidPocException('unable to load indexing task proto from ' + task_proto_path) # model for indexing task is needed for production use indexing_task_spec['spec']['ioConfig']['inputSpec']['paths'] = MeDruidHelper.in_vm_dir + '/' + file_name with open(MeDruidHelper.events_dir + '/' + file_name, 'w') as events_fh: for event in market_events: events_fh.write(json.dumps(vars(event), sort_keys=True) + '\n') MeDruidHelper.submit_synchronous_indexing_task(indexing_task_spec) @staticmethod def submit_synchronous_indexing_task(indexing_task_spec): submit_response = requests.post(OVERLORD_URL, headers={'Content-Type': 'application/json'}, data=json.dumps(indexing_task_spec)) if submit_response.status_code == 200 and submit_response.reason == 'OK': task_id = json.loads(submit_response.text)['task'] tracking_url = '%s/%s/status' % (OVERLORD_URL, task_id) print 'Indexing should begin shortly. Tracking URL: %s' % tracking_url MeDruidHelper.track_indexing_task(task_id) else: print 'Failed submitting task, reason:' + submit_response.reason @staticmethod def track_indexing_task(task_id): tracking_url = '%s/%s/status' % (OVERLORD_URL, task_id) status_response = requests.get(tracking_url) print status_response.json() task_status = status_response.json()['status']['status'] while status_response.status_code == 200 and task_status not in ['SUCCESS', 'FAILED']: time.sleep(10) status_response = requests.get(tracking_url) task_status = status_response.json()['status']['status'] print '[%d] %s - %s' % (status_response.status_code, task_status, status_response.json()) @staticmethod def post_to_tranquility(record, table_name=TABLE_NAME): """ used for streaming into Druid through tranquility :param record: :param table_name: :return: """ payload = json.dumps(record.__dict__) print payload load_response = requests.post(url=TRANQUILITY_URL + '/' + table_name, headers={'Content-Type': 'application/json'}, data=payload) print "[%d] %s\n" % (load_response.status_code, load_response.text) @staticmethod def shutdown_streaming_task(task_id): task_shutdown_url = '%s/%s/shutdown' % (OVERLORD_URL, task_id) response = requests.post(task_shutdown_url) print '[%d] %s' % (response.status_code, response.json()) def select_one_market_event(self, product_name): query = self.client.select( datasource=TABLE_NAME, granularity='all', dimensions=['product_name'], filter=Dimension('product_name') == product_name, paging_spec={"pagingIdentifiers": {}, "threshold": 1}, intervals=["2016-07-08/2017-09-13"] ) events = [segment_result['result']['events'] for segment_result in query.result] if len(events) >= 1: return events[0] return [] def positions_delta(self, product_name, min_num_employees, start_dt, end_dt): """ :type product_name: Union[str,unicode] :type min_num_employees: int :type start_dt: datetime :type end_dt: datetime """ query = self.client.timeseries( datasource=TABLE_NAME, granularity='month', intervals=[start_dt.strftime(YMD_FORMAT) + '/' + end_dt.strftime(YMD_FORMAT)], filter=((Dimension('product_name') == product_name) & (Dimension('customer_num_employees') > min_num_employees)), aggregations={"qty": doublesum("qty")}, ) print query.result delta = 0 for item in query.result: delta += item['result']['qty'] return delta @staticmethod def yesterday(): return (datetime.now() - timedelta(days=1)).strftime(YMD_FORMAT)
def init_druid(): fraud_druid = PyDruid(cm.FRAUD_DRUID_URL, cm.FRAUD_DRUID_PATH) fraud_druid.set_basic_auth_credentials(cm.FRAUD_DRUID_USER, cm.FRAUD_DRUID_PASS) return fraud_druid
class DruidAccessLayer(object): timeseries_granularities = ['none', 'second', 'minute', 'fifteen_minute', 'thirty_minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'] select_granularities = ['all', 'second', 'minute', 'fifteen_minute', 'thirty_minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'] def __init__(self): self.connection = None self.plyql = None def connect(self, uri): self.connection = PyDruid('http://{0}'.format(uri), 'druid/v2/') self.plyql = PlyQL(uri) try: tables = self.tables() if {'Tables_in_database': 'supervisor'} not in tables: raise Exception('Druid connection error: missing ' '"supervisor" table') except Exception: raise Exception('Druid connection error: {0}'.format(uri)) def __validate_granularity__(self, granularity, supported_granularities): if granularity in self.timeseries_granularities: query_granularity = granularity elif validators.duration(granularity): query_granularity = {'type': 'period', 'period': granularity} else: raise ValueError( 'Unsupported granularity "{0}"'.format(granularity)) return query_granularity def __validate_intervals__(self, intervals): if not validators.interval(intervals): raise ValueError('Unsupported interval "{0}"'.format(intervals)) return intervals def tables(self): return self.plyql.query('SHOW TABLES') def processes(self, agent_id, period='P6W'): return self.plyql.query('SELECT process_name AS process, ' 'COUNT() AS count, MAX(__time) AS time ' 'FROM supervisor WHERE agent_id = "{0}" ' 'GROUP BY process_name;' .format(agent_id), period) def timeseries(self, agent_id, process_name, granularity='none', intervals='P6W', descending=False): query_granularity = self.__validate_granularity__( granularity, self.timeseries_granularities) intervals = self.__validate_intervals__(intervals) return self.connection.timeseries( datasource='supervisor', granularity=query_granularity, descending=descending, intervals=intervals, aggregations={'cpu': doublemax('cpu'), 'mem': longmax('mem')}, context={'skipEmptyBuckets': 'true'}, filter=(Dimension('agent_id') == agent_id) & (Dimension('process_name') == process_name)) def select(self, agent_id, process_name, granularity='all', intervals='P6W', descending=True): query_granularity = self.__validate_granularity__( granularity, self.select_granularities) intervals = self.__validate_intervals__(intervals) return self.connection.select( datasource='supervisor', granularity=query_granularity, intervals=intervals, descending=descending, dimensions=['process_name'], metrics=['cpu', 'mem'], filter=(Dimension('agent_id') == agent_id) & (Dimension('process_name') == process_name), paging_spec={'pagingIdentifiers': {}, "threshold": 1} )
class DruidManager(object): """Manager for Druid engine.""" __metaclass__ = Singleton def __init__(self): super(DruidManager,self).__init__() self._proto = DBACCESS['druid']['data']['protocol'] self._host = DBACCESS['druid']['data']['host'] self._port = DBACCESS['druid']['data']['port'] self._url_root_path = DBACCESS['druid']['data']['url_root_path'] self._qtimeout = DBACCESS['druid']['data']['query_timeout'] self._qlimit = DBACCESS['druid']['data']['query_limit'] self._granularities = {'second': 'PT1S', 'minute': 'PT1M', 'fifteen_minute': 'PT15M', 'thirty_minute': 'PT30M', 'hour': 'PT1H', 'day': 'P1D', 'week': 'P1W', 'month': 'P1M', 'quarter': 'P3M', 'year': 'P1Y'} self._urlconn = self._proto + '://' + self._host + ':' + \ self._port.__str__() # Druid connection: self._query = PyDruid(self._urlconn, self._url_root_path) # For console debugging: if DEBUG == True: print 'Druid instance created.' #----------------------------------------------------- def sql_to_pydruid(self, dicc): """ SQL to JSON converter. """ params = dict() # Datasource: if 'FROM' in dicc: params['datasource'] = dicc['FROM'] else: raise QueryError('Undefined datasource (from)') # Define query interval and granularity: if 'QINTERVAL' in dicc: params['intervals'] = dicc['QINTERVAL'] else: raise QueryError('Undefined query time interval (qinterval)') if 'GRANULARITY' in dicc: params['granularity'] = dicc['GRANULARITY'].lower() else: raise QueryError('Undefined time granularity (granularity)') # Parse GROUP_BY chain: grouping # >> If no GROUP_BY clause is defined, query will be processed as a # >> timeseries query. if 'GROUP BY' in dicc: params['dimensions'] = dicc['GROUP BY'] # Parse 'ORDER_BY' clause for TopN queries: if 'ORDER BY' in dicc: params['metric'] = re.sub(r'\s(DE|A)SC','', dicc['ORDER BY']) # Parse LIMIT clause for TopN queries: if 'LIMIT' in dicc: params['threshold'] = dicc['LIMIT'] else: params['threshold'] = self._qlimit # # Parse WHERE chain (always is a optional clause) if 'WHERE' in dicc: clauses_ = re.sub( r'(?P<id>[\w.]+)\s?(?P<op>[<>]=?)\s?' r'(?P<value>[\w.-]+|[\'\"].*?[\'\"])', '(getattr(filters,\'JavaScript\')(\'\g<id>\') = ' '\"function(v) { return v \g<op> \g<value> }\")', dicc['WHERE'], re.M|re.S) clauses_ = re.sub( r'(?P<id>[\w.]+)\s?(?P<op>\!=|=)\s?' '(?P<value>[\w.-]+|[\'\"].*?[\'\"])', '(getattr(filters,\'Dimension\')(\'\g<id>\') \g<op> \g<value>)', clauses_, re.M|re.S) clauses = re.sub(r'[^<>!]=', ' ==', clauses_, re.M|re.S) clauses = re.sub(r'AND', '&', clauses, re.M|re.S) params['filter'] = re.sub(r'OR', '|', clauses, re.M|re.S) if DEBUG: print 'WHERE: ', params['filter'] else: params['filter'] = '' # Parse SELECT aggs and match with GROUP BY params['aggregations'] = dict() params['post_aggregations'] = dict() # Function to use into re.sub, in order to parse (post)aggregation # operations and fill in 'aggregations' and 'post_aggregations' # dictionaries. # Function output is captured by re.sub but is not used. def repl_agg(m): """ Auxiliary function to use into re.sub method. """ op = {'SUM': 'doublesum', 'MIN': 'min', 'MAX': 'max', 'COUNT': 'count'} post_op = ('AVG', ) if m.group(4) != None: name = 'alias' else: name = 'value' # Standard aggregation operators: if m.group('op') in op.keys(): # rep = '"' + m.group(name) + '": ' + op[m.group('op')] + \ # '("' + m.group('value') + '")' rep = 'aggregator' params['aggregations'][m.group(name)] = \ getattr(aggregators, op[m.group('op')])(m.group('value')) # Advanced aggregation operators -> postaggregations: elif m.group('op') in post_op: op1 = 'op1_' + m.group('value') op2 = 'op2_' + m.group('value') params['aggregations'][op1] = \ getattr(aggregators, op['SUM'])(m.group('value')) params['aggregations'][op2] = \ getattr(aggregators, op['COUNT'])(m.group('value')) ## AVG operator case: if m.group('op') == 'AVG': params['post_aggregations'][m.group(name)] = \ getattr(postaggregator, 'Field')(op1) / \ getattr(postaggregator, 'Field')(op2) ## support for another postagg operators.. rep = 'postaggregator' # Unknown aggregation operation: else: raise QueryError('Unknown aggregation operator.') return rep clauses = re.sub( r'((?P<op>[A-Z]+?)\s*?\(\s*?(?P<value>[\w.-]+)\s*?\))' r'(\s*?(?P<as>AS)\s*?(?P<alias>[\w.-]+))?', repl_agg, dicc['SELECT'], re.M|re.S) if DEBUG: print 'Aggregations: ', params['aggregations'] print 'Postaggregations: ', params['post_aggregations'] #TODO: HAVING clause (grouping conditions) return params #----------------------------------------------------- def timeseries(self, params): """ Wrapper for pydruid Timeseries query that returns a JSON string containing query result. """ query_id = "tseries-" + time.time().__str__() if DEBUG: print "Query-ID:", query_id result = self._query.timeseries( datasource = params['datasource'], granularity = params['granularity'], intervals = params['intervals'], aggregations = params['aggregations'], post_aggregations = params['post_aggregations'], filter = eval(params['filter']), context = {"timeout": self._qtimeout, "queryId": query_id} ) return result.result_json #----------------------------------------------------- def topn(self, params): """ Wrapper for pydruid TopN query that returns a JSON string containing query result. """ # Current time on UNIX timestamp format to build the query-id. query_id = "topn-" + time.time().__str__() if DEBUG: print "Query-ID:", query_id result = self._query.topn( datasource = params['datasource'], granularity = params['granularity'], intervals = params['intervals'], aggregations = params['aggregations'], post_aggregations = params['post_aggregations'], filter = eval(params['filter']), dimension = params['dimensions'], metric = params['metric'], threshold = params['threshold'], context = {"timeout": self._qtimeout, "queryId": query_id} ) return result.result_json #----------------------------------------------------- def groupby(self, params): """ Wrapper for pydruid Groupby query that returns a JSON string containing query result. """ query_id = "gby-" + time.time().__str__() if DEBUG: print "Query-ID:", query_id result = self._query.groupby( datasource = params['datasource'], granularity = params['granularity'], intervals = params['intervals'], aggregations = params['aggregations'], post_aggregations = params['post_aggregations'], filter = eval(params['filter']), dimensions = params['dimensions'], context = {"timeout": self._qtimeout, "queryId": query_id} ) return result.result_json #----------------------------------------------------- def nested_topn(self, params, dim_ord): """ Nested Top-N query: alternative to expensive groupby -- An accuracy error can exists due to the prunning in intermediate results from previous queries. Args: :params: dictionary with SQL clauses as keys and values from original SQL query. :dim_ord: list that contains dimesions in the order that grouping queries must be performed. Currently, only 2 dimensions are supported. Operation steps: 0) N = query threshold = value of LIMIT SQL clause 1) Execute Top-N over dimension-1, aggregating over metrics. 2) Execute 'N' x Top-N/2 over dimension-2, filtering by results from 1), aggregating over metrics again. Return a list of dictionaries {timestamp: ts, result: res}, where 'res' is a list of dictionaries in the format {dim1: val1, dim2: val2, metric_aggs: val}. """ th_l1 = params['threshold'] th_l2 = ceil(float(th_l1) / 2) dim1 = dim_ord[0] dim2 = dim_ord[1] # Initial query: TopN over first dimension query_id = "nestopn-" + time.time().__str__() if DEBUG: print "Query-ID:", query_id res = self._query.topn( datasource = params['datasource'], granularity = params['granularity'], intervals = params['intervals'], aggregations = params['aggregations'], post_aggregations = params['post_aggregations'], filter = eval(params['filter']), dimension = dim1, metric = params['metric'], threshold = th_l1, context = {"timeout": self._qtimeout, "queryId": query_id} ) if DEBUG: print 'L1 query result: ', res.result_json # Load JSON string as a Python object, transforming unicode # strings in UTF-8 format. qresult = json_loads_byteified(res.result_json) # Make a dictionary where the keys are the intervals # (funtion of query granularity), and the values are the # TopN values of the first dimension, "dim1", for each # interval. dic_dim1 = dict() # qresult is a list of dictionaries: for qres in qresult: interval = qres['timestamp'] list_dim1 = list() for elem in qres['result']: list_dim1.append(elem[dim1]) dic_dim1.update({interval : list_dim1}) # Dictionary for final result result = list() # Then, launch N queries by interval, one for each value # of the dimension-1 in the interval. The type of the # queries will be TopN too, but the dimension will be # dimension-2, filtering by dimension-1 every time. for interval, values_dim1 in dic_dim1.items(): if DEBUG: print 'L2 query interval: ', interval l2_result = list() for val_dim1 in values_dim1: query_id = "nestopn-" + time.time().__str__() if DEBUG: print "Query-ID:", query_id # TopN query: dimension = dim2, threshold = th_l2 res = self._query.topn( datasource = params['datasource'], granularity = 'all', intervals = interval + '/' + \ self._granularities[params['granularity']], aggregations = params['aggregations'], post_aggregations = params['post_aggregations'], filter = (Dimension(dim1) == val_dim1) & (eval(params['filter'])), dimension = dim2, metric = params['metric'], threshold = th_l2, context = {"timeout": self._qtimeout, "queryId": query_id} ) if DEBUG: print 'L2 query result, dim1 = ', val_dim1 print res.result_json # Load JSON string as a Python object, transforming unicode # strings in UTF-8 format. qresult=json_loads_byteified(res.result_json) for qres in qresult: for elem in qres['result']: elem_merged = elem.copy() elem_merged.update({dim1: val_dim1}) l2_result.append(elem_merged) result.append({"timestamp": interval, "result": l2_result}) return result #----------------------------------------------------- def query(self, qtype, statement): """ Two types of queries: JSON (native format) or SQL. SQL statement looks like a dictionary where the keys are the SQL clause keywords and values are the companions identifiers of SQL keywords. """ if qtype == 'json': # Native query for tests using 'requests' module # TODO pass elif qtype == 'sql': # SQL query will be traduced to paramenters PyDruid understands. # SQL statements to list of dictionaries with SQL clauses as keys, # one query per dicc in dicc-list: parser = SQLP() dicc = parser.parse(statement) # map SQL clauses to Druid query parameters: try: params = self.sql_to_pydruid(dicc) # length of dimension list = type of query # # - Timeseries: long (dimension) = 0 # - TopN: long (dimension) = 1 # - Nested-topN: long (dimension) = 2 (currently) # - Groupby (nested topN):long (dimensiones) = 3..N if 'dimensions' in params: dimensions = list(params['dimensions'].split(',')) num_dim = dimensions.__len__() else: num_dim = 0 # Current time on UNIX timestamp format ts = time.time().__str__() # -- Timeseries query -- if num_dim == 0: result = self.timeseries(params) # -- Basic Top-N query -- elif num_dim == 1: result = self.topn(params) # -- Nested Top-N query: alternative to expensive groupby -- # # TODO: add support for 3 dimensions # elif num_dim in (2, 3): # elif num_dim == 2: # Operation steps: # 0) N = query threshold = value of LIMIT SQL clause # 1) Execute Top-N over dimension-1, aggregating over # metrics. # 2) Execute 'N' x Top-N/2 over dimension-2, filtering by # results from 1), aggregating over metrics again. # # >> Steps 1) and 2) are both implemented into nested_topn # # 3) Repeat 1) and 2) swapping dimension-1 and dimension-2. # This is carried out by means of another call to # nested_topn with different order of dimensions. # # 4) Group results from 2) and 3), and return the greatest # N results based on metrics aggregation. dim_ord = [dimensions[0].strip(), dimensions[1].strip()] result_1 = self.nested_topn(params, dim_ord) dim_ord = [dimensions[1].strip(), dimensions[0].strip()] result_2 = self.nested_topn(params, dim_ord) # Merge result_1 and result_2, order results by aggregation # metric and discard rows beyond the value of the LIMIT # clause. result_1.extend(result_2) # Merge results from both lists grouping by timestamp: merged = dict() for item in result_1 + result_2: if item['timestamp'] in merged: merged[item['timestamp']] = \ merged[item['timestamp']] + item['result'] else: # item['result'] is a list of dictionaries merged[item['timestamp']] = item['result'] # Delete duplicates and order by metric: list is a ordered # collection, whereas that set is an unordered and unique # container. Tuple is an inmutable list. # It is necessary to transform the original dictionary in # a set to delete duplicates, and order the elements before # assign the final dict to a particular timestamp key. result = [ {'timestamp': k, 'result': sorted([dict(y) for y in set(tuple(x.items()) for x in v)], key=lambda k: k[params['metric']], reverse=True)} for k,v in merged.items()] # Finally, order the dictionary list by timestamp: result = json.dumps(sorted(result, key=lambda k: k['timestamp'])) # -- GroupBy query (no limit over results) -- else: result = self.groupby(params) except Exception as err: # Re-launch exception to manage in main program: if DEBUG: import traceback traceback.print_exc() # Launch last exception: raise else: #result = 'Query type no recognized.' pass return result #----------------------------------------------------- # ---------------------------------------------------------------------------- # Pydruid query examples: # ---------------------------------------------------------------------------- # ts = query.timeseries( # datasource='twitterstream', # granularity='day', # intervals='2014-02-02/p4w', # aggregations={'length': doublesum('tweet_length'), 'count': doublesum('count')}, # post_aggregations={'avg_tweet_length': (Field('length') / Field('count'))}, # filter=Dimension('first_hashtag') == 'sochi2014' # ) # top = query.topn( # datasource='twitterstream', # granularity='all', # intervals='2014-03-03/p1d', # utc time of 2014 oscars # aggregations={'count': doublesum('count')}, #*# dimension='user_mention_name', # filter=(Dimension('user_lang') == 'en') & (Dimension('first_hashtag') == 'oscars') & # (Dimension('user_time_zone') == 'Pacific Time (US & Canada)') & # ~(Dimension('user_mention_name') == 'No Mention'), #*# metric='count', #*# threshold=10 # ) # group = query.groupby( # datasource='twitterstream', # granularity='hour', # intervals='2013-10-04/pt12h', #*# dimensions=["user_name", "reply_to_name"], # filter=(~(Dimension("reply_to_name") == "Not A Reply")) & # (Dimension("user_location") == "California"), # aggregations={"count": doublesum("count")} # )
def get_pydruid_client(self): cli = PyDruid( "http://{0}:{1}/".format(self.broker_host, self.broker_port), self.broker_endpoint) return cli
def __init__(self, address, port=8082): url = f"http://{address}:{port}" self.async_client = AsyncPyDruid(url, 'druid/v2/') self.client = PyDruid(url, 'druid/v2/')