def load_dimensions_from_druid(self): base_query = GroupByQueryBuilder( datasource=self.datasource.name, granularity='all', grouping_fields=[], intervals=INTERVAL, calculation=COUNT_CALCULATION, ) for ordered_dimensions in list(self.filter_dimensions.values()): # Special case: meta-dimensions (like Nation) are prefixed with '_' # and are handled elsewhere - don't query them in druid. queryable_dimensions = [ d for d in ordered_dimensions if d[0] != '_' ] for dimension in queryable_dimensions: dimensions = self.dimension_slices.get(dimension, [dimension]) base_query.dimensions = dimensions base_query.query_filter = Dimension(dimension) != None LOG.info('Querying distinct %s from Druid...', dimensions) query_result = self.query_client.run_query(base_query) output_rows = [] for row in query_result.result: event = row['event'] output_row = dict(event) del output_row[COUNT_AGGREGATION_NAME] # Create a display version of this dimension that includes # the parent dimensions to help disambiguate dimension # values that are the same with a different hierarchy dimension_display = event[dimension] num_dimensions = len(dimensions) if num_dimensions > 1: # NOTE(ian): This logic matches logic used on the # frontend in SelectFilter.jsx start = num_dimensions - 1 disambiguation = [ event[d] for d in dimensions[start::-1] if event[d] ] dimension_display = '%s (%s)' % ( dimension_display, ', '.join(disambiguation), ) output_row[DISPLAY_FIELD] = dimension_display output_rows.append(output_row) self.dimension_map[dimension] = sorted( output_rows, key=lambda a: a[DISPLAY_FIELD]) LOG.info('%s values loaded for dimension: %s', len(output_rows), dimension) LOG.info('Done preloading dimension values.')
def get_field_summary(self, field_id): druid_context = current_app.druid_context # Simulate building a query so we can access the query filter this field # would normally use. calculation = current_app.zen_config.aggregation_rules.get_calculation_for_fields( [field_id]) interval = druid_context.data_time_boundary.get_full_time_interval() # HACK(ian): Setting granularity to month so that type=STOCK # aggregations are counted properly. This fails for # stock_granularity!=month, but there are very few of those. query = GroupByQueryBuilder('', 'month', [], [interval], calculation) # TODO(stephen): These values will be underreported for time interval # aggregations. Fix this. time_boundary = druid_context.data_time_boundary.get_field_time_boundary( field_id, query.query_filter) total_count = druid_context.row_count_lookup.get_row_count( query.query_filter, field_id) if not time_boundary or not total_count: return FieldSummary(field_id, 0) human_readable_formula = self.get_human_readable_formula_html(field_id) if not total_count: return FieldSummary(field_id, 0, human_readable_formula=human_readable_formula) return FieldSummary( field_id, total_count, time_boundary['min'], time_boundary['max'], human_readable_formula=human_readable_formula, )
def __init__( self, fields, start_date, end_date, dimensions, dimension_values, granularity='all', query_client=None, compute_field_calculation=None, ): self.fields = fields self.start_date = start_date self.end_date = end_date self.dimension_values = dimension_values if not compute_field_calculation: from config.aggregation_rules import get_calculation_for_fields compute_field_calculation = get_calculation_for_fields # Build the params needed for ValidationRow intervals = [build_time_interval(self.start_date, self.end_date)] calculation = compute_field_calculation(fields) dimension_filter = GroupByQueryBuilder.build_dimension_filter( [dimension_values]) super(FieldValidationRow, self).__init__( intervals, calculation, dimensions, dimension_filter, granularity, query_client, compute_field_calculation, )
def _build_query(self): return GroupByQueryBuilder( datasource=self.datasource.name, granularity=self.granularity, grouping_fields=self.dimensions, intervals=[self.interval], calculation=self.calculation, dimension_filter=self.query_filter, )
def build_query(self, datasource_name): return GroupByQueryBuilder( datasource=datasource_name, granularity=self.granularity, grouping_fields=self.dimensions, intervals=self.intervals, calculation=self.calculation, dimension_filter=self.dimension_filter, optimize=True, )
def build_query(self): calculations = [ build_calculation(field) for field in self.request.fields ] return GroupByQueryBuilder( datasource=self.datasource.name, granularity=self.request.build_granularity(), grouping_fields=self.request.build_dimensions(), intervals=self.request.build_intervals(), calculation=CalculationMerger(calculations), dimension_filter=self.request.build_query_filter(), )
def to_druid_query(self, datasource): # Always exclude nation values for AQT. query_filter = self.build_query_filter() use_nation_hack = datasource.startswith('et') if use_nation_hack: query_filter &= DimensionFilter('RegionName') != 'Nation' return GroupByQueryBuilder( datasource=datasource, granularity=self.build_granularity(), grouping_fields=self.build_dimensions(), intervals=self.build_intervals(), calculation=self.build_calculation(), dimension_filter=query_filter, subtotal_dimensions=self.build_subtotal_dimensions(), subtotal_result_label=SUBTOTAL_RESULT_LABEL, )
def load_ranges_from_druid(self): """Return a dictionary mapping data source name to a dictionary (minTime, maxTime) of datetime objects. """ date_ranges = {} LOG.info('Querying time ranges of data from Druid...') aggregations = { MIN_TIME_FIELD: { 'type': 'longMin', 'fieldName': '__time' }, MAX_TIME_FIELD: { 'type': 'longMax', 'fieldName': '__time' }, } calculation = BaseCalculation(aggregations=aggregations) query = GroupByQueryBuilder( datasource=self.datasource.name, granularity='all', grouping_fields=[SOURCE_FIELD], intervals=INTERVAL, calculation=calculation, ) query.query_filter &= Dimension(SOURCE_FIELD) != None query_result = self.query_client.run_query(query) for row in query_result.result: event = row['event'] # making {data_source: (minTime, maxTime)} date_ranges[event[SOURCE_FIELD]] = { MIN_TIME_FIELD: self.date_from_timestamp(event[MIN_TIME_FIELD]), MAX_TIME_FIELD: self.date_from_timestamp(event[MAX_TIME_FIELD]), } LOG.info('Done querying date ranges of data') return date_ranges
def get_no_date_filter_df(self): interval = current_app.druid_context.data_time_boundary.get_full_time_interval( ) # TODO(david): Update this when we work out a way of getting the first report # date for each geography without retriveing all report dates. # TODO(david): Work out a way of seperating the existing time and geographical # filters so that the geo filters can be included here. This will do for now as the only # effect this will have is if some dimension values are split across several higher # dimension values. E.g. if a county is split accross two regions and the different parts of # that county have different first report dates. earliest_report_query = GroupByQueryBuilder( self.datasource.name, 'day', self.request.build_dimensions(), [interval], self.request.build_calculation(), ) raw_df = self.query_client.run_query( earliest_report_query).export_pandas() return self.build_df(raw_df)
def run_query(self): ''' Constructs and runs the Druid request for this query. The query is blocking. ''' LOG.info('Running query...') # Filter the dimensions using the location filters passed in dimension_filter = GroupByQueryBuilder.build_dimension_filter( self.location_filters ) # AND the selected locations with the non-location filters requested dimension_filter &= self.non_hierarchical_filter # Slice by selected granularity + all fields less specific than it. For # example, if user makes a Woreda query, we also want to slice by Zone # and Region. if self.geo_field: # Restrict query to non-null for the given geo dimension_filter &= Dimension(self.geo_field) != '' # Set the appropriate dimensions for this query self.druid_slice_dimensions = self.get_slice_dimensions() if self.latitude_field and self.longitude_field: self.druid_geo_dimensions = [self.latitude_field, self.longitude_field] grouping_fields = self.druid_slice_dimensions + self.druid_geo_dimensions batches = [] overall_interval = build_time_interval(self.start_date, self.end_date) for selected_granularity in self.selected_granularities: granularity = selected_granularity intervals = [overall_interval] # Druid expects time intervals as # a list granularity = current_app.zen_config.aggregation_rules.get_granularity_for_interval( selected_granularity, self.start_date, self.end_date ) query = GroupByQueryBuilder( datasource=current_app.druid_context.current_datasource.name, granularity=granularity, grouping_fields=grouping_fields, intervals=intervals, calculation=self.calculation, dimension_filter=dimension_filter, ) batch = QueryBatch( query, selected_granularity, self.geo_field, self.latitude_field, self.longitude_field, self.ordered_fields, self.denom, self.druid_slice_dimensions, self.query_client, ) batches.append(batch) num_granularities = len(self.selected_granularities) if USE_THREAD_POOL and num_granularities > 1: pool = ThreadPool(num_granularities) pool.map(QueryBatch.run, batches) pool.close() pool.join() else: _ = [batch.run() for batch in batches] self.batches = batches return True