def sync_to_db(cls, name, cluster, merge): """Fetches metadata for that datasource and merges the Superset db""" logging.info("Syncing Druid datasource [{}]".format(name)) session = get_session() datasource = session.query(cls).filter_by(datasource_name=name).first() if not datasource: datasource = cls(datasource_name=name) session.add(datasource) flasher("Adding new datasource [{}]".format(name), "success") else: flasher("Refreshing datasource [{}]".format(name), "info") session.flush() datasource.cluster = cluster datasource.merge_flag = merge session.flush() cols = datasource.latest_metadata() if not cols: logging.error("Failed at fetching the latest segment") return for col in cols: # Skip the time column if col == "__time": continue col_obj = ( session .query(DruidColumn) .filter_by(datasource_name=name, column_name=col) .first() ) datatype = cols[col]['type'] if not col_obj: col_obj = DruidColumn(datasource_name=name, column_name=col) session.add(col_obj) if datatype == "STRING": col_obj.groupby = True col_obj.filterable = True if datatype == "hyperUnique" or datatype == "thetaSketch": col_obj.count_distinct = True # If long or double, allow sum/min/max if datatype == "LONG" or datatype == "DOUBLE": col_obj.sum = True col_obj.min = True col_obj.max = True if col_obj: col_obj.type = cols[col]['type'] session.flush() col_obj.datasource = datasource col_obj.generate_metrics() session.flush()
def generate_metrics(self): """Generate metrics based on the column metadata""" M = DruidMetric # noqa metrics = [] metrics.append( DruidMetric(metric_name='count', verbose_name='COUNT(*)', metric_type='count', json=json.dumps({ 'type': 'count', 'name': 'count' }))) # Somehow we need to reassign this for UDAFs if self.type in ('DOUBLE', 'FLOAT'): corrected_type = 'DOUBLE' else: corrected_type = self.type if self.sum and self.is_num: mt = corrected_type.lower() + 'Sum' name = 'sum__' + self.column_name metrics.append( DruidMetric(metric_name=name, metric_type='sum', verbose_name='SUM({})'.format(self.column_name), json=json.dumps({ 'type': mt, 'name': name, 'fieldName': self.column_name }))) if self.avg and self.is_num: mt = corrected_type.lower() + 'Avg' name = 'avg__' + self.column_name metrics.append( DruidMetric(metric_name=name, metric_type='avg', verbose_name='AVG({})'.format(self.column_name), json=json.dumps({ 'type': mt, 'name': name, 'fieldName': self.column_name }))) if self.min and self.is_num: mt = corrected_type.lower() + 'Min' name = 'min__' + self.column_name metrics.append( DruidMetric(metric_name=name, metric_type='min', verbose_name='MIN({})'.format(self.column_name), json=json.dumps({ 'type': mt, 'name': name, 'fieldName': self.column_name }))) if self.max and self.is_num: mt = corrected_type.lower() + 'Max' name = 'max__' + self.column_name metrics.append( DruidMetric(metric_name=name, metric_type='max', verbose_name='MAX({})'.format(self.column_name), json=json.dumps({ 'type': mt, 'name': name, 'fieldName': self.column_name }))) if self.count_distinct: name = 'count_distinct__' + self.column_name if self.type == 'hyperUnique' or self.type == 'thetaSketch': metrics.append( DruidMetric(metric_name=name, verbose_name='COUNT(DISTINCT {})'.format( self.column_name), metric_type=self.type, json=json.dumps({ 'type': self.type, 'name': name, 'fieldName': self.column_name }))) else: mt = 'count_distinct' metrics.append( DruidMetric(metric_name=name, verbose_name='COUNT(DISTINCT {})'.format( self.column_name), metric_type='count_distinct', json=json.dumps({ 'type': 'cardinality', 'name': name, 'fieldNames': [self.column_name] }))) session = get_session() new_metrics = [] for metric in metrics: m = (session.query(M).filter( M.metric_name == metric.metric_name).filter( M.datasource_name == self.datasource_name).filter( DruidCluster.cluster_name == self.datasource.cluster_name).first()) metric.datasource_name = self.datasource_name if not m: new_metrics.append(metric) session.add(metric) session.flush()
def generate_metrics(self): """Generate metrics based on the column metadata""" M = DruidMetric # noqa metrics = [] metrics.append(DruidMetric( metric_name='count', verbose_name='COUNT(*)', metric_type='count', json=json.dumps({'type': 'count', 'name': 'count'}) )) # Somehow we need to reassign this for UDAFs if self.type in ('DOUBLE', 'FLOAT'): corrected_type = 'DOUBLE' else: corrected_type = self.type if self.sum and self.is_num: mt = corrected_type.lower() + 'Sum' name = 'sum__' + self.column_name metrics.append(DruidMetric( metric_name=name, metric_type='sum', verbose_name='SUM({})'.format(self.column_name), json=json.dumps({ 'type': mt, 'name': name, 'fieldName': self.column_name}) )) if self.avg and self.is_num: mt = corrected_type.lower() + 'Avg' name = 'avg__' + self.column_name metrics.append(DruidMetric( metric_name=name, metric_type='avg', verbose_name='AVG({})'.format(self.column_name), json=json.dumps({ 'type': mt, 'name': name, 'fieldName': self.column_name}) )) if self.min and self.is_num: mt = corrected_type.lower() + 'Min' name = 'min__' + self.column_name metrics.append(DruidMetric( metric_name=name, metric_type='min', verbose_name='MIN({})'.format(self.column_name), json=json.dumps({ 'type': mt, 'name': name, 'fieldName': self.column_name}) )) if self.max and self.is_num: mt = corrected_type.lower() + 'Max' name = 'max__' + self.column_name metrics.append(DruidMetric( metric_name=name, metric_type='max', verbose_name='MAX({})'.format(self.column_name), json=json.dumps({ 'type': mt, 'name': name, 'fieldName': self.column_name}) )) if self.count_distinct: name = 'count_distinct__' + self.column_name if self.type == 'hyperUnique' or self.type == 'thetaSketch': metrics.append(DruidMetric( metric_name=name, verbose_name='COUNT(DISTINCT {})'.format(self.column_name), metric_type=self.type, json=json.dumps({ 'type': self.type, 'name': name, 'fieldName': self.column_name }) )) else: mt = 'count_distinct' metrics.append(DruidMetric( metric_name=name, verbose_name='COUNT(DISTINCT {})'.format(self.column_name), metric_type='count_distinct', json=json.dumps({ 'type': 'cardinality', 'name': name, 'fieldNames': [self.column_name]}) )) session = get_session() new_metrics = [] for metric in metrics: m = ( session.query(M) .filter(M.metric_name == metric.metric_name) .filter(M.datasource_name == self.datasource_name) .filter(DruidCluster.cluster_name == self.datasource.cluster_name) .first() ) metric.datasource_name = self.datasource_name if not m: new_metrics.append(metric) session.add(metric) session.flush()