def _get_sq_field_for_blender_field(field, queries, field_maps, reference=None): unmodified_field = find_field_in_modified_field(field) field_alias = alias_selector(reference_type_alias(field, reference)) # search for the field in each field map to determine which subquery it will be in for query, field_map in zip(queries, field_maps): if query is None or unmodified_field not in field_map: continue mapped_field = field_map[unmodified_field] mapped_field_alias = alias_selector( reference_type_alias(mapped_field, reference)) subquery_field = query[mapped_field_alias] # case #1 modified fields, ex. day(timestamp) or rollup(dimension) return field.for_(subquery_field).as_(field_alias) # Need to copy the metrics if there are references so that the `get_sql` monkey patch does not conflict. # Given some of them might have nested metrics themselves, the clone process is performed recursively. definition = field.definition while isinstance(definition, Field): definition = definition.definition # case #2: complex blender fields return _deepcopy_recursive(definition).as_(field_alias)
def map_blender_field_to_dataset_field(field, field_map, dataset): field_from_blender = find_field_in_modified_field(field) if field_from_blender in dataset.fields: return field if field_from_blender in field_map: return field.for_(field_map[field_from_blender])
def make_reference_dimensions(dimensions, ref_dimension, offset_func, field_transformer, trunc_date): return [ _replace_reference_dimension(dimension, offset_func, field_transformer, trunc_date) if ref_dimension is find_field_in_modified_field(dimension) else dimension for dimension in dimensions ]
def _map_fields(fields): """ TODO describe this """ for field in fields: field_from_blender = find_field_in_modified_field(field) if field_from_blender in dataset.fields: yield field continue if field_from_blender not in field_map: continue yield field.for_(field_map[field_from_blender])
def fetch_data( database: Database, queries: List[Type[QueryBuilder]], dimensions: Iterable[Field], share_dimensions: Iterable[Field] = (), reference_groups=(), ) -> Tuple[int, pd.DataFrame]: queries = [str(query) for query in queries] # Indicate which dimensions need to be parsed as date types # For this we create a dictionary with the dimension alias as key and PANDAS_TO_DATETIME_FORMAT as value pandas_parse_dates = {} for dimension in dimensions: unmodified_dimension = find_field_in_modified_field(dimension) if unmodified_dimension.data_type == DataType.date: pandas_parse_dates[alias_selector( unmodified_dimension.alias)] = PANDAS_TO_DATETIME_FORMAT results = database.fetch_dataframes(*queries, parse_dates=pandas_parse_dates) max_rows_returned = 0 for result_df in results: row_count = len(result_df) if row_count > max_rows_returned: max_rows_returned = row_count if row_count > database.max_result_set_size: logger.warning('row_count_over_max', extra={ 'row_count': len(result_df), 'database': str(database) }) # drop all result rows above database.max_result_set_size in place result_df.drop(result_df.index[database.max_result_set_size:], inplace=True) logger.info('max_rows_returned', extra={ 'row_count': max_rows_returned, 'database': str(database) }) return max_rows_returned, reduce_result_set(results, reference_groups, dimensions, share_dimensions)
def _get_sq_field_for_blender_field(field, reference=None): unmodified_field = find_field_in_modified_field(field) field_alias = alias_selector(reference_alias(field, reference)) # search for the field in each field map to determine which subquery it will be in for query, field_map in zip(queries, field_maps): if unmodified_field not in field_map: continue mapped_field = field_map[unmodified_field] mapped_field_alias = alias_selector( reference_alias(mapped_field, reference)) subquery_field = query[mapped_field_alias] # case #1 modified fields, ex. day(timestamp) or rollup(dimension) return field.for_(subquery_field).as_(field_alias) # Need to copy the metrics if there are references so that the `get_sql` monkey patch does not conflict definition = copy.deepcopy(field.definition) # case #2: complex blender fields return definition.as_(field_alias)
def _blender_join_criteria(base_query, join_query, dimensions, base_field_map, join_field_map): """ Build a criteria for joining this join query to the base query in datset blender queries. This should be a set of equality conditions like A0=B0 AND A1=B1 AND An=Bn for each mapped dimension between dataset from `DataSetBlender.dimension_map`. """ join_criteria = None for dimension in dimensions: dimension = find_field_in_modified_field(dimension) if not all([dimension in base_field_map, dimension in join_field_map]): continue alias0, alias1 = [ alias_selector(field_map[dimension].alias) for field_map in [base_field_map, join_field_map] ] next_criteria = base_query[alias0] == join_query[alias1] join_criteria = (next_criteria if join_criteria is None else (join_criteria & next_criteria)) return join_criteria