def new_leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, INCLUDING DEEP COLUMNS """ column_name = unnest_path(column_name) columns = self.columns all_paths = self.snowflake.sorted_query_paths output = {} for c in columns: if c.name == "_id" and column_name != "_id": continue if c.jx_type in OBJECTS: continue if c.cardinality == 0: continue for path in all_paths: if not startswith_field( unnest_path(relative_field(c.name, path)), column_name): continue existing = output.get(path) if not existing: output[path] = [c] continue if len(path) > len(c.nested_path[0]): continue if any("." + t + "." in c.es_column for t in (STRING_TYPE, NUMBER_TYPE, BOOLEAN_TYPE)): # ELASTICSEARCH field TYPES ARE NOT ALLOWED continue # ONLY THE DEEPEST COLUMN WILL BE CHOSEN output[path].append(c) return set(output.values())
def new_leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, INCLUDING DEEP COLUMNS """ column_name = unnest_path(column_name) columns = self.columns all_paths = self.snowflake.sorted_query_paths output = {} for c in columns: if c.name == "_id" and column_name != "_id": continue if c.jx_type in OBJECTS: continue if c.cardinality == 0: continue for path in all_paths: if not startswith_field(unnest_path(relative_field(c.name, path)), column_name): continue existing = output.get(path) if not existing: output[path] = [c] continue if len(path) > len(c.nested_path[0]): continue if any("." + t + "." in c.es_column for t in (STRING_TYPE, NUMBER_TYPE, BOOLEAN_TYPE)): # ELASTICSEARCH field TYPES ARE NOT ALLOWED continue # ONLY THE DEEPEST COLUMN WILL BE CHOSEN output[path].append(c) return set(output.values())
def _indexer(columns, query_path): all_names = set(unnest_path(n) for c in columns for n in c.names.values()) | {"."} lookup_leaves = {} # ALL LEAF VARIABLES for full_name in all_names: for c in columns: cname = c.names[query_path] nfp = unnest_path(cname) if (startswith_field(nfp, full_name) and c.type not in [EXISTS, OBJECT, NESTED] and (c.es_column != "_id" or full_name == "_id")): cs = lookup_leaves.setdefault(full_name, set()) cs.add(c) cs = lookup_leaves.setdefault(untype_path(full_name), set()) cs.add(c) lookup_variables = {} # ALL NOT-NESTED VARIABLES for full_name in all_names: for c in columns: cname = c.names[query_path] nfp = unnest_path(cname) if (startswith_field(nfp, full_name) and c.type not in [EXISTS, OBJECT] and (c.es_column != "_id" or full_name == "_id") and startswith_field(c.nested_path[0], query_path)): cs = lookup_variables.setdefault(full_name, set()) cs.add(c) cs = lookup_variables.setdefault(untype_path(full_name), set()) cs.add(c) relative_lookup = {} for c in columns: try: cname = c.names[query_path] cs = relative_lookup.setdefault(cname, set()) cs.add(c) ucname = untype_path(cname) cs = relative_lookup.setdefault(ucname, set()) cs.add(c) except Exception as e: Log.error("Should not happen", cause=e) if query_path != ".": # ADD ABSOLUTE NAMES TO THE NAMESAPCE absolute_lookup, more_leaves, more_variables = _indexer(columns, ".") for k, cs in absolute_lookup.items(): if k not in relative_lookup: relative_lookup[k] = cs for k, cs in more_leaves.items(): if k not in lookup_leaves: lookup_leaves[k] = cs for k, cs in more_variables.items(): if k not in lookup_variables: lookup_variables[k] = cs return relative_lookup, lookup_leaves, lookup_variables
def values(self, name): """ RETURN VALUES FOR THE GIVEN PATH NAME :param name: :return: """ return list(self.lookup_variables.get(unnest_path(name), Null))
def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema("meta.columns", SIMPLE_METADATA_COLUMNS), )
def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ output = [ { "table": concat_field(c.es_index, untype_path(table)), "name": untype_path(name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "type": c.type } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.type not in STRUCT # and c.es_column != "_id" for table, name in c.names.items() ] if not self.meta_schema: self.meta_schema = get_schema_from_list("meta\\.columns", output) from jx_python.containers.list_usingPythonList import ListContainer return ListContainer("meta\\.columns", data=output, schema=self.meta_schema)
def leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS """ clean_name = unnest_path(column_name) if clean_name != column_name: clean_name = column_name cleaner = lambda x: x else: cleaner = unnest_path columns = self.columns # TODO: '.' IMPLIES ALL FIELDS FROM ABSOLUTE PERPECTIVE, ALL OTHERS ARE A RELATIVE PERSPECTIVE # TODO: HOW TO REFER TO FIELDS THAT MAY BE SHADOWED BY A RELATIVE NAME? for path in reversed(self.query_path) if clean_name == '.' else self.query_path: output = [ c for c in columns if ( (c.name != "_id" or clean_name == "_id") and ( (c.jx_type == EXISTS and column_name.endswith("." + EXISTS_TYPE)) or c.jx_type not in OBJECTS or (clean_name == '.' and c.cardinality == 0) ) and startswith_field(cleaner(relative_field(c.name, path)), clean_name) ) ] if output: return set(output) return set()
def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema(META_COLUMNS_NAME, SIMPLE_METADATA_COLUMNS), )
def leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS """ clean_name = unnest_path(column_name) if clean_name != column_name: clean_name = column_name cleaner = lambda x: x else: cleaner = unnest_path columns = self.columns # TODO: '.' IMPLIES ALL FIELDS FROM ABSOLUTE PERPECTIVE, ALL OTHERS ARE A RELATIVE PERSPECTIVE # TODO: HOW TO REFER TO FIELDS THAT MAY BE SHADOWED BY A RELATIVE NAME? for path in reversed( self.query_path) if clean_name == '.' else self.query_path: output = [ c for c in columns if ((c.name != "_id" or clean_name == "_id") and ( (c.jx_type == EXISTS and column_name.endswith( "." + EXISTS_TYPE)) or c.jx_type not in OBJECTS or (clean_name == '.' and c.cardinality == 0)) and startswith_field(cleaner(relative_field(c.name, path)), clean_name)) ] if output: return set(output) return set()
def leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS """ column_name = unnest_path(column_name) columns = self.columns deep_path = self.query_path[0] for path in self.query_path: output = [ c for c in columns if ((c.names['.'] != "_id" or column_name == "_id") and c.jx_type not in OBJECTS and startswith_field( unnest_path(c.names[path]), column_name)) ] if output: return output return []
def leaves(self, name): """ RETURN LEAVES OF GIVEN PATH NAME pull leaves, considering query_path and namespace pull all first-level properties pull leaves, including parent leaves pull the head of any tree by name :param name: :return: """ return list(self.lookup_leaves.get(unnest_path(name), Null))
def leaves(self, name, meta=False): """ RETURN LEAVES OF GIVEN PATH NAME pull leaves, considering query_path and namespace pull all first-level properties pull leaves, including parent leaves pull the head of any tree by name :param name: :return: """ return list(self.lookup_leaves.get(unnest_path(name), Null))
def leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS """ column_name = unnest_path(column_name) columns = self.columns deep_path = self.query_path[0] for path in self.query_path: output = [ c for c in columns if ( (c.names['.'] != "_id" or column_name == "_id") and c.jx_type not in OBJECTS and startswith_field(unnest_path(c.names[path]), column_name) ) ] if output: return output return []
def values(self, name): """ RETURN VALUES FOR THE GIVEN PATH NAME :param name: :return: """ full_name = unnest_path(name) return list( set([ c for c in self.lookup.get(full_name, Null) if c.type in PRIMITIVE and ( c.es_column != "_id" ) # MULTIVALUES ARE LEGIT, SO NESTED IS FINE: and self.query_path == c.nested_path[0] ]))
def map_to_es(self): """ RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME """ output = {} for path in self.query_path: set_default( output, { k: c.es_column for c in self.snowflake.columns if c.jx_type not in STRUCT for rel_name in [c.names[path]] for k in [rel_name, untype_path(rel_name), unnest_path(rel_name)] }) return output
def values(self, column_name): """ RETURN ALL COLUMNS THAT column_name REFERES TO """ column_name = unnest_path(column_name) columns = self.columns deep_path = self.query_path[0] for path in self.query_path: output = [ c for c in columns if (c.jx_type not in STRUCT and untype_path(c.names[path]) == column_name) ] if output: return output return output
def map_to_es(self): """ RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME """ output = {} for path in self.query_path: set_default( output, { k: c.es_column for c in self.snowflake.columns if c.jx_type not in STRUCT for rel_name in [c.names[path]] for k in [rel_name, untype_path(rel_name), unnest_path(rel_name)] } ) return output
def values(self, column_name, exclude_type=STRUCT): """ RETURN ALL COLUMNS THAT column_name REFERS TO """ column_name = unnest_path(column_name) columns = self.columns output = [] for path in self.query_path: full_path = untype_path(concat_field(path, column_name)) for c in columns: if c.jx_type in exclude_type: continue # if c.cardinality == 0: # continue if untype_path(c.name) == full_path: output.append(c) if output: return output return []
def values(self, column_name): """ RETURN ALL COLUMNS THAT column_name REFERES TO """ column_name = unnest_path(column_name) columns = self.columns deep_path = self.query_path[0] for path in self.query_path: output = [ c for c in columns if ( c.jx_type not in STRUCT and untype_path(c.names[path]) == column_name ) ] if output: return output return output
def es_setop(es, query): schema = query.frum.schema query_path = schema.query_path[0] split_select = {".": ESSelect('.')} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select('.').use_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select('.').use_source = True new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select('.').use_source = True for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": "."}, "pull": lambda row: row._id }) elif c.jx_type == NESTED: get_select('.').use_source = True pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))} }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column) pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = {"script": text_type(Painless[first(script)].partial_eval().to_es_script(schema))} new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select('.').use_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") split_wheres = split_expression_by_path(query.where, schema, lang=ES52) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort, schema) with Timer("call to ES", silent=True) as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits # Log.note("{{output}}", output=T) try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter", silent=True): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def get_selects(query): schema = query.frum.schema split_select = {".": ESSelect(".")} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.name), term.var) ) if c.jx_type == NESTED: get_select(".").set_op = True new_select.append( { "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, "pull": get_pull_source(c.es_column), } ) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append( { "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, } ) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select(".").set_op = True new_select.append( { "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source("."), } ) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select(".").set_op = True for c in leaves: if ( len(c.nested_path) == 1 ): # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, "pull": get_pull_source(c.es_column), } ) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": ".", }, "pull": lambda row: row._id, } ) elif c.jx_type == NESTED: get_select(".").set_op = True pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, "pull": get_pull_source(c.es_column), } ) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, } ) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field( untype_path( relative_field(c.name, schema.query_path[0]) ), s_column, ) pull = accumulate_nested_doc( c_nested_path, Variable( relative_field(s_column, unnest_path(c_nested_path)) ), ) new_select.append( { "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child, }, "pull": pull, } ) else: new_select.append( { "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."}, } ) put_index += 1 else: split_scripts = split_expression_by_path( select.value, schema, lang=Painless ) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = { "script": text( Painless[first(script)].partial_eval().to_es_script(schema) ) } new_select.append( { "name": select.name, "pull": jx_expression_to_function( "fields." + literal_field(select.name) ), "put": {"name": select.name, "index": put_index, "child": "."}, } ) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select(".").set_op: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var)) ) else: Log.error("Do not know what to do") return new_select, split_select
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if is_text(e.value): Log.error("Expecting Variable or Expression, not plain string") if is_op(e.value, LeavesOp): return object.__new__(ObjectDecoder) elif is_op(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(is_op(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data(dimension={"fields": e.value.terms}) return object.__new__(DimFieldListDecoder) elif is_op(e.value, Variable): schema = query.frum.schema cols = schema.leaves(e.value.var) if not cols: return object.__new__(DefaultDecoder) if len(cols) > 1: return object.__new__(ObjectDecoder) col = first(cols) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.cardinality == None: DEBUG and Log.warning( "metadata for column {{name|quote}} (id={{id}}) is not ready", name=concat_field(col.es_index, col.es_column), id=id(col)) if unnest_path(e.value.var) in KNOWN_MULTITYPES: Log.warning("{{var}} is not multivalued", var=e.value.var) return object.__new__(MultivalueDecoder) e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) elif col.multi <= 1 and col.partitions == None: if unnest_path(e.value.var) in KNOWN_MULTITYPES: Log.warning("{{var}} is not multivalued", var=e.value.var) return object.__new__(MultivalueDecoder) e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) else: DEBUG and Log.note("id={{id}} has parts!!!", id=id(col)) if col.multi > 1: return object.__new__(MultivalueDecoder) partitions = col.partitions[:limit:] if e.domain.sort == -1: partitions = list(reversed(sorted(partitions))) else: partitions = sorted(partitions) e.domain = SimpleSetDomain(partitions=partitions, limit=limit) else: return object.__new__(DefaultDecoder) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder) if e.range: return object.__new__(GeneralRangeDecoder) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if is_data(fields): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len(abs_column.nested_path): output.remove(other) self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}}) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals
def es_setop(es, query): schema = query.frum.schema query_path = schema.query_path[0] split_select = {".": ESSelect('.')} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select('.').use_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." }, "pull": get_pull_source(c.es_column) }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." } }) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select('.').use_source = True new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source(".") }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select('.').use_source = True for c in leaves: if len( c.nested_path ) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column)) }, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": "." }, "pull": lambda row: row._id }) elif c.jx_type == NESTED: get_select('.').use_source = True pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field( pre_child, s_column)) }, "pull": get_pull_source(c.es_column) }) else: get_select(c_nested_path).fields.append( c.es_column) pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field( pre_child, s_column)) } }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field( untype_path( relative_field(c.name, schema.query_path[0])), s_column) pull = accumulate_nested_doc( c_nested_path, Variable( relative_field( s_column, unnest_path(c_nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 else: split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = { "script": text_type(Painless[first( script)].partial_eval().to_es_script(schema)) } new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select('.').use_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") split_wheres = split_expression_by_path(query.where, schema, lang=ES52) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort, schema) with Timer("call to ES", silent=DEBUG) as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits # Log.note("{{output}}", output=T) try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter", silent=True): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ]) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len( abs_column.nested_path): output.remove(other) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_column": other.es_column, "es_index": other.es_index } } }) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals
def es_setop(es, query): schema = query.frum.schema es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.stored_fields = FlatList() selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema # columns = schema.columns # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") es_query.sort = jx_sort_to_es_sort(query.sort, schema) put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var)) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 elif c.nested_path[0] != ".": pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.stored_fields += [c.es_column] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif isinstance(select.value, Variable): s_column = select.value.var # LEAVES OF OBJECT leaves = schema.leaves(s_column) nested_selects = {} if leaves: if s_column == '.': # PULL ALL SOURCE es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) elif any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.stored_fields = ["_source"] for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES jx_name = untype_path(c.names["."]) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}, "pull": get_pull_source(c.es_column) }) else: es_query.stored_fields += [c.es_column] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)} }) else: if not nested_filter: where = filters[0].copy() nested_filter = [where] for k in filters[0].keys(): filters[0][k] = None set_default( filters[0], es_and([where, es_or(nested_filter)]) ) nested_path = c.nested_path[0] if nested_path not in nested_selects: where = nested_selects[nested_path] = Data() nested_filter += [where] where.nested.path = nested_path where.nested.query.match_all = {} where.nested.inner_hits._source = False where.nested.inner_hits.stored_fields += [c.es_column] child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: nested_selects[nested_path].nested.inner_hits.stored_fields += [c.es_column] else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: painless = select.value.partial_eval().to_es_script(schema) es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif isinstance(n.value, Variable): if es_query.stored_fields[0] == "_source": es_query.stored_fields = ["_source"] n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter"): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def _indexer(columns, query_path): all_names = set(unnest_path(n) for c in columns for n in c.names.values()) | {"."} lookup_leaves = {} # ALL LEAF VARIABLES for full_name in all_names: for c in columns: cname = c.names[query_path] nfp = unnest_path(cname) if ( startswith_field(nfp, full_name) and c.es_type not in [EXISTS, OBJECT, NESTED] and (c.es_column != "_id" or full_name == "_id") ): cs = lookup_leaves.setdefault(full_name, set()) cs.add(c) cs = lookup_leaves.setdefault(untype_path(full_name), set()) cs.add(c) lookup_variables = {} # ALL NOT-NESTED VARIABLES for full_name in all_names: for c in columns: cname = c.names[query_path] nfp = unnest_path(cname) if ( startswith_field(nfp, full_name) and c.es_type not in [EXISTS, OBJECT] and (c.es_column != "_id" or full_name == "_id") and startswith_field(c.nested_path[0], query_path) ): cs = lookup_variables.setdefault(full_name, set()) cs.add(c) cs = lookup_variables.setdefault(untype_path(full_name), set()) cs.add(c) relative_lookup = {} for c in columns: try: cname = c.names[query_path] cs = relative_lookup.setdefault(cname, set()) cs.add(c) ucname = untype_path(cname) cs = relative_lookup.setdefault(ucname, set()) cs.add(c) except Exception as e: Log.error("Should not happen", cause=e) if query_path != ".": # ADD ABSOLUTE NAMES TO THE NAMESAPCE absolute_lookup, more_leaves, more_variables = _indexer(columns, ".") for k, cs in absolute_lookup.items(): if k not in relative_lookup: relative_lookup[k] = cs for k, cs in more_leaves.items(): if k not in lookup_leaves: lookup_leaves[k] = cs for k, cs in more_variables.items(): if k not in lookup_variables: lookup_variables[k] = cs return relative_lookup, lookup_leaves, lookup_variables
def es_setop(es, query): schema = query.frum.schema es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.stored_fields = FlatList() selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema # columns = schema.columns # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") es_query.sort = jx_sort_to_es_sort(query.sort, schema) put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance( select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.names["."]), term.var)) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." }, "pull": get_pull_source(c.es_column) }) put_index += 1 elif c.nested_path[0] != ".": pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.stored_fields += [c.es_column] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." } }) put_index += 1 elif isinstance(select.value, Variable): s_column = select.value.var # LEAVES OF OBJECT leaves = schema.leaves(s_column) nested_selects = {} if leaves: if s_column == '.': # PULL ALL SOURCE es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source(".") }) elif any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.stored_fields = ["_source"] for c in leaves: if len( c.nested_path ) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES jx_name = untype_path(c.names["."]) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) }, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) }, "pull": get_pull_source(c.es_column) }) else: es_query.stored_fields += [c.es_column] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": relative_field(jx_name, s_column) } }) else: if not nested_filter: where = filters[0].copy() nested_filter = [where] for k in filters[0].keys(): filters[0][k] = None set_default( filters[0], es_and([where, es_or(nested_filter)])) nested_path = c.nested_path[0] if nested_path not in nested_selects: where = nested_selects[nested_path] = Data() nested_filter += [where] where.nested.path = nested_path where.nested.query.match_all = {} where.nested.inner_hits._source = False where.nested.inner_hits.stored_fields += [ c.es_column ] child = relative_field( untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc( nested_path, Variable( relative_field( s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: nested_selects[ nested_path].nested.inner_hits.stored_fields += [ c.es_column ] else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 else: painless = select.value.partial_eval().to_es_script(schema) es_query.script_fields[literal_field(select.name)] = es_script( painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 for n in new_select: if n.pull: continue elif isinstance(n.value, Variable): if es_query.stored_fields[0] == "_source": es_query.stored_fields = ["_source"] n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter"): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)