def map_to_sql(self, var=""): """ RETURN A MAP FROM THE RELATIVE AND ABSOLUTE NAME SPACE TO COLUMNS """ origin = self.nested_path[0] if startswith_field(var, origin) and origin != var: var = relative_field(var, origin) fact_dict = {} origin_dict = {} for k, cs in self.namespace.items(): for c in cs: if c.jx_type in STRUCT: continue if startswith_field(get_property_name(k), var): origin_dict.setdefault(relative_field(c.name, origin), []).append(c) if origin != c.nested_path[0]: fact_dict.setdefault(c.name, []).append(c) elif origin == var: origin_dict.setdefault( concat_field(var, relative_field(c.name, origin)), []).append(c) if origin != c.nested_path[0]: fact_dict.setdefault(concat_field(var, c.name), []).append(c) return set_default(origin_dict, fact_dict)
def to_bq(self, schema, not_null=False, boolean=False): if not is_op(self.term, Variable): Log.error("Can only handle Variable") term = self.term.var prefix_length = len(split_field(term)) output = wrap( [ { "name": join_field( split_field(schema.get_column_name(c))[prefix_length:] ), "sql": Variable(schema.get_column_name(c)).to_bq(schema)[0].sql, } for c in schema.columns if startswith_field(c.name, term) and ( ( c.jx_type not in (EXISTS, OBJECT, NESTED) and startswith_field(schema.nested_path[0], c.nested_path[0]) ) or ( c.jx_type not in (EXISTS, OBJECT) and schema.nested_path[0] == c.nested_path[0] ) ) ] ) return output
def __and__(self, other): """ MERGE TWO NestedOp """ if not is_op(other, NestedOp): return AndOp([self, other]) # MERGE elif self.path == other.frum: return NestedOp( self.path, listwrap(self.select) + listwrap(other.select), AndOp([self.where, other.where]), coalesce(self.sort, other.sort), coalesce(self.limit, other.limit), ) # NEST elif startswith_field(other.frum.var, self.path.var): # WE ACHIEVE INTERSECTION BY LIMITING OURSELF TO ONLY THE DEEP OBJECTS # WE ASSUME frum SELECTS WHOLE DOCUMENT, SO self.select IS POSSIBLE return NestedOp(other, self.select, self.where, self.sort, self.limit,) elif startswith_field(self.path.var, other.frum.var): return NestedOp(self, other.select, other.where, other.sort, other.limit,) else: return AndOp([self, other])
def _indexer(columns, query_path): all_names = set(unnest_path(n) for c in columns for n in c.names.values()) | {"."} lookup_leaves = {} # ALL LEAF VARIABLES for full_name in all_names: for c in columns: cname = c.names[query_path] nfp = unnest_path(cname) if (startswith_field(nfp, full_name) and c.type not in [EXISTS, OBJECT, NESTED] and (c.es_column != "_id" or full_name == "_id")): cs = lookup_leaves.setdefault(full_name, set()) cs.add(c) cs = lookup_leaves.setdefault(untype_path(full_name), set()) cs.add(c) lookup_variables = {} # ALL NOT-NESTED VARIABLES for full_name in all_names: for c in columns: cname = c.names[query_path] nfp = unnest_path(cname) if (startswith_field(nfp, full_name) and c.type not in [EXISTS, OBJECT] and (c.es_column != "_id" or full_name == "_id") and startswith_field(c.nested_path[0], query_path)): cs = lookup_variables.setdefault(full_name, set()) cs.add(c) cs = lookup_variables.setdefault(untype_path(full_name), set()) cs.add(c) relative_lookup = {} for c in columns: try: cname = c.names[query_path] cs = relative_lookup.setdefault(cname, set()) cs.add(c) ucname = untype_path(cname) cs = relative_lookup.setdefault(ucname, set()) cs.add(c) except Exception as e: Log.error("Should not happen", cause=e) if query_path != ".": # ADD ABSOLUTE NAMES TO THE NAMESAPCE absolute_lookup, more_leaves, more_variables = _indexer(columns, ".") for k, cs in absolute_lookup.items(): if k not in relative_lookup: relative_lookup[k] = cs for k, cs in more_leaves.items(): if k not in lookup_leaves: lookup_leaves[k] = cs for k, cs in more_variables.items(): if k not in lookup_variables: lookup_variables[k] = cs return relative_lookup, lookup_leaves, lookup_variables
def __new__(cls, name, bases, dct): x = type.__new__(cls, name, bases, dct) x.lang = None if startswith_field(x.__module__, expression_module): # ALL OPS IN expression_module ARE GIVEN AN ID, NO OTHERS setattr(x, ID, next_id()) return x
def _get_sql_schema(self, frum): """ :param nest: the path to the nested sub-table :return: relative schema for the sub-table; change `es_index` to sql alias """ # WE MUST HAVE THE ALIAS NAMES FOR THE TABLES nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.nested_tables.items()) } def paths(field): path = split_field(field) for i in range(len(path) + 1): yield join_field(path[0:i]) columns = Data() for k in set(kk for k in self.columns.keys() for kk in paths(k)): for j, c in ((j, cc) for j, c in self.columns.items() for cc in c): if startswith_field(j, k): if c.type in STRUCT: continue c = copy(c) c.es_index = nest_to_alias[c.nested_path[0]] columns[literal_field(k)] += [c] columns._db = self.db return unwrap(columns)
def leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS """ clean_name = unnest_path(column_name) if clean_name != column_name: clean_name = column_name cleaner = lambda x: x else: cleaner = unnest_path columns = self.columns # TODO: '.' IMPLIES ALL FIELDS FROM ABSOLUTE PERPECTIVE, ALL OTHERS ARE A RELATIVE PERSPECTIVE # TODO: HOW TO REFER TO FIELDS THAT MAY BE SHADOWED BY A RELATIVE NAME? for path in reversed(self.query_path) if clean_name == '.' else self.query_path: output = [ c for c in columns if ( (c.name != "_id" or clean_name == "_id") and ( (c.jx_type == EXISTS and column_name.endswith("." + EXISTS_TYPE)) or c.jx_type not in OBJECTS or (clean_name == '.' and c.cardinality == 0) ) and startswith_field(cleaner(relative_field(c.name, path)), clean_name) ) ] if output: return set(output) return set()
def leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS """ clean_name = unnest_path(column_name) if clean_name != column_name: clean_name = column_name cleaner = lambda x: x else: cleaner = unnest_path columns = self.columns # TODO: '.' IMPLIES ALL FIELDS FROM ABSOLUTE PERPECTIVE, ALL OTHERS ARE A RELATIVE PERSPECTIVE # TODO: HOW TO REFER TO FIELDS THAT MAY BE SHADOWED BY A RELATIVE NAME? for path in reversed( self.query_path) if clean_name == '.' else self.query_path: output = [ c for c in columns if ((c.name != "_id" or clean_name == "_id") and ( (c.jx_type == EXISTS and column_name.endswith( "." + EXISTS_TYPE)) or c.jx_type not in OBJECTS or (clean_name == '.' and c.cardinality == 0)) and startswith_field(cleaner(relative_field(c.name, path)), clean_name)) ] if output: return set(output) return set()
def new_leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, INCLUDING DEEP COLUMNS """ column_name = unnest_path(column_name) columns = self.columns all_paths = self.snowflake.sorted_query_paths output = {} for c in columns: if c.name == "_id" and column_name != "_id": continue if c.jx_type in OBJECTS: continue if c.cardinality == 0: continue for path in all_paths: if not startswith_field( unnest_path(relative_field(c.name, path)), column_name): continue existing = output.get(path) if not existing: output[path] = [c] continue if len(path) > len(c.nested_path[0]): continue if any("." + t + "." in c.es_column for t in (STRING_TYPE, NUMBER_TYPE, BOOLEAN_TYPE)): # ELASTICSEARCH field TYPES ARE NOT ALLOWED continue # ONLY THE DEEPEST COLUMN WILL BE CHOSEN output[path].append(c) return set(output.values())
def new_leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, INCLUDING DEEP COLUMNS """ column_name = unnest_path(column_name) columns = self.columns all_paths = self.snowflake.sorted_query_paths output = {} for c in columns: if c.name == "_id" and column_name != "_id": continue if c.jx_type in OBJECTS: continue if c.cardinality == 0: continue for path in all_paths: if not startswith_field(unnest_path(relative_field(c.name, path)), column_name): continue existing = output.get(path) if not existing: output[path] = [c] continue if len(path) > len(c.nested_path[0]): continue if any("." + t + "." in c.es_column for t in (STRING_TYPE, NUMBER_TYPE, BOOLEAN_TYPE)): # ELASTICSEARCH field TYPES ARE NOT ALLOWED continue # ONLY THE DEEPEST COLUMN WILL BE CHOSEN output[path].append(c) return set(output.values())
def leaves(self, prefix): full_name = concat_field(self.nested_path, prefix) return set( c for c in self.snowflake.namespace.columns.find( self.snowflake.fact_name) for k in [c.name] if startswith_field(k, full_name) and k != GUID or k == full_name if c.jx_type not in [OBJECT, EXISTS])
def __getitem__(self, item): if isinstance(item, text): sub_schema = self.schema for n in split_field(item): if n in sub_schema.more: sub_schema = sub_schema.more.get(n) else: sub_schema = sub_schema.values.get(n) return Table( { k: v for k, v in self.values.items() if startswith_field(k, item) }, { k: v for k, v in self.reps.items() if startswith_field(k, item) }, { k: v for k, v in self.defs.items() if startswith_field(k, item) }, self.num_rows, sub_schema, self.max_definition_level) elif isinstance(item, slice): start = coalesce(item.start, 0) stop = coalesce(item.stop, self.num_rows) if start == 0 and stop == self.num_rows: return self first = 0 last = 0 counter = 0 for i, r in enumerate(self.reps): if counter == start: first = i elif counter == stop: last = i break if r == 0: counter += 1 return Table({k: v[first:last] for k, v in self.values.items()}, {k: v[first:last] for k, v in self.reps.items()}, {k: v[first:last] for k, v in self.defs.items()}, stop - start, self.schema)
def _edges_op(self, query, frum): query = query.copy() # WE WILL BE MARKING UP THE QUERY index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) outer_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) frum_path = split_field(frum) base_table = join_field(frum_path[0:1]) path = join_field(frum_path[1:]) nest_to_alias = { nested_path: quote_column("__" + unichr(ord('a') + i) + "__") for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } schema = self.sf.tables[relative_field(frum, self.sf.fact)].schema tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = quote_column(join_field([base_table] + split_field(tables[0].nest))) + tables[0].alias for previous, t in zip(tables, tables[1::]): from_sql += ( SQL_LEFT_JOIN + quote_column(concat_field(base_table, t.nest)) + t.alias + SQL_ON + join_column(t.alias, quoted_PARENT) + " = " + join_column(previous.alias, quoted_UID) ) main_filter = query.where.to_sql(schema, boolean=True)[0].sql.b # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH ons = [] join_types = [] wheres = [] null_ons = [EXISTS_COLUMN + SQL_IS_NULL] groupby = [] null_groupby = [] orderby = [] domains = [] select_clause = [SQL_ONE + EXISTS_COLUMN] + [quote_column(c.es_column) for c in self.sf.tables['.'].columns] for edge_index, query_edge in enumerate(query.edges): edge_alias = quote_column("e" + text_type(edge_index)) if query_edge.value: edge_values = [p for c in query_edge.value.to_sql(schema).sql for p in c.items()] elif not query_edge.value and any(query_edge.domain.partitions.where): case = SQL_CASE for pp, p in enumerate(query_edge.domain.partitions): w = p.where.to_sql(schema)[0].sql.b t = quote_value(pp) case += SQL_WHEN + w + SQL_THEN + t case += SQL_ELSE + SQL_NULL + SQL_END # quote value with length of partitions edge_values = [("n", case)] elif query_edge.range: edge_values = query_edge.range.min.to_sql(schema)[0].sql.items() + query_edge.range.max.to_sql(schema)[ 0].sql.items()
def _nest_column(self, column, new_path): destination_table = join_field([self.name] + split_field(new_path)) existing_table = join_field([self.name] + split_field(column.nested_path[0])) # FIND THE INNER COLUMNS WE WILL BE MOVING new_columns = {} for cname, cols in self.columns.items(): if startswith_field(cname, column.names[self.name]): new_columns[cname] = set() for col in cols: new_columns[cname].add(col) col.nested_path = [new_path] + col.nested_path # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO? # DEFINE A NEW TABLE? # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(destination_table) + ")" details = self.db.query(command) if details.data: raise Log.error("not expected, new nesting!") from jx_sqlite.query_table import QueryTable self.nested_tables[new_path] = sub_table = QueryTable( destination_table, self.db, exists=False) self.db.execute("ALTER TABLE " + quote_table(sub_table.name) + " ADD COLUMN " + quoted_PARENT + " INTEGER") self.db.execute("ALTER TABLE " + quote_table(sub_table.name) + " ADD COLUMN " + quote_table(ORDER) + " INTEGER") for cname, cols in new_columns.items(): for c in cols: sub_table.add_column(c) # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY all_cols = [c for _, cols in sub_table.columns.items() for c in cols] if not all_cols: has_nested_data = "0" elif len(all_cols) == 1: has_nested_data = _quote_column(all_cols[0]) + " is NOT NULL" else: has_nested_data = "COALESCE(" + \ ",".join(_quote_column(c) for c in all_cols) + \ ") IS NOT NULL" # FILL TABLE WITH EXISTING COLUMN DATA command = "INSERT INTO " + quote_table(destination_table) + "(\n" + \ ",\n".join( [quoted_UID, quoted_PARENT, quote_table(ORDER)] + [_quote_column(c) for _, cols in sub_table.columns.items() for c in cols] ) + \ "\n)\n" + \ "\nSELECT\n" + ",".join( [quoted_UID, quoted_UID, "0"] + [_quote_column(c) for _, cols in sub_table.columns.items() for c in cols] ) + \ "\nFROM\n" + quote_table(existing_table) + \ "\nWHERE\n" + has_nested_data self.db.execute(command)
def needed(name, required): """ RETURN SUBSET IF name IN REQUIRED """ return [ relative_field(r, name) if r and startswith_field(r, name) else None for r in required ]
def place(parent_doc_details): if startswith_field(step, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append( nested_doc_details)
def needed(name, required): """ RETURN SUBSET IF name IN REQUIRED """ return [ relative_field(r, name) if r and startswith_field(r, name) else None for r in required ]
def leaves(self, prefix): head = self.namespace.get(prefix, None) if not head: return Null full_name = first(head).name return set( c for k, cs in self.namespace.items() if startswith_field(k, full_name) and k != GUID or k == full_name for c in cs if c.jx_type not in [OBJECT, EXISTS])
def to_es(self, schema, query_path="."): output = Aggs.to_es(self, schema, self.path) if query_path == self.path: Log.error("this should have been cancelled out") elif startswith_field(self.path, query_path): output['nested'] = {"path": self.path} else: output["reverse_nested"] = {"path": None if self.path == "." else self.path} return output
def __init__(self, frum, nests): Expression.__init__(self, nests) self.frum = frum self.nests = nests last = "." for n in reversed(nests): path = n.path.var if not startswith_field(path, last): Log.error("Expecting nests to be reverse nested order") last = path
def __getitem__(self, item): if isinstance(item, text_type): sub_schema = self.schema for n in split_field(item): if n in sub_schema.more: sub_schema = sub_schema.more.get(n) else: sub_schema = sub_schema.values.get(n) return Table( {k: v for k, v in self.values.items() if startswith_field(k, item)}, {k: v for k, v in self.reps.items() if startswith_field(k, item)}, {k: v for k, v in self.defs.items() if startswith_field(k, item)}, self.num_rows, sub_schema, self.max_definition_level ) elif isinstance(item, slice): start = coalesce(item.start, 0) stop = coalesce(item.stop, self.num_rows) if start == 0 and stop == self.num_rows: return self first = 0 last = 0 counter = 0 for i, r in enumerate(self.reps): if counter == start: first = i elif counter == stop: last = i break if r == 0: counter += 1 return Table( {k: v[first:last] for k, v in self.values.items()}, {k: v[first:last] for k, v in self.reps.items()}, {k: v[first:last] for k, v in self.defs.items()}, stop - start, self.schema )
def execute_query(self, query): try: if startswith_field(query["from"], self._index.name): return self._index.query(deepcopy(query)) elif query["from"] == "meta.columns": return self._index.query_metadata(deepcopy(query)) else: Log.error("Do not know how to handle") except Exception as e: Log.error("Failed query", e)
def to_es(self, schema, query_path="."): output = Aggs.to_es(self, schema, self.path) if query_path == self.path: Log.error("this should have been cancelled out") elif startswith_field(self.path, query_path): output['nested'] = {"path": self.path} else: output["reverse_nested"] = { "path": None if self.path == "." else self.path } return output
def __init__(self, db): self.db = db self._snowflakes = {} # MAP FROM BASE TABLE TO LIST OF NESTED TABLES self._columns = ColumnList() # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d[i] for i, k in enumerate(result.header)} for d in result.data]) last_nested_path = [] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path nested_tables = self._snowflakes.setdefault( base_table, [nested_path] + last_nested_path) nested_tables.append( jx_base.TableDesc(name=table.name, nested_path=full_nested_path)) # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) self._columns.add( Column( name=cname, # I THINK COLUMNS HAVE THIER FULL PATH jx_type=coalesce( ctype, { "TEXT": "string", "REAL": "number", "INTEGER": "integer" }.get(dtype)), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name)) last_nested_path = full_nested_path
def map_to_sql(self, var=""): """ RETURN A MAP FROM THE RELATIVE AND ABSOLUTE NAME SPACE TO COLUMNS """ origin = self.nested_path[0] if startswith_field(var, origin) and origin != var: var = relative_field(var, origin) fact_dict = {} origin_dict = {} for k, cs in self.map.items(): for c in cs: if c.type not in STRUCT: if (startswith_field(get_property_name(k), var)): if c.names[origin] in origin_dict: origin_dict[c.names[origin]].append(c) else: origin_dict[c.names[origin]] = [c] if origin != c.nested_path[0]: if c.names["."] in fact_dict: fact_dict[c.names["."]].append(c) else: fact_dict[c.names["."]] = [c] elif origin == var: if concat_field(var, c.names[origin]) in origin_dict: origin_dict[concat_field( var, c.names[origin])].append(c) else: origin_dict[concat_field(var, c.names[origin])] = [c] if origin != c.nested_path[0]: if c.names["."] in fact_dict: fact_dict[concat_field(var, c.names["."])].append(c) else: fact_dict[concat_field(var, c.names["."])] = [c] return set_default(origin_dict, fact_dict)
def _load_from_database(self): # FIND ALL TABLES result = self.db.query( sql_query({ "from": "sqlite_master", "where": { "eq": { "type": "table" } }, "orderby": "name" })) tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = ["."] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX if nested_path == ".": last_nested_path = [] else: for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS details = self.db.about(table.name) for cid, name, dtype, notnull, dfft_value, pk in details: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add( Column(name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now())) last_nested_path = full_nested_path
def read_db(self): """ PULL SCHEMA FROM DATABASE, BUILD THE MODEL :return: None """ # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d[i] for i, k in enumerate(result.header)} for d in result.data]) tables_found = False for table in tables: if table.name.startswith("__"): continue tables_found = True nested_path = [ join_field(split_field(tab.name)[1:]) for tab in jx.reverse(tables) if startswith_field(table.name, tab.name) ] self.add_table_to_schema(nested_path) # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) column = Column(names={ np: relative_field(cname, np) for np in nested_path }, type=coalesce( ctype, { "TEXT": "string", "REAL": "number", "INTEGER": "integer" }.get(dtype)), nested_path=nested_path, es_column=name, es_index=table.name) self.add_column_to_schema(column) return tables_found
def __init__(self, frum, nests): """ A SEQUENCE OF NESTED (INNER) JOINS FOR A QUERY :param frum: THE TABLE OF DOCUMENTS :param nests: LIST OF INNER JOINS (deepest first) """ Expression.__init__(self, nests) self.frum = frum self.nests = nests last = "." for n in reversed(nests): path = n.path.var if not startswith_field(path, last): Log.error("Expecting nests to be reverse nested order") last = path
def split_nested_inner_variables(where, focal_path, var_to_columns): """ SOME VARIABLES ARE BOTH NESTED AND INNER, EXPAND QUERY TO HANDLE BOTH :param where: :param focal_path: :param var_to_columns: :return: """ wheres = [where] # WE DO THIS EXPANSION TO CAPTURE A VARIABLE OVER DIFFERENT NESTED LEVELS # EXPAND VARS TO COLUMNS, MULTIPLY THE EXPRESSIONS for v, cols in var_to_columns.items(): more_exprs = [] if not cols: for e in wheres: more_exprs.append(e.map({v: NULL})) else: for c in cols: deepest = c.nested_path[0] for e in wheres: if startswith_field(focal_path, deepest): more_exprs.append( e.map({ v: Variable(c.es_column, type=c.jx_type, multi=c.multi) })) else: more_exprs.append( e.map({ v: NestedOp( path=Variable(deepest), select=Variable(c.es_column), where=Variable(c.es_column).exists(), ) })) wheres = more_exprs var_to_columns = { c.es_column: [c] for cs in var_to_columns.values() for c in cs } return OrOp(wheres)
def leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS """ column_name = unnest_path(column_name) columns = self.columns deep_path = self.query_path[0] for path in self.query_path: output = [ c for c in columns if ((c.names['.'] != "_id" or column_name == "_id") and c.jx_type not in OBJECTS and startswith_field( unnest_path(c.names[path]), column_name)) ] if output: return output return []
def _load_from_database(self): # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = [] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add( Column(name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now())) last_nested_path = full_nested_path
def leaves(self, column_name): """ :param column_name: :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS """ column_name = unnest_path(column_name) columns = self.columns deep_path = self.query_path[0] for path in self.query_path: output = [ c for c in columns if ( (c.names['.'] != "_id" or column_name == "_id") and c.jx_type not in OBJECTS and startswith_field(unnest_path(c.names[path]), column_name) ) ] if output: return output return []
def to_sql(self, schema, not_null=False, boolean=False): if not isinstance(self.term, Variable): Log.error("Can only handle Variable") term = self.term.var prefix_length = len(split_field(term)) db_columns = [] for n, cols in schema.map_to_sql(term).items(): for c in cols: col = schema.get_column_name(c) if startswith_field(col, term): db_columns.append({ "name": join_field(split_field(col)[prefix_length:]), "sql": Variable(col).to_sql(schema)[0].sql }) else: db_columns.append({ "name": col, "sql": Variable(col).to_sql(schema)[0].sql }) return wrap(db_columns)
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.sf.fact): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self table = self.sf.tables[relative_field(frum, self.sf.fact)] schema = table.schema query = QueryOp.wrap(query, table=table, schema=schema) new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_column(new_table) + " AS " else: create_table = "" if query.groupby and query.format != "cube": op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.groupby: query.edges, query.groupby = query.groupby, query.edges op, index_to_columns = self._edges_op(query, frum) command = create_table + op query.edges, query.groupby = query.groupby, query.edges elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op result = self.db.query(command) if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0])) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( data=unwrap(data), select=select, meta={"format": "cube"} ) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data[s.name] = Matrix(dims=dims, zeros=0) else: data[s.name] = Matrix(dims=dims) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()} ) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} if query.sort[i].sort == -1: domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True))) else: domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data_cubes = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data_cubes[s.name] = Matrix(dims=dims, zeros=0) else: data_cubes[s.name] = Matrix(dims=dims) r2c = index_to_coordinate(dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull(row) if query.select == None: select = Null elif isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()} ) elif query.format == "table" or (not query.format and query.groupby): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data( meta={"format": "table"}, header=column_names, data=data ) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any(listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": if data[c.push_name] == None: data[c.push_name] = c.pull(result.data[0]) elif isinstance(data[c.push_name], list): data[c.push_name].append(c.pull(result.data[0])) else: data[c.push_name] = [data[c.push_name], c.pull(result.data[0])] else: data[c.push_name][c.push_child] = c.pull(result.data[0]) output = Data( meta={"format": "value"}, data=data ) else: data = Data() for s in index_to_columns.values(): if not data[s.push_child]: data[s.push_child] = s.pull(result.data[0]) else: data[s.push_child] += [s.pull(result.data[0])] output = Data( meta={"format": "value"}, data=unwrap(data) ) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data( meta={"format": "list"}, data=data ) else: Log.error("unknown format {{format}}", format=query.format) return output
def _none_to_column(schema, path, rep_level, def_level): for full_path in all_schema.leaves: if startswith_field(full_path, path): reps[full_path].append(rep_level) defs[full_path].append(def_level)
def __init__( self, host, index, type=None, name=None, port=9200, read_only=True, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) wait_for_active_shards=1, # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency) typed=None, kwargs=None ): Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.settings = kwargs self.name = name = coalesce(name, index) if read_only: self.es = elasticsearch.Alias(alias=index, kwargs=kwargs) else: self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs) self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.settings.type = self.es.settings.type self.edges = Data() self.worker = None columns = self.snowflake.columns # ABSOLUTE COLUMNS is_typed = any(c.es_column == EXISTS_TYPE for c in columns) if typed == None: # SWITCH ON TYPED MODE self.typed = is_typed else: if is_typed != typed: Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed) self.typed = typed if not typed: # ADD EXISTENCE COLUMNS all_paths = {".": None} # MAP FROM path TO parent TO MAKE A TREE def nested_path_of(v): if not v: return [] else: return [v] + nested_path_of(all_paths[v]) all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p))) for step in sorted(all): if step in all_paths: continue else: best = '.' for candidate in all_paths.keys(): if startswith_field(step, candidate): if startswith_field(candidate, best): best = candidate all_paths[step] = best for p in all_paths.keys(): nested_path = nested_path_of(all_paths[p]) if not nested_path: nested_path = ['.'] self.namespace.meta.columns.add(Column( name=p, es_column=p, es_index=self.name, es_type=OBJECT, jx_type=EXISTS, nested_path=nested_path, last_updated=Date.now() ))
def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.sf.fact, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if not isinstance(data, Mapping): data = {".": data} for k, v in data.items(): insertion = doc_collection[nested_path[0]] cname = concat_field(full_path, literal_field(k)) value_type = get_type(v) if value_type is None: continue if value_type in STRUCT: c = unwraplist( [cc for cc in abs_schema[cname] if cc.type in STRUCT]) else: c = unwraplist([ cc for cc in abs_schema[cname] if cc.type == value_type ]) if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path, _ in nested_tables.items(): if startswith_field( cname, path) and len(deeper_nested_path) < len(path): deeper_nested_path = path c = Column(names={".": cname}, type=value_type, es_column=typed_column(cname, value_type), es_index=table, nested_path=nested_path) abs_schema.add(cname, c) if value_type == "nested": nested_tables[cname] = "fake table" required_changes.append({"add": c}) # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY insertion.active_columns.add(c) elif c.type == "nested" and value_type == "object": value_type = "nested" v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) abs_schema.remove(cname, c) required_changes.append({"nest": (c, nested_path[0])}) deep_c = Column(names={".": cname}, type=value_type, es_column=typed_column(cname, value_type), es_index=table, nested_path=nested_path) abs_schema.add(cname, deep_c) insertion.active_columns.add(deep_c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = { UID: self.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column] } insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = {UID: self.next_uid(), PARENT: uid, ORDER: order} insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if value_type == "nested": row[c.es_column] = "." deeper_nested_path = [cname] + nested_path insertion = doc_collection.get(cname, None) if not insertion: insertion = doc_collection[cname] = Data( active_columns=set(), rows=[]) for i, r in enumerate(v): child_uid = self.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif value_type == "object": row[c.es_column] = "." _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.type: row[c.es_column] = v
def _indexer(columns, query_path): all_names = set(unnest_path(n) for c in columns for n in c.names.values()) | {"."} lookup_leaves = {} # ALL LEAF VARIABLES for full_name in all_names: for c in columns: cname = c.names[query_path] nfp = unnest_path(cname) if ( startswith_field(nfp, full_name) and c.es_type not in [EXISTS, OBJECT, NESTED] and (c.es_column != "_id" or full_name == "_id") ): cs = lookup_leaves.setdefault(full_name, set()) cs.add(c) cs = lookup_leaves.setdefault(untype_path(full_name), set()) cs.add(c) lookup_variables = {} # ALL NOT-NESTED VARIABLES for full_name in all_names: for c in columns: cname = c.names[query_path] nfp = unnest_path(cname) if ( startswith_field(nfp, full_name) and c.es_type not in [EXISTS, OBJECT] and (c.es_column != "_id" or full_name == "_id") and startswith_field(c.nested_path[0], query_path) ): cs = lookup_variables.setdefault(full_name, set()) cs.add(c) cs = lookup_variables.setdefault(untype_path(full_name), set()) cs.add(c) relative_lookup = {} for c in columns: try: cname = c.names[query_path] cs = relative_lookup.setdefault(cname, set()) cs.add(c) ucname = untype_path(cname) cs = relative_lookup.setdefault(ucname, set()) cs.add(c) except Exception as e: Log.error("Should not happen", cause=e) if query_path != ".": # ADD ABSOLUTE NAMES TO THE NAMESAPCE absolute_lookup, more_leaves, more_variables = _indexer(columns, ".") for k, cs in absolute_lookup.items(): if k not in relative_lookup: relative_lookup[k] = cs for k, cs in more_leaves.items(): if k not in lookup_leaves: lookup_leaves[k] = cs for k, cs in more_variables.items(): if k not in lookup_variables: lookup_variables[k] = cs return relative_lookup, lookup_leaves, lookup_variables
def es_deepop(es, query): schema = query.frum.schema query_path = schema.query_path[0] # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es_query_template(query_path) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for f, w in zip_longest(es_filters, wheres): script = ES52[AndOp(w)].partial_eval().to_esfilter(schema) set_default(f, script) if not wheres[1]: # INCLUDE DOCS WITH NO NESTED DOCS more_filter = { "bool": { "filter": [AndOp(wheres[0]).partial_eval().to_esfilter(schema)], "must_not": { "nested": { "path": query_path, "query": MATCH_ALL } } } } else: more_filter = None es_query.size = coalesce(query.limit, DEFAULT_LIMIT) map_to_es_columns = schema.map_to_es() query_for_es = query.map(map_to_es_columns) es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema) es_query.stored_fields = [] is_list = is_list_(query.select) selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS leaves = schema.leaves(select.value.term.var) col_names = set() for c in leaves: if c.nested_path[0] == ".": if c.jx_type == NESTED: continue es_query.stored_fields += [c.es_column] c_name = untype_path(relative_field(c.name, query_path)) col_names.add(c_name) new_select.append({ "name": concat_field(select.name, c_name), "nested_path": c.nested_path[0], "put": {"name": concat_field(select.name, literal_field(c_name)), "index": put_index, "child": "."}, "pull": get_pull_function(c) }) put_index += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.put.name = n.name = n.name.lstrip(".") col_names.add(n.name) elif is_op(select.value, Variable): net_columns = schema.leaves(select.value.var) if not net_columns: new_select.append({ "name": select.name, "nested_path": ".", "put": {"name": select.name, "index": put_index, "child": "."}, "pull": NULL }) else: for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": if n.jx_type == NESTED: continue es_query.stored_fields += [n.es_column] # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child for np in n.nested_path: c_name = untype_path(relative_field(n.name, np)) if startswith_field(c_name, select.value.var): # PREFER THE MOST-RELATIVE NAME child = relative_field(c_name, select.value.var) break else: continue new_select.append({ "name": select.name, "pull": pull, "nested_path": n.nested_path[0], "put": { "name": select.name, "index": put_index, "child": child } }) put_index += 1 else: expr = select.value for v in expr.vars(): for c in schema[v.var]: if c.nested_path[0] == ".": es_query.stored_fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + select.name map_to_local = MapToLocal(schema) pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = jx_expression_to_function(expr.map(map_to_local)) new_select.append({ "name": select.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 es_query.stored_fields = sorted(es_query.stored_fields) # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es_post( es, Data( query=more_filter, stored_fields=es_query.stored_fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t # </COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def es_deepop(es, query): schema = query.frum.schema query_path = schema.query_path[0] # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es_query_template(query_path) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema) set_default(f, script) if not wheres[1]: # WITHOUT NESTED CONDITIONS, WE MUST ALSO RETURN DOCS WITH NO NESTED RECORDS more_filter = { "and": [ es_filters[0], {"missing": {"field": untype_path(query_path) + "." + EXISTS_TYPE}} ] } else: more_filter = None es_query.size = coalesce(query.limit, DEFAULT_LIMIT) # es_query.sort = jx_sort_to_es_sort(query.sort) map_to_es_columns = schema.map_to_es() # {c.names["."]: c.es_column for c in schema.leaves(".")} query_for_es = query.map(map_to_es_columns) es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema) es_query.fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp) and isinstance(s.value.term, Variable): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS leaves = schema.leaves(s.value.term.var) col_names = set() for c in leaves: if c.nested_path[0] == ".": if c.jx_type == NESTED: continue es_query.fields += [c.es_column] c_name = untype_path(c.names[query_path]) col_names.add(c_name) new_select.append({ "name": concat_field(s.name, c_name), "nested_path": c.nested_path[0], "put": {"name": concat_field(s.name, literal_field(c_name)), "index": i, "child": "."}, "pull": get_pull_function(c) }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.put.name = n.name = n.name.lstrip(".") col_names.add(n.name) elif isinstance(s.value, Variable): net_columns = schema.leaves(s.value.var) if not net_columns: new_select.append({ "name": s.name, "nested_path": ".", "put": {"name": s.name, "index": i, "child": "."}, "pull": NULL }) else: for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": if n.jx_type == NESTED: continue es_query.fields += [n.es_column] # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child for np in n.nested_path: c_name = untype_path(n.names[np]) if startswith_field(c_name, s.value.var): child = relative_field(c_name, s.value.var) break else: child = relative_field(untype_path(n.names[n.nested_path[0]]), s.value.var) new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": { "name": s.name, "index": i, "child": child } }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v.var]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + s.name map_to_local = MapToLocal(schema) pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es_post( es, Data( query={"filtered": {"filter": more_filter}}, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)