def get_pull(column): if column.nested_path[0] == ".": return concat_field("fields", literal_field(column.es_column)) else: depth = len(split_field(column.nested_path[0])) rel_name = split_field(column.es_column)[depth:] return join_field(["_inner"] + rel_name)
def untype_path(encoded): if encoded.startswith(".."): remainder = encoded.lstrip(".") back = len(encoded) - len(remainder) - 1 return ("." * back) + join_field(decode_property(c) for c in split_field(remainder) if not c.startswith(TYPE_PREFIX)) else: return join_field(decode_property(c) for c in split_field(encoded) if not c.startswith(TYPE_PREFIX))
def select(self, selectList, fromPath, varName, sourceVar): path = split_field(fromPath) is_deep = len(path) > 1 heads = [] list = [] for s in selectList: if is_deep: if s.value and is_variable_name(s.value): shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: Log.error("do not know how to handle yet") else: if s.value and is_variable_name(s.value): list.append("Value2Pipe(getDocValue(" + value2MVEL(s.value) + "))\n") elif s.value: shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: code, decode = self.Parts2Term(s.domain) heads.append(code.head) list.append("Value2Pipe(" + code.body + ")\n") if len(split_field(fromPath)) > 1: output = 'if (' + varName + ' != "") ' + varName + '+="|";\n' + varName + '+=' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n' else: output = varName + ' = ' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n' return Data( head="".join(heads), body=output )
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if is_data(field_name) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): if len(split_field(field_name)) == 1: return [(d[field_name],) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif is_list(field_name): paths = [_select_a_field(f) for f in field_name] output = FlatList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = FlatList() _tuple((), data, paths, 0, output) return output
def _select_a_field(field): if is_text(field): return wrap({"name": field, "value": split_field(field)}) elif is_text(wrap(field).value): field = wrap(field) return wrap({"name": field.name, "value": split_field(field.value)}) else: return wrap({"name": field.name, "value": field.value})
def _select_a_field(field): if isinstance(field, basestring): return wrap({"name": field, "value": split_field(field)}) elif isinstance(wrap(field).value, basestring): field = wrap(field) return wrap({"name": field.name, "value": split_field(field.value)}) else: return wrap({"name": field.name, "value": field.value})
def select(self, fields): if isinstance(fields, Mapping): fields=fields.value if isinstance(fields, text_type): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = FlatList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if isinstance(fields, list): output = FlatList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append((f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Data() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Data() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def _select_a_field(field): if is_text(field): return dict_to_data({"name": field, "value": split_field(field)}) elif is_text(to_data(field).value): field = to_data(field) return dict_to_data({ "name": field.name, "value": split_field(field.value) }) else: return dict_to_data({"name": field.name, "value": field.value})
def untype_path(encoded): if encoded.startswith(".."): remainder = encoded.lstrip(".") back = len(encoded) - len(remainder) - 1 return ("." * back) + join_field( decode_property(c) for c in split_field(remainder) if not c.startswith(TYPE_PREFIX)) else: return join_field( decode_property(c) for c in split_field(encoded) if not c.startswith(TYPE_PREFIX))
def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n")
def getFrameVariables(self, body): contextVariables = [] columns = self.fromData.columns parentVarNames = set() # ALL PARENTS OF VARIABLES WITH "." IN NAME body = body.replace(".?", ".") for i, c in enumerate(columns): j = body.find(c.name, 0) while j >= 0: s = j j = body.find(c.name, s + 1) test0 = body[s - 1: s + len(c.name) + 1:] test3 = body[s - 8: s + len(c.name):] if test0[:-1] == "\"" + c.name: continue if test3 == "_source." + c.name: continue def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n") body = body.replace(c.name, "-"*len(c.name)) if self.isLean or c.useSource: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getSourceValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = _source[\"" + c.name + "\"];\n") else: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") break return "".join(contextVariables)
def setValues(expression, constants): if not constants: return expression constants = constants.copy() # EXPAND ALL CONSTANTS TO PRIMITIVE VALUES (MVEL CAN ONLY ACCEPT PRIMITIVE VALUES) for c in constants: value = c.value n = c.name if len(split_field(n)) >= 3: continue # DO NOT GO TOO DEEP if isinstance(value, list): continue # DO NOT MESS WITH ARRAYS if isinstance(value, Mapping): for k, v in value.items(): constants.append({"name": n + "." + k, "value": v}) for c in reverse(constants):# REVERSE ORDER, SO LONGER NAMES ARE TESTED FIRST s = 0 while True: s = expression.find(c.name, s) if s == -1: break if re.match(r"\w", expression[s - 1]): break if re.match(r"\w", expression[s + len(c.name)]): break v = value2MVEL(c.value) expression = expression[:s:] + "" + v + expression[:s + len(c.name):] return expression
def get_column(self, name): sub_schema = self.schema while '.' in sub_schema.more: sub_schema = sub_schema.more.get('.') for n in split_field(name): m = sub_schema.more if n in m: sub_schema = m.get(n) else: Log.error("{{name}} not found in schema", name=name) while '.' in sub_schema.more: sub_schema = sub_schema.more.get('.') return Column( name, self.values[name], self.reps[name], self.defs[name], self.num_rows, sub_schema, self.schema.max_repetition_level(name), self.schema.max_definition_level(name) )
def is_setop(es, query): if not any( map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])): return False select = listwrap(query.select) if not query.edges: isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT simpleAgg = AND([ s.aggregate in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT # NO EDGES IMPLIES SIMPLER QUERIES: EITHER A SET OPERATION, OR RETURN SINGLE AGGREGATE if simpleAgg or isDeep: return True else: isSmooth = AND( (e.domain.type in ALGEBRAIC and e.domain.interval == "none") for e in query.edges) if isSmooth: return True return False
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now()
def unnest_path(encoded): if encoded.startswith(".."): encoded = encoded.lstrip(".") if not encoded: encoded = "." return join_field(decode_property(c) for c in split_field(encoded) if c != NESTED_TYPE)
def to_sql(self, schema, not_null=False, boolean=False): if not is_op(self.term, Variable): Log.error("Can only handle Variable") term = self.term.var prefix_length = len(split_field(term)) output = wrap([{ "name": join_field(split_field(schema.get_column_name(c))[prefix_length:]), "sql": Variable(schema.get_column_name(c)).to_sql(schema)[0].sql, } for c in schema.columns if startswith_field(c.name, term) and ( (c.jx_type not in (EXISTS, OBJECT, NESTED) and startswith_field(schema.nested_path[0], c.nested_path[0])) or (c.jx_type not in (EXISTS, OBJECT) and schema.nested_path[0] == c.nested_path[0]))]) return output
def typed_encode(value, flake): """ RETURN (typed_value, flake_update, added_nested) TUPLES :param value: THE RECORD TO CONVERT TO STRICT TYPED FORM :param flake: LOOKUP SCHEMA, WILL BE UPDATED WITH CHANGES :return: (record, update, nested) TUPLE """ _ = flake.columns # ENSURE WE HAVE INTERNAL STRUCTURES FILLED output, update, nested = _typed_encode(value, flake.schema) if update: # REFRESH COLUMNS flake._columns = None _ = flake.columns worker = to_data(output) for path, field in flake._top_level_fields.items(): worker[field] = worker[path] worker[path] = None # DO NOT LEAVE ANY EMPTY OBJECT RESIDUE _path = split_field(path) for i, _ in jx.reverse(enumerate(_path)): sub_path = join_field(_path[:i]) v = worker[sub_path] if is_data(v) and not worker[sub_path].keys(): worker[sub_path] = None else: break return output, update, nested
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception, e: Log.error("Not expected", cause=e)
def to_python(self, not_null=False, boolean=False, many=False): path = split_field(self.var) agg = "row" if not path: return agg elif path[0] in ["row", "rownum"]: # MAGIC VARIABLES agg = path[0] path = path[1:] if len(path) == 0: return agg elif path[0] == "rows": if len(path) == 1: return "rows" elif path[1] in ["first", "last"]: agg = "rows." + path[1] + "()" path = path[2:] else: Log.error("do not know what {{var}} of `rows` is", var=path[1]) for p in path[:-1]: if not_null: agg = agg + ".get(" + convert.value2quote(p) + ")" else: agg = agg + ".get(" + convert.value2quote(p) + ", EMPTY_DICT)" output = agg + ".get(" + convert.value2quote(path[-1]) + ")" if many: output = "listwrap(" + output + ")" return output
def find_container(frum, after): """ :param frum: :return: """ global namespace if not namespace: if not container.config.default.settings: Log.error( "expecting jx_base.container.config.default.settings to contain default elasticsearch connection info" ) namespace = ElasticsearchMetadata(container.config.default.settings) if not frum: Log.error("expecting json query expression with from clause") # FORCE A RELOAD namespace.get_columns(frum, after=after) if is_text(frum): if frum in container_cache: return container_cache[frum] path = split_field(frum) if path[0] == "meta": if path[1] == "columns": return namespace.meta.columns.denormalized() elif path[1] == "tables": return namespace.meta.tables else: fact_table_name = join_field(path[:2]) else: fact_table_name = path[0] type_ = container.config.default.type settings = set_default( { "alias": fact_table_name, "name": frum, "exists": True }, container.config.default.settings, ) settings.type = None output = container.type2container[type_](settings) container_cache[frum] = output return output elif is_data(frum) and frum.type and container.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return container.type2container[frum.type](frum.settings) elif is_data(frum) and (frum["from"] or is_container(frum["from"])): from jx_base.query import QueryOp return QueryOp.wrap(frum) elif is_container(frum): return ListContainer("test_list", frum) else: return frum
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1 :]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def _get_schema_from_list(frum, table_name, prefix_path, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param prefix_path: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix_path) column = columns[full_name] if not column: column = Column(names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns.add(column) column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix_path + [name]) column = columns[full_name] if not column: column = Column(names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns.add(column) if isinstance(value, list): if len(value) == 0: this_type = "undefined" elif len(value) == 1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], table_name, prefix_path + [name], nested_path, columns) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist( [join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, table_name, prefix_path + [name], newpath, columns)
def __getitem__(self, name): def _get(node, path): if not path: return self.element return _get(node.more[path[0]], path[1:]) return _get(self, split_field(name))
def _get_schema_from_list(frum, table_name, prefix_path, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param prefix_path: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix_path) column = columns[full_name] if not column: column = Column( names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns.add(column) column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix_path + [name]) column = columns[full_name] if not column: column = Column( names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns.add(column) if isinstance(value, list): if len(value) == 0: this_type = "undefined" elif len(value) == 1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], table_name, prefix_path + [name], nested_path, columns) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0])+[name])]+np) _get_schema_from_list(value, table_name, prefix_path + [name], newpath, columns)
def set(constants): """ REACH INTO THE MODULES AND OBJECTS TO SET CONSTANTS. THINK OF THIS AS PRIMITIVE DEPENDENCY INJECTION FOR MODULES. USEFUL FOR SETTING DEBUG FLAGS. """ if not constants: return constants = wrap(constants) for full_path, new_value in constants.leaves(): errors = [] k_path = split_field(full_path) if len(k_path) < 2: from mo_logs import Log Log.error( "expecting <module>.<constant> format, not {{path|quote}}", path=k_path) name = k_path[-1] try: old_value = mo_dots_set_attr(sys.modules, k_path, new_value) continue except Exception as e: errors.append(e) # ONE MODULE IS MISSING, THE CALLING MODULE try: caller_globals = sys._getframe(1).f_globals caller_file = caller_globals["__file__"] if not caller_file.endswith(".py"): raise Exception("do not know how to handle non-python caller") caller_module = caller_file[:-3].replace("\\", "/") module_path = caller_module.split("/") # ENSURE THERE IS SOME EVIDENCE THE MODULE MATCHES THE PATH if k_path[-2] != module_path[-1]: continue old_value = mo_dots_set_attr(caller_globals, [name], new_value) if DEBUG: from mo_logs import Log Log.note( "Changed {{module}}[{{attribute}}] from {{old_value}} to {{new_value}}", module=caller_module, attribute=name, old_value=old_value, new_value=new_value) break except Exception as e: errors.append(e) if errors: from mo_logs import Log Log.error("Can not set constant {{path}}", path=full_path, cause=errors)
def _get(v, k, d): for p in split_field(k): try: v = v.get(p) if v is None: return d except Exception: v = [vv.get(p) for vv in v] return v
def _edges_op(self, query, frum): query = query.copy() # WE WILL BE MARKING UP THE QUERY index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) outer_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) base_table, path = schema.snowflake.fact_name, schema.nested_path nest_to_alias = { nested_path: quote_column("__" + unichr(ord('a') + i) + "__") for i, (nested_path, sub_table) in enumerate(self.sf.tables) } schema = self.sf.tables[relative_field(frum, self.sf.fact_name)].schema tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = quote_column(join_field([base_table] + split_field(tables[0].nest))) + tables[0].alias for previous, t in zip(tables, tables[1::]): from_sql += ( SQL_LEFT_JOIN + quote_column(concat_field(base_table, t.nest)) + t.alias + SQL_ON + join_column(t.alias, quoted_PARENT) + " = " + join_column(previous.alias, quoted_UID) ) main_filter = query.where.to_sql(schema, boolean=True)[0].sql.b # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH ons = [] join_types = [] wheres = [] null_ons = [EXISTS_COLUMN + SQL_IS_NULL] groupby = [] null_groupby = [] orderby = [] domains = [] select_clause = [SQL_ONE + EXISTS_COLUMN] + [quote_column(c.es_column) for c in self.sf.tables["."].columns] for edge_index, query_edge in enumerate(query.edges): edge_alias = quote_column("e" + text_type(edge_index)) if query_edge.value: edge_values = [p for c in query_edge.value.to_sql(schema).sql for p in c.items()] elif not query_edge.value and any(query_edge.domain.partitions.where): case = SQL_CASE for pp, p in enumerate(query_edge.domain.partitions): w = p.where.to_sql(schema)[0].sql.b t = quote_value(pp) case += SQL_WHEN + w + SQL_THEN + t case += SQL_ELSE + SQL_NULL + SQL_END # quote value with length of partitions edge_values = [("n", case)] elif query_edge.range: edge_values = query_edge.range.min.to_sql(schema)[0].sql.items() + query_edge.range.max.to_sql(schema)[ 0].sql.items()
def __call__(self, row, rownum=None, rows=None): path = split_field(self.var) for p in path: row = row.get(p) if row is None: return None if isinstance(row, list) and len(row) == 1: return row[0] return row
def to_python(self, not_null=False, boolean=False, many=False): agg = "rows[rownum+" + self.offset.to_python() + "]" path = split_field(json2value(self.var.json)) if not path: return agg for p in path[:-1]: agg = agg + ".get(" + convert.value2quote(p) + ", EMPTY_DICT)" return agg + ".get(" + convert.value2quote(path[-1]) + ")"
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=['.'], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([ c for c in columns if not c.last_updated ])) else: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return []
def untyped_column(column_name): """ :param column_name: DATABASE COLUMN NAME :return: (NAME, TYPE) PAIR """ if "$" in column_name: path = split_field(column_name) return join_field(path[:-1]), path[-1][1:] else: return column_name, None
def add_column_to_schema(self, nest_to_schema, column): abs_table = literal_field(self.name) abs_name = column.names[abs_table] for nest, schema in nest_to_schema.items(): rel_table = literal_field( join_field([self.name] + split_field(nest))) rel_name = relative_field(abs_name, nest) column.names[rel_table] = rel_name
def get_nested_path(typed_path): # CONSTRUCT THE nested_path FROM THE typed_path path = split_field(typed_path) parent = "." nested_path = (parent, ) for i, p in enumerate(path[:-1]): if p == ARRAY_KEY: step = concat_field(parent, join_field(path[0:i + 1])) nested_path = (step, ) + nested_path return nested_path
def exists_variable(path): """ RETURN THE VARIABLE THAT WILL INDICATE OBJECT (OR ARRAY) EXISTS (~e~) """ steps = split_field(path) if not steps: return EXISTS_TYPE if steps[-1] == NESTED_TYPE: steps = steps[:-1] return join_field(steps + [EXISTS_TYPE])
def get_document_value(document, column): """ RETURN DOCUMENT VALUE IF MATCHES THE column (name, type) :param document: THE DOCUMENT :param column: A (name, type) PAIR :return: VALUE, IF IT IS THE SAME NAME AND TYPE """ v = document.get(split_field(column.name)[0], None) return get_if_type(v, column.type)
def schema_element(self, path): if isinstance(path, text_type): path = split_field(path) output = self while '.' in output.more: output = output.more['.'] for p in path: output = output.more.get(p) while '.' in output.more: output = output.more['.'] return output.element if output else None
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note("{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
def schema_element(self, path): if isinstance(path, text): path = split_field(path) output = self while '.' in output.more: output = output.more['.'] for p in path: output = output.more.get(p) while '.' in output.more: output = output.more['.'] return output.element if output else None
def _test_mode_wait(query): """ WAIT FOR METADATA TO ARRIVE ON INDEX :param query: dict() OF REQUEST BODY :return: nothing """ try: m = meta.singlton now = Date.now() end_time = now + MINUTE # MARK COLUMNS DIRTY m.meta.columns.update({ "clear": ["partitions", "count", "cardinality", "last_updated"], "where": { "eq": { "es_index": join_field(split_field(query["from"])[0:1]) } } }) # BE SURE THEY ARE ON THE todo QUEUE FOR RE-EVALUATION cols = [ c for c in m.get_columns(table_name=query["from"], force=True) if c.type not in STRUCT ] for c in cols: Log.note("Mark {{column}} dirty at {{time}}", column=c.names["."], time=now) c.last_updated = now - TOO_OLD m.todo.push(c) while end_time > now: # GET FRESH VERSIONS cols = [ c for c in m.get_columns(table_name=query["from"]) if c.type not in STRUCT ] for c in cols: if not c.last_updated or c.cardinality == None: Log.note( "wait for column (table={{col.es_index}}, name={{col.es_column}}) metadata to arrive", col=c) break else: break Till(seconds=1).wait() for c in cols: Log.note( "fresh column name={{column.name}} updated={{column.last_updated|date}} parts={{column.partitions}}", column=c) except Exception, e: Log.warning("could not pickup columns", cause=e)
def __radd__(self, path): """ RETURN self AT THE END OF path :param path """ acc = self for step in reversed(split_field(path)): if IS_PRIMITIVE_KEY.match(step): continue acc = JsonType(**{step: acc}) return acc
def find_container(frum, schema=None): """ :param frum: :param schema: :return: """ if not _meta: _delayed_imports() frum = wrap(frum) if isinstance(frum, text_type): if not container.config.default.settings: Log.error( "expecting jx_base.container.config.default.settings to contain default elasticsearch connection info" ) type_ = None if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns.denormalized() elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) type_ = container.config.default.type fact_table_name = split_field(frum)[0] settings = set_default( { "index": fact_table_name, "name": frum, "exists": True, }, container.config.default.settings) settings.type = None return container.type2container[type_](settings) elif isinstance( frum, Mapping) and frum.type and container.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return container.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from jx_base.query import QueryOp return QueryOp.wrap(frum, namespace=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def set(constants): """ REACH INTO THE MODULES AND OBJECTS TO SET CONSTANTS. THINK OF THIS AS PRIMITIVE DEPENDENCY INJECTION FOR MODULES. USEFUL FOR SETTING DEBUG FLAGS. """ if not constants: return constants = wrap(constants) for k, new_value in constants.leaves(): errors = [] try: old_value = mo_dots_set_attr(sys.modules, k, new_value) continue except Exception as e: errors.append(e) # ONE MODULE IS MISSING, THE CALLING MODULE try: caller_globals = sys._getframe(1).f_globals caller_file = caller_globals["__file__"] if not caller_file.endswith(".py"): raise Exception("do not know how to handle non-python caller") caller_module = caller_file[:-3].replace("/", ".") path = split_field(k) for i, p in enumerate(path): if i == 0: continue prefix = join_field(path[:1]) name = join_field(path[i:]) if caller_module.endswith(prefix): old_value = mo_dots_set_attr(caller_globals, name, new_value) if DEBUG: from mo_logs import Log Log.note( "Changed {{module}}[{{attribute}}] from {{old_value}} to {{new_value}}", module=prefix, attribute=name, old_value=old_value, new_value=new_value ) break except Exception as e: errors.append(e) if errors: from mo_logs import Log Log.error("Can not set constant {{path}}", path=k, cause=errors)
def __getitem__(self, key): if isinstance(key, slice): return Null elif isinstance(key, str): key = key.decode("utf8") elif isinstance(key, int): return NullType(self, key) path = split_field(key) output = self for p in path: output = NullType(output, p) return output
def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] query_path = join_field(table_path[1:]) meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for _, properties in meta.mappings.items(): properties.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(meta.index, properties, meta)
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not _containers: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not _containers.config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") type_ = None index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns.denormalized() elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = _containers.config.default.type index = split_field(frum)[0] settings = set_default( { "index": index, "name": frum, "exists": True, }, _containers.config.default.settings ) settings.type = None return _containers.type2container[type_](settings) elif isinstance(frum, Mapping) and frum.type and _containers.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return _containers.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import QueryOp return QueryOp.wrap(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=['.'], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated])) else: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return []
def is_deepop(es, query): if query.edges or query.groupby: return False if all(s.aggregate not in (None, "none") for s in listwrap(query.select)): return False if len(split_field(query.frum.name)) > 1: return True # ASSUME IT IS NESTED IF WE ARE ASKING FOR NESTED COLUMNS # vars_ = query_get_all_vars(query) # columns = query.frum.get_columns() # if any(c for c in columns if len(c.nested_path) != 1 and c.name in vars_): # return True return False
def unnest_path(encoded): if encoded.startswith(".."): remainder = encoded.lstrip(".") back = len(encoded) - len(remainder) return ("." * back) + unnest_path(remainder) path = split_field(encoded) if not path: return "." if path[-1] == NESTED_TYPE: path = path[:-1] if not path: return "." return join_field([decode_property(c) for c in path[:-1] if not c.startswith(TYPE_PREFIX)] + [decode_property(path[-1])])
def __init__(self, table_name, columns): """ :param table_name: THE FACT TABLE :param query_path: PATH TO ARM OF SNOWFLAKE :param columns: ALL COLUMNS IN SNOWFLAKE """ table_path = split_field(table_name) self.table = table_path[0] # USED AS AN EXPLICIT STATEMENT OF PERSPECTIVE IN THE DATABASE self.query_path = join_field(table_path[1:]) self._columns = copy(columns) lookup = self.lookup = _index(columns, self.query_path) if self.query_path != ".": alternate = _index(columns, ".") for k,v in alternate.items(): lookup.setdefault(k, v)
def is_setop(es, query): select = listwrap(query.select) if not query.edges: isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT simpleAgg = AND([s.aggregate in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT # NO EDGES IMPLIES SIMPLER QUERIES: EITHER A SET OPERATION, OR RETURN SINGLE AGGREGATE if simpleAgg or isDeep: return True else: isSmooth = AND((e.domain.type in ALGEBRAIC and e.domain.interval == "none") for e in query.edges) if isSmooth: return True return False