def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = es_type_to_json_type[abs_column.es_type] for query_path in query_paths: abs_column.names[query_path[0]] = relative_field( abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass
def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE alias_done = set() index = split_field(table)[0] query_path = split_field(table)[1:] metadata = self.default_es.get_metadata(index=index) for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}): for _, properties in meta.mappings.items(): columns = _elasticsearch.parse_properties(index, None, properties.properties) columns = columns.filter(lambda r: not r.abs_name.startswith("other.") and not r.abs_name.startswith("previous_values.cf_")) # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED with Timer("upserting {{num}} columns", {"num": len(columns)}, debug=DEBUG): with self.columns.locker: for c in columns: # ABSOLUTE c.table = join_field([index]+query_path) self.upsert_column(c) for alias in meta.aliases: # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS if alias in alias_done: continue alias_done.add(alias) c = copy(c) c.table = join_field([alias]+query_path) self.upsert_column(c)
def __init__(self, es=None, id_column="_id"): self.es = es self.id_column = id_column self.remove_id = True if id_column == "_id" else False if es: _schema = Data() for c in parse_properties(es.settings.alias, ".", es.get_properties()): if c.type not in (OBJECT, NESTED): _schema[c.names["."]] = c self.schema = unwrap(_schema) else: self.schema = {}
def __init__(self, es=None, id_expression="_id"): self.es = es self.id_column = id_expression self.get_id = jx_expression_to_function(id_expression) self.remove_id = True if id_expression == "_id" else False if es: _schema = Data() for c in parse_properties(es.settings.alias, ".", ROOT_PATH, es.get_properties()): if c.es_type not in (OBJECT, NESTED): _schema[c.name] = c self.schema = unwrap(_schema) else: self.schema = {}
def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column. startswith("previous_values.cf_") and not r.es_index.startswith( "debug") and r.es_column.find("=") == -1 and r.es_column.find( " ") == -1) def add_column(c, query_path): c.last_updated = Date.now() if query_path[0] != ".": c.names[query_path[0]] = relative_field( c.names["."], query_path[0]) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.es_index = alias self._upsert_column(c) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: for query_path in query_paths: add_column(abs_column, query_path)
def __init__(self, es=None, id_info=None): self.es = es self.id_info = id_info self.get_id = jx.get(id_info.field) self.get_version = jx.get(id_info.version) if es: _schema = Data() for c in parse_properties(es.settings.alias, ".", ROOT_PATH, es.get_properties()): if c.es_type in (OBJECT, NESTED): _schema[c.name] = {} else: _schema[c.name] = c self.schema = unwrap(_schema) else: self.schema = {}
def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES def add_column(c, query_path): c.last_updated = Date.now() - TOO_OLD if query_path[0] != ".": c.names[query_path[0]] = relative_field( c.names["."], query_path[0]) with self.meta.columns.locker: for alias in meta.aliases: c_ = copy(c) c_.es_index = alias self._upsert_column(c_) self._upsert_column(c) abs_columns = elasticsearch.parse_properties(abs_index, None, properties.properties) self.abs_columns.update(abs_columns) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(SELF_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: abs_column = abs_column.__copy__() abs_column.type = es_type_to_json_type[abs_column.type] for query_path in query_paths: add_column(abs_column, query_path) pass
def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=") == -1 and r.es_column.find(" ") == -1 ) def add_column(c, query_path): c.last_updated = Date.now() if query_path[0] != ".": c.names[query_path[0]] = relative_field(c.names["."], query_path[0]) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.es_index = alias self._upsert_column(c) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: for query_path in query_paths: add_column(abs_column, query_path)
def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns): Log.warning("Some columns are not stored {{names}}", names=[ ".".join((c.es_index, c.names['.'])) for c in abs_columns if c.cardinality == 0 ]) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths for i in self.index_to_alias.get_domain(alias): self.alias_to_query_paths[i] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = jx_type(abs_column) for query_path in query_paths: abs_column.names[query_path[0]] = relative_field( abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass
def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns): Log.warning( "Some columns are not stored {{names}}", names=[ ".".join((c.es_index, c.names['.'])) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths for i in self.index_to_alias.get_domain(alias): self.alias_to_query_paths[i] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = jx_type(abs_column) for query_path in query_paths: abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass
def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column. startswith("previous_values.cf_") and not r.es_index.startswith( "debug")) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() if query_path: c.table = c.es_index + "." + query_path.last() else: c.table = c.es_index with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) if query_path: c.table = alias + "." + query_path.last() else: c.table = alias self._upsert_column(c) # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"]) for a, b in itertools.product(query_paths, query_paths): aa = a.last() bb = b.last() if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) < len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(0, aa) break query_paths.append([]) for c in abs_columns: # ADD RELATIVE COLUMNS full_path = listwrap(c.nested_path) abs_depth = len(full_path) abs_parent = coalesce(full_path.last(), "") for query_path in query_paths: rel_depth = len(query_path) # ABSOLUTE add_column(copy(c), query_path) cc = copy(c) cc.relative = True if not query_path: add_column(cc, query_path) continue rel_parent = query_path.last() if c.es_column.startswith(rel_parent + "."): cc.name = c.es_column[len(rel_parent) + 1:] add_column(cc, query_path) elif c.es_column == rel_parent: cc.name = "." add_column(cc, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent.startswith(abs_parent + "."): cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error")
def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() if query_path: c.table = c.es_index + "." + query_path.last() else: c.table = c.es_index with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) if query_path: c.table = alias + "." + query_path.last() else: c.table = alias self._upsert_column(c) # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"]) for a, b in itertools.product(query_paths, query_paths): aa = a.last() bb = b.last() if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) < len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(0, aa) break query_paths.append([]) for c in abs_columns: # ADD RELATIVE COLUMNS full_path = listwrap(c.nested_path) abs_depth = len(full_path) abs_parent = coalesce(full_path.last(), "") for query_path in query_paths: rel_depth = len(query_path) # ABSOLUTE add_column(copy(c), query_path) cc = copy(c) cc.relative = True if not query_path: add_column(cc, query_path) continue rel_parent = query_path.last() if c.es_column.startswith(rel_parent+"."): cc.name = c.es_column[len(rel_parent)+1:] add_column(cc, query_path) elif c.es_column == rel_parent: cc.name = "." add_column(cc, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent.startswith(abs_parent+"."): cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error")
def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ]) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len( abs_column.nested_path): output.remove(other) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_column": other.es_column, "es_index": other.es_index } } }) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals
def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=")==-1 and r.es_column.find(" ")==-1 ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c) # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: full_path = abs_column.nested_path abs_depth = len(full_path)-1 abs_parent = full_path[1] if abs_depth else "" for query_path in query_paths: rel_depth = len(query_path)-1 rel_parent = query_path[0] rel_column = copy(abs_column) rel_column.relative = True add_column(copy(abs_column), query_path) if rel_parent == ".": add_column(rel_column, query_path) elif abs_column.es_column.startswith(rel_parent+"."): rel_column.name = abs_column.es_column[len(rel_parent)+1:] add_column(rel_column, query_path) elif abs_column.es_column == rel_parent: rel_column.name = "." add_column(rel_column, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent.startswith(abs_parent+"."): rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error")
def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len(abs_column.nested_path): output.remove(other) self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}}) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals